From e9f5de11d4039f3440adbdb766189d20302ac42b Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Fri, 26 Aug 2022 18:09:27 +0200
Subject: [PATCH] tu: Initial implementation of
 VK_EXT_graphics_pipeline_library

Now that the state for each pipeline is split into pieces, we can mostly
implement it by stitching together the pieces. One TODO is that we could
do more to split up the pre-rast and FS commands into separate draw
states so that we have to emit less commands when fast linking,
currently we compile the variants but delay emitting the commands until
link time, but note that even the Gallium driver doesn't currently do
this. Given the strict SSO model (e.g. with separate VPC registers for
each stage) it may even be possible to do most of the linking ahead of
time with only a few fixups for corner cases.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18554>
---
 src/freedreno/ci/freedreno-a618-fails.txt     |   32 +
 .../ci/freedreno-a630-asan-fails.txt          |   27 +
 src/freedreno/ci/freedreno-a630-fails.txt     |   32 +
 src/freedreno/ir3/ir3_shader.c                |   10 +-
 src/freedreno/vulkan/tu_descriptor_set.c      |   71 +-
 src/freedreno/vulkan/tu_descriptor_set.h      |    4 +
 src/freedreno/vulkan/tu_device.c              |   16 +
 src/freedreno/vulkan/tu_pipeline.c            | 1229 +++++++++++++----
 src/freedreno/vulkan/tu_pipeline.h            |   42 +
 src/freedreno/vulkan/tu_shader.c              |   30 +-
 src/freedreno/vulkan/tu_shader.h              |    1 +
 11 files changed, 1222 insertions(+), 272 deletions(-)

diff --git a/src/freedreno/ci/freedreno-a618-fails.txt b/src/freedreno/ci/freedreno-a618-fails.txt
index 4258e2f4ff5..c5895225675 100644
--- a/src/freedreno/ci/freedreno-a618-fails.txt
+++ b/src/freedreno/ci/freedreno-a618-fails.txt
@@ -15,6 +15,22 @@ dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
@@ -23,6 +39,22 @@ gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachme
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
 
 spill-dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_bool,Fail
 spill-dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast_bool_requiredsubgroupsize128,Fail
diff --git a/src/freedreno/ci/freedreno-a630-asan-fails.txt b/src/freedreno/ci/freedreno-a630-asan-fails.txt
index 25d7994d5e5..7f45d307e71 100644
--- a/src/freedreno/ci/freedreno-a630-asan-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-asan-fails.txt
@@ -1 +1,28 @@
 dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.12,Crash
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3759
+# deqp-vk: ../src/freedreno/vulkan/tu_pipeline.c:3894: tu_pipeline_builder_init_graphics: Assertion `subpass->color_count == 0 || !create_info->pColorBlendState || subpass->color_count == create_info->pColorBlendState->attachmentCount' failed
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
diff --git a/src/freedreno/ci/freedreno-a630-fails.txt b/src/freedreno/ci/freedreno-a630-fails.txt
index 99a9feacd54..5ca2147794e 100644
--- a/src/freedreno/ci/freedreno-a630-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-fails.txt
@@ -431,6 +431,22 @@ dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
 dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
@@ -439,6 +455,22 @@ gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachme
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
 gmem-dEQP-VK.pipeline.monolithic.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+gmem-dEQP-VK.pipeline.pipeline_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments4_more3,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more1,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments5_more3,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more0,Crash
+gmem-dEQP-VK.pipeline.fast_linked_library.color_write_enable_maxa.cwe_after_bind.attachments6_more1,Crash
 
 # https://gitlab.freedesktop.org/mesa/mesa/-/issues/7152
 spec@ext_transform_feedback@builtin-varyings gl_culldistance,Fail
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index 797aac513e6..3c2fde55cae 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -615,17 +615,19 @@ ir3_trim_constlen(struct ir3_shader_variant **variants,
 {
    unsigned constlens[MESA_SHADER_STAGES] = {};
 
+   bool shared_consts_enable = false;
+
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (variants[i])
+      if (variants[i]) {
          constlens[i] = variants[i]->constlen;
+         shared_consts_enable =
+            ir3_const_state(variants[i])->shared_consts_enable;
+      }
    }
 
    uint32_t trimmed = 0;
    STATIC_ASSERT(MESA_SHADER_STAGES <= 8 * sizeof(trimmed));
 
-   bool shared_consts_enable =
-      ir3_const_state(variants[MESA_SHADER_VERTEX])->shared_consts_enable;
-
    /* Use a hw quirk for geometry shared consts, not matched with actual
     * shared consts size (on a6xx).
     */
diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c
index d9d4a051f8f..14d8b4b07da 100644
--- a/src/freedreno/vulkan/tu_descriptor_set.c
+++ b/src/freedreno/vulkan/tu_descriptor_set.c
@@ -410,6 +410,46 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
  * just multiple descriptor set layouts pasted together.
  */
 
+void
+tu_pipeline_layout_init(struct tu_pipeline_layout *layout)
+{
+   unsigned dynamic_offset_size = 0;
+
+   for (uint32_t set = 0; set < layout->num_sets; set++) {
+      assert(set < MAX_SETS);
+      layout->set[set].dynamic_offset_start = dynamic_offset_size;
+
+      if (layout->set[set].layout)
+         dynamic_offset_size += layout->set[set].layout->dynamic_offset_size;
+   }
+
+   layout->dynamic_offset_size = dynamic_offset_size;
+
+   /* We only care about INDEPENDENT_SETS for dynamic-offset descriptors,
+    * where all the descriptors from all the sets are combined into one set
+    * and we have to provide the dynamic_offset_start dynamically with fast
+    * linking.
+    */
+   if (dynamic_offset_size == 0) {
+      layout->independent_sets = false;
+   }
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (layout->set[s].layout)
+         sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+                        sizeof(layout->set[s].dynamic_offset_start));
+   }
+   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+   _mesa_sha1_update(&ctx, &layout->push_constant_size,
+                     sizeof(layout->push_constant_size));
+   _mesa_sha1_update(&ctx, &layout->independent_sets,
+                     sizeof(layout->independent_sets));
+   _mesa_sha1_final(&ctx, layout->sha1);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_CreatePipelineLayout(VkDevice _device,
                         const VkPipelineLayoutCreateInfo *pCreateInfo,
@@ -428,23 +468,16 @@ tu_CreatePipelineLayout(VkDevice _device,
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
-   layout->dynamic_offset_size = 0;
-
-   unsigned dynamic_offset_size = 0;
-
    for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
       TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
                      pCreateInfo->pSetLayouts[set]);
 
       assert(set < MAX_SETS);
       layout->set[set].layout = set_layout;
-      layout->set[set].dynamic_offset_start = dynamic_offset_size;
-      vk_descriptor_set_layout_ref(&set_layout->vk);
-
-      dynamic_offset_size += set_layout->dynamic_offset_size;
+      if (set_layout)
+         vk_descriptor_set_layout_ref(&set_layout->vk);
    }
 
-   layout->dynamic_offset_size = dynamic_offset_size;
    layout->push_constant_size = 0;
 
    for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) {
@@ -454,18 +487,10 @@ tu_CreatePipelineLayout(VkDevice _device,
    }
 
    layout->push_constant_size = align(layout->push_constant_size, 16);
+   layout->independent_sets =
+      pCreateInfo->flags & VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT;
 
-   struct mesa_sha1 ctx;
-   _mesa_sha1_init(&ctx);
-   for (unsigned s = 0; s < layout->num_sets; s++) {
-      sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
-      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
-                        sizeof(layout->set[s].dynamic_offset_start));
-   }
-   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
-   _mesa_sha1_update(&ctx, &layout->push_constant_size,
-                     sizeof(layout->push_constant_size));
-   _mesa_sha1_final(&ctx, layout->sha1);
+   tu_pipeline_layout_init(layout);
 
    *pPipelineLayout = tu_pipeline_layout_to_handle(layout);
 
@@ -483,8 +508,10 @@ tu_DestroyPipelineLayout(VkDevice _device,
    if (!pipeline_layout)
       return;
 
-   for (uint32_t i = 0; i < pipeline_layout->num_sets; i++)
-      vk_descriptor_set_layout_unref(&device->vk, &pipeline_layout->set[i].layout->vk);
+   for (uint32_t i = 0; i < pipeline_layout->num_sets; i++) {
+      if (pipeline_layout->set[i].layout)
+         vk_descriptor_set_layout_unref(&device->vk, &pipeline_layout->set[i].layout->vk);
+   }
 
    vk_object_free(&device->vk, pAllocator, pipeline_layout);
 }
diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h
index 6b7b1bafdae..270afcfdc30 100644
--- a/src/freedreno/vulkan/tu_descriptor_set.h
+++ b/src/freedreno/vulkan/tu_descriptor_set.h
@@ -93,6 +93,8 @@ struct tu_pipeline_layout
       uint32_t dynamic_offset_start;
    } set[MAX_SETS];
 
+   bool independent_sets;
+
    uint32_t num_sets;
    uint32_t push_constant_size;
    uint32_t dynamic_offset_size;
@@ -102,6 +104,8 @@ struct tu_pipeline_layout
 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
                                VK_OBJECT_TYPE_PIPELINE_LAYOUT)
 
+void tu_pipeline_layout_init(struct tu_pipeline_layout *layout);
+
 struct tu_descriptor_set
 {
    struct vk_object_base base;
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 894d9875b24..c63610dd277 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -238,11 +238,14 @@ get_device_extensions(const struct tu_physical_device *device,
       .EXT_tooling_info = true,
       .EXT_inline_uniform_block = true,
       .EXT_mutable_descriptor_type = true,
+      .KHR_pipeline_library = true,
+      .EXT_graphics_pipeline_library = true,
    };
 }
 
 static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
    &tu_shaders_ops,
+   &tu_nir_shaders_ops,
    NULL,
 };
 
@@ -906,6 +909,12 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->multiDraw = true;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT: {
+         VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *features =
+            (VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *)ext;
+         features->graphicsPipelineLibrary = true;
+         break;
+      }
 
       default:
          break;
@@ -1371,6 +1380,13 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->maxMultiDrawCount = 2048;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_PROPERTIES_EXT: {
+         VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *props =
+            (VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *)ext;
+         props->graphicsPipelineLibraryFastLinking = true;
+         props->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
+         break;
+      }
       default:
          break;
       }
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 08659a45cfb..68b59b65466 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -15,6 +15,7 @@
 #include "main/menums.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
+#include "nir/nir_serialize.h"
 #include "spirv/nir_spirv.h"
 #include "util/debug.h"
 #include "util/mesa-sha1.h"
@@ -243,11 +244,15 @@ struct tu_pipeline_builder
    struct tu_device *device;
    void *mem_ctx;
    struct vk_pipeline_cache *cache;
-   struct tu_pipeline_layout *layout;
    const VkAllocationCallbacks *alloc;
    const VkGraphicsPipelineCreateInfo *create_info;
 
-   struct tu_compiled_shaders *shaders;
+   struct tu_pipeline_layout layout;
+
+   struct tu_compiled_shaders *compiled_shaders;
+
+   struct tu_const_state const_state[MESA_SHADER_FRAGMENT + 1];
+   struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1];
    struct ir3_shader_variant *binning_variant;
    uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
    uint64_t binning_vs_iova;
@@ -260,6 +265,7 @@ struct tu_pipeline_builder
    /* these states are affectd by rasterizer_discard */
    bool depth_clip_disable;
    bool use_color_attachments;
+   bool attachment_state_valid;
    VkFormat color_attachment_formats[MAX_RTS];
    VkFormat depth_attachment_format;
    uint32_t multiview_mask;
@@ -268,6 +274,24 @@ struct tu_pipeline_builder
    bool subpass_feedback_loop_color;
    bool subpass_feedback_loop_ds;
    bool feedback_loop_may_involve_textures;
+
+   /* Each library defines at least one piece of state in
+    * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
+    * there can be at most as many libraries as pieces of state, of which
+    * there are currently 4.
+    */
+#define MAX_LIBRARIES 4
+
+   unsigned num_libraries;
+   struct tu_pipeline *libraries[MAX_LIBRARIES];
+
+   /* This is just the state that we are compiling now, whereas the final
+    * pipeline will include the state from the libraries.
+    */
+   VkGraphicsPipelineLibraryFlagsEXT state;
+
+   /* The stages we are compiling now. */
+   VkShaderStageFlags active_stages;
 };
 
 static bool
@@ -648,6 +672,30 @@ tu6_emit_xs(struct tu_cs *cs,
    }
 }
 
+static void
+tu6_emit_dynamic_offset(struct tu_cs *cs,
+                        const struct ir3_shader_variant *xs,
+                        struct tu_pipeline_builder *builder)
+{
+   if (!xs || builder->const_state[xs->type].dynamic_offset_loc == UINT32_MAX)
+      return;
+
+   tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS);
+   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(builder->const_state[xs->type].dynamic_offset_loc / 4) |
+              CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+              CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+              CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
+              CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4)));
+   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+
+   for (unsigned i = 0; i < MAX_SETS; i++) {
+      unsigned dynamic_offset_start =
+         builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4);
+      tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0);
+   }
+}
+
 static void
 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
 {
@@ -1767,7 +1815,7 @@ tu6_emit_program_config(struct tu_cs *cs,
 
    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
 
-   bool shared_consts_enable = tu6_shared_constants_enable(builder->layout,
+   bool shared_consts_enable = tu6_shared_constants_enable(&builder->layout,
          builder->device->compiler);
    tu6_emit_shared_consts_enable(cs, shared_consts_enable);
 
@@ -1780,7 +1828,7 @@ tu6_emit_program_config(struct tu_cs *cs,
          .gfx_ibo = true,
          .gfx_shared_const = shared_consts_enable));
    for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
-      tu6_emit_xs_config(cs, stage, builder->shaders->variants[stage]);
+      tu6_emit_xs_config(cs, stage, builder->variants[stage]);
    }
 }
 
@@ -1790,15 +1838,14 @@ tu6_emit_program(struct tu_cs *cs,
                  bool binning_pass,
                  struct tu_pipeline *pipeline)
 {
-   const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
+   const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
    const struct ir3_shader_variant *bs = builder->binning_variant;
-   const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
-   const struct ir3_shader_variant *ds = builder->shaders->variants[MESA_SHADER_TESS_EVAL];
-   const struct ir3_shader_variant *gs = builder->shaders->variants[MESA_SHADER_GEOMETRY];
-   const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
+   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
+   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
+   const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
+   const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
    gl_shader_stage stage = MESA_SHADER_VERTEX;
-   uint32_t cps_per_patch = builder->create_info->pTessellationState ?
-      builder->create_info->pTessellationState->patchControlPoints : 0;
+   uint32_t cps_per_patch = pipeline->tess.patch_control_points;
    bool multi_pos_output = vs->multi_pos_output;
 
   /* Don't use the binning pass variant when GS is present because we don't
@@ -1807,20 +1854,22 @@ tu6_emit_program(struct tu_cs *cs,
    if (binning_pass && !gs) {
       vs = bs;
       tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
+      tu6_emit_dynamic_offset(cs, bs, builder);
       stage++;
    }
 
    for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
-      const struct ir3_shader_variant *xs = builder->shaders->variants[stage];
+      const struct ir3_shader_variant *xs = builder->variants[stage];
 
       if (stage == MESA_SHADER_FRAGMENT && binning_pass)
          fs = xs = NULL;
 
       tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
+      tu6_emit_dynamic_offset(cs, xs, builder);
    }
 
-   uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
-   uint32_t multiview_cntl = builder->multiview_mask ?
+   uint32_t multiview_views = util_logbase2(pipeline->rast.multiview_mask) + 1;
+   uint32_t multiview_cntl = pipeline->rast.multiview_mask ?
       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
@@ -1845,7 +1894,7 @@ tu6_emit_program(struct tu_cs *cs,
    if (multiview_cntl &&
        builder->device->physical_device->info->a6xx.supports_multiview_mask) {
       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
-      tu_cs_emit(cs, builder->multiview_mask);
+      tu_cs_emit(cs, pipeline->rast.multiview_mask);
    }
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
@@ -2366,6 +2415,38 @@ tu_setup_pvtmem(struct tu_device *dev,
    return VK_SUCCESS;
 }
 
+static bool
+contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
+{
+   return (state &
+      (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
+      (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
+}
+
+/* Return true if this pipeline contains all of the GPL stages listed but none
+ * of the libraries it uses do, so this is "the first time" that all of them
+ * are defined together. This is useful for state that needs to be combined
+ * from multiple GPL stages.
+ */
+
+static bool
+set_combined_state(struct tu_pipeline_builder *builder,
+                   struct tu_pipeline *pipeline,
+                   VkGraphicsPipelineLibraryFlagsEXT state)
+{
+   if ((pipeline->state & state) != state)
+      return false;
+
+   for (unsigned i = 0; i < builder->num_libraries; i++) {
+      if ((builder->libraries[i]->state & state) == state)
+         return false;
+   }
+
+   return true;
+}
+
 static VkResult
 tu_pipeline_allocate_cs(struct tu_device *dev,
                         struct tu_pipeline *pipeline,
@@ -2373,38 +2454,49 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
                         struct tu_pipeline_builder *builder,
                         struct ir3_shader_variant *compute)
 {
-   uint32_t size = 1024 + tu6_load_state_size(pipeline, layout);
+   uint32_t size = 1024;
 
    /* graphics case: */
    if (builder) {
-      size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS +
-         2 * TU6_EMIT_VFD_DEST_MAX_DWORDS;
-
-      for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
-         if (builder->shaders->variants[i]) {
-            size += builder->shaders->variants[i]->info.size / 4;
-         }
+      if (builder->state &
+          VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
+         size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
       }
 
-      size += builder->binning_variant->info.size / 4;
+      if (set_combined_state(builder, pipeline,
+                             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+                             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
+         size += 2 * TU6_EMIT_VFD_DEST_MAX_DWORDS;
+         size += tu6_load_state_size(pipeline, layout);
 
-      builder->additional_cs_reserve_size = 0;
-      for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
-         struct ir3_shader_variant *variant = builder->shaders->variants[i];
-         if (variant) {
-            builder->additional_cs_reserve_size +=
-               tu_xs_get_additional_cs_size_dwords(variant);
-
-            if (variant->binning) {
-               builder->additional_cs_reserve_size +=
-                  tu_xs_get_additional_cs_size_dwords(variant->binning);
+         for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
+            if (builder->variants[i]) {
+               size += builder->variants[i]->info.size / 4;
             }
          }
-      }
 
-      /* The additional size is used twice, once per tu6_emit_program() call. */
-      size += builder->additional_cs_reserve_size * 2;
+         size += builder->binning_variant->info.size / 4;
+
+         builder->additional_cs_reserve_size = 0;
+         for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
+            struct ir3_shader_variant *variant = builder->variants[i];
+            if (variant) {
+               builder->additional_cs_reserve_size +=
+                  tu_xs_get_additional_cs_size_dwords(variant);
+
+               if (variant->binning) {
+                  builder->additional_cs_reserve_size +=
+                     tu_xs_get_additional_cs_size_dwords(variant->binning);
+               }
+            }
+         }
+
+         /* The additional size is used twice, once per tu6_emit_program() call. */
+         size += builder->additional_cs_reserve_size * 2;
+      }
    } else {
+      size += tu6_load_state_size(pipeline, layout);
+
       size += compute->info.size / 4;
 
       size += tu_xs_get_additional_cs_size_dwords(compute);
@@ -2436,20 +2528,42 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
 static void
 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
                             const struct tu_pipeline *pipeline,
-                            const VkGraphicsPipelineCreateInfo *pipeline_info)
+                            struct tu_pipeline_builder *builder,
+                            nir_shader **nir)
 {
-   for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
-      if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
+   /* We set this after we compile to NIR because we need the prim mode */
+   key->tessellation = IR3_TESS_NONE;
+
+   for (unsigned i = 0; i < builder->num_libraries; i++) {
+      if (!(builder->libraries[i]->state &
+            (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)))
+         continue;
+
+      const struct ir3_shader_key *library_key =
+         &builder->libraries[i]->ir3_key;
+
+      if (library_key->tessellation != IR3_TESS_NONE)
+         key->tessellation = library_key->tessellation;
+      key->has_gs |= library_key->has_gs;
+      key->sample_shading |= library_key->sample_shading;
+   }
+
+   for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
+      if (builder->create_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
          key->has_gs = true;
          break;
       }
    }
 
-   if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
-       !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
+   if (!(builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT))
       return;
 
-   const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
+   if (builder->rasterizer_discard)
+      return;
+
+   const VkPipelineMultisampleStateCreateInfo *msaa_info =
+      builder->create_info->pMultisampleState;
 
    /* The 1.3.215 spec says:
     *
@@ -2475,12 +2589,9 @@ tu_pipeline_shader_key_init(struct ir3_shader_key *key,
     * just checked in tu6_emit_fs_inputs.  We will also copy the value to
     * tu_shader_key::force_sample_interp in a bit.
     */
-   if (msaa_info->sampleShadingEnable &&
+   if (msaa_info && msaa_info->sampleShadingEnable &&
        (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f)
       key->sample_shading = true;
-
-   /* We set this after we compile to NIR because we need the prim mode */
-   key->tessellation = IR3_TESS_NONE;
 }
 
 static uint32_t
@@ -2624,12 +2735,21 @@ tu_shader_key_init(struct tu_shader_key *key,
 static void
 tu_hash_stage(struct mesa_sha1 *ctx,
               const VkPipelineShaderStageCreateInfo *stage,
+              const nir_shader *nir,
               const struct tu_shader_key *key)
 {
-   unsigned char stage_hash[SHA1_DIGEST_LENGTH];
 
-   vk_pipeline_hash_shader_stage(stage, stage_hash);
-   _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
+   if (nir) {
+      struct blob blob;
+      blob_init(&blob);
+      nir_serialize(&blob, nir, true);
+      _mesa_sha1_update(ctx, blob.data, blob.size);
+      blob_finish(&blob);
+   } else {
+      unsigned char stage_hash[SHA1_DIGEST_LENGTH];
+      vk_pipeline_hash_shader_stage(stage, stage_hash);
+      _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
+   }
    _mesa_sha1_update(ctx, key, sizeof(*key));
 }
 
@@ -2647,9 +2767,11 @@ tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
 static void
 tu_hash_shaders(unsigned char *hash,
                 const VkPipelineShaderStageCreateInfo **stages,
+                nir_shader *const *nir,
                 const struct tu_pipeline_layout *layout,
                 const struct tu_shader_key *keys,
                 const struct ir3_shader_key *ir3_key,
+                VkGraphicsPipelineLibraryFlagsEXT state,
                 const struct ir3_compiler *compiler)
 {
    struct mesa_sha1 ctx;
@@ -2662,10 +2784,11 @@ tu_hash_shaders(unsigned char *hash,
    _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key));
 
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
-      if (stages[i]) {
-         tu_hash_stage(&ctx, stages[i], &keys[i]);
+      if (stages[i] || nir[i]) {
+         tu_hash_stage(&ctx, stages[i], nir[i], &keys[i]);
       }
    }
+   _mesa_sha1_update(&ctx, &state, sizeof(state));
    tu_hash_compiler(&ctx, compiler);
    _mesa_sha1_final(&ctx, hash);
 }
@@ -2684,7 +2807,7 @@ tu_hash_compute(unsigned char *hash,
    if (layout)
       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
 
-   tu_hash_stage(&ctx, stage, key);
+   tu_hash_stage(&ctx, stage, NULL, key);
 
    tu_hash_compiler(&ctx, compiler);
    _mesa_sha1_final(&ctx, hash);
@@ -2708,6 +2831,9 @@ tu_shaders_destroy(struct vk_pipeline_cache_object *object)
    for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++)
       ralloc_free(shaders->variants[i]);
 
+   for (unsigned i = 0; i < ARRAY_SIZE(shaders->safe_const_variants); i++)
+      ralloc_free(shaders->safe_const_variants[i]);
+
    vk_pipeline_cache_object_finish(&shaders->base);
    vk_free(&object->device->alloc, shaders);
 }
@@ -2753,6 +2879,13 @@ tu_shaders_serialize(struct vk_pipeline_cache_object *object,
       } else {
          blob_write_uint8(blob, 0);
       }
+
+      if (shaders->safe_const_variants[i]) {
+         blob_write_uint8(blob, 1);
+         ir3_store_variant(blob, shaders->safe_const_variants[i]);
+      } else {
+         blob_write_uint8(blob, 0);
+      }
    }
 
    return true;
@@ -2774,10 +2907,13 @@ tu_shaders_deserialize(struct vk_device *_device,
    shaders->active_desc_sets = blob_read_uint8(blob);
 
    for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
-      bool has_shader = blob_read_uint8(blob);
-      if (has_shader) {
+      if (blob_read_uint8(blob)) {
          shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
       }
+
+      if (blob_read_uint8(blob)) {
+         shaders->safe_const_variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
+      }
    }
 
    return &shaders->base;
@@ -2806,6 +2942,117 @@ tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
    return container_of(object, struct tu_compiled_shaders, base);
 }
 
+static bool
+tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob);
+
+static struct vk_pipeline_cache_object *
+tu_nir_shaders_deserialize(struct vk_device *device,
+                           const void *key_data, size_t key_size,
+                           struct blob_reader *blob);
+
+static void
+tu_nir_shaders_destroy(struct vk_pipeline_cache_object *object)
+{
+   struct tu_nir_shaders *shaders =
+      container_of(object, struct tu_nir_shaders, base);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
+      ralloc_free(shaders->nir[i]);
+
+   vk_pipeline_cache_object_finish(&shaders->base);
+   vk_free(&object->device->alloc, shaders);
+}
+
+const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
+   .serialize = tu_nir_shaders_serialize,
+   .deserialize = tu_nir_shaders_deserialize,
+   .destroy = tu_nir_shaders_destroy,
+};
+
+static struct tu_nir_shaders *
+tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
+{
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
+   VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
+
+   if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
+                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
+      return NULL;
+
+   memcpy(obj_key_data, key_data, key_size);
+   vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
+                                 &tu_nir_shaders_ops, obj_key_data, key_size);
+
+   return shaders;
+}
+
+static bool
+tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob)
+{
+   struct tu_nir_shaders *shaders =
+      container_of(object, struct tu_nir_shaders, base);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
+      if (shaders->nir[i]) {
+         blob_write_uint8(blob, 1);
+         nir_serialize(blob, shaders->nir[i], true);
+      } else {
+         blob_write_uint8(blob, 0);
+      }
+   }
+
+   return true;
+}
+
+static struct vk_pipeline_cache_object *
+tu_nir_shaders_deserialize(struct vk_device *_device,
+                       const void *key_data, size_t key_size,
+                       struct blob_reader *blob)
+{
+   struct tu_device *dev = container_of(_device, struct tu_device, vk);
+   struct tu_nir_shaders *shaders =
+      tu_nir_shaders_init(dev, key_data, key_size);
+
+   if (!shaders)
+      return NULL;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
+      if (blob_read_uint8(blob)) {
+         shaders->nir[i] =
+            nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
+      }
+   }
+
+   return &shaders->base;
+}
+
+static struct tu_nir_shaders *
+tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
+                    const void *key_data, size_t key_size,
+                    bool *application_cache_hit)
+{
+   struct vk_pipeline_cache_object *object =
+      vk_pipeline_cache_lookup_object(cache, key_data, key_size,
+                                      &tu_nir_shaders_ops, application_cache_hit);
+   if (object)
+      return container_of(object, struct tu_nir_shaders, base);
+   else
+      return NULL;
+}
+
+static struct tu_nir_shaders *
+tu_nir_cache_insert(struct vk_pipeline_cache *cache,
+                    struct tu_nir_shaders *shaders)
+{
+   struct vk_pipeline_cache_object *object =
+      vk_pipeline_cache_add_object(cache, &shaders->base);
+   return container_of(object, struct tu_nir_shaders, base);
+}
+
+
 static VkResult
 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
                                     struct tu_pipeline *pipeline)
@@ -2825,45 +3072,79 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
 
+   bool must_compile =
+      builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
       gl_shader_stage stage =
          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
       stage_infos[stage] = &builder->create_info->pStages[i];
-
-      pipeline->active_stages |= builder->create_info->pStages[i].stage;
+      must_compile = true;
    }
 
-   if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) {
+   if (tu6_shared_constants_enable(&builder->layout, builder->device->compiler)) {
       pipeline->shared_consts = (struct tu_push_constant_range) {
          .lo = 0,
-         .dwords = builder->layout->push_constant_size / 4,
+         .dwords = builder->layout.push_constant_size / 4,
       };
    }
 
+   nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
+
    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
         stage < ARRAY_SIZE(keys); stage++) {
       tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device);
    }
 
-   struct ir3_shader_key ir3_key = {};
-   tu_pipeline_shader_key_init(&ir3_key, pipeline, builder->create_info);
+   if (builder->create_info->flags &
+       VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT) {
+      for (unsigned i = 0; i < builder->num_libraries; i++) {
+         struct tu_pipeline *library = builder->libraries[i];
 
-   keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask;
-   keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask;
-   keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading;
+         for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
+            if (library->shaders[j].nir) {
+               assert(!nir[j]);
+               nir[j] = nir_shader_clone(NULL, library->shaders[j].nir);
+               keys[j] = library->shaders[j].key;
+               must_compile = true;
+            }
+         }
+      }
+   }
+
+   struct ir3_shader_key ir3_key = {};
+   tu_pipeline_shader_key_init(&ir3_key, pipeline, builder, nir);
+
+   struct tu_compiled_shaders *compiled_shaders = NULL;
+   struct tu_nir_shaders *nir_shaders = NULL;
+   if (!must_compile)
+      goto done;
+
+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
+      keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask;
+   }
+
+   if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
+      keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask;
+      keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading;
+   }
 
    unsigned char pipeline_sha1[20];
-   tu_hash_shaders(pipeline_sha1, stage_infos, builder->layout, keys, &ir3_key, compiler);
+   tu_hash_shaders(pipeline_sha1, stage_infos, nir, &builder->layout, keys,
+                   &ir3_key, builder->state, compiler);
+
+   unsigned char nir_sha1[21];
+   memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
+   nir_sha1[20] = 'N';
 
    const bool executable_info = builder->create_info->flags &
       VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
 
    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
 
-   struct tu_compiled_shaders *compiled_shaders;
-
    if (!executable_info) {
+      bool cache_hit = false;
       bool application_cache_hit = false;
 
       compiled_shaders =
@@ -2871,12 +3152,31 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
                                   sizeof(pipeline_sha1),
                                   &application_cache_hit);
 
+      cache_hit = !!compiled_shaders;
+
+      /* If the user asks us to keep the NIR around, we need to have it for a
+       * successful cache hit. If we only have a "partial" cache hit, then we
+       * still need to recompile in order to get the NIR.
+       */
+      if (compiled_shaders &&
+          (builder->create_info->flags &
+           VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
+         bool nir_application_cache_hit = false;
+         nir_shaders =
+            tu_nir_cache_lookup(builder->cache, &nir_sha1,
+                                sizeof(nir_sha1),
+                                &nir_application_cache_hit);
+
+         application_cache_hit &= nir_application_cache_hit;
+         cache_hit &= !!nir_shaders;
+      }
+
       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
          pipeline_feedback.flags |=
             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
       }
 
-      if (compiled_shaders)
+      if (cache_hit)
          goto done;
    }
 
@@ -2885,8 +3185,6 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
       return VK_PIPELINE_COMPILE_REQUIRED;
    }
 
-   nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
-
    struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL };
 
    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
@@ -2907,7 +3205,8 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
    }
 
-   if (!nir[MESA_SHADER_FRAGMENT]) {
+   if (!nir[MESA_SHADER_FRAGMENT] &&
+       (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
          const nir_shader_compiler_options *nir_options =
             ir3_get_compiler_options(builder->device->compiler);
          nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
@@ -2929,6 +3228,32 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
 
    tu_link_shaders(builder, nir, ARRAY_SIZE(nir));
 
+   if (builder->create_info->flags &
+       VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT) {
+      nir_shaders =
+         tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
+      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+           stage < ARRAY_SIZE(nir); stage++) {
+         if (!nir[stage])
+            continue;
+
+         nir_shaders->nir[stage] = nir_shader_clone(NULL, nir[stage]);
+      }
+
+      nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
+
+      if (compiled_shaders)
+         goto done;
+   }
+
+   compiled_shaders =
+      tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1));
+
+   if (!compiled_shaders) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
    uint32_t desc_sets = 0;
    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
         stage < ARRAY_SIZE(nir); stage++) {
@@ -2939,7 +3264,7 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
 
       struct tu_shader *shader =
          tu_shader_create(builder->device, nir[stage], &keys[stage],
-                          builder->layout, builder->alloc);
+                          &builder->layout, builder->alloc);
       if (!shader) {
          result = VK_ERROR_OUT_OF_HOST_MEMORY;
          goto fail;
@@ -2972,20 +3297,18 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
    }
 
+   /* In the the tess-but-not-FS case we don't know whether the FS will read
+    * PrimID so we need to unconditionally store it.
+    */
+   if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
+      ir3_key.tcs_store_primid = true;
+
    struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY];
    if (!last_shader)
       last_shader = shaders[MESA_SHADER_TESS_EVAL];
    if (!last_shader)
       last_shader = shaders[MESA_SHADER_VERTEX];
 
-   compiled_shaders =
-      tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1));
-
-   if (!compiled_shaders) {
-      result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      goto fail;
-   }
-
    compiled_shaders->active_desc_sets = desc_sets;
 
    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
@@ -3028,9 +3351,19 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
          }
 
          stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
+      } else if (contains_all_shader_state(builder->state)) {
+         compiled_shaders->safe_const_variants[stage] =
+            ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
+                                      executable_info);
+         if (!compiled_shaders->variants[stage]) {
+            result = VK_ERROR_OUT_OF_HOST_MEMORY;
+            goto fail;
+         }
       }
    }
 
+   ir3_key.safe_constlen = false;
+
    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
          stage < ARRAY_SIZE(nir); stage++) {
       if (shaders[stage]) {
@@ -3041,42 +3374,133 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
    compiled_shaders =
       tu_pipeline_cache_insert(builder->cache, compiled_shaders);
 
-done:
-   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
-         stage < ARRAY_SIZE(nir); stage++) {
-      if (compiled_shaders->variants[stage]) {
-         tu_append_executable(pipeline, compiled_shaders->variants[stage],
-            nir_initial_disasm[stage]);
+done:;
+
+   struct ir3_shader_variant *safe_const_variants[ARRAY_SIZE(nir)] = { NULL };
+   nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
+
+   if (compiled_shaders) {
+      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+            stage < ARRAY_SIZE(nir); stage++) {
+         if (compiled_shaders->variants[stage]) {
+            tu_append_executable(pipeline, compiled_shaders->variants[stage],
+               nir_initial_disasm[stage]);
+            builder->variants[stage] = compiled_shaders->variants[stage];
+            safe_const_variants[stage] =
+               compiled_shaders->safe_const_variants[stage];
+            builder->const_state[stage] =
+               compiled_shaders->const_state[stage];
+         }
       }
    }
 
-   struct ir3_shader_variant *vs =
-      compiled_shaders->variants[MESA_SHADER_VERTEX];
-
-   struct ir3_shader_variant *variant;
-   if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
-      tu_append_executable(pipeline, vs->binning, NULL);
-      variant = vs->binning;
-   } else {
-      variant = vs;
+   if (nir_shaders) {
+      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+            stage < ARRAY_SIZE(nir); stage++) {
+         if (nir_shaders->nir[stage]) {
+            post_link_nir[stage] = nir_shaders->nir[stage];
+         }
+      }
    }
 
-   builder->binning_variant = variant;
+   /* In the case where we're building a library without link-time
+    * optimization but with sub-libraries that retain LTO info, we should
+    * retain it ourselves in case another pipeline includes us with LTO.
+    */
+   for (unsigned i = 0; i < builder->num_libraries; i++) {
+      struct tu_pipeline *library = builder->libraries[i];
+      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+            stage < ARRAY_SIZE(library->shaders); stage++) {
+         if (!post_link_nir[stage] && library->shaders[stage].nir) {
+            post_link_nir[stage] = library->shaders[stage].nir;
+            keys[stage] = library->shaders[stage].key;
+         }
+      }
+   }
 
-   builder->shaders = compiled_shaders;
+   if (!(builder->create_info->flags &
+         VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT)) {
+      for (unsigned i = 0; i < builder->num_libraries; i++) {
+         struct tu_pipeline *library = builder->libraries[i];
+         for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+               stage < ARRAY_SIZE(library->shaders); stage++) {
+            if (library->shaders[stage].variant) {
+               assert(!builder->variants[stage]);
+               builder->variants[stage] = library->shaders[stage].variant;
+               safe_const_variants[stage] =
+                  library->shaders[stage].safe_const_variant;
+               builder->const_state[stage] =
+                  library->shaders[stage].const_state;
+               post_link_nir[stage] = library->shaders[stage].nir;
+            }
+         }
+      }
 
-   pipeline->active_desc_sets = compiled_shaders->active_desc_sets;
-   if (compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) {
+      /* Because we added more variants, we need to trim constlen again.
+       */
+      if (builder->num_libraries > 0) {
+         uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
+         for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+              stage < ARRAY_SIZE(builder->variants); stage++) {
+            if (safe_constlens & (1u << stage))
+               builder->variants[stage] = safe_const_variants[stage];
+         }
+      }
+   }
+
+   if (compiled_shaders)
+      pipeline->active_desc_sets = compiled_shaders->active_desc_sets;
+
+   for (unsigned i = 0; i < builder->num_libraries; i++) {
+      struct tu_pipeline *library = builder->libraries[i];
+      pipeline->active_desc_sets |= library->active_desc_sets;
+   }
+
+   if (compiled_shaders && compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) {
       pipeline->tess.patch_type =
          compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation;
    }
 
+   if (contains_all_shader_state(pipeline->state)) {
+      struct ir3_shader_variant *vs =
+         builder->variants[MESA_SHADER_VERTEX];
+
+      struct ir3_shader_variant *variant;
+      if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
+         tu_append_executable(pipeline, vs->binning, NULL);
+         variant = vs->binning;
+      } else {
+         variant = vs;
+      }
+
+      builder->binning_variant = variant;
+
+      builder->compiled_shaders = compiled_shaders;
+
+      /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
+       * when compiling all stages, but make sure we don't leak.
+       */
+      if (nir_shaders)
+         vk_pipeline_cache_object_unref(&nir_shaders->base);
+   } else {
+      pipeline->compiled_shaders = compiled_shaders;
+      pipeline->nir_shaders = nir_shaders;
+      pipeline->ir3_key = ir3_key;
+      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
+           stage < ARRAY_SIZE(pipeline->shaders); stage++) {
+         pipeline->shaders[stage].nir = post_link_nir[stage];
+         pipeline->shaders[stage].key = keys[stage];
+         pipeline->shaders[stage].const_state = builder->const_state[stage];
+         pipeline->shaders[stage].variant = builder->variants[stage];
+         pipeline->shaders[stage].safe_const_variant =
+            safe_const_variants[stage];
+      }
+   }
+
    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
    if (creation_feedback) {
       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
 
-      assert(builder->create_info->stageCount == 
-             creation_feedback->pipelineStageCreationFeedbackCount);
       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
          gl_shader_stage s =
             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
@@ -3097,6 +3521,9 @@ fail:
    if (compiled_shaders)
       vk_pipeline_cache_object_unref(&compiled_shaders->base);
 
+   if (nir_shaders)
+      vk_pipeline_cache_object_unref(&nir_shaders->base);
+
    return result;
 }
 
@@ -3227,6 +3654,155 @@ tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
    }
 }
 
+static void
+tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
+                                    struct tu_pipeline *pipeline)
+{
+   const VkPipelineLibraryCreateInfoKHR *library_info =
+      vk_find_struct_const(builder->create_info->pNext,
+                           PIPELINE_LIBRARY_CREATE_INFO_KHR);
+
+   if (library_info) {
+      assert(library_info->libraryCount <= MAX_LIBRARIES);
+      builder->num_libraries = library_info->libraryCount;
+      for (unsigned i = 0; i < library_info->libraryCount; i++) {
+         TU_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
+         builder->libraries[i] = library;
+      }
+   }
+
+   /* Merge in the state from libraries. The program state is a bit special
+    * and is handled separately.
+    */
+   pipeline->state = builder->state;
+   for (unsigned i = 0; i < builder->num_libraries; i++) {
+      struct tu_pipeline *library = builder->libraries[i];
+      pipeline->state |= library->state;
+
+      uint32_t library_dynamic_state = 0;
+      if (library->state &
+          VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
+         pipeline->vi = library->vi;
+         pipeline->ia = library->ia;
+         library_dynamic_state |=
+            BIT(TU_DYNAMIC_STATE_VERTEX_INPUT) |
+            BIT(TU_DYNAMIC_STATE_VB_STRIDE) |
+            BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY) |
+            BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
+         pipeline->shared_consts = library->shared_consts;
+      }
+
+      if (library->state &
+          VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
+         pipeline->tess = library->tess;
+         pipeline->rast = library->rast;
+         pipeline->viewport = library->viewport;
+         library_dynamic_state |=
+            BIT(VK_DYNAMIC_STATE_VIEWPORT) |
+            BIT(VK_DYNAMIC_STATE_SCISSOR) |
+            BIT(VK_DYNAMIC_STATE_LINE_WIDTH) |
+            BIT(VK_DYNAMIC_STATE_DEPTH_BIAS) |
+            BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
+      }
+
+      if (library->state &
+          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
+         pipeline->ds = library->ds;
+         pipeline->lrz.fs = library->lrz.fs;
+         pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask;
+         pipeline->lrz.force_late_z |= library->lrz.force_late_z;
+         library_dynamic_state |=
+            BIT(VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK) |
+            BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK) |
+            BIT(VK_DYNAMIC_STATE_STENCIL_REFERENCE) |
+            BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL) |
+            BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL) |
+            BIT(VK_DYNAMIC_STATE_DEPTH_BOUNDS);
+         pipeline->shared_consts = library->shared_consts;
+      }
+
+      if (library->state &
+          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
+         pipeline->blend = library->blend;
+         pipeline->output = library->output;
+         pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask;
+         pipeline->lrz.force_late_z |= library->lrz.force_late_z;
+         pipeline->prim_order = library->prim_order;
+         library_dynamic_state |=
+            BIT(VK_DYNAMIC_STATE_BLEND_CONSTANTS) |
+            BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS) |
+            BIT(TU_DYNAMIC_STATE_BLEND) |
+            BIT(TU_DYNAMIC_STATE_LOGIC_OP) |
+            BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE);
+      }
+
+      if ((library->state &
+           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
+          (library->state &
+           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
+         pipeline->prim_order = library->prim_order;
+      }
+
+      pipeline->dynamic_state_mask =
+         (pipeline->dynamic_state_mask & ~library_dynamic_state) |
+         (library->dynamic_state_mask & library_dynamic_state);
+
+      u_foreach_bit (i, library_dynamic_state & ~library->dynamic_state_mask) {
+         if (i >= TU_DYNAMIC_STATE_COUNT)
+            break;
+
+         pipeline->dynamic_state[i] = library->dynamic_state[i];
+      }
+
+      if (contains_all_shader_state(library->state)) {
+         pipeline->program = library->program;
+         pipeline->load_state = library->load_state;
+      }
+   }
+}
+
+static void
+tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
+                                 struct tu_pipeline *pipeline)
+{
+   TU_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
+
+   if (layout) {
+      /* Note: it's still valid to have a layout even if there are libraries.
+       * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
+       * a non-INDEPENDENT_SET layout which may make us use a faster path,
+       * currently this just affects dynamic offset descriptors.
+       */
+      builder->layout = *layout;
+   } else {
+      for (unsigned i = 0; i < builder->num_libraries; i++) {
+         struct tu_pipeline *library = builder->libraries[i];
+         builder->layout.num_sets = MAX2(builder->layout.num_sets,
+                                         library->num_sets);
+         for (unsigned j = 0; j < library->num_sets; j++) {
+            if (library->layouts[i])
+               builder->layout.set[i].layout = library->layouts[i];
+         }
+
+         builder->layout.push_constant_size = pipeline->push_constant_size;
+         builder->layout.independent_sets |= pipeline->independent_sets;
+      }
+
+      tu_pipeline_layout_init(&builder->layout);
+   }
+
+   if (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) {
+      pipeline->num_sets = builder->layout.num_sets;
+      for (unsigned i = 0; i < pipeline->num_sets; i++) {
+         pipeline->layouts[i] = builder->layout.set[i].layout;
+         if (pipeline->layouts[i])
+            vk_descriptor_set_layout_ref(&pipeline->layouts[i]->vk);
+      }
+      pipeline->push_constant_size = builder->layout.push_constant_size;
+      pipeline->independent_sets = builder->layout.independent_sets;
+   }
+}
+
 static void
 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
                         struct tu_const_state *const_state,
@@ -3266,13 +3842,13 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
    tu6_emit_program(&prog_cs, builder, true, pipeline);
    pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
 
-   for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
-      if (!builder->shaders->variants[i])
+   for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
+      if (!builder->variants[i])
          continue;
 
       tu_pipeline_set_linkage(&pipeline->program.link[i],
-                              &builder->shaders->const_state[i],
-                              builder->shaders->variants[i]);
+                              &builder->const_state[i],
+                              builder->variants[i]);
    }
 }
 
@@ -3377,8 +3953,8 @@ static void
 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
                                        struct tu_pipeline *pipeline)
 {
-   if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
-       !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
+   if (!(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
+       !(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
       return;
 
    const VkPipelineTessellationStateCreateInfo *tess_info =
@@ -3390,7 +3966,7 @@ tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
          vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
    pipeline->tess.upper_left_domain_origin = !domain_info ||
          domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
-   const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
+   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
    pipeline->tess.param_stride = hs->output_size * 4;
 }
 
@@ -3516,6 +4092,8 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
       vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
    pipeline->rast.provoking_vertex_last = provoking_vtx_state &&
       provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
+
+   pipeline->rast.multiview_mask = builder->multiview_mask;
 }
 
 static void
@@ -3532,13 +4110,12 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
     */
    const VkPipelineDepthStencilStateCreateInfo *ds_info =
       builder->create_info->pDepthStencilState;
-   const enum pipe_format pipe_format =
-      vk_format_to_pipe_format(builder->depth_attachment_format);
    uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
    struct tu_cs cs;
 
-   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
-       builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
+   if (!builder->attachment_state_valid ||
+       (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
+        builder->depth_attachment_format != VK_FORMAT_S8_UINT)) {
       if (ds_info->depthTestEnable) {
          rb_depth_cntl |=
             A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
@@ -3557,19 +4134,10 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
 
       if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
          tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
-
-      pipeline->output.depth_cpp_per_sample = util_format_get_component_bits(
-            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
-   } else {
-      /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
-       * to 0 when this pipeline is used, as enabling depth test when there
-       * is no depth attachment is a problem (at least for the S8_UINT case)
-       */
-      if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
-         pipeline->output.rb_depth_cntl_disable = true;
    }
 
-   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
+   if (!builder->attachment_state_valid ||
+       builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
       const VkStencilOpState *front = &ds_info->front;
       const VkStencilOpState *back = &ds_info->back;
 
@@ -3590,9 +4158,6 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
             A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
       }
 
-      pipeline->output.stencil_cpp_per_sample = util_format_get_component_bits(
-            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
-
       pipeline->ds.raster_order_attachment_access =
          ds_info->flags &
          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
@@ -3615,7 +4180,8 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
    pipeline->ds.rb_stencil_cntl = rb_stencil_cntl;
 
    /* the remaining draw states arent used if there is no d/s, leave them empty */
-   if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
+   if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED &&
+       builder->attachment_state_valid)
       return;
 
    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
@@ -3640,8 +4206,8 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
                                               .bfref = ds_info->back.reference & 0xff));
    }
 
-   if (builder->shaders->variants[MESA_SHADER_FRAGMENT]) {
-      const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
+   if (builder->variants[MESA_SHADER_FRAGMENT]) {
+      const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
       if (fs->has_kill) {
          pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
       }
@@ -3651,6 +4217,26 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
    }
 }
 
+static void
+tu_pipeline_builder_parse_ds_disable(struct tu_pipeline_builder *builder,
+                                     struct tu_pipeline *pipeline)
+{
+   if (builder->rasterizer_discard)
+      return;
+
+   /* If RB_DEPTH_CNTL is static state, then we can disable it ahead of time.
+    * However we only know whether RB_DEPTH_CNTL is dynamic in the fragment
+    * shader state and we only know whether it needs to be force-disabled in
+    * the output interface state.
+    */
+   struct tu_cs cs;
+   if (pipeline->output.rb_depth_cntl_disable &&
+       tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
+      tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
+      tu_cs_emit(&cs, 0);
+   }
+}
+
 static void
 tu_pipeline_builder_parse_multisample_and_color_blend(
    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
@@ -3704,6 +4290,27 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
          VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM;
    }
 
+   const enum pipe_format ds_pipe_format =
+      vk_format_to_pipe_format(builder->depth_attachment_format);
+
+   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
+       builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
+      pipeline->output.depth_cpp_per_sample = util_format_get_component_bits(
+            ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
+   } else {
+      /* We need to make sure RB_DEPTH_CNTL is set to 0 when this pipeline is
+       * used, regardless of whether it's linked with a fragment shader
+       * pipeline that has an enabled depth test or if RB_DEPTH_CNTL is set
+       * dynamically.
+       */
+      pipeline->output.rb_depth_cntl_disable = true;
+   }
+
+   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
+      pipeline->output.stencil_cpp_per_sample = util_format_get_component_bits(
+            ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
+   }
+
    struct tu_cs cs;
    tu6_emit_rb_mrt_controls(pipeline, blend_info,
                             builder->color_attachment_formats,
@@ -3848,9 +4455,21 @@ tu_pipeline_finish(struct tu_pipeline *pipeline,
    if (pipeline->pvtmem_bo)
       tu_bo_finish(dev, pipeline->pvtmem_bo);
 
+   if (pipeline->compiled_shaders)
+      vk_pipeline_cache_object_unref(&pipeline->compiled_shaders->base);
+
+   if (pipeline->nir_shaders)
+      vk_pipeline_cache_object_unref(&pipeline->nir_shaders->base);
+
+   for (unsigned i = 0; i < pipeline->num_sets; i++) {
+      if (pipeline->layouts[i])
+         vk_descriptor_set_layout_unref(&dev->vk, &pipeline->layouts[i]->vk);
+   }
+
    ralloc_free(pipeline->executables_mem_ctx);
 }
 
+
 static VkResult
 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
                           struct tu_pipeline **pipeline)
@@ -3865,66 +4484,118 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
 
-   /* compile and upload shaders */
-   result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
-   if (result != VK_SUCCESS) {
-      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
-      return result;
+   tu_pipeline_builder_parse_dynamic(builder, *pipeline);
+   tu_pipeline_builder_parse_libraries(builder, *pipeline);
+
+   VkShaderStageFlags stages = 0;
+   for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
+      stages |= builder->create_info->pStages[i].stage;
    }
+   builder->active_stages = stages;
 
-   result = tu_pipeline_allocate_cs(builder->device, *pipeline,
-                                    builder->layout, builder, NULL);
-   if (result != VK_SUCCESS) {
-      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
-      return result;
-   }
+   (*pipeline)->active_stages = stages;
+   for (unsigned i = 0; i < builder->num_libraries; i++)
+      (*pipeline)->active_stages |= builder->libraries[i]->active_stages;
 
-   for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++)
-      builder->shader_iova[i] =
-         tu_upload_variant(*pipeline, builder->shaders->variants[i]);
+   /* Compile and upload shaders unless a library has already done that. */
+   if ((*pipeline)->program.state.size == 0) {
+      tu_pipeline_builder_parse_layout(builder, *pipeline);
 
-   builder->binning_vs_iova =
-      tu_upload_variant(*pipeline, builder->binning_variant);
-
-   /* Setup private memory. Note that because we're sharing the same private
-    * memory for all stages, all stages must use the same config, or else
-    * fibers from one stage might overwrite fibers in another.
-    */
-
-   uint32_t pvtmem_size = 0;
-   bool per_wave = true;
-   for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
-      if (builder->shaders->variants[i]) {
-         pvtmem_size = MAX2(pvtmem_size, builder->shaders->variants[i]->pvtmem_size);
-         if (!builder->shaders->variants[i]->pvtmem_per_wave)
-            per_wave = false;
+      result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
+      if (result != VK_SUCCESS) {
+         vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
+         return result;
       }
    }
 
-   if (builder->binning_variant) {
-      pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
-      if (!builder->binning_variant->pvtmem_per_wave)
-         per_wave = false;
+   result = tu_pipeline_allocate_cs(builder->device, *pipeline,
+                                    &builder->layout, builder, NULL);
+
+
+   /* This has to come before emitting the program so that
+    * pipeline->tess.patch_control_points and pipeline->rast.multiview_mask
+    * are always set.
+    */
+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
+      tu_pipeline_builder_parse_tessellation(builder, *pipeline);
+      (*pipeline)->rast.multiview_mask = builder->multiview_mask;
    }
 
-   result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
-                            pvtmem_size, per_wave);
-   if (result != VK_SUCCESS) {
-      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
-      return result;
+   if (set_combined_state(builder, *pipeline,
+                          VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
+      if (result != VK_SUCCESS) {
+         vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
+         return result;
+      }
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++)
+         builder->shader_iova[i] =
+            tu_upload_variant(*pipeline, builder->variants[i]);
+
+      builder->binning_vs_iova =
+         tu_upload_variant(*pipeline, builder->binning_variant);
+
+      /* Setup private memory. Note that because we're sharing the same private
+       * memory for all stages, all stages must use the same config, or else
+       * fibers from one stage might overwrite fibers in another.
+       */
+
+      uint32_t pvtmem_size = 0;
+      bool per_wave = true;
+      for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
+         if (builder->variants[i]) {
+            pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size);
+            if (!builder->variants[i]->pvtmem_per_wave)
+               per_wave = false;
+         }
+      }
+
+      if (builder->binning_variant) {
+         pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
+         if (!builder->binning_variant->pvtmem_per_wave)
+            per_wave = false;
+      }
+
+      result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
+                               pvtmem_size, per_wave);
+      if (result != VK_SUCCESS) {
+         vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
+         return result;
+      }
+
+      tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
+      tu6_emit_load_state(*pipeline, &builder->layout);
    }
 
-   tu_pipeline_builder_parse_dynamic(builder, *pipeline);
-   tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
-   tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
-   tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
-   tu_pipeline_builder_parse_tessellation(builder, *pipeline);
-   tu_pipeline_builder_parse_viewport(builder, *pipeline);
-   tu_pipeline_builder_parse_rasterization(builder, *pipeline);
-   tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
-   tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
-   tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
-   tu6_emit_load_state(*pipeline, builder->layout);
+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
+      tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
+      tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
+   }
+
+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
+      tu_pipeline_builder_parse_viewport(builder, *pipeline);
+      tu_pipeline_builder_parse_rasterization(builder, *pipeline);
+   }
+
+   if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
+      tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
+   }
+
+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
+      tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
+   }
+
+   if (set_combined_state(builder, *pipeline,
+                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
+                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
+      tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
+      tu_pipeline_builder_parse_ds_disable(builder, *pipeline);
+   }
 
    return VK_SUCCESS;
 }
@@ -3932,8 +4603,8 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
 static void
 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
 {
-   if (builder->shaders)
-      vk_pipeline_cache_object_unref(&builder->shaders->base);
+   if (builder->compiled_shaders)
+      vk_pipeline_cache_object_unref(&builder->compiled_shaders->base);
    ralloc_free(builder->mem_ctx);
 }
 
@@ -3945,17 +4616,49 @@ tu_pipeline_builder_init_graphics(
    const VkGraphicsPipelineCreateInfo *create_info,
    const VkAllocationCallbacks *alloc)
 {
-   TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
-
    *builder = (struct tu_pipeline_builder) {
       .device = dev,
       .mem_ctx = ralloc_context(NULL),
       .cache = cache,
       .create_info = create_info,
       .alloc = alloc,
-      .layout = layout,
    };
 
+   const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
+      vk_find_struct_const(builder->create_info->pNext, 
+                           GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
+
+   const VkPipelineLibraryCreateInfoKHR *library_info =
+      vk_find_struct_const(builder->create_info->pNext, 
+                           PIPELINE_LIBRARY_CREATE_INFO_KHR);
+
+   if (gpl_info) {
+      builder->state = gpl_info->flags;
+   } else {
+      /* Implement this bit of spec text:
+       *
+       *    If this structure is omitted, and either
+       *    VkGraphicsPipelineCreateInfo::flags includes
+       *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
+       *    VkGraphicsPipelineCreateInfo::pNext chain includes a
+       *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
+       *    greater than 0, it is as if flags is 0. Otherwise if this
+       *    structure is omitted, it is as if flags includes all possible
+       *    subsets of the graphics pipeline (i.e. a complete graphics
+       *    pipeline).
+       */
+      if ((library_info && library_info->libraryCount > 0) ||
+          (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)) {
+         builder->state = 0;
+      } else {
+         builder->state =
+            VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
+            VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
+            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
+      }
+   }
+
    bool rasterizer_discard_dynamic = false;
    if (create_info->pDynamicState) {
       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
@@ -3968,76 +4671,120 @@ tu_pipeline_builder_init_graphics(
    }
 
    builder->rasterizer_discard =
+      (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
       builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
       !rasterizer_discard_dynamic;
 
-   const VkPipelineRenderingCreateInfo *rendering_info =
-      vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO);
+   if (builder->state &
+       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
+        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
+      const VkPipelineRenderingCreateInfo *rendering_info =
+         vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO);
 
-   if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info)
-      rendering_info = vk_get_pipeline_rendering_create_info(create_info);
+      if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info)
+         rendering_info = vk_get_pipeline_rendering_create_info(create_info);
 
-   if (rendering_info) {
-      builder->subpass_raster_order_attachment_access = false;
-      builder->subpass_feedback_loop_ds = false;
-      builder->subpass_feedback_loop_color = false;
-
-      builder->multiview_mask = rendering_info->viewMask;
-
-      const VkRenderingSelfDependencyInfoMESA *self_dependency =
-         vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
-
-      if (self_dependency) {
-         builder->subpass_feedback_loop_ds =
-            self_dependency->depthSelfDependency ||
-            self_dependency->stencilSelfDependency;
-         builder->subpass_feedback_loop_color =
-            self_dependency->colorSelfDependencies;
+      /* Get multiview_mask, which is only used for shaders */
+      if (builder->state &
+          (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
+           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
+         if (rendering_info) {
+            builder->multiview_mask = rendering_info->viewMask;
+         } else {
+            const struct tu_render_pass *pass =
+               tu_render_pass_from_handle(create_info->renderPass);
+            const struct tu_subpass *subpass =
+               &pass->subpasses[create_info->subpass];
+            builder->multiview_mask = subpass->multiview_mask;
+         }
       }
 
-      if (!builder->rasterizer_discard) {
-         builder->depth_attachment_format =
-            rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ?
-            rendering_info->stencilAttachmentFormat :
-            rendering_info->depthAttachmentFormat;
+      /* Get the attachment state. This is valid:
+       *
+       * - With classic renderpasses, when either fragment shader or fragment
+       *   output interface state is being compiled. This includes when we
+       *   emulate classic renderpasses with dynamic rendering with the debug
+       *   flag.
+       * - With dynamic rendering (renderPass is NULL) only when compiling the
+       *   output interface state.
+       *
+       * We only actually need this for the fragment output interface state,
+       * but the spec also requires us to skip parsing depth/stencil state
+       * when the attachment state is defined *and* no depth/stencil
+       * attachment is not used, so we have to parse it for fragment shader
+       * state when possible. Life is pain.
+       */
+      if (((builder->state &
+            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) ||
+           ((builder->state &
+            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
+            builder->create_info->renderPass)) &&
+          rendering_info) {
+         builder->subpass_raster_order_attachment_access = false;
+         builder->subpass_feedback_loop_ds = false;
+         builder->subpass_feedback_loop_color = false;
 
-         for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) {
-            builder->color_attachment_formats[i] =
-               rendering_info->pColorAttachmentFormats[i];
-            if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
+         const VkRenderingSelfDependencyInfoMESA *self_dependency =
+            vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
+
+         if (self_dependency) {
+            builder->subpass_feedback_loop_ds =
+               self_dependency->depthSelfDependency ||
+               self_dependency->stencilSelfDependency;
+            builder->subpass_feedback_loop_color =
+               self_dependency->colorSelfDependencies;
+         }
+
+         if (!builder->rasterizer_discard) {
+            builder->depth_attachment_format =
+               rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ?
+               rendering_info->stencilAttachmentFormat :
+               rendering_info->depthAttachmentFormat;
+
+            for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) {
+               builder->color_attachment_formats[i] =
+                  rendering_info->pColorAttachmentFormats[i];
+               if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
+                  builder->use_color_attachments = true;
+               }
+            }
+         }
+
+         builder->attachment_state_valid = true;
+      } else if ((builder->state &
+                  (VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
+                   VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
+                 create_info->renderPass != VK_NULL_HANDLE) {
+         const struct tu_render_pass *pass =
+            tu_render_pass_from_handle(create_info->renderPass);
+         const struct tu_subpass *subpass =
+            &pass->subpasses[create_info->subpass];
+
+         builder->subpass_raster_order_attachment_access =
+            subpass->raster_order_attachment_access;
+         builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
+         builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
+
+         if (!builder->rasterizer_discard) {
+            const uint32_t a = subpass->depth_stencil_attachment.attachment;
+            builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
+               pass->attachments[a].format : VK_FORMAT_UNDEFINED;
+
+            assert(subpass->color_count == 0 ||
+                   !create_info->pColorBlendState ||
+                   subpass->color_count == create_info->pColorBlendState->attachmentCount);
+            for (uint32_t i = 0; i < subpass->color_count; i++) {
+               const uint32_t a = subpass->color_attachments[i].attachment;
+               if (a == VK_ATTACHMENT_UNUSED)
+                  continue;
+
+               builder->color_attachment_formats[i] = pass->attachments[a].format;
                builder->use_color_attachments = true;
             }
          }
-      }
-   } else {
-      const struct tu_render_pass *pass =
-         tu_render_pass_from_handle(create_info->renderPass);
-      const struct tu_subpass *subpass =
-         &pass->subpasses[create_info->subpass];
 
-      builder->subpass_raster_order_attachment_access =
-         subpass->raster_order_attachment_access;
-      builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
-      builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
-
-      builder->multiview_mask = subpass->multiview_mask;
-
-      if (!builder->rasterizer_discard) {
-         const uint32_t a = subpass->depth_stencil_attachment.attachment;
-         builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
-            pass->attachments[a].format : VK_FORMAT_UNDEFINED;
-
-         assert(subpass->color_count == 0 ||
-                !create_info->pColorBlendState ||
-                subpass->color_count == create_info->pColorBlendState->attachmentCount);
-         for (uint32_t i = 0; i < subpass->color_count; i++) {
-            const uint32_t a = subpass->color_attachments[i].attachment;
-            if (a == VK_ATTACHMENT_UNUSED)
-               continue;
-
-            builder->color_attachment_formats[i] = pass->attachments[a].format;
-            builder->use_color_attachments = true;
-         }
+         builder->attachment_state_valid = true;
       }
    }
 
diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h
index a06599fadf5..6b26bd2a9fc 100644
--- a/src/freedreno/vulkan/tu_pipeline.h
+++ b/src/freedreno/vulkan/tu_pipeline.h
@@ -60,9 +60,22 @@ struct tu_compiled_shaders
    uint8_t active_desc_sets;
 
    struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
+
+   struct ir3_shader_variant *safe_const_variants[MESA_SHADER_STAGES];
+};
+
+struct tu_nir_shaders
+{
+   struct vk_pipeline_cache_object base;
+
+   /* This is optional, and is only filled out when a library pipeline is
+    * compiled with RETAIN_LINK_TIME_OPTIMIZATION_INFO.
+    */
+   nir_shader *nir[MESA_SHADER_STAGES];
 };
 
 extern const struct vk_pipeline_cache_object_ops tu_shaders_ops;
+extern const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops;
 
 static bool inline
 tu6_shared_constants_enable(const struct tu_pipeline_layout *layout,
@@ -111,6 +124,8 @@ struct tu_pipeline
    uint32_t dynamic_state_mask;
    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
 
+   VkGraphicsPipelineLibraryFlagsEXT state;
+
    /* for dynamic states which use the same register: */
    struct {
       uint32_t gras_su_cntl, gras_su_cntl_mask;
@@ -119,6 +134,8 @@ struct tu_pipeline
       enum a5xx_line_mode line_mode;
       bool provoking_vertex_last;
 
+      uint32_t multiview_mask;
+
       struct tu_draw_state state;
    } rast;
 
@@ -209,6 +226,31 @@ struct tu_pipeline
       bool z_negative_one_to_one;
    } viewport;
 
+   /* Used only for libraries. compiled_shaders only contains variants compiled
+    * by this pipeline, and it owns them, so when it is freed they disappear.
+    * Similarly, nir_shaders owns the link-time NIR. shaders points to the
+    * shaders from this pipeline and all libraries included in it, for
+    * convenience.
+    */
+   struct tu_compiled_shaders *compiled_shaders;
+   struct tu_nir_shaders *nir_shaders;
+   struct {
+      nir_shader *nir;
+      struct tu_shader_key key;
+      struct tu_const_state const_state;
+      struct ir3_shader_variant *variant, *safe_const_variant;
+   } shaders[MESA_SHADER_FRAGMENT + 1];
+
+   struct ir3_shader_key ir3_key;
+
+   /* Used for libraries, to stitch together an overall layout for the final
+    * pipeline.
+    */
+   struct tu_descriptor_set_layout *layouts[MAX_SETS];
+   unsigned num_sets;
+   unsigned push_constant_size;
+   bool independent_sets;
+
    void *executables_mem_ctx;
    /* tu_pipeline_executable */
    struct util_dynarray executables;
diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c
index b8e0f28c81a..4f9d847e230 100644
--- a/src/freedreno/vulkan/tu_shader.c
+++ b/src/freedreno/vulkan/tu_shader.c
@@ -176,19 +176,32 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
    struct tu_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
-   uint32_t base;
+   nir_ssa_def *base;
 
    shader->active_desc_sets |= 1u << set;
 
    switch (binding_layout->type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-      base = (layout->set[set].dynamic_offset_start +
-         binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS);
+      if (layout->independent_sets) {
+         /* With independent sets, we don't know
+          * layout->set[set].dynamic_offset_start until after link time which
+          * with fast linking means after the shader is compiled. We have to
+          * get it from the const file instead.
+          */
+         base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
+         nir_ssa_def *dynamic_offset_start =
+            nir_load_uniform(b, 1, 32, nir_imm_int(b, 0),
+                             .base = shader->const_state.dynamic_offset_loc + set);
+         base = nir_iadd(b, base, dynamic_offset_start);
+      } else {
+         base = nir_imm_int(b, (layout->set[set].dynamic_offset_start +
+            binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
+      }
       set = MAX_SETS;
       break;
    default:
-      base = binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS);
+      base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
       break;
    }
 
@@ -204,7 +217,7 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
    }
 
    nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set),
-                               nir_iadd(b, nir_imm_int(b, base),
+                               nir_iadd(b, base,
                                         nir_ishl(b, vulkan_idx, shift)),
                                shift);
 
@@ -658,6 +671,13 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
       align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
             dev->compiler->const_upload_unit);
 
+   if (layout->independent_sets) {
+      const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
+      reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4);
+   } else {
+      const_state->dynamic_offset_loc = UINT32_MAX;
+   }
+
    tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4;
 
    struct lower_instr_params params = {
diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h
index 6f529cde058..69e0a20683b 100644
--- a/src/freedreno/vulkan/tu_shader.h
+++ b/src/freedreno/vulkan/tu_shader.h
@@ -21,6 +21,7 @@ struct tu_push_constant_range
 struct tu_const_state
 {
    struct tu_push_constant_range push_consts;
+   uint32_t dynamic_offset_loc;
 };
 
 struct tu_shader