diff --git a/src/intel/vulkan/anv_astc_emu.c b/src/intel/vulkan/anv_astc_emu.c
index e447db303c0..83c6cc660e7 100644
--- a/src/intel/vulkan/anv_astc_emu.c
+++ b/src/intel/vulkan/anv_astc_emu.c
@@ -5,6 +5,8 @@
 
 #include "anv_private.h"
 
+#include "vk_common_entrypoints.h"
+
 #include "compiler/nir/nir_builder.h"
 
 static void
@@ -293,8 +295,9 @@ astc_emu_flush_denorm_slice(struct anv_cmd_buffer *cmd_buffer,
                                      set_writes);
    VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);
 
-   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
-                       astc_emu->pipeline);
+   vk_common_CmdBindPipeline(cmd_buffer_,
+                             VK_PIPELINE_BIND_POINT_COMPUTE,
+                             astc_emu->pipeline);
 
    VkPushConstantsInfoKHR push_info = {
       .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
@@ -351,7 +354,9 @@ astc_emu_decompress_slice(struct anv_cmd_buffer *cmd_buffer,
       return;
    }
 
-   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+   vk_common_CmdBindPipeline(cmd_buffer_,
+                             VK_PIPELINE_BIND_POINT_COMPUTE,
+                             pipeline);
 
    struct vk_texcompress_astc_write_descriptor_set writes;
    vk_texcompress_astc_fill_write_descriptor_sets(astc_emu->texcompress,
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 945ac7686cd..e22612c35a5 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -30,6 +30,7 @@
 #include "anv_private.h"
 #include "anv_measure.h"
 
+#include "vk_common_entrypoints.h"
 #include "vk_util.h"
 
 /** \file anv_cmd_buffer.c
@@ -435,17 +436,16 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
 }
 
 static void
-anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
-                                    struct anv_cmd_pipeline_state *pipeline_state,
-                                    struct anv_pipeline *pipeline,
-                                    VkShaderStageFlags stages)
+anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_cmd_pipeline_state *pipeline_state,
+                                   uint32_t ray_queries,
+                                   VkShaderStageFlags stages)
 {
    struct anv_device *device = cmd_buffer->device;
    uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
 
    uint64_t ray_shadow_size =
-      align64(brw_rt_ray_queries_shadow_stacks_size(device->info,
-                                                    pipeline->ray_queries),
+      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
               4096);
    if (ray_shadow_size > 0 &&
        (!cmd_buffer->state.ray_query_shadow_bo ||
@@ -497,112 +497,6 @@ anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
    pipeline_state->push_constants_data_dirty = true;
 }
 
-/**
- * This function compute changes between 2 pipelines and flags the dirty HW
- * state appropriately.
- */
-static void
-anv_cmd_buffer_flush_pipeline_hw_state(struct anv_cmd_buffer *cmd_buffer,
-                                       struct anv_graphics_pipeline *old_pipeline,
-                                       struct anv_graphics_pipeline *new_pipeline)
-{
-   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
-
-#define diff_fix_state(bit, name)                                       \
-   do {                                                                 \
-      /* Fixed states should always have matching sizes */              \
-      assert(old_pipeline == NULL ||                                    \
-             old_pipeline->name.len == new_pipeline->name.len);         \
-      /* Don't bother memcmp if the state is already dirty */           \
-      if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) &&    \
-          (old_pipeline == NULL ||                                      \
-           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                  &new_pipeline->batch_data[new_pipeline->name.offset], \
-                  4 * new_pipeline->name.len) != 0))                    \
-         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);         \
-   } while (0)
-#define diff_var_state(bit, name)                                       \
-   do {                                                                 \
-      /* Don't bother memcmp if the state is already dirty */           \
-      /* Also if the new state is empty, avoid marking dirty */         \
-      if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) &&    \
-          new_pipeline->name.len != 0 &&                                \
-          (old_pipeline == NULL ||                                      \
-           old_pipeline->name.len != new_pipeline->name.len ||          \
-           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                  &new_pipeline->batch_data[new_pipeline->name.offset], \
-                  4 * new_pipeline->name.len) != 0))                    \
-         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);         \
-   } while (0)
-#define assert_identical(bit, name)                                     \
-   do {                                                                 \
-      /* Fixed states should always have matching sizes */              \
-      assert(old_pipeline == NULL ||                                    \
-             old_pipeline->name.len == new_pipeline->name.len);         \
-      assert(old_pipeline == NULL ||                                    \
-             memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                    &new_pipeline->batch_data[new_pipeline->name.offset], \
-                    4 * new_pipeline->name.len) == 0);                  \
-   } while (0)
-#define assert_empty(name) assert(new_pipeline->name.len == 0)
-
-   /* Compare all states, including partial packed ones, the dynamic part is
-    * left at 0 but the static part could still change.
-    *
-    * We avoid comparing protected packets as all the fields but the scratch
-    * surface are identical. we just need to select the right one at emission.
-    */
-   diff_fix_state(VF_SGVS,                  final.vf_sgvs);
-   if (cmd_buffer->device->info->ver >= 11)
-      diff_fix_state(VF_SGVS_2,             final.vf_sgvs_2);
-   diff_fix_state(VF_COMPONENT_PACKING,     final.vf_component_packing);
-   diff_fix_state(VS,                       final.vs);
-   diff_fix_state(HS,                       final.hs);
-   diff_fix_state(DS,                       final.ds);
-
-   diff_fix_state(WM,                       partial.wm);
-   diff_fix_state(STREAMOUT,                partial.so);
-   diff_fix_state(GS,                       partial.gs);
-   diff_fix_state(TE,                       partial.te);
-   diff_fix_state(PS,                       partial.ps);
-   diff_fix_state(PS_EXTRA,                 partial.ps_extra);
-
-   if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
-      diff_fix_state(TASK_CONTROL,          final.task_control);
-      diff_fix_state(TASK_SHADER,           final.task_shader);
-      diff_fix_state(TASK_REDISTRIB,        final.task_redistrib);
-      diff_fix_state(MESH_CONTROL,          final.mesh_control);
-      diff_fix_state(MESH_SHADER,           final.mesh_shader);
-      diff_fix_state(MESH_DISTRIB,          final.mesh_distrib);
-      diff_fix_state(CLIP_MESH,             final.clip_mesh);
-   } else {
-      assert_empty(final.task_control);
-      assert_empty(final.task_shader);
-      assert_empty(final.task_redistrib);
-      assert_empty(final.mesh_control);
-      assert_empty(final.mesh_shader);
-      assert_empty(final.mesh_distrib);
-      assert_empty(final.clip_mesh);
-   }
-
-   /* States that can vary in length */
-   diff_var_state(VF_SGVS_INSTANCING,       final.vf_sgvs_instancing);
-   diff_var_state(SO_DECL_LIST,             final.so_decl_list);
-
-#undef diff_fix_state
-#undef diff_var_state
-#undef assert_identical
-#undef assert_empty
-
-   /* We're not diffing the following :
-    *    - anv_graphics_pipeline::vertex_input_data
-    *    - anv_graphics_pipeline::final::vf_instancing
-    *
-    * since they are tracked by the runtime.
-    */
-}
-
 static enum anv_cmd_dirty_bits
 get_pipeline_dirty_stages(struct anv_device *device,
                           struct anv_graphics_pipeline *old_pipeline,
@@ -636,7 +530,7 @@ get_pipeline_dirty_stages(struct anv_device *device,
 
 static void
 update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
-                             struct anv_shader_bin **shaders,
+                             struct anv_shader ** const shaders,
                              uint32_t shader_count)
 {
    state->push_buffer_stages = 0;
@@ -646,7 +540,7 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
       if (shaders[i] == NULL)
          continue;
 
-      VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->stage);
+      VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->vk.stage);
 
       if (shaders[i]->push_desc_info.used_descriptors)
          state->push_descriptor_stages |= stage;
@@ -656,145 +550,6 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
    }
 }
 
-void anv_CmdBindPipeline(
-    VkCommandBuffer                             commandBuffer,
-    VkPipelineBindPoint                         pipelineBindPoint,
-    VkPipeline                                  _pipeline)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
-   struct anv_cmd_pipeline_state *state;
-   VkShaderStageFlags stages = 0;
-
-   switch (pipelineBindPoint) {
-   case VK_PIPELINE_BIND_POINT_COMPUTE: {
-      if (cmd_buffer->state.compute.base.pipeline == pipeline)
-         return;
-
-      struct anv_compute_pipeline *compute_pipeline =
-         anv_pipeline_to_compute(pipeline);
-
-      cmd_buffer->state.compute.shader = compute_pipeline->cs;
-      cmd_buffer->state.compute.pipeline_dirty = true;
-
-      set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
-                             &compute_pipeline->cs->bind_map);
-
-      state = &cmd_buffer->state.compute.base;
-      stages = VK_SHADER_STAGE_COMPUTE_BIT;
-
-      update_push_descriptor_flags(state, &compute_pipeline->cs, 1);
-      break;
-   }
-
-   case VK_PIPELINE_BIND_POINT_GRAPHICS: {
-      struct anv_graphics_pipeline *new_pipeline =
-         anv_pipeline_to_graphics(pipeline);
-
-      /* Apply the non dynamic state from the pipeline */
-      vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
-                                        &new_pipeline->dynamic_state);
-
-      if (cmd_buffer->state.gfx.base.pipeline == pipeline)
-         return;
-
-      struct anv_graphics_pipeline *old_pipeline =
-         cmd_buffer->state.gfx.base.pipeline == NULL ? NULL :
-         anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
-
-      cmd_buffer->state.gfx.dirty |=
-         get_pipeline_dirty_stages(cmd_buffer->device,
-                                   old_pipeline, new_pipeline);
-
-      STATIC_ASSERT(sizeof(cmd_buffer->state.gfx.shaders) ==
-                    sizeof(new_pipeline->base.shaders));
-      memcpy(cmd_buffer->state.gfx.shaders,
-             new_pipeline->base.shaders,
-             sizeof(cmd_buffer->state.gfx.shaders));
-      cmd_buffer->state.gfx.active_stages = pipeline->active_stages;
-
-      anv_foreach_stage(stage, new_pipeline->base.base.active_stages) {
-         set_dirty_for_bind_map(cmd_buffer, stage,
-                                &new_pipeline->base.shaders[stage]->bind_map);
-      }
-
-      state = &cmd_buffer->state.gfx.base;
-      stages = new_pipeline->base.base.active_stages;
-
-      update_push_descriptor_flags(state,
-                                   new_pipeline->base.shaders,
-                                   ARRAY_SIZE(new_pipeline->base.shaders));
-
-      /* When the pipeline is using independent states and dynamic buffers,
-       * this will trigger an update of anv_push_constants::dynamic_base_index
-       * & anv_push_constants::dynamic_offsets.
-       */
-      struct anv_push_constants *push =
-         &cmd_buffer->state.gfx.base.push_constants;
-      struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout;
-      if (layout->independent_sets && layout->num_dynamic_buffers > 0) {
-         bool modified = false;
-         for (uint32_t s = 0; s < layout->num_sets; s++) {
-            if (layout->set_layouts[s] == NULL)
-               continue;
-
-            assert(layout->dynamic_offset_start[s] < MAX_DYNAMIC_BUFFERS);
-            if (layout->set_layouts[s]->vk.dynamic_descriptor_count > 0 &&
-                (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) !=
-                layout->dynamic_offset_start[s]) {
-               push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK;
-               push->desc_surface_offsets[s] |= (layout->dynamic_offset_start[s] &
-                                                 ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
-               modified = true;
-            }
-         }
-         if (modified) {
-            cmd_buffer->state.push_constants_dirty |= stages;
-            state->push_constants_data_dirty = true;
-         }
-      }
-
-      cmd_buffer->state.gfx.vs_source_hash = new_pipeline->vs_source_hash;
-      cmd_buffer->state.gfx.fs_source_hash = new_pipeline->fs_source_hash;
-
-      cmd_buffer->state.gfx.instance_multiplier = new_pipeline->instance_multiplier;
-
-      anv_cmd_buffer_flush_pipeline_hw_state(cmd_buffer, old_pipeline, new_pipeline);
-      break;
-   }
-
-   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
-      if (cmd_buffer->state.rt.base.pipeline == pipeline)
-         return;
-
-      cmd_buffer->state.rt.pipeline_dirty = true;
-
-      struct anv_ray_tracing_pipeline *rt_pipeline =
-         anv_pipeline_to_ray_tracing(pipeline);
-      if (rt_pipeline->stack_size > 0) {
-         anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
-                                                  rt_pipeline->stack_size);
-      }
-
-      state = &cmd_buffer->state.rt.base;
-
-      state->push_buffer_stages = pipeline->use_push_descriptor_buffer;
-      state->push_descriptor_stages = pipeline->use_push_descriptor_buffer;
-      state->push_descriptor_index = pipeline->layout.push_descriptor_set_index;
-      break;
-   }
-
-   default:
-      UNREACHABLE("invalid bind point");
-      break;
-   }
-
-   state->pipeline = pipeline;
-
-   if (pipeline->ray_queries > 0)
-      anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
-}
-
 static struct anv_cmd_pipeline_state *
 anv_cmd_buffer_get_pipeline_layout_state(struct anv_cmd_buffer *cmd_buffer,
                                          VkPipelineBindPoint bind_point,
@@ -1519,20 +1274,37 @@ void anv_CmdPushDescriptorSetWithTemplate2KHR(
                                       NULL, NULL);
 }
 
-void anv_CmdSetRayTracingPipelineStackSizeKHR(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    pipelineStackSize)
+void
+anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer,
+                            VkDeviceSize scratch_size,
+                            uint32_t ray_queries)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+   rt->scratch_size = MAX2(rt->scratch_size, scratch_size);
+   if (ray_queries > 0) {
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &rt->base, ray_queries,
+                                         ANV_RT_STAGE_BITS);
+   }
+}
+
+void
+anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer,
+                              VkDeviceSize stack_size)
+{
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
    struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
 
    if (anv_batch_has_error(&cmd_buffer->batch))
       return;
 
    uint32_t stack_ids_per_dss = 2048; /* TODO */
 
-   unsigned stack_size_log2 = util_logbase2_ceil(pipelineStackSize);
+   unsigned stack_size_log2 = util_logbase2_ceil(stack_size);
    if (stack_size_log2 < 10)
       stack_size_log2 = 10;
 
@@ -1585,7 +1357,7 @@ anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
       &cmd_buffer->state.compute.base;
 
    if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE)
-      state->pipeline = pipe_state->pipeline;
+      state->shader = &cmd_buffer->state.compute.shader->vk;
 
    if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0)
       state->descriptor_set[0] = pipe_state->descriptors[0];
@@ -1614,11 +1386,11 @@ anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
    struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;
 
    if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) {
-       if (state->pipeline) {
-          anv_CmdBindPipeline(cmd_buffer_, bind_point,
-                              anv_pipeline_to_handle(state->pipeline));
+       if (state->shader) {
+          mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+          anv_cmd_buffer_bind_shaders(&cmd_buffer->vk, 1, &stage, &state->shader);
        } else {
-          pipe_state->pipeline = NULL;
+          cmd_buffer->state.compute.shader = NULL;
        }
    }
 
@@ -1693,3 +1465,285 @@ anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer,
    anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned)
       (commandBuffer, invocations_x, invocations_y, invocations_z);
 }
+
+static void
+bind_compute_shader(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_shader *shader)
+{
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+
+   cmd_buffer->state.compute.shader = shader;
+   if (shader == NULL)
+      return;
+
+   cmd_buffer->state.compute.pipeline_dirty = true;
+   set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, &shader->bind_map);
+
+   update_push_descriptor_flags(&comp_state->base,
+                                &cmd_buffer->state.compute.shader, 1);
+
+   if (shader->vk.ray_queries > 0) {
+      assert(cmd_buffer->device->info->verx10 >= 125);
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &comp_state->base,
+                                         shader->vk.ray_queries,
+                                         VK_SHADER_STAGE_COMPUTE_BIT);
+   }
+}
+
+static void
+bind_graphics_shaders(struct anv_cmd_buffer *cmd_buffer,
+                      struct anv_shader *new_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT])
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+   uint32_t ray_queries = 0;
+
+   static const enum anv_cmd_dirty_bits mesa_stage_to_dirty_bit[] = {
+      [MESA_SHADER_VERTEX]    = ANV_CMD_DIRTY_VS,
+      [MESA_SHADER_TESS_CTRL] = ANV_CMD_DIRTY_HS,
+      [MESA_SHADER_TESS_EVAL] = ANV_CMD_DIRTY_DS,
+      [MESA_SHADER_GEOMETRY]  = ANV_CMD_DIRTY_GS,
+      [MESA_SHADER_TASK]      = ANV_CMD_DIRTY_TASK,
+      [MESA_SHADER_MESH]      = ANV_CMD_DIRTY_MESH,
+      [MESA_SHADER_FRAGMENT]  = ANV_CMD_DIRTY_PS,
+   };
+
+   gfx->active_stages = 0;
+   gfx->instance_multiplier = 0;
+
+   mesa_shader_stage new_streamout_stage = -1;
+   /* Find the last pre-rasterization stage */
+   for (uint32_t i = 0; i < ANV_GRAPHICS_SHADER_STAGE_COUNT; i++) {
+      mesa_shader_stage s = ANV_GRAPHICS_SHADER_STAGE_COUNT - i - 1;
+      if (new_shaders[s] == NULL)
+         continue;
+
+      assert(gfx->instance_multiplier == 0 ||
+             gfx->instance_multiplier == new_shaders[s]->instance_multiplier);
+      gfx->active_stages |= mesa_to_vk_shader_stage(s);
+      gfx->instance_multiplier = new_shaders[s]->instance_multiplier;
+
+      if (s == MESA_SHADER_FRAGMENT ||
+          s == MESA_SHADER_TASK ||
+          s == MESA_SHADER_TESS_CTRL)
+         continue;
+
+      new_streamout_stage = MAX2(new_streamout_stage, s);
+   }
+
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      struct anv_shader *shader = new_shaders[s];
+
+      if (shader != NULL) {
+         gfx->active_stages |= mesa_to_vk_shader_stage(s);
+
+         ray_queries = MAX2(ray_queries, shader->vk.ray_queries);
+         if (gfx->shaders[s] != shader)
+            set_dirty_for_bind_map(cmd_buffer, s, &shader->bind_map);
+      }
+
+      if (gfx->shaders[s] != shader)
+         gfx->dirty |= mesa_stage_to_dirty_bit[s];
+      else
+         continue;
+
+#define diff_fix_state(bit, name)                                       \
+      do {                                                              \
+         /* Fixed states should always have matching sizes */           \
+         assert(gfx->shaders[s] == NULL ||                              \
+                gfx->shaders[s]->name.len == shader->name.len);         \
+         /* Don't bother memcmp if the state is already dirty */        \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             (gfx->shaders[s] == NULL ||                                \
+              memcmp(&gfx->shaders[s]->cmd_data[                        \
+                        gfx->shaders[s]->name.offset],                  \
+                     &shader->cmd_data[                                 \
+                        shader->name.offset],                           \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_var_state(bit, name)                                       \
+      do {                                                              \
+         /* Don't bother memcmp if the state is already dirty */        \
+         /* Also if the new state is empty, avoid marking dirty */      \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             shader->name.len != 0 &&                                   \
+             (gfx->shaders[s] == NULL ||                                \
+              gfx->shaders[s]->name.len != shader->name.len ||          \
+              memcmp(&gfx->shaders[s]->cmd_data[                        \
+                        gfx->shaders[s]->name.offset],                  \
+                     &shader->cmd_data[shader->name.offset],            \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_fix_state_stage(bit, name, old_stage)                      \
+      do {                                                              \
+         /* Fixed states should always have matching sizes */           \
+         assert(old_stage == MESA_SHADER_NONE ||                        \
+                gfx->shaders[old_stage] == NULL ||                      \
+                gfx->shaders[old_stage]->name.len == shader->name.len); \
+         /* Don't bother memcmp if the state is already dirty */        \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             (old_stage == MESA_SHADER_NONE ||                          \
+              gfx->shaders[old_stage] == NULL ||                        \
+              memcmp(&gfx->shaders[old_stage]->cmd_data[                \
+                        gfx->shaders[old_stage]->name.offset],          \
+                     &shader->cmd_data[                                 \
+                        shader->name.offset],                           \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_var_state_stage(bit, name, old_stage)                      \
+      do {                                                              \
+         /* Don't bother memcmp if the state is already dirty */        \
+         /* Also if the new state is empty, avoid marking dirty */      \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             shader->name.len != 0 &&                                   \
+             (gfx->shaders[old_stage] == NULL ||                        \
+              gfx->shaders[old_stage]->name.len != shader->name.len ||  \
+              memcmp(&gfx->shaders[old_stage]->cmd_data[                \
+                        gfx->shaders[old_stage]->name.offset],          \
+                     &shader->cmd_data[shader->name.offset],            \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+
+      switch (s) {
+      case MESA_SHADER_VERTEX:
+         if (shader != NULL) {
+            diff_fix_state(VS,                       vs.vs);
+            diff_fix_state(VF_SGVS,                  vs.vf_sgvs);
+            if (cmd_buffer->device->info->ver >= 11)
+               diff_fix_state(VF_SGVS_2,             vs.vf_sgvs_2);
+            diff_fix_state(VF_COMPONENT_PACKING,     vs.vf_component_packing);
+            diff_var_state(VF_SGVS_INSTANCING,       vs.vf_sgvs_instancing);
+            gfx->vs_source_hash = shader->prog_data->source_hash;
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VS);
+         }
+         break;
+
+      case MESA_SHADER_TESS_CTRL:
+         if (shader != NULL)
+            diff_fix_state(HS,                       hs.hs);
+         else
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_HS);
+         break;
+
+      case MESA_SHADER_TESS_EVAL:
+         if (shader != NULL) {
+            diff_fix_state(DS,                       ds.ds);
+            diff_fix_state(TE,                       ds.te);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_DS);
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TE);
+         }
+         break;
+
+      case MESA_SHADER_GEOMETRY:
+         if (shader != NULL)
+            diff_fix_state(GS,                       gs.gs);
+         else
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_GS);
+         break;
+
+      case MESA_SHADER_MESH:
+         if (shader != NULL) {
+            diff_fix_state(MESH_CONTROL,             ms.control);
+            diff_fix_state(MESH_SHADER,              ms.shader);
+            diff_fix_state(MESH_DISTRIB,             ms.distrib);
+            diff_fix_state(CLIP_MESH,                ms.clip);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_MESH_CONTROL);
+         }
+         break;
+
+      case MESA_SHADER_TASK:
+         if (shader != NULL) {
+            diff_fix_state(TASK_CONTROL,             ts.control);
+            diff_fix_state(TASK_SHADER,              ts.shader);
+            diff_fix_state(TASK_REDISTRIB,           ts.redistrib);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TASK_CONTROL);
+         }
+         break;
+
+      case MESA_SHADER_FRAGMENT:
+         if (shader != NULL) {
+            diff_fix_state(WM,                       ps.wm);
+            diff_fix_state(PS,                       ps.ps);
+            diff_fix_state(PS_EXTRA,                 ps.ps_extra);
+            gfx->fs_source_hash = shader->prog_data->source_hash;
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_PS_EXTRA);
+         }
+         break;
+
+      default:
+         UNREACHABLE("Invalid shader stage");
+      }
+
+      /* Only diff those field on the streamout stage */
+      if (s == new_streamout_stage) {
+         diff_fix_state_stage(STREAMOUT,    so,           gfx->streamout_stage);
+         diff_var_state_stage(SO_DECL_LIST, so_decl_list, gfx->streamout_stage);
+      }
+
+      gfx->shaders[s] = shader;
+   }
+
+   gfx->streamout_stage = new_streamout_stage;
+
+#undef diff_fix_state
+#undef diff_var_state
+#undef diff_fix_state_stage
+#undef diff_var_state_stage
+
+   update_push_descriptor_flags(&gfx->base,
+                                cmd_buffer->state.gfx.shaders,
+                                ARRAY_SIZE(cmd_buffer->state.gfx.shaders));
+
+   if (ray_queries > 0) {
+      assert(cmd_buffer->device->info->verx10 >= 125);
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &gfx->base, ray_queries,
+                                         cmd_buffer->state.gfx.active_stages);
+   }
+}
+
+void
+anv_cmd_buffer_bind_shaders(struct vk_command_buffer *vk_cmd_buffer,
+                            uint32_t stage_count,
+                            const mesa_shader_stage *stages,
+                            struct vk_shader ** const vk_shaders)
+{
+   struct anv_shader ** const shaders = (struct anv_shader ** const)vk_shaders;
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+
+   /* Append any scratch surface used by the shaders */
+   for (uint32_t i = 0; i < stage_count; i++) {
+      if (shaders[i] != NULL) {
+         anv_reloc_list_append(cmd_buffer->batch.relocs,
+                               &shaders[i]->relocs);
+      }
+   }
+
+   struct anv_shader *cs_shader = cmd_buffer->state.compute.shader;
+   struct anv_shader *gfx_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+   memcpy(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders));
+   for (uint32_t i = 0; i < stage_count; i++) {
+      if (mesa_shader_stage_is_compute(stages[i]))
+         cs_shader = shaders[i];
+      else
+         gfx_shaders[stages[i]] = shaders[i];
+   }
+
+   if (cs_shader != cmd_buffer->state.compute.shader)
+      bind_compute_shader(cmd_buffer, cs_shader);
+   if (memcmp(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders)))
+      bind_graphics_shaders(cmd_buffer, gfx_shaders);
+}
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 6b21b498e3a..00885daec5a 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -31,6 +31,7 @@
 
 #include "anv_private.h"
 #include "anv_measure.h"
+#include "anv_shader.h"
 #include "anv_slab_bo.h"
 #include "util/u_debug.h"
 #include "util/os_file.h"
@@ -380,6 +381,8 @@ VkResult anv_CreateDevice(
    if (result != VK_SUCCESS)
       goto fail_alloc;
 
+   device->vk.shader_ops = &anv_device_shader_ops;
+
    if (INTEL_DEBUG(DEBUG_BATCH) || INTEL_DEBUG(DEBUG_BATCH_STATS)) {
       for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
          struct intel_batch_decode_ctx *decoder = &device->decoder[i];
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index ed079f59cec..342e00d31d9 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -223,7 +223,7 @@ uint32_t
 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
                                        struct anv_cmd_pipeline_state *pipe_state,
                                        const VkShaderStageFlags dirty,
-                                       const struct anv_shader_bin **shaders,
+                                       const struct anv_shader **shaders,
                                        uint32_t num_shaders);
 
 void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 9364eb4b909..ead66968608 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -173,17 +173,29 @@ anv_pipeline_finish(struct anv_pipeline *pipeline,
    vk_object_base_finish(&pipeline->vk.base);
 }
 
+VKAPI_ATTR void VKAPI_CALL
+vk_common_DestroyPipeline(VkDevice _device,
+                          VkPipeline _pipeline,
+                          const VkAllocationCallbacks *pAllocator);
+
 void anv_DestroyPipeline(
     VkDevice                                    _device,
     VkPipeline                                  _pipeline,
     const VkAllocationCallbacks*                pAllocator)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   VK_FROM_HANDLE(vk_pipeline, vk_pipeline, _pipeline);
 
-   if (!pipeline)
+   if (!vk_pipeline)
       return;
 
+   if (vk_pipeline->ops != NULL) {
+      vk_common_DestroyPipeline(_device, _pipeline, pAllocator);
+      return;
+   }
+
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
    ANV_RMV(resource_destroy, device, pipeline);
 
    switch (pipeline->type) {
@@ -2851,6 +2863,7 @@ anv_compute_pipeline_create(struct anv_device *device,
    return pipeline->base.batch.status;
 }
 
+#if 0
 VkResult anv_CreateComputePipelines(
     VkDevice                                    _device,
     VkPipelineCache                             pipelineCache,
@@ -2885,6 +2898,7 @@ VkResult anv_CreateComputePipelines(
 
    return result;
 }
+#endif
 
 static uint32_t
 get_vs_input_elements(const struct brw_vs_prog_data *vs_prog_data)
@@ -3343,6 +3357,7 @@ anv_graphics_pipeline_create(struct anv_device *device,
    return pipeline->base.base.batch.status;
 }
 
+#if 0
 VkResult anv_CreateGraphicsPipelines(
     VkDevice                                    _device,
     VkPipelineCache                             pipelineCache,
@@ -3388,6 +3403,7 @@ VkResult anv_CreateGraphicsPipelines(
 
    return result;
 }
+#endif
 
 static bool
 should_remat_cb(nir_instr *instr, void *data)
@@ -4083,6 +4099,7 @@ anv_ray_tracing_pipeline_create(
    return pipeline->base.batch.status;
 }
 
+#if 0
 VkResult
 anv_CreateRayTracingPipelinesKHR(
     VkDevice                                    _device,
@@ -4491,3 +4508,4 @@ anv_GetRayTracingShaderGroupStackSizeKHR(
 
    return brw_bs_prog_data_const(bin->prog_data)->max_stack_size;
 }
+#endif
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index de85f360815..00e48e319e7 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1224,7 +1224,6 @@ struct anv_shader {
    struct anv_state kernel;
 
    const struct brw_stage_prog_data *prog_data;
-   uint32_t prog_data_size;
 
    struct brw_compile_stats stats[3];
    uint32_t num_stats;
@@ -2186,6 +2185,11 @@ struct anv_gfx_dynamic_state {
       uint32_t PrimitiveTopologyType;
    } vft;
 
+   /* 3DSTATE_VS */
+   struct {
+      bool     VertexCacheDisable;
+   } vs;
+
    /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */
    struct {
       uint32_t count;
@@ -4422,7 +4426,7 @@ struct anv_cmd_graphics_state {
    struct anv_cmd_pipeline_state base;
 
    /* Shaders bound */
-   struct anv_shader_bin *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+   struct anv_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
 
    /* Bitfield of valid entries in the shaders array */
    VkShaderStageFlags active_stages;
@@ -4436,6 +4440,9 @@ struct anv_cmd_graphics_state {
    bool kill_pixel;
    bool uses_xfb;
 
+   /* Shader stage in base.shaders[] responsible for streamout */
+   mesa_shader_stage streamout_stage;
+
    /* Render pass information */
    VkRenderingFlags rendering_flags;
    VkRect2D render_area;
@@ -4530,7 +4537,7 @@ struct anv_cmd_graphics_state {
 struct anv_cmd_compute_state {
    struct anv_cmd_pipeline_state base;
 
-   struct anv_shader_bin *shader;
+   struct anv_shader *shader;
 
    bool pipeline_dirty;
 
@@ -4551,6 +4558,8 @@ struct anv_cmd_ray_tracing_state {
       struct brw_rt_scratch_layout layout;
    } scratch;
 
+   VkDeviceSize scratch_size;
+
    uint32_t debug_marker_count;
    uint32_t num_tlas;
    uint32_t num_blas;
@@ -5022,6 +5031,12 @@ void
 anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
                                          enum anv_pipe_bits flushed_bits);
 
+void
+anv_cmd_buffer_bind_shaders(struct vk_command_buffer *cmd_buffer,
+                            uint32_t stage_count,
+                            const mesa_shader_stage *stages,
+                            struct vk_shader ** const shaders);
+
 /**
  * A allocation tied to a command buffer.
  *
@@ -5083,7 +5098,7 @@ enum anv_cmd_saved_state_flags {
 struct anv_cmd_saved_state {
    uint32_t flags;
 
-   struct anv_pipeline *pipeline;
+   struct vk_shader *shader;
    struct anv_descriptor_set *descriptor_set[MAX_SETS];
    uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
 };
@@ -5444,7 +5459,6 @@ struct anv_graphics_pipeline {
              4 * _cmd_state->len);                                      \
    } while (0)
 
-
 struct anv_compute_pipeline {
    struct anv_pipeline                          base;
 
@@ -6484,6 +6498,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer);
 VkResult
 anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer);
 
+void
+anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer,
+                            VkDeviceSize scratch_size,
+                            uint32_t ray_queries);
+
+void
+anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer,
+                              VkDeviceSize stack_size);
+
 bool
 anv_can_hiz_clear_image(struct anv_cmd_buffer *cmd_buffer,
                         const struct anv_image *image,
diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c
index 14eb0d2adf9..666082b4f4b 100644
--- a/src/intel/vulkan/anv_shader_compile.c
+++ b/src/intel/vulkan/anv_shader_compile.c
@@ -1886,5 +1886,8 @@ struct vk_device_shader_ops anv_device_shader_ops = {
    .deserialize                    = anv_shader_deserialize,
    .write_rt_shader_group          = anv_write_rt_shader_group,
    .write_rt_shader_group_replay_handle = anv_write_rt_shader_group_replay_handle,
+   .cmd_bind_shaders               = anv_cmd_buffer_bind_shaders,
    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
+   .cmd_set_rt_state               = anv_cmd_buffer_set_rt_state,
+   .cmd_set_stack_size             = anv_cmd_buffer_set_stack_size,
 };
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 33325da7197..46d84535a01 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -2121,7 +2121,7 @@ emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
 static VkResult
 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                    struct anv_cmd_pipeline_state *pipe_state,
-                   const struct anv_shader_bin *shader,
+                   const struct anv_shader *shader,
                    struct anv_state *bt_state)
 {
    uint32_t state_offset;
@@ -2153,7 +2153,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 
       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
          /* Color attachment binding */
-         assert(shader->stage == MESA_SHADER_FRAGMENT);
+         assert(shader->vk.stage == MESA_SHADER_FRAGMENT);
          uint32_t index = binding->index < MAX_RTS ?
             cmd_buffer->state.gfx.color_output_mapping[binding->index] :
             binding->index;
@@ -2268,7 +2268,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 static VkResult
 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
               struct anv_cmd_pipeline_state *pipe_state,
-              const struct anv_shader_bin *shader,
+              const struct anv_shader *shader,
               struct anv_state *state)
 {
    const struct anv_pipeline_bind_map *map = &shader->bind_map;
@@ -2312,7 +2312,7 @@ uint32_t
 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
                                        struct anv_cmd_pipeline_state *pipe_state,
                                        const VkShaderStageFlags dirty,
-                                       const struct anv_shader_bin **shaders,
+                                       const struct anv_shader **shaders,
                                        uint32_t num_shaders)
 {
    VkShaderStageFlags flushed = 0;
@@ -2322,7 +2322,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
       if (!shaders[i])
          continue;
 
-      mesa_shader_stage stage = shaders[i]->stage;
+      mesa_shader_stage stage = shaders[i]->vk.stage;
       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
       if ((vk_stage & dirty) == 0)
          continue;
@@ -2361,7 +2361,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
          if (!shaders[i])
             continue;
 
-         mesa_shader_stage stage = shaders[i]->stage;
+         mesa_shader_stage stage = shaders[i]->vk.stage;
 
          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
                                 &cmd_buffer->state.samplers[stage]);
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index 2eb8fceaa83..2fcbda1fef3 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -105,13 +105,11 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
    struct anv_device *device = cmd_buffer->device;
    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
    const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
-   struct anv_compute_pipeline *pipeline =
-      anv_pipeline_to_compute(comp_state->base.pipeline);
 
    assert(comp_state->shader);
 
    genX(cmd_buffer_config_l3)(cmd_buffer,
-                              pipeline->cs->prog_data->total_shared > 0 ?
+                              comp_state->shader->prog_data->total_shared > 0 ?
                               device->l3_slm_config : device->l3_config);
 
    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
@@ -127,7 +125,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
     */
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   if (cmd_buffer->state.compute.pipeline_dirty) {
+   if (comp_state->pipeline_dirty) {
 #if GFX_VERx10 < 125
       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
        *
@@ -143,13 +141,28 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 #endif
 
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+#define anv_batch_emit_cs(batch, cmd, field) ({                         \
+            void *__dst = anv_batch_emit_dwords(                        \
+               batch, __anv_cmd_length(cmd));                           \
+            memcpy(__dst,                                               \
+                   &comp_state->shader->cmd_data[                       \
+                      comp_state->shader->field.offset],                \
+                   4 * __anv_cmd_length(cmd));                          \
+            VG(VALGRIND_CHECK_MEM_IS_DEFINED(                           \
+                  __dst, __anv_cmd_length(cmd) * 4));                   \
+            __dst;                                                      \
+         })
+
 
 #if GFX_VERx10 >= 125
       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
       genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
+#else
+      anv_batch_emit_cs(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), cs.gfx9.vfe);
 #endif
 
+#undef anv_batch_emit_cs
+
       /* Changing the pipeline affects the push constants layout (different
        * amount of cross/per thread allocations). The allocation is also
        * bounded to just the amount consummed by the pipeline (see
@@ -179,7 +192,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
          cmd_buffer,
          &cmd_buffer->state.compute.base,
          VK_SHADER_STAGE_COMPUTE_BIT,
-         (const struct anv_shader_bin **)&comp_state->shader, 1);
+         (const struct anv_shader **)&comp_state->shader, 1);
       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
 
 #if GFX_VERx10 < 125
@@ -194,7 +207,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
 
       struct anv_state state =
          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
-                                      pipeline->gfx9.interface_descriptor_data,
+                                      comp_state->shader->cs.gfx9.idd,
                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
                                       64);
 
@@ -439,7 +452,7 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
          &cmd_buffer->batch,
          GENX(EXECUTE_INDIRECT_DISPATCH_length),
          GENX(EXECUTE_INDIRECT_DISPATCH_body_start) / 32,
-         anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
+         comp_state->shader->cs.gfx125.compute_walker_body,
          GENX(EXECUTE_INDIRECT_DISPATCH),
          .PredicateEnable            = predicate,
          .MaxCount                   = 1,
@@ -520,7 +533,7 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
          &cmd_buffer->batch,
          GENX(COMPUTE_WALKER_length),
          GENX(COMPUTE_WALKER_body_start) / 32,
-         anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
+         comp_state->shader->cs.gfx125.compute_walker_body,
          GENX(COMPUTE_WALKER),
          .IndirectParameterEnable        = !anv_address_is_null(indirect_addr),
          .PredicateEnable                = predicate,
@@ -1051,8 +1064,6 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
 {
    struct anv_device *device = cmd_buffer->device;
    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
-   struct anv_ray_tracing_pipeline *pipeline =
-      anv_pipeline_to_ray_tracing(rt->base.pipeline);
 
    if (INTEL_DEBUG(DEBUG_RT_NO_TRACE))
       return;
@@ -1211,18 +1222,18 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
        */
       btd.PerDSSMemoryBackedBufferSize = 6;
       btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
-      if (pipeline->base.scratch_size > 0) {
+      if (rt->scratch_size > 0) {
          struct anv_bo *scratch_bo =
             anv_scratch_pool_alloc(device,
                                    &device->scratch_pool,
                                    MESA_SHADER_COMPUTE,
-                                   pipeline->base.scratch_size);
+                                   rt->scratch_size);
          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
                                scratch_bo);
          uint32_t scratch_surf =
             anv_scratch_pool_get_surf(cmd_buffer->device,
                                       &device->scratch_pool,
-                                      pipeline->base.scratch_size);
+                                      rt->scratch_size);
          btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
       }
 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
@@ -1234,7 +1245,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
 #endif
    }
 
-   genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
+   genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, rt->scratch_size);
 
    const struct brw_cs_prog_data *cs_prog_data =
       brw_cs_prog_data_const(device->rt_trampoline->prog_data);
@@ -1273,7 +1284,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
       .ThreadGroupIDZDimension        = global_size[2],
       .ExecutionMask                  = 0xff,
       .EmitInlineParameter            = true,
-      .PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0),
+      .PostSync.MOCS                  = anv_mocs(cmd_buffer->device, NULL, 0),
 #if GFX_VER >= 30
          /* HSD 14016252163 */
       .DispatchWalkOrder = cs_prog_data->uses_sampler ? MortonWalk : LinearWalk,
diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c
index f7f61b38471..743012011ec 100644
--- a/src/intel/vulkan/genX_cmd_draw.c
+++ b/src/intel/vulkan/genX_cmd_draw.c
@@ -162,7 +162,7 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
 
 static struct anv_address
 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
-                       const struct anv_shader_bin *shader,
+                       const struct anv_shader *shader,
                        const struct anv_push_range *range)
 {
    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
@@ -242,10 +242,10 @@ get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
  */
 static uint32_t
 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
-                          const struct anv_shader_bin *shader,
+                          const struct anv_shader *shader,
                           const struct anv_push_range *range)
 {
-   assert(shader->stage != MESA_SHADER_COMPUTE);
+   assert(shader->vk.stage != MESA_SHADER_COMPUTE);
    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
    switch (range->set) {
    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
@@ -443,7 +443,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
       if (!anv_gfx_has_stage(gfx, stage))
          continue;
 
-      const struct anv_shader_bin *shader = gfx->shaders[stage];
+      const struct anv_shader *shader = gfx->shaders[stage];
       if (shader->prog_data->robust_ubo_ranges) {
          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
          struct anv_push_constants *push = &gfx->base.push_constants;
@@ -509,7 +509,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
 
       struct anv_address buffers[4] = {};
       if (anv_gfx_has_stage(gfx, stage)) {
-         const struct anv_shader_bin *shader = gfx->shaders[stage];
+         const struct anv_shader *shader = gfx->shaders[stage];
          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
 
          /* We have to gather buffer addresses as a second step because the
@@ -593,7 +593,7 @@ get_mesh_task_push_addr64(struct anv_cmd_buffer *cmd_buffer,
                           struct anv_cmd_graphics_state *gfx,
                           mesa_shader_stage stage)
 {
-   const struct anv_shader_bin *shader = gfx->shaders[stage];
+   const struct anv_shader *shader = gfx->shaders[stage];
    const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
    if (bind_map->push_ranges[0].length == 0)
       return 0;
@@ -645,31 +645,50 @@ cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
 
 ALWAYS_INLINE static void
 cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
-                                 const struct anv_graphics_pipeline *pipeline)
+                                 struct anv_cmd_graphics_state *gfx,
+                                 const struct vk_dynamic_graphics_state *dyn)
 {
-   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+   if (!anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT))
       return;
 
-   UNUSED bool need_rt_flush = false;
-   for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) {
-      /* No writes going to this render target so it won't affect the RT cache
-       */
-      if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED)
-         continue;
+   /* Count the number of color attachments in the binding table */
+   const struct anv_pipeline_bind_map *bind_map =
+      &gfx->shaders[MESA_SHADER_FRAGMENT]->bind_map;
 
-      /* No change */
-      if (cmd_buffer->state.gfx.color_output_mapping[rt] ==
-          pipeline->color_output_mapping[rt])
-         continue;
-
-      cmd_buffer->state.gfx.color_output_mapping[rt] =
-         pipeline->color_output_mapping[rt];
-      need_rt_flush = true;
-      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   /* Build a map of fragment color output to attachment */
+   uint8_t rt_to_att[MAX_RTS];
+   memset(rt_to_att, ANV_COLOR_OUTPUT_DISABLED, MAX_RTS);
+   for (uint32_t i = 0; i < MAX_RTS; i++) {
+      if (dyn->cal.color_map[i] != MESA_VK_ATTACHMENT_UNUSED)
+         rt_to_att[dyn->cal.color_map[i]] = i;
+   }
+
+   /* For each fragment shader output if not unused apply the remapping to
+    * pipeline->color_output_mapping
+    */
+   UNUSED bool need_rt_flush = false;
+   for (unsigned rt = 0; rt < MIN2(bind_map->surface_count, MAX_RTS); rt++) {
+      if (bind_map->surface_to_descriptor[rt].set !=
+          ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
+         break;
+
+      uint32_t index = bind_map->surface_to_descriptor[rt].index;
+      if (index == ANV_COLOR_OUTPUT_UNUSED)
+         continue;
+
+      if (index == ANV_COLOR_OUTPUT_DISABLED &&
+          gfx->color_output_mapping[rt] != index) {
+         gfx->color_output_mapping[rt] = index;
+         need_rt_flush = true;
+      } else if (gfx->color_output_mapping[rt] != rt_to_att[rt])  {
+         gfx->color_output_mapping[rt] = rt_to_att[rt];
+         need_rt_flush = true;
+      }
    }
 
-#if GFX_VER >= 11
    if (need_rt_flush) {
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+#if GFX_VER >= 11
       /* The PIPE_CONTROL command description says:
        *
        *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
@@ -689,8 +708,8 @@ cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
                                 "change RT due to shader outputs");
-   }
 #endif
+   }
 }
 
 ALWAYS_INLINE static void
@@ -750,8 +769,6 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_device *device = cmd_buffer->device;
    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
    const struct vk_dynamic_graphics_state *dyn =
       &cmd_buffer->vk.dynamic_graphics_state;
 
@@ -772,16 +789,16 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
        *
        * Apply task URB workaround when switching from task to primitive.
        */
-      if (anv_pipeline_is_primitive(pipeline)) {
+      if (!anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
          genX(apply_task_urb_workaround)(cmd_buffer);
-      } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+      } else if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
          cmd_buffer->state.gfx.used_task_shader = true;
       }
    }
 
    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP) ||
        (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PS))
-      cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline);
+      cmd_buffer_maybe_flush_rt_writes(cmd_buffer, gfx, dyn);
 
    /* Apply any pending pipeline flushes we may have.  We want to apply them
     * now because, if any of those flushes are for things like push constants,
@@ -887,17 +904,29 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
    /* If the pipeline changed, we may need to re-allocate push constant space
     * in the URB.
     */
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS) {
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS)
       cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
 
-      /* Also add the relocations (scratch buffers) */
-      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
-                                              pipeline->base.base.batch.relocs);
-      if (result != VK_SUCCESS) {
-         anv_batch_set_error(&cmd_buffer->batch, result);
-         return;
+#if GFX_VERx10 < 125
+   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_VS |
+                                      ANV_CMD_DIRTY_HS |
+                                      ANV_CMD_DIRTY_DS |
+                                      ANV_CMD_DIRTY_GS |
+                                      ANV_CMD_DIRTY_PS)) {
+      for (unsigned s = 0; s <= MESA_SHADER_FRAGMENT; s++) {
+         if (gfx->shaders[s] == NULL)
+            continue;
+
+         /* Also add the relocations (scratch buffers) */
+         VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                                 &gfx->shaders[s]->relocs);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
       }
    }
+#endif
 
    /* Render targets live in the same binding table as fragment descriptors */
    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
@@ -916,7 +945,7 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
          cmd_buffer,
          &cmd_buffer->state.gfx.base,
          descriptors_dirty,
-         (const struct anv_shader_bin **)gfx->shaders,
+         (const struct anv_shader **)gfx->shaders,
          ARRAY_SIZE(gfx->shaders));
       cmd_buffer->state.descriptors_dirty &= ~dirty;
    }
@@ -989,23 +1018,13 @@ anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
 ALWAYS_INLINE static void
 cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
 {
+   UNUSED const struct anv_device *device = cmd_buffer->device;
+   UNUSED const struct anv_instance *instance =
+      device->physical->instance;
    UNUSED const bool protected = cmd_buffer->vk.pool->flags &
                                  VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
-   UNUSED struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
-   UNUSED struct anv_device *device = cmd_buffer->device;
-   UNUSED struct anv_instance *instance = device->physical->instance;
-
-#define DEBUG_SHADER_HASH(stage) do {                                   \
-      if (unlikely(                                                     \
-             (instance->debug & ANV_DEBUG_SHADER_HASH) &&               \
-             anv_pipeline_has_stage(pipeline, stage))) {                \
-         mi_store(&b,                                                   \
-                  mi_mem32(device->workaround_address),                 \
-                  mi_imm(pipeline->base.shaders[stage]->                \
-                         prog_data->source_hash));                      \
-      }                                                                 \
-   } while (0)
+   UNUSED struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   UNUSED struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
 
    struct mi_builder b;
    if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {
@@ -1013,18 +1032,35 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
       mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
    }
 
+#define DEBUG_SHADER_HASH(stage) do {                                   \
+      if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {          \
+         mi_store(&b,                                                   \
+                  mi_mem32(device->workaround_address),                 \
+                  mi_imm(gfx->shaders[stage]->prog_data->source_hash)); \
+      }                                                                 \
+   } while (0)
+
+#define anv_batch_emit_gfx(batch, cmd, name) ({                         \
+      void *__dst = anv_batch_emit_dwords(                              \
+         batch, __anv_cmd_length(cmd));                                 \
+      memcpy(__dst, hw_state->packed.name,                              \
+             4 * __anv_cmd_length(cmd));                                \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(                                 \
+            __dst, __anv_cmd_length(cmd) * 4));                         \
+      __dst;                                                            \
+   })
+
 #if INTEL_WA_16011107343_GFX_VER
    if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) &&
-       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
+       anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) {
       DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL);
-      anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
-                                              final.hs, protected);
+      anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_HS), hs);
    }
 #endif
 
 #if INTEL_WA_22018402687_GFX_VER
    if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) &&
-       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+       anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
       DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL);
       /* Wa_22018402687:
        *   In any 3D enabled context, just before any Tessellation enabled
@@ -1038,13 +1074,13 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
        * said switch, as it matters at the HW level, and can be triggered even
        * across processes, so we apply the Wa at all times.
        */
-      anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
-                                              final.ds, protected);
+      anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_DS), ds);
    }
 #endif
 
    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
 
+#undef anv_batch_emit_gfx
 #undef DEBUG_SHADER_HASH
 }
 
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
index f5fd7c71d37..90584136ec6 100644
--- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@@ -96,18 +96,10 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
       ANV_STATE_NULL;
    UNUSED uint32_t wa_insts_offset = 0;
 
-#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
-#endif
-
 #if INTEL_WA_16011107343_GFX_VER
    if (wa_16011107343) {
       memcpy(wa_insts_state.map + wa_insts_offset,
-             &pipeline->batch_data[
-                protected ?
-                pipeline->final.hs_protected.offset :
-                pipeline->final.hs.offset],
+             gfx->dyn_state.packed.hs,
              GENX(3DSTATE_HS_length) * 4);
       wa_insts_offset += GENX(3DSTATE_HS_length) * 4;
    }
@@ -116,10 +108,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
 #if INTEL_WA_22018402687_GFX_VER
    if (wa_22018402687) {
       memcpy(wa_insts_state.map + wa_insts_offset,
-             &pipeline->batch_data[
-                protected ?
-                pipeline->final.ds_protected.offset :
-                pipeline->final.ds.offset],
+             gfx->dyn_state.packed.ds,
              GENX(3DSTATE_DS_length) * 4);
       wa_insts_offset += GENX(3DSTATE_DS_length) * 4;
    }
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c
index 9892780b3b3..547cef56242 100644
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -209,7 +209,7 @@ genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer,
    if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
       return;
 
-   if (gfx->uses_xfb) {
+   if (gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) {
       genX(cmd_buffer_set_preemption)(cmd_buffer, false);
       return;
    }
@@ -417,10 +417,10 @@ want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn,
     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
     * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
     */
-   struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
 
    return kill_pixel(wm_prog_data, dyn) ||
-          has_ds_feedback_loop(&fs_bin->bind_map, dyn) ||
+          has_ds_feedback_loop(&fs->bind_map, dyn) ||
           wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
 }
 
@@ -1012,21 +1012,21 @@ update_ps(struct anv_gfx_dynamic_state *hw_state,
       return;
    }
 
-   const struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   const struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
    struct GENX(3DSTATE_PS) ps = {};
    intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
                                MAX2(dyn->ms.rasterization_samples, 1),
                                hw_state->fs_msaa_flags);
 
    SET(PS, ps.KernelStartPointer0,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
    SET(PS, ps.KernelStartPointer1,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
 #if GFX_VER < 20
    SET(PS, ps.KernelStartPointer2,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
 #endif
 
@@ -1124,12 +1124,12 @@ update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state,
                             const struct vk_dynamic_graphics_state *dyn,
                             const struct anv_cmd_graphics_state *gfx)
 {
-   struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
    const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
 
    SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
                        wm_prog_data &&
-                       (has_ds_feedback_loop(&fs_bin->bind_map, dyn) ||
+                       (has_ds_feedback_loop(&fs->bind_map, dyn) ||
                         wm_prog_data->uses_kill),
                        FRAGMENT);
 }
@@ -2174,6 +2174,35 @@ update_tbimr_info(struct anv_gfx_dynamic_state *hw_state,
 }
 #endif
 
+#if GFX_VERx10 == 90
+ALWAYS_INLINE static void
+update_vs(struct anv_gfx_dynamic_state *hw_state,
+          const struct anv_cmd_graphics_state *gfx,
+          const struct anv_device *device)
+{
+   if (device->info->gt < 4)
+      return;
+
+   /* On Sky Lake GT4, we have experienced some hangs related to the VS cache
+    * and tessellation. It is unknown exactly what is happening but the
+    * Haswell docs for the "VS Reference Count Full Force Miss Enable" field
+    * of the "Thread Mode" register refer to a HSW bug in which the VUE handle
+    * reference count would overflow resulting in internal reference counting
+    * bugs. My (Faith's) best guess is that this bug cropped back up on SKL
+    * GT4 when we suddenly had more threads in play than any previous gfx9
+    * hardware.
+    *
+    * What we do know for sure is that setting this bit when tessellation
+    * shaders are in use fixes a GPU hang in Batman: Arkham City when playing
+    * with DXVK (https://bugs.freedesktop.org/107280). Disabling the vertex
+    * cache with tessellation shaders should only have a minor performance
+    * impact as the tessellation shaders are likely generating and processing
+    * far more geometry than the vertex stage.
+    */
+   SET(VS, vs.VertexCacheDisable, anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL));
+}
+#endif
+
 #if INTEL_WA_18019110168_GFX_VER
 static inline unsigned
 compute_mesh_provoking_vertex(const struct brw_mesh_prog_data *mesh_prog_data,
@@ -2215,11 +2244,13 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
                                    const struct anv_device *device,
                                    const struct vk_dynamic_graphics_state *dyn,
                                    struct anv_cmd_graphics_state *gfx,
-                                   const struct anv_graphics_pipeline *pipeline,
                                    VkCommandBufferLevel cmd_buffer_level)
 {
    UNUSED bool fs_msaa_changed = false;
 
+   assert(gfx->shaders[gfx->streamout_stage] != NULL);
+   assert(gfx->instance_multiplier != 0);
+
    /* Do this before update_fs_msaa_flags() for primitive_id_index */
    if (gfx->dirty & ANV_CMD_DIRTY_ALL_SHADERS(device))
       update_sbe(hw_state, gfx, device);
@@ -2234,6 +2265,11 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
    if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
       update_urb_config(hw_state, gfx, device);
 
+#if GFX_VERx10 == 90
+   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
+      update_vs(hw_state, gfx, device);
+#endif
+
    if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
        BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
       update_ps(hw_state, device, dyn, gfx);
@@ -2482,8 +2518,7 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
 static void
 cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
                             struct anv_cmd_buffer *cmd_buffer,
-                            const struct anv_cmd_graphics_state *gfx,
-                            const struct anv_graphics_pipeline *pipeline)
+                            const struct anv_cmd_graphics_state *gfx)
 {
    struct anv_device *device = cmd_buffer->device;
    struct anv_instance *instance = device->physical->instance;
@@ -2502,73 +2537,107 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
    } while (0)
 #define IS_DIRTY(name) BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##name)
 
-#define anv_gfx_copy(field, cmd, source) ({                             \
-      assert(sizeof(hw_state->packed.field) >=                          \
-             4 * __anv_cmd_length(cmd));                                \
-      assert((source).len == __anv_cmd_length(cmd));                    \
-      memcpy(&hw_state->packed.field,                                   \
-             &pipeline->batch_data[(source).offset],                    \
-             4 * __anv_cmd_length(cmd));                                \
+#define anv_gfx_copy(field, cmd, stage, source) ({                      \
+      if (gfx->shaders[stage] != NULL) {                                \
+         assert(sizeof(hw_state->packed.field) >=                       \
+                4 * __anv_cmd_length(cmd));                             \
+         assert((gfx->shaders[stage]->source).len ==                    \
+                __anv_cmd_length(cmd));                                 \
+         memcpy(&hw_state->packed.field,                                \
+                &gfx->shaders[stage]->cmd_data[                         \
+                   (gfx->shaders[stage]->source).offset],               \
+                4 * __anv_cmd_length(cmd));                             \
+      } else {                                                          \
+         anv_gfx_pack(field, cmd, __unused_name);                       \
+      }                                                                 \
    })
-#define anv_gfx_copy_variable(field, source) ({                         \
-      assert(sizeof(hw_state->packed.field) >=                          \
-             4 * (source).len);                                         \
-      memcpy(&hw_state->packed.field,                                   \
-             &pipeline->batch_data[(source).offset],                    \
-             4 * (source).len);                                         \
-      hw_state->packed.field##_len = (source).len;                      \
+#define anv_gfx_copy_variable(field, stage, source) ({                  \
+      if (gfx->shaders[stage] != NULL) {                                \
+         assert(sizeof(hw_state->packed.field) >=                       \
+                4 * gfx->shaders[stage]->source.len);                   \
+         memcpy(&hw_state->packed.field,                                \
+                &gfx->shaders[stage]->cmd_data[                         \
+                   (gfx->shaders[stage]->source).offset],               \
+                4 * gfx->shaders[stage]->source.len);                   \
+         hw_state->packed.field##_len =                                 \
+            gfx->shaders[stage]->source.len;                            \
+      }                                                                 \
    })
-#define anv_gfx_copy_protected(field, cmd, source) ({                  \
+#define anv_gfx_copy_protected(field, cmd, stage, source) ({           \
       const bool __protected = (cmd_buffer->vk.pool->flags &           \
                                 VK_COMMAND_POOL_CREATE_PROTECTED_BIT); \
       assert(sizeof(hw_state->packed.field) >=                         \
              4 * __anv_cmd_length(cmd));                               \
-      assert((source).len == __anv_cmd_length(cmd));                   \
-      memcpy(&hw_state->packed.field,                                  \
-             &pipeline->batch_data[                                    \
-                __protected ?                                          \
-                (source##_protected).offset :                          \
-                (source).offset],                                      \
-             4 * __anv_cmd_length(cmd));                               \
+      if (gfx->shaders[stage] != NULL) {                               \
+         assert((gfx->shaders[stage]->source).len ==                   \
+                __anv_cmd_length(cmd));                                \
+         memcpy(&hw_state->packed.field,                               \
+                &gfx->shaders[stage]->cmd_data[                        \
+                   __protected ?                                       \
+                   gfx->shaders[stage]->source##_protected.offset :    \
+                   gfx->shaders[stage]->source.offset],                \
+                4 * __anv_cmd_length(cmd));                            \
+      } else {                                                         \
+         memcpy(&hw_state->packed.field,                               \
+                device->physical->gfx_default.field,                   \
+                4 * __anv_cmd_length(cmd));                            \
+      }                                                                \
    })
-#define anv_gfx_pack_merge(field, cmd, prepacked, name)                 \
-   for (struct cmd name = { 0 },                                        \
+#define anv_gfx_pack_merge(field, cmd, stage, source, name)             \
+   for (struct cmd name = (struct cmd) { 0 },                           \
         *_dst = (struct cmd *)hw_state->packed.field;                   \
         __builtin_expect(_dst != NULL, 1);                              \
-        ({ const struct anv_gfx_state_ptr *_cmd_state = &prepacked;     \
+        ({                                                              \
            uint32_t _partial[__anv_cmd_length(cmd)];                    \
-           assert(_cmd_state->len == __anv_cmd_length(cmd));            \
            assert(sizeof(hw_state->packed.field) >=                     \
                   4 * __anv_cmd_length(cmd));                           \
            __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
-           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
-              assert((_partial[i] &                                     \
-                      (pipeline)->batch_data[                           \
-                         (prepacked).offset + i]) == 0);                \
-              ((uint32_t *)_dst)[i] = _partial[i] |                     \
-                 (pipeline)->batch_data[_cmd_state->offset + i];        \
+           if (gfx->shaders[stage] != NULL) {                           \
+              const struct anv_gfx_state_ptr *_cmd_state =              \
+                 &gfx->shaders[stage]->source;                          \
+              assert(_cmd_state->len == __anv_cmd_length(cmd));         \
+              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
+                 assert((_partial[i] &                                  \
+                         gfx->shaders[stage]->cmd_data[                 \
+                            _cmd_state->offset + i]) == 0);             \
+                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
+                    gfx->shaders[stage]->cmd_data[_cmd_state->offset + i]; \
+              }                                                         \
+           } else {                                                     \
+              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
+                 assert((_partial[i] &                                  \
+                         device->physical->gfx_default.field[i]) == 0); \
+                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
+                    device->physical->gfx_default.field[i];             \
+              }                                                         \
            }                                                            \
            _dst = NULL;                                                 \
-         }))
-#define anv_gfx_pack_merge_protected(field, cmd, prepacked, name)       \
-   for (struct cmd name = { 0 },                                        \
+        }))
+#define anv_gfx_pack_merge_protected(field, cmd, stage, source, name)   \
+   for (struct cmd name = (struct cmd) { 0 },                           \
         *_dst = (struct cmd *)hw_state->packed.field;                   \
         __builtin_expect(_dst != NULL, 1);                              \
-        ({ const struct anv_gfx_state_ptr *_cmd_state =                 \
-              (cmd_buffer->vk.pool->flags &                             \
-               VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?                  \
-              &prepacked##_protected : &prepacked;                      \
+        ({                                                              \
            uint32_t _partial[__anv_cmd_length(cmd)];                    \
-           assert(_cmd_state->len == __anv_cmd_length(cmd));            \
            assert(sizeof(hw_state->packed.field) >=                     \
                   4 * __anv_cmd_length(cmd));                           \
            __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
+           const struct anv_gfx_state_ptr *_cmd_state =                 \
+              gfx->shaders[stage] != NULL ?                             \
+              ((cmd_buffer->vk.pool->flags &                            \
+                VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?                 \
+               &gfx->shaders[stage]->source##_protected :               \
+               &gfx->shaders[stage]->source) :                          \
+              NULL;                                                     \
+           assert(_cmd_state == NULL ||                                 \
+                  _cmd_state->len == __anv_cmd_length(cmd));            \
+           const uint32_t *_inst_data =                                 \
+              gfx->shaders[stage] != NULL ?                             \
+              &gfx->shaders[stage]->cmd_data[_cmd_state->offset] :      \
+              device->physical->gfx_default.field;                      \
            for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
-              assert((_partial[i] &                                     \
-                      (pipeline)->batch_data[                           \
-                         (prepacked).offset + i]) == 0);                \
-              ((uint32_t *)_dst)[i] = _partial[i] |                     \
-                 (pipeline)->batch_data[_cmd_state->offset + i];        \
+              assert((_partial[i] & _inst_data[i]) == 0);               \
+              ((uint32_t *)_dst)[i] = _partial[i] | _inst_data[i];      \
            }                                                            \
            _dst = NULL;                                                 \
          }))
@@ -2624,19 +2693,19 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 #endif
 
    if (IS_DIRTY(VF_SGVS))
-      anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), pipeline->final.vf_sgvs);
+      anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), MESA_SHADER_VERTEX, vs.vf_sgvs);
 
 #if GFX_VER >= 11
    if (IS_DIRTY(VF_SGVS_2))
-      anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), pipeline->final.vf_sgvs_2);
+      anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), MESA_SHADER_VERTEX, vs.vf_sgvs_2);
 #endif
 
    if (IS_DIRTY(VF_SGVS_INSTANCING))
-      anv_gfx_copy_variable(vf_sgvs_instancing, pipeline->final.vf_sgvs_instancing);
+      anv_gfx_copy_variable(vf_sgvs_instancing, MESA_SHADER_VERTEX, vs.vf_sgvs_instancing);
 
    if (instance->vf_component_packing && IS_DIRTY(VF_COMPONENT_PACKING)) {
       anv_gfx_copy(vf_component_packing, GENX(3DSTATE_VF_COMPONENT_PACKING),
-                   pipeline->final.vf_component_packing);
+                   MESA_SHADER_VERTEX, vs.vf_component_packing);
    }
 
    if (IS_DIRTY(INDEX_BUFFER)) {
@@ -2655,7 +2724,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 
    if (IS_DIRTY(STREAMOUT)) {
       anv_gfx_pack_merge(so, GENX(3DSTATE_STREAMOUT),
-                         pipeline->partial.so, so) {
+                         gfx->streamout_stage, so, so) {
          SET(so, so, RenderingDisable);
          SET(so, so, RenderStreamSelect);
          SET(so, so, ReorderMode);
@@ -2664,7 +2733,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
    }
 
    if (IS_DIRTY(SO_DECL_LIST))
-      anv_gfx_copy_variable(so_decl_list, pipeline->final.so_decl_list);
+      anv_gfx_copy_variable(so_decl_list, gfx->streamout_stage, so_decl_list);
 
    if (IS_DIRTY(CLIP)) {
       anv_gfx_pack(clip, GENX(3DSTATE_CLIP), clip) {
@@ -2886,7 +2955,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 
    if (IS_DIRTY(TE)) {
       if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
-         anv_gfx_pack_merge(te, GENX(3DSTATE_TE), pipeline->partial.te, te) {
+         anv_gfx_pack_merge(te, GENX(3DSTATE_TE),
+                            MESA_SHADER_TESS_EVAL, ds.te, te) {
             SET(te, te, OutputTopology);
 #if GFX_VERx10 >= 125
             SET(te, te, TessellationDistributionMode);
@@ -2986,7 +3056,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
    }
 
    if (IS_DIRTY(WM)) {
-      anv_gfx_pack_merge(wm, GENX(3DSTATE_WM), pipeline->partial.wm, wm) {
+      anv_gfx_pack_merge(wm, GENX(3DSTATE_WM),
+                         MESA_SHADER_FRAGMENT, ps.wm, wm) {
          SET(wm, wm, LineStippleEnable);
          SET(wm, wm, BarycentricInterpolationMode);
       }
@@ -3079,12 +3150,12 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
    }
 
 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_features.meshShader) {
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
       if (IS_DIRTY(MESH_CONTROL)) {
          if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
             anv_gfx_copy_protected(mesh_control,
                                    GENX(3DSTATE_MESH_CONTROL),
-                                   pipeline->final.mesh_control);
+                                   MESA_SHADER_MESH, ms.control);
          } else {
             anv_gfx_pack(mesh_control, GENX(3DSTATE_MESH_CONTROL), mc);
          }
@@ -3092,8 +3163,9 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 
       if (IS_DIRTY(TASK_CONTROL)) {
          if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
-            anv_gfx_copy_protected(task_control, GENX(3DSTATE_TASK_CONTROL),
-                                   pipeline->final.task_control);
+            anv_gfx_copy_protected(task_control,
+                                   GENX(3DSTATE_TASK_CONTROL),
+                                   MESA_SHADER_TASK, ts.control);
          } else {
             anv_gfx_pack(task_control, GENX(3DSTATE_TASK_CONTROL), tc);
          }
@@ -3101,101 +3173,86 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 
       if (IS_DIRTY(MESH_SHADER)) {
          anv_gfx_copy(mesh_shader, GENX(3DSTATE_MESH_SHADER),
-                      pipeline->final.mesh_shader);
+                      MESA_SHADER_MESH, ms.shader);
       }
 
       if (IS_DIRTY(MESH_DISTRIB)) {
          anv_gfx_copy(mesh_distrib, GENX(3DSTATE_MESH_DISTRIB),
-                      pipeline->final.mesh_distrib);
+                      MESA_SHADER_MESH, ms.distrib);
       }
 
       if (IS_DIRTY(CLIP_MESH)) {
          anv_gfx_copy(clip_mesh, GENX(3DSTATE_CLIP_MESH),
-                      pipeline->final.clip_mesh);
+                      MESA_SHADER_MESH, ms.clip);
       }
 
       if (IS_DIRTY(TASK_SHADER)) {
          anv_gfx_copy(task_shader, GENX(3DSTATE_TASK_SHADER),
-                      pipeline->final.task_shader);
+                      MESA_SHADER_TASK, ts.shader);
       }
 
       if (IS_DIRTY(TASK_REDISTRIB)) {
          anv_gfx_copy(task_redistrib, GENX(3DSTATE_TASK_REDISTRIB),
-                      pipeline->final.task_redistrib);
+                      MESA_SHADER_TASK, ts.redistrib);
       }
    }
 #endif /* GFX_VERx10 >= 125 */
 
    if (IS_DIRTY(VS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_VERTEX)) {
-         anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), pipeline->final.vs);
-      } else {
-         anv_gfx_pack(vs, GENX(3DSTATE_VS), vs);
+#if GFX_VERx10 == 90
+      anv_gfx_pack_merge_protected(vs, GENX(3DSTATE_VS),
+                                   MESA_SHADER_VERTEX, vs.vs, vs) {
+         SET(vs, vs, VertexCacheDisable);
       }
+#else
+      anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), MESA_SHADER_VERTEX, vs.vs);
+#endif
    }
 
-   if (IS_DIRTY(HS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) {
-         anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), pipeline->final.hs);
-      } else {
-         anv_gfx_pack(hs, GENX(3DSTATE_HS), hs);
-      }
-   }
+   if (IS_DIRTY(HS))
+      anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs);
 
-   if (IS_DIRTY(DS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
-         anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), pipeline->final.ds);
-      } else {
-         anv_gfx_pack(ds, GENX(3DSTATE_DS), ds);
-      }
-   }
+   if (IS_DIRTY(DS))
+      anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), MESA_SHADER_TESS_EVAL, ds.ds);
 
    if (IS_DIRTY(GS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_GEOMETRY)) {
-         anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
-                                      pipeline->partial.gs, gs) {
-            SET(gs, gs, ReorderMode);
-         }
-      } else {
-         anv_gfx_pack(gs, GENX(3DSTATE_GS), gs);
+      anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
+                                   MESA_SHADER_GEOMETRY, gs.gs, gs) {
+         SET(gs, gs, ReorderMode);
       }
    }
 
    if (IS_DIRTY(PS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
-         anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS),
-                                      pipeline->partial.ps, ps) {
-            SET(ps, ps, KernelStartPointer0);
-            SET(ps, ps, KernelStartPointer1);
-            SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
-            SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
+      anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS),
+                                   MESA_SHADER_FRAGMENT, ps.ps, ps) {
+         SET(ps, ps, KernelStartPointer0);
+         SET(ps, ps, KernelStartPointer1);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
 
 #if GFX_VER < 20
-            SET(ps, ps, KernelStartPointer2);
-            SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
+         SET(ps, ps, KernelStartPointer2);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
 
-            SET(ps, ps, _8PixelDispatchEnable);
-            SET(ps, ps, _16PixelDispatchEnable);
-            SET(ps, ps, _32PixelDispatchEnable);
+         SET(ps, ps, _8PixelDispatchEnable);
+         SET(ps, ps, _16PixelDispatchEnable);
+         SET(ps, ps, _32PixelDispatchEnable);
 #else
-            SET(ps, ps, Kernel0Enable);
-            SET(ps, ps, Kernel1Enable);
-            SET(ps, ps, Kernel0SIMDWidth);
-            SET(ps, ps, Kernel1SIMDWidth);
-            SET(ps, ps, Kernel0PolyPackingPolicy);
-            SET(ps, ps, Kernel0MaximumPolysperThread);
+         SET(ps, ps, Kernel0Enable);
+         SET(ps, ps, Kernel1Enable);
+         SET(ps, ps, Kernel0SIMDWidth);
+         SET(ps, ps, Kernel1SIMDWidth);
+         SET(ps, ps, Kernel0PolyPackingPolicy);
+         SET(ps, ps, Kernel0MaximumPolysperThread);
 #endif
-            SET(ps, ps, PositionXYOffsetSelect);
-         }
-      } else {
-         anv_gfx_pack(ps, GENX(3DSTATE_PS), ps);
+         SET(ps, ps, PositionXYOffsetSelect);
       }
    }
 
    if (IS_DIRTY(PS_EXTRA)) {
       if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
          anv_gfx_pack_merge(ps_extra, GENX(3DSTATE_PS_EXTRA),
-                            pipeline->partial.ps_extra, pse) {
+                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
             SET(pse, ps_extra, PixelShaderHasUAV);
             SET(pse, ps_extra, PixelShaderIsPerSample);
 #if GFX_VER >= 11
@@ -3213,7 +3270,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
           * change through pre-rasterization shader) or if we notice a change.
           */
          anv_gfx_pack_merge(ps_extra_dep, GENX(3DSTATE_PS_EXTRA),
-                            pipeline->partial.ps_extra, pse) {
+                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
             SET(pse, ps_extra, PixelShaderHasUAV);
             SET(pse, ps_extra, PixelShaderIsPerSample);
 #if GFX_VER >= 11
@@ -3269,15 +3326,13 @@ genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
       cmd_buffer->device,
       &cmd_buffer->vk.dynamic_graphics_state,
       &cmd_buffer->state.gfx,
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline),
       cmd_buffer->vk.level);
 
    vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
 
    cmd_buffer_repack_gfx_state(&cmd_buffer->state.gfx.dyn_state,
                                cmd_buffer,
-                               &cmd_buffer->state.gfx,
-                               anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline));
+                               &cmd_buffer->state.gfx);
 }
 
 static void
@@ -3431,8 +3486,6 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
    struct anv_device *device = cmd_buffer->device;
    struct anv_instance *instance = device->physical->instance;
    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
    const struct vk_dynamic_graphics_state *dyn =
       &cmd_buffer->vk.dynamic_graphics_state;
    struct anv_push_constants *push_consts =
@@ -3493,7 +3546,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
       const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
       if (mesh_prog_data) {
          push_consts->gfx.fs_per_prim_remap_offset =
-            pipeline->base.shaders[MESA_SHADER_MESH]->kernel.offset +
+            gfx->shaders[MESA_SHADER_MESH]->kernel.offset +
             mesh_prog_data->wa_18019110168_mapping_offset;
       }
 
@@ -3576,7 +3629,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
        *    3. Send 3D State SOL with SOL Enabled
        */
       if (intel_needs_workaround(device->info, 16011773973) &&
-          pipeline->uses_xfb)
+          gfx->shaders[gfx->streamout_stage]->xfb_info != NULL)
          anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
 
       anv_batch_emit_gfx_variable(batch, so_decl_list);
@@ -3597,7 +3650,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
    }
 
 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_features.meshShader) {
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
       if (IS_DIRTY(MESH_CONTROL))
          anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_CONTROL), mesh_control);
 
@@ -3670,8 +3723,8 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
       anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_TOPOLOGY), vft);
 
    if (IS_DIRTY(VERTEX_INPUT)) {
-      genX(batch_emit_pipeline_vertex_input)(batch, device,
-                                             pipeline, dyn->vi);
+      genX(batch_emit_vertex_input)(batch, device,
+                                    gfx->shaders[MESA_SHADER_VERTEX], dyn->vi);
    }
 
    if (IS_DIRTY(TE))
@@ -3823,8 +3876,6 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_device *device = cmd_buffer->device;
    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
 
    if (INTEL_DEBUG(DEBUG_REEMIT)) {
@@ -3863,7 +3914,7 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
     * it after.
     */
    if (intel_needs_workaround(device->info, 16011773973) &&
-       pipeline->uses_xfb &&
+       gfx->shaders[gfx->streamout_stage]->xfb_info != NULL &&
        BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
       BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
    }
diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c
index 3dbe549d244..0f13df1c744 100644
--- a/src/intel/vulkan/genX_shader.c
+++ b/src/intel/vulkan/genX_shader.c
@@ -569,31 +569,6 @@ emit_vs_shader(struct anv_batch *batch,
       vs.SoftwareExceptionEnable    = false;
       vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
 
-#if 0
-      /* TODO: move to shader binding */
-      if (GFX_VER == 9 && devinfo->gt == 4 &&
-          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-         /* On Sky Lake GT4, we have experienced some hangs related to the VS
-          * cache and tessellation.  It is unknown exactly what is happening
-          * but the Haswell docs for the "VS Reference Count Full Force Miss
-          * Enable" field of the "Thread Mode" register refer to a HSW bug in
-          * which the VUE handle reference count would overflow resulting in
-          * internal reference counting bugs.  My (Faith's) best guess is that
-          * this bug cropped back up on SKL GT4 when we suddenly had more
-          * threads in play than any previous gfx9 hardware.
-          *
-          * What we do know for sure is that setting this bit when
-          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
-          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
-          * Disabling the vertex cache with tessellation shaders should only
-          * have a minor performance impact as the tessellation shaders are
-          * likely generating and processing far more geometry than the vertex
-          * stage.
-          */
-         vs.VertexCacheDisable = true;
-      }
-#endif
-
       vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
       vs.VertexURBEntryReadOffset      = 0;
       vs.DispatchGRFStartRegisterForURBData =