anv: switch over to runtime pipelines

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34872>
2025-12-20 11:40:10 +01:00 · 2024-08-08 14:42:07 +03:00 · 2024-08-08 14:42:07 +03:00 · e76ed91d3f
commit e76ed91d3f
parent 4d9dd5c3a2
13 changed files with 697 additions and 529 deletions
--- a/src/intel/vulkan/anv_astc_emu.c
+++ b/src/intel/vulkan/anv_astc_emu.c
@ -5,6 +5,8 @@

 #include "anv_private.h"

+#include "vk_common_entrypoints.h"
+
 #include "compiler/nir/nir_builder.h"

 static void
@ -293,7 +295,8 @@ astc_emu_flush_denorm_slice(struct anv_cmd_buffer *cmd_buffer,
                                     set_writes);
   VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);

-   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
+   vk_common_CmdBindPipeline(cmd_buffer_,
+                             VK_PIPELINE_BIND_POINT_COMPUTE,
                             astc_emu->pipeline);

   VkPushConstantsInfoKHR push_info = {
@ -351,7 +354,9 @@ astc_emu_decompress_slice(struct anv_cmd_buffer *cmd_buffer,
      return;
   }

-   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+   vk_common_CmdBindPipeline(cmd_buffer_,
+                             VK_PIPELINE_BIND_POINT_COMPUTE,
+                             pipeline);

   struct vk_texcompress_astc_write_descriptor_set writes;
   vk_texcompress_astc_fill_write_descriptor_sets(astc_emu->texcompress,
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@ -30,6 +30,7 @@
 #include "anv_private.h"
 #include "anv_measure.h"

+#include "vk_common_entrypoints.h"
 #include "vk_util.h"

 /** \file anv_cmd_buffer.c
@ -435,17 +436,16 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
 }

 static void
-anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
                                   struct anv_cmd_pipeline_state *pipeline_state,
-                                    struct anv_pipeline *pipeline,
+                                   uint32_t ray_queries,
                                   VkShaderStageFlags stages)
 {
   struct anv_device *device = cmd_buffer->device;
   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);

   uint64_t ray_shadow_size =
-      align64(brw_rt_ray_queries_shadow_stacks_size(device->info,
-                                                    pipeline->ray_queries),
+      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
              4096);
   if (ray_shadow_size > 0 &&
       (!cmd_buffer->state.ray_query_shadow_bo ||
@ -497,112 +497,6 @@ anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
   pipeline_state->push_constants_data_dirty = true;
 }

-/**
- * This function compute changes between 2 pipelines and flags the dirty HW
- * state appropriately.
- */
-static void
-anv_cmd_buffer_flush_pipeline_hw_state(struct anv_cmd_buffer *cmd_buffer,
-                                       struct anv_graphics_pipeline *old_pipeline,
-                                       struct anv_graphics_pipeline *new_pipeline)
-{
-   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
-
-#define diff_fix_state(bit, name)                                       \
-   do {                                                                 \
-      /* Fixed states should always have matching sizes */              \
-      assert(old_pipeline == NULL ||                                    \
-             old_pipeline->name.len == new_pipeline->name.len);         \
-      /* Don't bother memcmp if the state is already dirty */           \
-      if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) &&    \
-          (old_pipeline == NULL ||                                      \
-           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                  &new_pipeline->batch_data[new_pipeline->name.offset], \
-                  4 * new_pipeline->name.len) != 0))                    \
-         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);         \
-   } while (0)
-#define diff_var_state(bit, name)                                       \
-   do {                                                                 \
-      /* Don't bother memcmp if the state is already dirty */           \
-      /* Also if the new state is empty, avoid marking dirty */         \
-      if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) &&    \
-          new_pipeline->name.len != 0 &&                                \
-          (old_pipeline == NULL ||                                      \
-           old_pipeline->name.len != new_pipeline->name.len ||          \
-           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                  &new_pipeline->batch_data[new_pipeline->name.offset], \
-                  4 * new_pipeline->name.len) != 0))                    \
-         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);         \
-   } while (0)
-#define assert_identical(bit, name)                                     \
-   do {                                                                 \
-      /* Fixed states should always have matching sizes */              \
-      assert(old_pipeline == NULL ||                                    \
-             old_pipeline->name.len == new_pipeline->name.len);         \
-      assert(old_pipeline == NULL ||                                    \
-             memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
-                    &new_pipeline->batch_data[new_pipeline->name.offset], \
-                    4 * new_pipeline->name.len) == 0);                  \
-   } while (0)
-#define assert_empty(name) assert(new_pipeline->name.len == 0)
-
-   /* Compare all states, including partial packed ones, the dynamic part is
-    * left at 0 but the static part could still change.
-    *
-    * We avoid comparing protected packets as all the fields but the scratch
-    * surface are identical. we just need to select the right one at emission.
-    */
-   diff_fix_state(VF_SGVS,                  final.vf_sgvs);
-   if (cmd_buffer->device->info->ver >= 11)
-      diff_fix_state(VF_SGVS_2,             final.vf_sgvs_2);
-   diff_fix_state(VF_COMPONENT_PACKING,     final.vf_component_packing);
-   diff_fix_state(VS,                       final.vs);
-   diff_fix_state(HS,                       final.hs);
-   diff_fix_state(DS,                       final.ds);
-
-   diff_fix_state(WM,                       partial.wm);
-   diff_fix_state(STREAMOUT,                partial.so);
-   diff_fix_state(GS,                       partial.gs);
-   diff_fix_state(TE,                       partial.te);
-   diff_fix_state(PS,                       partial.ps);
-   diff_fix_state(PS_EXTRA,                 partial.ps_extra);
-
-   if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
-      diff_fix_state(TASK_CONTROL,          final.task_control);
-      diff_fix_state(TASK_SHADER,           final.task_shader);
-      diff_fix_state(TASK_REDISTRIB,        final.task_redistrib);
-      diff_fix_state(MESH_CONTROL,          final.mesh_control);
-      diff_fix_state(MESH_SHADER,           final.mesh_shader);
-      diff_fix_state(MESH_DISTRIB,          final.mesh_distrib);
-      diff_fix_state(CLIP_MESH,             final.clip_mesh);
-   } else {
-      assert_empty(final.task_control);
-      assert_empty(final.task_shader);
-      assert_empty(final.task_redistrib);
-      assert_empty(final.mesh_control);
-      assert_empty(final.mesh_shader);
-      assert_empty(final.mesh_distrib);
-      assert_empty(final.clip_mesh);
-   }
-
-   /* States that can vary in length */
-   diff_var_state(VF_SGVS_INSTANCING,       final.vf_sgvs_instancing);
-   diff_var_state(SO_DECL_LIST,             final.so_decl_list);
-
-#undef diff_fix_state
-#undef diff_var_state
-#undef assert_identical
-#undef assert_empty
-
-   /* We're not diffing the following :
-    *    - anv_graphics_pipeline::vertex_input_data
-    *    - anv_graphics_pipeline::final::vf_instancing
-    *
-    * since they are tracked by the runtime.
-    */
-}
-
 static enum anv_cmd_dirty_bits
 get_pipeline_dirty_stages(struct anv_device *device,
                          struct anv_graphics_pipeline *old_pipeline,
@ -636,7 +530,7 @@ get_pipeline_dirty_stages(struct anv_device *device,

 static void
 update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
-                             struct anv_shader_bin **shaders,
+                             struct anv_shader ** const shaders,
                             uint32_t shader_count)
 {
   state->push_buffer_stages = 0;
@ -646,7 +540,7 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
      if (shaders[i] == NULL)
         continue;

-      VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->stage);
+      VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->vk.stage);

      if (shaders[i]->push_desc_info.used_descriptors)
         state->push_descriptor_stages |= stage;
@ -656,145 +550,6 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state,
   }
 }

-void anv_CmdBindPipeline(
-    VkCommandBuffer                             commandBuffer,
-    VkPipelineBindPoint                         pipelineBindPoint,
-    VkPipeline                                  _pipeline)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
-   struct anv_cmd_pipeline_state *state;
-   VkShaderStageFlags stages = 0;
-
-   switch (pipelineBindPoint) {
-   case VK_PIPELINE_BIND_POINT_COMPUTE: {
-      if (cmd_buffer->state.compute.base.pipeline == pipeline)
-         return;
-
-      struct anv_compute_pipeline *compute_pipeline =
-         anv_pipeline_to_compute(pipeline);
-
-      cmd_buffer->state.compute.shader = compute_pipeline->cs;
-      cmd_buffer->state.compute.pipeline_dirty = true;
-
-      set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
-                             &compute_pipeline->cs->bind_map);
-
-      state = &cmd_buffer->state.compute.base;
-      stages = VK_SHADER_STAGE_COMPUTE_BIT;
-
-      update_push_descriptor_flags(state, &compute_pipeline->cs, 1);
-      break;
-   }
-
-   case VK_PIPELINE_BIND_POINT_GRAPHICS: {
-      struct anv_graphics_pipeline *new_pipeline =
-         anv_pipeline_to_graphics(pipeline);
-
-      /* Apply the non dynamic state from the pipeline */
-      vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
-                                        &new_pipeline->dynamic_state);
-
-      if (cmd_buffer->state.gfx.base.pipeline == pipeline)
-         return;
-
-      struct anv_graphics_pipeline *old_pipeline =
-         cmd_buffer->state.gfx.base.pipeline == NULL ? NULL :
-         anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
-
-      cmd_buffer->state.gfx.dirty |=
-         get_pipeline_dirty_stages(cmd_buffer->device,
-                                   old_pipeline, new_pipeline);
-
-      STATIC_ASSERT(sizeof(cmd_buffer->state.gfx.shaders) ==
-                    sizeof(new_pipeline->base.shaders));
-      memcpy(cmd_buffer->state.gfx.shaders,
-             new_pipeline->base.shaders,
-             sizeof(cmd_buffer->state.gfx.shaders));
-      cmd_buffer->state.gfx.active_stages = pipeline->active_stages;
-
-      anv_foreach_stage(stage, new_pipeline->base.base.active_stages) {
-         set_dirty_for_bind_map(cmd_buffer, stage,
-                                &new_pipeline->base.shaders[stage]->bind_map);
-      }
-
-      state = &cmd_buffer->state.gfx.base;
-      stages = new_pipeline->base.base.active_stages;
-
-      update_push_descriptor_flags(state,
-                                   new_pipeline->base.shaders,
-                                   ARRAY_SIZE(new_pipeline->base.shaders));
-
-      /* When the pipeline is using independent states and dynamic buffers,
-       * this will trigger an update of anv_push_constants::dynamic_base_index
-       * & anv_push_constants::dynamic_offsets.
-       */
-      struct anv_push_constants *push =
-         &cmd_buffer->state.gfx.base.push_constants;
-      struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout;
-      if (layout->independent_sets && layout->num_dynamic_buffers > 0) {
-         bool modified = false;
-         for (uint32_t s = 0; s < layout->num_sets; s++) {
-            if (layout->set_layouts[s] == NULL)
-               continue;
-
-            assert(layout->dynamic_offset_start[s] < MAX_DYNAMIC_BUFFERS);
-            if (layout->set_layouts[s]->vk.dynamic_descriptor_count > 0 &&
-                (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) !=
-                layout->dynamic_offset_start[s]) {
-               push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK;
-               push->desc_surface_offsets[s] |= (layout->dynamic_offset_start[s] &
-                                                 ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
-               modified = true;
-            }
-         }
-         if (modified) {
-            cmd_buffer->state.push_constants_dirty |= stages;
-            state->push_constants_data_dirty = true;
-         }
-      }
-
-      cmd_buffer->state.gfx.vs_source_hash = new_pipeline->vs_source_hash;
-      cmd_buffer->state.gfx.fs_source_hash = new_pipeline->fs_source_hash;
-
-      cmd_buffer->state.gfx.instance_multiplier = new_pipeline->instance_multiplier;
-
-      anv_cmd_buffer_flush_pipeline_hw_state(cmd_buffer, old_pipeline, new_pipeline);
-      break;
-   }
-
-   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
-      if (cmd_buffer->state.rt.base.pipeline == pipeline)
-         return;
-
-      cmd_buffer->state.rt.pipeline_dirty = true;
-
-      struct anv_ray_tracing_pipeline *rt_pipeline =
-         anv_pipeline_to_ray_tracing(pipeline);
-      if (rt_pipeline->stack_size > 0) {
-         anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
-                                                  rt_pipeline->stack_size);
-      }
-
-      state = &cmd_buffer->state.rt.base;
-
-      state->push_buffer_stages = pipeline->use_push_descriptor_buffer;
-      state->push_descriptor_stages = pipeline->use_push_descriptor_buffer;
-      state->push_descriptor_index = pipeline->layout.push_descriptor_set_index;
-      break;
-   }
-
-   default:
-      UNREACHABLE("invalid bind point");
-      break;
-   }
-
-   state->pipeline = pipeline;
-
-   if (pipeline->ray_queries > 0)
-      anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
-}
-
 static struct anv_cmd_pipeline_state *
 anv_cmd_buffer_get_pipeline_layout_state(struct anv_cmd_buffer *cmd_buffer,
                                         VkPipelineBindPoint bind_point,
@ -1519,20 +1274,37 @@ void anv_CmdPushDescriptorSetWithTemplate2KHR(
                                      NULL, NULL);
 }

-void anv_CmdSetRayTracingPipelineStackSizeKHR(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    pipelineStackSize)
+void
+anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer,
+                            VkDeviceSize scratch_size,
+                            uint32_t ray_queries)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+   rt->scratch_size = MAX2(rt->scratch_size, scratch_size);
+   if (ray_queries > 0) {
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &rt->base, ray_queries,
+                                         ANV_RT_STAGE_BITS);
+   }
+}
+
+void
+anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer,
+                              VkDeviceSize stack_size)
+{
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
   struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;

   if (anv_batch_has_error(&cmd_buffer->batch))
      return;

   uint32_t stack_ids_per_dss = 2048; /* TODO */

-   unsigned stack_size_log2 = util_logbase2_ceil(pipelineStackSize);
+   unsigned stack_size_log2 = util_logbase2_ceil(stack_size);
   if (stack_size_log2 < 10)
      stack_size_log2 = 10;

@ -1585,7 +1357,7 @@ anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
      &cmd_buffer->state.compute.base;

   if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE)
-      state->pipeline = pipe_state->pipeline;
+      state->shader = &cmd_buffer->state.compute.shader->vk;

   if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0)
      state->descriptor_set[0] = pipe_state->descriptors[0];
@ -1614,11 +1386,11 @@ anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
   struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;

   if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) {
-       if (state->pipeline) {
-          anv_CmdBindPipeline(cmd_buffer_, bind_point,
-                              anv_pipeline_to_handle(state->pipeline));
+       if (state->shader) {
+          mesa_shader_stage stage = MESA_SHADER_COMPUTE;
+          anv_cmd_buffer_bind_shaders(&cmd_buffer->vk, 1, &stage, &state->shader);
       } else {
-          pipe_state->pipeline = NULL;
+          cmd_buffer->state.compute.shader = NULL;
       }
   }

@ -1693,3 +1465,285 @@ anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer,
   anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned)
      (commandBuffer, invocations_x, invocations_y, invocations_z);
 }
+
+static void
+bind_compute_shader(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_shader *shader)
+{
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+
+   cmd_buffer->state.compute.shader = shader;
+   if (shader == NULL)
+      return;
+
+   cmd_buffer->state.compute.pipeline_dirty = true;
+   set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, &shader->bind_map);
+
+   update_push_descriptor_flags(&comp_state->base,
+                                &cmd_buffer->state.compute.shader, 1);
+
+   if (shader->vk.ray_queries > 0) {
+      assert(cmd_buffer->device->info->verx10 >= 125);
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &comp_state->base,
+                                         shader->vk.ray_queries,
+                                         VK_SHADER_STAGE_COMPUTE_BIT);
+   }
+}
+
+static void
+bind_graphics_shaders(struct anv_cmd_buffer *cmd_buffer,
+                      struct anv_shader *new_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT])
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+   uint32_t ray_queries = 0;
+
+   static const enum anv_cmd_dirty_bits mesa_stage_to_dirty_bit[] = {
+      [MESA_SHADER_VERTEX]    = ANV_CMD_DIRTY_VS,
+      [MESA_SHADER_TESS_CTRL] = ANV_CMD_DIRTY_HS,
+      [MESA_SHADER_TESS_EVAL] = ANV_CMD_DIRTY_DS,
+      [MESA_SHADER_GEOMETRY]  = ANV_CMD_DIRTY_GS,
+      [MESA_SHADER_TASK]      = ANV_CMD_DIRTY_TASK,
+      [MESA_SHADER_MESH]      = ANV_CMD_DIRTY_MESH,
+      [MESA_SHADER_FRAGMENT]  = ANV_CMD_DIRTY_PS,
+   };
+
+   gfx->active_stages = 0;
+   gfx->instance_multiplier = 0;
+
+   mesa_shader_stage new_streamout_stage = -1;
+   /* Find the last pre-rasterization stage */
+   for (uint32_t i = 0; i < ANV_GRAPHICS_SHADER_STAGE_COUNT; i++) {
+      mesa_shader_stage s = ANV_GRAPHICS_SHADER_STAGE_COUNT - i - 1;
+      if (new_shaders[s] == NULL)
+         continue;
+
+      assert(gfx->instance_multiplier == 0 ||
+             gfx->instance_multiplier == new_shaders[s]->instance_multiplier);
+      gfx->active_stages |= mesa_to_vk_shader_stage(s);
+      gfx->instance_multiplier = new_shaders[s]->instance_multiplier;
+
+      if (s == MESA_SHADER_FRAGMENT ||
+          s == MESA_SHADER_TASK ||
+          s == MESA_SHADER_TESS_CTRL)
+         continue;
+
+      new_streamout_stage = MAX2(new_streamout_stage, s);
+   }
+
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      struct anv_shader *shader = new_shaders[s];
+
+      if (shader != NULL) {
+         gfx->active_stages |= mesa_to_vk_shader_stage(s);
+
+         ray_queries = MAX2(ray_queries, shader->vk.ray_queries);
+         if (gfx->shaders[s] != shader)
+            set_dirty_for_bind_map(cmd_buffer, s, &shader->bind_map);
+      }
+
+      if (gfx->shaders[s] != shader)
+         gfx->dirty |= mesa_stage_to_dirty_bit[s];
+      else
+         continue;
+
+#define diff_fix_state(bit, name)                                       \
+      do {                                                              \
+         /* Fixed states should always have matching sizes */           \
+         assert(gfx->shaders[s] == NULL ||                              \
+                gfx->shaders[s]->name.len == shader->name.len);         \
+         /* Don't bother memcmp if the state is already dirty */        \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             (gfx->shaders[s] == NULL ||                                \
+              memcmp(&gfx->shaders[s]->cmd_data[                        \
+                        gfx->shaders[s]->name.offset],                  \
+                     &shader->cmd_data[                                 \
+                        shader->name.offset],                           \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_var_state(bit, name)                                       \
+      do {                                                              \
+         /* Don't bother memcmp if the state is already dirty */        \
+         /* Also if the new state is empty, avoid marking dirty */      \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             shader->name.len != 0 &&                                   \
+             (gfx->shaders[s] == NULL ||                                \
+              gfx->shaders[s]->name.len != shader->name.len ||          \
+              memcmp(&gfx->shaders[s]->cmd_data[                        \
+                        gfx->shaders[s]->name.offset],                  \
+                     &shader->cmd_data[shader->name.offset],            \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_fix_state_stage(bit, name, old_stage)                      \
+      do {                                                              \
+         /* Fixed states should always have matching sizes */           \
+         assert(old_stage == MESA_SHADER_NONE ||                        \
+                gfx->shaders[old_stage] == NULL ||                      \
+                gfx->shaders[old_stage]->name.len == shader->name.len); \
+         /* Don't bother memcmp if the state is already dirty */        \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             (old_stage == MESA_SHADER_NONE ||                          \
+              gfx->shaders[old_stage] == NULL ||                        \
+              memcmp(&gfx->shaders[old_stage]->cmd_data[                \
+                        gfx->shaders[old_stage]->name.offset],          \
+                     &shader->cmd_data[                                 \
+                        shader->name.offset],                           \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+#define diff_var_state_stage(bit, name, old_stage)                      \
+      do {                                                              \
+         /* Don't bother memcmp if the state is already dirty */        \
+         /* Also if the new state is empty, avoid marking dirty */      \
+         if (!BITSET_TEST(hw_state->pack_dirty,                         \
+                          ANV_GFX_STATE_##bit) &&                       \
+             shader->name.len != 0 &&                                   \
+             (gfx->shaders[old_stage] == NULL ||                        \
+              gfx->shaders[old_stage]->name.len != shader->name.len ||  \
+              memcmp(&gfx->shaders[old_stage]->cmd_data[                \
+                        gfx->shaders[old_stage]->name.offset],          \
+                     &shader->cmd_data[shader->name.offset],            \
+                     4 * shader->name.len) != 0))                       \
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit);      \
+      } while (0)
+
+      switch (s) {
+      case MESA_SHADER_VERTEX:
+         if (shader != NULL) {
+            diff_fix_state(VS,                       vs.vs);
+            diff_fix_state(VF_SGVS,                  vs.vf_sgvs);
+            if (cmd_buffer->device->info->ver >= 11)
+               diff_fix_state(VF_SGVS_2,             vs.vf_sgvs_2);
+            diff_fix_state(VF_COMPONENT_PACKING,     vs.vf_component_packing);
+            diff_var_state(VF_SGVS_INSTANCING,       vs.vf_sgvs_instancing);
+            gfx->vs_source_hash = shader->prog_data->source_hash;
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VS);
+         }
+         break;
+
+      case MESA_SHADER_TESS_CTRL:
+         if (shader != NULL)
+            diff_fix_state(HS,                       hs.hs);
+         else
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_HS);
+         break;
+
+      case MESA_SHADER_TESS_EVAL:
+         if (shader != NULL) {
+            diff_fix_state(DS,                       ds.ds);
+            diff_fix_state(TE,                       ds.te);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_DS);
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TE);
+         }
+         break;
+
+      case MESA_SHADER_GEOMETRY:
+         if (shader != NULL)
+            diff_fix_state(GS,                       gs.gs);
+         else
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_GS);
+         break;
+
+      case MESA_SHADER_MESH:
+         if (shader != NULL) {
+            diff_fix_state(MESH_CONTROL,             ms.control);
+            diff_fix_state(MESH_SHADER,              ms.shader);
+            diff_fix_state(MESH_DISTRIB,             ms.distrib);
+            diff_fix_state(CLIP_MESH,                ms.clip);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_MESH_CONTROL);
+         }
+         break;
+
+      case MESA_SHADER_TASK:
+         if (shader != NULL) {
+            diff_fix_state(TASK_CONTROL,             ts.control);
+            diff_fix_state(TASK_SHADER,              ts.shader);
+            diff_fix_state(TASK_REDISTRIB,           ts.redistrib);
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TASK_CONTROL);
+         }
+         break;
+
+      case MESA_SHADER_FRAGMENT:
+         if (shader != NULL) {
+            diff_fix_state(WM,                       ps.wm);
+            diff_fix_state(PS,                       ps.ps);
+            diff_fix_state(PS_EXTRA,                 ps.ps_extra);
+            gfx->fs_source_hash = shader->prog_data->source_hash;
+         } else {
+            BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_PS_EXTRA);
+         }
+         break;
+
+      default:
+         UNREACHABLE("Invalid shader stage");
+      }
+
+      /* Only diff those field on the streamout stage */
+      if (s == new_streamout_stage) {
+         diff_fix_state_stage(STREAMOUT,    so,           gfx->streamout_stage);
+         diff_var_state_stage(SO_DECL_LIST, so_decl_list, gfx->streamout_stage);
+      }
+
+      gfx->shaders[s] = shader;
+   }
+
+   gfx->streamout_stage = new_streamout_stage;
+
+#undef diff_fix_state
+#undef diff_var_state
+#undef diff_fix_state_stage
+#undef diff_var_state_stage
+
+   update_push_descriptor_flags(&gfx->base,
+                                cmd_buffer->state.gfx.shaders,
+                                ARRAY_SIZE(cmd_buffer->state.gfx.shaders));
+
+   if (ray_queries > 0) {
+      assert(cmd_buffer->device->info->verx10 >= 125);
+      anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &gfx->base, ray_queries,
+                                         cmd_buffer->state.gfx.active_stages);
+   }
+}
+
+void
+anv_cmd_buffer_bind_shaders(struct vk_command_buffer *vk_cmd_buffer,
+                            uint32_t stage_count,
+                            const mesa_shader_stage *stages,
+                            struct vk_shader ** const vk_shaders)
+{
+   struct anv_shader ** const shaders = (struct anv_shader ** const)vk_shaders;
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+
+   /* Append any scratch surface used by the shaders */
+   for (uint32_t i = 0; i < stage_count; i++) {
+      if (shaders[i] != NULL) {
+         anv_reloc_list_append(cmd_buffer->batch.relocs,
+                               &shaders[i]->relocs);
+      }
+   }
+
+   struct anv_shader *cs_shader = cmd_buffer->state.compute.shader;
+   struct anv_shader *gfx_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+   memcpy(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders));
+   for (uint32_t i = 0; i < stage_count; i++) {
+      if (mesa_shader_stage_is_compute(stages[i]))
+         cs_shader = shaders[i];
+      else
+         gfx_shaders[stages[i]] = shaders[i];
+   }
+
+   if (cs_shader != cmd_buffer->state.compute.shader)
+      bind_compute_shader(cmd_buffer, cs_shader);
+   if (memcmp(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders)))
+      bind_graphics_shaders(cmd_buffer, gfx_shaders);
+}
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -31,6 +31,7 @@

 #include "anv_private.h"
 #include "anv_measure.h"
+#include "anv_shader.h"
 #include "anv_slab_bo.h"
 #include "util/u_debug.h"
 #include "util/os_file.h"
@ -380,6 +381,8 @@ VkResult anv_CreateDevice(
   if (result != VK_SUCCESS)
      goto fail_alloc;

+   device->vk.shader_ops = &anv_device_shader_ops;
+
   if (INTEL_DEBUG(DEBUG_BATCH) || INTEL_DEBUG(DEBUG_BATCH_STATS)) {
      for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
         struct intel_batch_decode_ctx *decoder = &device->decoder[i];
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@ -223,7 +223,7 @@ uint32_t
 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
                                       struct anv_cmd_pipeline_state *pipe_state,
                                       const VkShaderStageFlags dirty,
-                                       const struct anv_shader_bin **shaders,
+                                       const struct anv_shader **shaders,
                                       uint32_t num_shaders);

 void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -173,17 +173,29 @@ anv_pipeline_finish(struct anv_pipeline *pipeline,
   vk_object_base_finish(&pipeline->vk.base);
 }

+VKAPI_ATTR void VKAPI_CALL
+vk_common_DestroyPipeline(VkDevice _device,
+                          VkPipeline _pipeline,
+                          const VkAllocationCallbacks *pAllocator);
+
 void anv_DestroyPipeline(
    VkDevice                                    _device,
    VkPipeline                                  _pipeline,
    const VkAllocationCallbacks*                pAllocator)
 {
   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   VK_FROM_HANDLE(vk_pipeline, vk_pipeline, _pipeline);

-   if (!pipeline)
+   if (!vk_pipeline)
      return;

+   if (vk_pipeline->ops != NULL) {
+      vk_common_DestroyPipeline(_device, _pipeline, pAllocator);
+      return;
+   }
+
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
   ANV_RMV(resource_destroy, device, pipeline);

   switch (pipeline->type) {
@ -2851,6 +2863,7 @@ anv_compute_pipeline_create(struct anv_device *device,
   return pipeline->base.batch.status;
 }

+#if 0
 VkResult anv_CreateComputePipelines(
    VkDevice                                    _device,
    VkPipelineCache                             pipelineCache,
@ -2885,6 +2898,7 @@ VkResult anv_CreateComputePipelines(

   return result;
 }
+#endif

 static uint32_t
 get_vs_input_elements(const struct brw_vs_prog_data *vs_prog_data)
@ -3343,6 +3357,7 @@ anv_graphics_pipeline_create(struct anv_device *device,
   return pipeline->base.base.batch.status;
 }

+#if 0
 VkResult anv_CreateGraphicsPipelines(
    VkDevice                                    _device,
    VkPipelineCache                             pipelineCache,
@ -3388,6 +3403,7 @@ VkResult anv_CreateGraphicsPipelines(

   return result;
 }
+#endif

 static bool
 should_remat_cb(nir_instr *instr, void *data)
@ -4083,6 +4099,7 @@ anv_ray_tracing_pipeline_create(
   return pipeline->base.batch.status;
 }

+#if 0
 VkResult
 anv_CreateRayTracingPipelinesKHR(
    VkDevice                                    _device,
@ -4491,3 +4508,4 @@ anv_GetRayTracingShaderGroupStackSizeKHR(

   return brw_bs_prog_data_const(bin->prog_data)->max_stack_size;
 }
+#endif
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1224,7 +1224,6 @@ struct anv_shader {
   struct anv_state kernel;

   const struct brw_stage_prog_data *prog_data;
-   uint32_t prog_data_size;

   struct brw_compile_stats stats[3];
   uint32_t num_stats;
@ -2186,6 +2185,11 @@ struct anv_gfx_dynamic_state {
      uint32_t PrimitiveTopologyType;
   } vft;

+   /* 3DSTATE_VS */
+   struct {
+      bool     VertexCacheDisable;
+   } vs;
+
   /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */
   struct {
      uint32_t count;
@ -4422,7 +4426,7 @@ struct anv_cmd_graphics_state {
   struct anv_cmd_pipeline_state base;

   /* Shaders bound */
-   struct anv_shader_bin *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+   struct anv_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];

   /* Bitfield of valid entries in the shaders array */
   VkShaderStageFlags active_stages;
@ -4436,6 +4440,9 @@ struct anv_cmd_graphics_state {
   bool kill_pixel;
   bool uses_xfb;

+   /* Shader stage in base.shaders[] responsible for streamout */
+   mesa_shader_stage streamout_stage;
+
   /* Render pass information */
   VkRenderingFlags rendering_flags;
   VkRect2D render_area;
@ -4530,7 +4537,7 @@ struct anv_cmd_graphics_state {
 struct anv_cmd_compute_state {
   struct anv_cmd_pipeline_state base;

-   struct anv_shader_bin *shader;
+   struct anv_shader *shader;

   bool pipeline_dirty;

@ -4551,6 +4558,8 @@ struct anv_cmd_ray_tracing_state {
      struct brw_rt_scratch_layout layout;
   } scratch;

+   VkDeviceSize scratch_size;
+
   uint32_t debug_marker_count;
   uint32_t num_tlas;
   uint32_t num_blas;
@ -5022,6 +5031,12 @@ void
 anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
                                         enum anv_pipe_bits flushed_bits);

+void
+anv_cmd_buffer_bind_shaders(struct vk_command_buffer *cmd_buffer,
+                            uint32_t stage_count,
+                            const mesa_shader_stage *stages,
+                            struct vk_shader ** const shaders);
+
 /**
 * A allocation tied to a command buffer.
 *
@ -5083,7 +5098,7 @@ enum anv_cmd_saved_state_flags {
 struct anv_cmd_saved_state {
   uint32_t flags;

-   struct anv_pipeline *pipeline;
+   struct vk_shader *shader;
   struct anv_descriptor_set *descriptor_set[MAX_SETS];
   uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
 };
@ -5444,7 +5459,6 @@ struct anv_graphics_pipeline {
             4 * _cmd_state->len);                                      \
   } while (0)

-
 struct anv_compute_pipeline {
   struct anv_pipeline                          base;

@ -6484,6 +6498,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer);
 VkResult
 anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer);

+void
+anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer,
+                            VkDeviceSize scratch_size,
+                            uint32_t ray_queries);
+
+void
+anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer,
+                              VkDeviceSize stack_size);
+
 bool
 anv_can_hiz_clear_image(struct anv_cmd_buffer *cmd_buffer,
                        const struct anv_image *image,
--- a/src/intel/vulkan/anv_shader_compile.c
+++ b/src/intel/vulkan/anv_shader_compile.c
@ -1886,5 +1886,8 @@ struct vk_device_shader_ops anv_device_shader_ops = {
   .deserialize                    = anv_shader_deserialize,
   .write_rt_shader_group          = anv_write_rt_shader_group,
   .write_rt_shader_group_replay_handle = anv_write_rt_shader_group_replay_handle,
+   .cmd_bind_shaders               = anv_cmd_buffer_bind_shaders,
   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
+   .cmd_set_rt_state               = anv_cmd_buffer_set_rt_state,
+   .cmd_set_stack_size             = anv_cmd_buffer_set_stack_size,
 };
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -2121,7 +2121,7 @@ emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
 static VkResult
 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                   struct anv_cmd_pipeline_state *pipe_state,
-                   const struct anv_shader_bin *shader,
+                   const struct anv_shader *shader,
                   struct anv_state *bt_state)
 {
   uint32_t state_offset;
@ -2153,7 +2153,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,

      case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
         /* Color attachment binding */
-         assert(shader->stage == MESA_SHADER_FRAGMENT);
+         assert(shader->vk.stage == MESA_SHADER_FRAGMENT);
         uint32_t index = binding->index < MAX_RTS ?
            cmd_buffer->state.gfx.color_output_mapping[binding->index] :
            binding->index;
@ -2268,7 +2268,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 static VkResult
 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
              struct anv_cmd_pipeline_state *pipe_state,
-              const struct anv_shader_bin *shader,
+              const struct anv_shader *shader,
              struct anv_state *state)
 {
   const struct anv_pipeline_bind_map *map = &shader->bind_map;
@ -2312,7 +2312,7 @@ uint32_t
 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
                                       struct anv_cmd_pipeline_state *pipe_state,
                                       const VkShaderStageFlags dirty,
-                                       const struct anv_shader_bin **shaders,
+                                       const struct anv_shader **shaders,
                                       uint32_t num_shaders)
 {
   VkShaderStageFlags flushed = 0;
@ -2322,7 +2322,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
      if (!shaders[i])
         continue;

-      mesa_shader_stage stage = shaders[i]->stage;
+      mesa_shader_stage stage = shaders[i]->vk.stage;
      VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
      if ((vk_stage & dirty) == 0)
         continue;
@ -2361,7 +2361,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
         if (!shaders[i])
            continue;

-         mesa_shader_stage stage = shaders[i]->stage;
+         mesa_shader_stage stage = shaders[i]->vk.stage;

         result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
                                &cmd_buffer->state.samplers[stage]);
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@ -105,13 +105,11 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
   struct anv_device *device = cmd_buffer->device;
   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
   const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
-   struct anv_compute_pipeline *pipeline =
-      anv_pipeline_to_compute(comp_state->base.pipeline);

   assert(comp_state->shader);

   genX(cmd_buffer_config_l3)(cmd_buffer,
-                              pipeline->cs->prog_data->total_shared > 0 ?
+                              comp_state->shader->prog_data->total_shared > 0 ?
                              device->l3_slm_config : device->l3_config);

   genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
@ -127,7 +125,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
    */
   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

-   if (cmd_buffer->state.compute.pipeline_dirty) {
+   if (comp_state->pipeline_dirty) {
 #if GFX_VERx10 < 125
      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
       *
@ -143,13 +141,28 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 #endif

-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+#define anv_batch_emit_cs(batch, cmd, field) ({                         \
+            void *__dst = anv_batch_emit_dwords(                        \
+               batch, __anv_cmd_length(cmd));                           \
+            memcpy(__dst,                                               \
+                   &comp_state->shader->cmd_data[                       \
+                      comp_state->shader->field.offset],                \
+                   4 * __anv_cmd_length(cmd));                          \
+            VG(VALGRIND_CHECK_MEM_IS_DEFINED(                           \
+                  __dst, __anv_cmd_length(cmd) * 4));                   \
+            __dst;                                                      \
+         })
+

 #if GFX_VERx10 >= 125
      const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state);
      genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
+#else
+      anv_batch_emit_cs(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), cs.gfx9.vfe);
 #endif

+#undef anv_batch_emit_cs
+
      /* Changing the pipeline affects the push constants layout (different
       * amount of cross/per thread allocations). The allocation is also
       * bounded to just the amount consummed by the pipeline (see
@ -179,7 +192,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
         cmd_buffer,
         &cmd_buffer->state.compute.base,
         VK_SHADER_STAGE_COMPUTE_BIT,
-         (const struct anv_shader_bin **)&comp_state->shader, 1);
+         (const struct anv_shader **)&comp_state->shader, 1);
      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;

 #if GFX_VERx10 < 125
@ -194,7 +207,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)

      struct anv_state state =
         anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
-                                      pipeline->gfx9.interface_descriptor_data,
+                                      comp_state->shader->cs.gfx9.idd,
                                      GENX(INTERFACE_DESCRIPTOR_DATA_length),
                                      64);

@ -439,7 +452,7 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
         &cmd_buffer->batch,
         GENX(EXECUTE_INDIRECT_DISPATCH_length),
         GENX(EXECUTE_INDIRECT_DISPATCH_body_start) / 32,
-         anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
+         comp_state->shader->cs.gfx125.compute_walker_body,
         GENX(EXECUTE_INDIRECT_DISPATCH),
         .PredicateEnable            = predicate,
         .MaxCount                   = 1,
@ -520,7 +533,7 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
         &cmd_buffer->batch,
         GENX(COMPUTE_WALKER_length),
         GENX(COMPUTE_WALKER_body_start) / 32,
-         anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body,
+         comp_state->shader->cs.gfx125.compute_walker_body,
         GENX(COMPUTE_WALKER),
         .IndirectParameterEnable        = !anv_address_is_null(indirect_addr),
         .PredicateEnable                = predicate,
@ -1051,8 +1064,6 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
 {
   struct anv_device *device = cmd_buffer->device;
   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
-   struct anv_ray_tracing_pipeline *pipeline =
-      anv_pipeline_to_ray_tracing(rt->base.pipeline);

   if (INTEL_DEBUG(DEBUG_RT_NO_TRACE))
      return;
@ -1211,18 +1222,18 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
       */
      btd.PerDSSMemoryBackedBufferSize = 6;
      btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
-      if (pipeline->base.scratch_size > 0) {
+      if (rt->scratch_size > 0) {
         struct anv_bo *scratch_bo =
            anv_scratch_pool_alloc(device,
                                   &device->scratch_pool,
                                   MESA_SHADER_COMPUTE,
-                                   pipeline->base.scratch_size);
+                                   rt->scratch_size);
         anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
                               scratch_bo);
         uint32_t scratch_surf =
            anv_scratch_pool_get_surf(cmd_buffer->device,
                                      &device->scratch_pool,
-                                      pipeline->base.scratch_size);
+                                      rt->scratch_size);
         btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
      }
 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
@ -1234,7 +1245,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
 #endif
   }

-   genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
+   genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, rt->scratch_size);

   const struct brw_cs_prog_data *cs_prog_data =
      brw_cs_prog_data_const(device->rt_trampoline->prog_data);
@ -1273,7 +1284,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
      .ThreadGroupIDZDimension        = global_size[2],
      .ExecutionMask                  = 0xff,
      .EmitInlineParameter            = true,
-      .PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0),
+      .PostSync.MOCS                  = anv_mocs(cmd_buffer->device, NULL, 0),
 #if GFX_VER >= 30
         /* HSD 14016252163 */
      .DispatchWalkOrder = cs_prog_data->uses_sampler ? MortonWalk : LinearWalk,
--- a/src/intel/vulkan/genX_cmd_draw.c
+++ b/src/intel/vulkan/genX_cmd_draw.c
@ -162,7 +162,7 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,

 static struct anv_address
 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
-                       const struct anv_shader_bin *shader,
+                       const struct anv_shader *shader,
                       const struct anv_push_range *range)
 {
   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
@ -242,10 +242,10 @@ get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
 */
 static uint32_t
 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
-                          const struct anv_shader_bin *shader,
+                          const struct anv_shader *shader,
                          const struct anv_push_range *range)
 {
-   assert(shader->stage != MESA_SHADER_COMPUTE);
+   assert(shader->vk.stage != MESA_SHADER_COMPUTE);
   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
   switch (range->set) {
   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
@ -443,7 +443,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
      if (!anv_gfx_has_stage(gfx, stage))
         continue;

-      const struct anv_shader_bin *shader = gfx->shaders[stage];
+      const struct anv_shader *shader = gfx->shaders[stage];
      if (shader->prog_data->robust_ubo_ranges) {
         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
         struct anv_push_constants *push = &gfx->base.push_constants;
@ -509,7 +509,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,

      struct anv_address buffers[4] = {};
      if (anv_gfx_has_stage(gfx, stage)) {
-         const struct anv_shader_bin *shader = gfx->shaders[stage];
+         const struct anv_shader *shader = gfx->shaders[stage];
         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;

         /* We have to gather buffer addresses as a second step because the
@ -593,7 +593,7 @@ get_mesh_task_push_addr64(struct anv_cmd_buffer *cmd_buffer,
                          struct anv_cmd_graphics_state *gfx,
                          mesa_shader_stage stage)
 {
-   const struct anv_shader_bin *shader = gfx->shaders[stage];
+   const struct anv_shader *shader = gfx->shaders[stage];
   const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
   if (bind_map->push_ranges[0].length == 0)
      return 0;
@ -645,31 +645,50 @@ cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,

 ALWAYS_INLINE static void
 cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
-                                 const struct anv_graphics_pipeline *pipeline)
+                                 struct anv_cmd_graphics_state *gfx,
+                                 const struct vk_dynamic_graphics_state *dyn)
 {
-   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+   if (!anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT))
      return;

-   UNUSED bool need_rt_flush = false;
-   for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) {
-      /* No writes going to this render target so it won't affect the RT cache
-       */
-      if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED)
-         continue;
+   /* Count the number of color attachments in the binding table */
+   const struct anv_pipeline_bind_map *bind_map =
+      &gfx->shaders[MESA_SHADER_FRAGMENT]->bind_map;

-      /* No change */
-      if (cmd_buffer->state.gfx.color_output_mapping[rt] ==
-          pipeline->color_output_mapping[rt])
-         continue;
-
-      cmd_buffer->state.gfx.color_output_mapping[rt] =
-         pipeline->color_output_mapping[rt];
-      need_rt_flush = true;
-      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   /* Build a map of fragment color output to attachment */
+   uint8_t rt_to_att[MAX_RTS];
+   memset(rt_to_att, ANV_COLOR_OUTPUT_DISABLED, MAX_RTS);
+   for (uint32_t i = 0; i < MAX_RTS; i++) {
+      if (dyn->cal.color_map[i] != MESA_VK_ATTACHMENT_UNUSED)
+         rt_to_att[dyn->cal.color_map[i]] = i;
+   }
+
+   /* For each fragment shader output if not unused apply the remapping to
+    * pipeline->color_output_mapping
+    */
+   UNUSED bool need_rt_flush = false;
+   for (unsigned rt = 0; rt < MIN2(bind_map->surface_count, MAX_RTS); rt++) {
+      if (bind_map->surface_to_descriptor[rt].set !=
+          ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
+         break;
+
+      uint32_t index = bind_map->surface_to_descriptor[rt].index;
+      if (index == ANV_COLOR_OUTPUT_UNUSED)
+         continue;
+
+      if (index == ANV_COLOR_OUTPUT_DISABLED &&
+          gfx->color_output_mapping[rt] != index) {
+         gfx->color_output_mapping[rt] = index;
+         need_rt_flush = true;
+      } else if (gfx->color_output_mapping[rt] != rt_to_att[rt])  {
+         gfx->color_output_mapping[rt] = rt_to_att[rt];
+         need_rt_flush = true;
+      }
   }

-#if GFX_VER >= 11
   if (need_rt_flush) {
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+#if GFX_VER >= 11
      /* The PIPE_CONTROL command description says:
       *
       *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
@ -689,8 +708,8 @@ cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
                                "change RT due to shader outputs");
-   }
 #endif
+   }
 }

 ALWAYS_INLINE static void
@ -750,8 +769,6 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_device *device = cmd_buffer->device;
   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
   const struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;

@ -772,16 +789,16 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
       *
       * Apply task URB workaround when switching from task to primitive.
       */
-      if (anv_pipeline_is_primitive(pipeline)) {
+      if (!anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
         genX(apply_task_urb_workaround)(cmd_buffer);
-      } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+      } else if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
         cmd_buffer->state.gfx.used_task_shader = true;
      }
   }

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP) ||
       (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PS))
-      cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline);
+      cmd_buffer_maybe_flush_rt_writes(cmd_buffer, gfx, dyn);

   /* Apply any pending pipeline flushes we may have.  We want to apply them
    * now because, if any of those flushes are for things like push constants,
@ -887,17 +904,29 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
   /* If the pipeline changed, we may need to re-allocate push constant space
    * in the URB.
    */
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS) {
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS)
      cmd_buffer_alloc_gfx_push_constants(cmd_buffer);

+#if GFX_VERx10 < 125
+   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_VS |
+                                      ANV_CMD_DIRTY_HS |
+                                      ANV_CMD_DIRTY_DS |
+                                      ANV_CMD_DIRTY_GS |
+                                      ANV_CMD_DIRTY_PS)) {
+      for (unsigned s = 0; s <= MESA_SHADER_FRAGMENT; s++) {
+         if (gfx->shaders[s] == NULL)
+            continue;
+
         /* Also add the relocations (scratch buffers) */
         VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
-                                              pipeline->base.base.batch.relocs);
+                                                 &gfx->shaders[s]->relocs);
         if (result != VK_SUCCESS) {
            anv_batch_set_error(&cmd_buffer->batch, result);
            return;
         }
      }
+   }
+#endif

   /* Render targets live in the same binding table as fragment descriptors */
   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
@ -916,7 +945,7 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
         cmd_buffer,
         &cmd_buffer->state.gfx.base,
         descriptors_dirty,
-         (const struct anv_shader_bin **)gfx->shaders,
+         (const struct anv_shader **)gfx->shaders,
         ARRAY_SIZE(gfx->shaders));
      cmd_buffer->state.descriptors_dirty &= ~dirty;
   }
@ -989,23 +1018,13 @@ anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
 ALWAYS_INLINE static void
 cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
 {
+   UNUSED const struct anv_device *device = cmd_buffer->device;
+   UNUSED const struct anv_instance *instance =
+      device->physical->instance;
   UNUSED const bool protected = cmd_buffer->vk.pool->flags &
                                 VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
-   UNUSED struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
-   UNUSED struct anv_device *device = cmd_buffer->device;
-   UNUSED struct anv_instance *instance = device->physical->instance;
-
-#define DEBUG_SHADER_HASH(stage) do {                                   \
-      if (unlikely(                                                     \
-             (instance->debug & ANV_DEBUG_SHADER_HASH) &&               \
-             anv_pipeline_has_stage(pipeline, stage))) {                \
-         mi_store(&b,                                                   \
-                  mi_mem32(device->workaround_address),                 \
-                  mi_imm(pipeline->base.shaders[stage]->                \
-                         prog_data->source_hash));                      \
-      }                                                                 \
-   } while (0)
+   UNUSED struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   UNUSED struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;

   struct mi_builder b;
   if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {
@ -1013,18 +1032,35 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
      mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
   }

+#define DEBUG_SHADER_HASH(stage) do {                                   \
+      if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {          \
+         mi_store(&b,                                                   \
+                  mi_mem32(device->workaround_address),                 \
+                  mi_imm(gfx->shaders[stage]->prog_data->source_hash)); \
+      }                                                                 \
+   } while (0)
+
+#define anv_batch_emit_gfx(batch, cmd, name) ({                         \
+      void *__dst = anv_batch_emit_dwords(                              \
+         batch, __anv_cmd_length(cmd));                                 \
+      memcpy(__dst, hw_state->packed.name,                              \
+             4 * __anv_cmd_length(cmd));                                \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(                                 \
+            __dst, __anv_cmd_length(cmd) * 4));                         \
+      __dst;                                                            \
+   })
+
 #if INTEL_WA_16011107343_GFX_VER
   if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) &&
-       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
+       anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) {
      DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL);
-      anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
-                                              final.hs, protected);
+      anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_HS), hs);
   }
 #endif

 #if INTEL_WA_22018402687_GFX_VER
   if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) &&
-       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+       anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
      DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL);
      /* Wa_22018402687:
       *   In any 3D enabled context, just before any Tessellation enabled
@ -1038,13 +1074,13 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
       * said switch, as it matters at the HW level, and can be triggered even
       * across processes, so we apply the Wa at all times.
       */
-      anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
-                                              final.ds, protected);
+      anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_DS), ds);
   }
 #endif

   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);

+#undef anv_batch_emit_gfx
 #undef DEBUG_SHADER_HASH
 }

--- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@ -96,18 +96,10 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
      ANV_STATE_NULL;
   UNUSED uint32_t wa_insts_offset = 0;

-#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
-#endif
-
 #if INTEL_WA_16011107343_GFX_VER
   if (wa_16011107343) {
      memcpy(wa_insts_state.map + wa_insts_offset,
-             &pipeline->batch_data[
-                protected ?
-                pipeline->final.hs_protected.offset :
-                pipeline->final.hs.offset],
+             gfx->dyn_state.packed.hs,
             GENX(3DSTATE_HS_length) * 4);
      wa_insts_offset += GENX(3DSTATE_HS_length) * 4;
   }
@ -116,10 +108,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
 #if INTEL_WA_22018402687_GFX_VER
   if (wa_22018402687) {
      memcpy(wa_insts_state.map + wa_insts_offset,
-             &pipeline->batch_data[
-                protected ?
-                pipeline->final.ds_protected.offset :
-                pipeline->final.ds.offset],
+             gfx->dyn_state.packed.ds,
             GENX(3DSTATE_DS_length) * 4);
      wa_insts_offset += GENX(3DSTATE_DS_length) * 4;
   }
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -209,7 +209,7 @@ genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer,
   if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
      return;

-   if (gfx->uses_xfb) {
+   if (gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) {
      genX(cmd_buffer_set_preemption)(cmd_buffer, false);
      return;
   }
@ -417,10 +417,10 @@ want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn,
    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
    */
-   struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];

   return kill_pixel(wm_prog_data, dyn) ||
-          has_ds_feedback_loop(&fs_bin->bind_map, dyn) ||
+          has_ds_feedback_loop(&fs->bind_map, dyn) ||
          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
 }

@ -1012,21 +1012,21 @@ update_ps(struct anv_gfx_dynamic_state *hw_state,
      return;
   }

-   const struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   const struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
   struct GENX(3DSTATE_PS) ps = {};
   intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
                               MAX2(dyn->ms.rasterization_samples, 1),
                               hw_state->fs_msaa_flags);

   SET(PS, ps.KernelStartPointer0,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
   SET(PS, ps.KernelStartPointer1,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
 #if GFX_VER < 20
   SET(PS, ps.KernelStartPointer2,
-           fs_bin->kernel.offset +
+           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
 #endif

@ -1124,12 +1124,12 @@ update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state,
                            const struct vk_dynamic_graphics_state *dyn,
                            const struct anv_cmd_graphics_state *gfx)
 {
-   struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT];
+   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
                       wm_prog_data &&
-                       (has_ds_feedback_loop(&fs_bin->bind_map, dyn) ||
+                       (has_ds_feedback_loop(&fs->bind_map, dyn) ||
                        wm_prog_data->uses_kill),
                       FRAGMENT);
 }
@ -2174,6 +2174,35 @@ update_tbimr_info(struct anv_gfx_dynamic_state *hw_state,
 }
 #endif

+#if GFX_VERx10 == 90
+ALWAYS_INLINE static void
+update_vs(struct anv_gfx_dynamic_state *hw_state,
+          const struct anv_cmd_graphics_state *gfx,
+          const struct anv_device *device)
+{
+   if (device->info->gt < 4)
+      return;
+
+   /* On Sky Lake GT4, we have experienced some hangs related to the VS cache
+    * and tessellation. It is unknown exactly what is happening but the
+    * Haswell docs for the "VS Reference Count Full Force Miss Enable" field
+    * of the "Thread Mode" register refer to a HSW bug in which the VUE handle
+    * reference count would overflow resulting in internal reference counting
+    * bugs. My (Faith's) best guess is that this bug cropped back up on SKL
+    * GT4 when we suddenly had more threads in play than any previous gfx9
+    * hardware.
+    *
+    * What we do know for sure is that setting this bit when tessellation
+    * shaders are in use fixes a GPU hang in Batman: Arkham City when playing
+    * with DXVK (https://bugs.freedesktop.org/107280). Disabling the vertex
+    * cache with tessellation shaders should only have a minor performance
+    * impact as the tessellation shaders are likely generating and processing
+    * far more geometry than the vertex stage.
+    */
+   SET(VS, vs.VertexCacheDisable, anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL));
+}
+#endif
+
 #if INTEL_WA_18019110168_GFX_VER
 static inline unsigned
 compute_mesh_provoking_vertex(const struct brw_mesh_prog_data *mesh_prog_data,
@ -2215,11 +2244,13 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
                                   const struct anv_device *device,
                                   const struct vk_dynamic_graphics_state *dyn,
                                   struct anv_cmd_graphics_state *gfx,
-                                   const struct anv_graphics_pipeline *pipeline,
                                   VkCommandBufferLevel cmd_buffer_level)
 {
   UNUSED bool fs_msaa_changed = false;

+   assert(gfx->shaders[gfx->streamout_stage] != NULL);
+   assert(gfx->instance_multiplier != 0);
+
   /* Do this before update_fs_msaa_flags() for primitive_id_index */
   if (gfx->dirty & ANV_CMD_DIRTY_ALL_SHADERS(device))
      update_sbe(hw_state, gfx, device);
@ -2234,6 +2265,11 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_urb_config(hw_state, gfx, device);

+#if GFX_VERx10 == 90
+   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
+      update_vs(hw_state, gfx, device);
+#endif
+
   if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
       BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
      update_ps(hw_state, device, dyn, gfx);
@ -2482,8 +2518,7 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
 static void
 cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
                            struct anv_cmd_buffer *cmd_buffer,
-                            const struct anv_cmd_graphics_state *gfx,
-                            const struct anv_graphics_pipeline *pipeline)
+                            const struct anv_cmd_graphics_state *gfx)
 {
   struct anv_device *device = cmd_buffer->device;
   struct anv_instance *instance = device->physical->instance;
@ -2502,73 +2537,107 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   } while (0)
 #define IS_DIRTY(name) BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##name)

-#define anv_gfx_copy(field, cmd, source) ({                             \
+#define anv_gfx_copy(field, cmd, stage, source) ({                      \
+      if (gfx->shaders[stage] != NULL) {                                \
         assert(sizeof(hw_state->packed.field) >=                       \
                4 * __anv_cmd_length(cmd));                             \
-      assert((source).len == __anv_cmd_length(cmd));                    \
+         assert((gfx->shaders[stage]->source).len ==                    \
+                __anv_cmd_length(cmd));                                 \
         memcpy(&hw_state->packed.field,                                \
-             &pipeline->batch_data[(source).offset],                    \
+                &gfx->shaders[stage]->cmd_data[                         \
+                   (gfx->shaders[stage]->source).offset],               \
                4 * __anv_cmd_length(cmd));                             \
+      } else {                                                          \
+         anv_gfx_pack(field, cmd, __unused_name);                       \
+      }                                                                 \
   })
-#define anv_gfx_copy_variable(field, source) ({                         \
+#define anv_gfx_copy_variable(field, stage, source) ({                  \
+      if (gfx->shaders[stage] != NULL) {                                \
         assert(sizeof(hw_state->packed.field) >=                       \
-             4 * (source).len);                                         \
+                4 * gfx->shaders[stage]->source.len);                   \
         memcpy(&hw_state->packed.field,                                \
-             &pipeline->batch_data[(source).offset],                    \
-             4 * (source).len);                                         \
-      hw_state->packed.field##_len = (source).len;                      \
+                &gfx->shaders[stage]->cmd_data[                         \
+                   (gfx->shaders[stage]->source).offset],               \
+                4 * gfx->shaders[stage]->source.len);                   \
+         hw_state->packed.field##_len =                                 \
+            gfx->shaders[stage]->source.len;                            \
+      }                                                                 \
   })
-#define anv_gfx_copy_protected(field, cmd, source) ({                  \
+#define anv_gfx_copy_protected(field, cmd, stage, source) ({           \
      const bool __protected = (cmd_buffer->vk.pool->flags &           \
                                VK_COMMAND_POOL_CREATE_PROTECTED_BIT); \
      assert(sizeof(hw_state->packed.field) >=                         \
             4 * __anv_cmd_length(cmd));                               \
-      assert((source).len == __anv_cmd_length(cmd));                   \
+      if (gfx->shaders[stage] != NULL) {                               \
+         assert((gfx->shaders[stage]->source).len ==                   \
+                __anv_cmd_length(cmd));                                \
         memcpy(&hw_state->packed.field,                               \
-             &pipeline->batch_data[                                    \
+                &gfx->shaders[stage]->cmd_data[                        \
                   __protected ?                                       \
-                (source##_protected).offset :                          \
-                (source).offset],                                      \
+                   gfx->shaders[stage]->source##_protected.offset :    \
+                   gfx->shaders[stage]->source.offset],                \
                4 * __anv_cmd_length(cmd));                            \
+      } else {                                                         \
+         memcpy(&hw_state->packed.field,                               \
+                device->physical->gfx_default.field,                   \
+                4 * __anv_cmd_length(cmd));                            \
+      }                                                                \
   })
-#define anv_gfx_pack_merge(field, cmd, prepacked, name)                 \
-   for (struct cmd name = { 0 },                                        \
+#define anv_gfx_pack_merge(field, cmd, stage, source, name)             \
+   for (struct cmd name = (struct cmd) { 0 },                           \
        *_dst = (struct cmd *)hw_state->packed.field;                   \
        __builtin_expect(_dst != NULL, 1);                              \
-        ({ const struct anv_gfx_state_ptr *_cmd_state = &prepacked;     \
+        ({                                                              \
           uint32_t _partial[__anv_cmd_length(cmd)];                    \
-           assert(_cmd_state->len == __anv_cmd_length(cmd));            \
           assert(sizeof(hw_state->packed.field) >=                     \
                  4 * __anv_cmd_length(cmd));                           \
           __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
+           if (gfx->shaders[stage] != NULL) {                           \
+              const struct anv_gfx_state_ptr *_cmd_state =              \
+                 &gfx->shaders[stage]->source;                          \
+              assert(_cmd_state->len == __anv_cmd_length(cmd));         \
              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
                 assert((_partial[i] &                                  \
-                      (pipeline)->batch_data[                           \
-                         (prepacked).offset + i]) == 0);                \
+                         gfx->shaders[stage]->cmd_data[                 \
+                            _cmd_state->offset + i]) == 0);             \
                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
-                 (pipeline)->batch_data[_cmd_state->offset + i];        \
+                    gfx->shaders[stage]->cmd_data[_cmd_state->offset + i]; \
+              }                                                         \
+           } else {                                                     \
+              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
+                 assert((_partial[i] &                                  \
+                         device->physical->gfx_default.field[i]) == 0); \
+                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
+                    device->physical->gfx_default.field[i];             \
+              }                                                         \
           }                                                            \
           _dst = NULL;                                                 \
        }))
-#define anv_gfx_pack_merge_protected(field, cmd, prepacked, name)       \
-   for (struct cmd name = { 0 },                                        \
+#define anv_gfx_pack_merge_protected(field, cmd, stage, source, name)   \
+   for (struct cmd name = (struct cmd) { 0 },                           \
        *_dst = (struct cmd *)hw_state->packed.field;                   \
        __builtin_expect(_dst != NULL, 1);                              \
-        ({ const struct anv_gfx_state_ptr *_cmd_state =                 \
-              (cmd_buffer->vk.pool->flags &                             \
-               VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?                  \
-              &prepacked##_protected : &prepacked;                      \
+        ({                                                              \
           uint32_t _partial[__anv_cmd_length(cmd)];                    \
-           assert(_cmd_state->len == __anv_cmd_length(cmd));            \
           assert(sizeof(hw_state->packed.field) >=                     \
                  4 * __anv_cmd_length(cmd));                           \
           __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
+           const struct anv_gfx_state_ptr *_cmd_state =                 \
+              gfx->shaders[stage] != NULL ?                             \
+              ((cmd_buffer->vk.pool->flags &                            \
+                VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?                 \
+               &gfx->shaders[stage]->source##_protected :               \
+               &gfx->shaders[stage]->source) :                          \
+              NULL;                                                     \
+           assert(_cmd_state == NULL ||                                 \
+                  _cmd_state->len == __anv_cmd_length(cmd));            \
+           const uint32_t *_inst_data =                                 \
+              gfx->shaders[stage] != NULL ?                             \
+              &gfx->shaders[stage]->cmd_data[_cmd_state->offset] :      \
+              device->physical->gfx_default.field;                      \
           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
-              assert((_partial[i] &                                     \
-                      (pipeline)->batch_data[                           \
-                         (prepacked).offset + i]) == 0);                \
-              ((uint32_t *)_dst)[i] = _partial[i] |                     \
-                 (pipeline)->batch_data[_cmd_state->offset + i];        \
+              assert((_partial[i] & _inst_data[i]) == 0);               \
+              ((uint32_t *)_dst)[i] = _partial[i] | _inst_data[i];      \
           }                                                            \
           _dst = NULL;                                                 \
         }))
@ -2624,19 +2693,19 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 #endif

   if (IS_DIRTY(VF_SGVS))
-      anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), pipeline->final.vf_sgvs);
+      anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), MESA_SHADER_VERTEX, vs.vf_sgvs);

 #if GFX_VER >= 11
   if (IS_DIRTY(VF_SGVS_2))
-      anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), pipeline->final.vf_sgvs_2);
+      anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), MESA_SHADER_VERTEX, vs.vf_sgvs_2);
 #endif

   if (IS_DIRTY(VF_SGVS_INSTANCING))
-      anv_gfx_copy_variable(vf_sgvs_instancing, pipeline->final.vf_sgvs_instancing);
+      anv_gfx_copy_variable(vf_sgvs_instancing, MESA_SHADER_VERTEX, vs.vf_sgvs_instancing);

   if (instance->vf_component_packing && IS_DIRTY(VF_COMPONENT_PACKING)) {
      anv_gfx_copy(vf_component_packing, GENX(3DSTATE_VF_COMPONENT_PACKING),
-                   pipeline->final.vf_component_packing);
+                   MESA_SHADER_VERTEX, vs.vf_component_packing);
   }

   if (IS_DIRTY(INDEX_BUFFER)) {
@ -2655,7 +2724,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,

   if (IS_DIRTY(STREAMOUT)) {
      anv_gfx_pack_merge(so, GENX(3DSTATE_STREAMOUT),
-                         pipeline->partial.so, so) {
+                         gfx->streamout_stage, so, so) {
         SET(so, so, RenderingDisable);
         SET(so, so, RenderStreamSelect);
         SET(so, so, ReorderMode);
@ -2664,7 +2733,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   }

   if (IS_DIRTY(SO_DECL_LIST))
-      anv_gfx_copy_variable(so_decl_list, pipeline->final.so_decl_list);
+      anv_gfx_copy_variable(so_decl_list, gfx->streamout_stage, so_decl_list);

   if (IS_DIRTY(CLIP)) {
      anv_gfx_pack(clip, GENX(3DSTATE_CLIP), clip) {
@ -2886,7 +2955,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,

   if (IS_DIRTY(TE)) {
      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
-         anv_gfx_pack_merge(te, GENX(3DSTATE_TE), pipeline->partial.te, te) {
+         anv_gfx_pack_merge(te, GENX(3DSTATE_TE),
+                            MESA_SHADER_TESS_EVAL, ds.te, te) {
            SET(te, te, OutputTopology);
 #if GFX_VERx10 >= 125
            SET(te, te, TessellationDistributionMode);
@ -2986,7 +3056,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   }

   if (IS_DIRTY(WM)) {
-      anv_gfx_pack_merge(wm, GENX(3DSTATE_WM), pipeline->partial.wm, wm) {
+      anv_gfx_pack_merge(wm, GENX(3DSTATE_WM),
+                         MESA_SHADER_FRAGMENT, ps.wm, wm) {
         SET(wm, wm, LineStippleEnable);
         SET(wm, wm, BarycentricInterpolationMode);
      }
@ -3079,12 +3150,12 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   }

 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_features.meshShader) {
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
      if (IS_DIRTY(MESH_CONTROL)) {
         if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
            anv_gfx_copy_protected(mesh_control,
                                   GENX(3DSTATE_MESH_CONTROL),
-                                   pipeline->final.mesh_control);
+                                   MESA_SHADER_MESH, ms.control);
         } else {
            anv_gfx_pack(mesh_control, GENX(3DSTATE_MESH_CONTROL), mc);
         }
@ -3092,8 +3163,9 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,

      if (IS_DIRTY(TASK_CONTROL)) {
         if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
-            anv_gfx_copy_protected(task_control, GENX(3DSTATE_TASK_CONTROL),
-                                   pipeline->final.task_control);
+            anv_gfx_copy_protected(task_control,
+                                   GENX(3DSTATE_TASK_CONTROL),
+                                   MESA_SHADER_TASK, ts.control);
         } else {
            anv_gfx_pack(task_control, GENX(3DSTATE_TASK_CONTROL), tc);
         }
@ -3101,70 +3173,58 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,

      if (IS_DIRTY(MESH_SHADER)) {
         anv_gfx_copy(mesh_shader, GENX(3DSTATE_MESH_SHADER),
-                      pipeline->final.mesh_shader);
+                      MESA_SHADER_MESH, ms.shader);
      }

      if (IS_DIRTY(MESH_DISTRIB)) {
         anv_gfx_copy(mesh_distrib, GENX(3DSTATE_MESH_DISTRIB),
-                      pipeline->final.mesh_distrib);
+                      MESA_SHADER_MESH, ms.distrib);
      }

      if (IS_DIRTY(CLIP_MESH)) {
         anv_gfx_copy(clip_mesh, GENX(3DSTATE_CLIP_MESH),
-                      pipeline->final.clip_mesh);
+                      MESA_SHADER_MESH, ms.clip);
      }

      if (IS_DIRTY(TASK_SHADER)) {
         anv_gfx_copy(task_shader, GENX(3DSTATE_TASK_SHADER),
-                      pipeline->final.task_shader);
+                      MESA_SHADER_TASK, ts.shader);
      }

      if (IS_DIRTY(TASK_REDISTRIB)) {
         anv_gfx_copy(task_redistrib, GENX(3DSTATE_TASK_REDISTRIB),
-                      pipeline->final.task_redistrib);
+                      MESA_SHADER_TASK, ts.redistrib);
      }
   }
 #endif /* GFX_VERx10 >= 125 */

   if (IS_DIRTY(VS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_VERTEX)) {
-         anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), pipeline->final.vs);
-      } else {
-         anv_gfx_pack(vs, GENX(3DSTATE_VS), vs);
+#if GFX_VERx10 == 90
+      anv_gfx_pack_merge_protected(vs, GENX(3DSTATE_VS),
+                                   MESA_SHADER_VERTEX, vs.vs, vs) {
+         SET(vs, vs, VertexCacheDisable);
      }
+#else
+      anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), MESA_SHADER_VERTEX, vs.vs);
+#endif
   }

-   if (IS_DIRTY(HS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) {
-         anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), pipeline->final.hs);
-      } else {
-         anv_gfx_pack(hs, GENX(3DSTATE_HS), hs);
-      }
-   }
+   if (IS_DIRTY(HS))
+      anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs);

-   if (IS_DIRTY(DS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
-         anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), pipeline->final.ds);
-      } else {
-         anv_gfx_pack(ds, GENX(3DSTATE_DS), ds);
-      }
-   }
+   if (IS_DIRTY(DS))
+      anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), MESA_SHADER_TESS_EVAL, ds.ds);

   if (IS_DIRTY(GS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_GEOMETRY)) {
      anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
-                                      pipeline->partial.gs, gs) {
+                                   MESA_SHADER_GEOMETRY, gs.gs, gs) {
         SET(gs, gs, ReorderMode);
      }
-      } else {
-         anv_gfx_pack(gs, GENX(3DSTATE_GS), gs);
-      }
   }

   if (IS_DIRTY(PS)) {
-      if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
      anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS),
-                                      pipeline->partial.ps, ps) {
+                                   MESA_SHADER_FRAGMENT, ps.ps, ps) {
         SET(ps, ps, KernelStartPointer0);
         SET(ps, ps, KernelStartPointer1);
         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
@ -3187,15 +3247,12 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
 #endif
         SET(ps, ps, PositionXYOffsetSelect);
      }
-      } else {
-         anv_gfx_pack(ps, GENX(3DSTATE_PS), ps);
-      }
   }

   if (IS_DIRTY(PS_EXTRA)) {
      if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
         anv_gfx_pack_merge(ps_extra, GENX(3DSTATE_PS_EXTRA),
-                            pipeline->partial.ps_extra, pse) {
+                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
            SET(pse, ps_extra, PixelShaderHasUAV);
            SET(pse, ps_extra, PixelShaderIsPerSample);
 #if GFX_VER >= 11
@ -3213,7 +3270,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
          * change through pre-rasterization shader) or if we notice a change.
          */
         anv_gfx_pack_merge(ps_extra_dep, GENX(3DSTATE_PS_EXTRA),
-                            pipeline->partial.ps_extra, pse) {
+                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
            SET(pse, ps_extra, PixelShaderHasUAV);
            SET(pse, ps_extra, PixelShaderIsPerSample);
 #if GFX_VER >= 11
@ -3269,15 +3326,13 @@ genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
      cmd_buffer->device,
      &cmd_buffer->vk.dynamic_graphics_state,
      &cmd_buffer->state.gfx,
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline),
      cmd_buffer->vk.level);

   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);

   cmd_buffer_repack_gfx_state(&cmd_buffer->state.gfx.dyn_state,
                               cmd_buffer,
-                               &cmd_buffer->state.gfx,
-                               anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline));
+                               &cmd_buffer->state.gfx);
 }

 static void
@ -3431,8 +3486,6 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
   struct anv_device *device = cmd_buffer->device;
   struct anv_instance *instance = device->physical->instance;
   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(gfx->base.pipeline);
   const struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;
   struct anv_push_constants *push_consts =
@ -3493,7 +3546,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
      const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
      if (mesh_prog_data) {
         push_consts->gfx.fs_per_prim_remap_offset =
-            pipeline->base.shaders[MESA_SHADER_MESH]->kernel.offset +
+            gfx->shaders[MESA_SHADER_MESH]->kernel.offset +
            mesh_prog_data->wa_18019110168_mapping_offset;
      }

@ -3576,7 +3629,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
       *    3. Send 3D State SOL with SOL Enabled
       */
      if (intel_needs_workaround(device->info, 16011773973) &&
-          pipeline->uses_xfb)
+          gfx->shaders[gfx->streamout_stage]->xfb_info != NULL)
         anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);

      anv_batch_emit_gfx_variable(batch, so_decl_list);
@ -3597,7 +3650,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
   }

 #if GFX_VERx10 >= 125
-   if (device->vk.enabled_features.meshShader) {
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
      if (IS_DIRTY(MESH_CONTROL))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_CONTROL), mesh_control);

@ -3670,8 +3723,8 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_TOPOLOGY), vft);

   if (IS_DIRTY(VERTEX_INPUT)) {
-      genX(batch_emit_pipeline_vertex_input)(batch, device,
-                                             pipeline, dyn->vi);
+      genX(batch_emit_vertex_input)(batch, device,
+                                    gfx->shaders[MESA_SHADER_VERTEX], dyn->vi);
   }

   if (IS_DIRTY(TE))
@ -3823,8 +3876,6 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_device *device = cmd_buffer->device;
   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
-   struct anv_graphics_pipeline *pipeline =
-      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;

   if (INTEL_DEBUG(DEBUG_REEMIT)) {
@ -3863,7 +3914,7 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
    * it after.
    */
   if (intel_needs_workaround(device->info, 16011773973) &&
-       pipeline->uses_xfb &&
+       gfx->shaders[gfx->streamout_stage]->xfb_info != NULL &&
       BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
   }
--- a/src/intel/vulkan/genX_shader.c
+++ b/src/intel/vulkan/genX_shader.c
@ -569,31 +569,6 @@ emit_vs_shader(struct anv_batch *batch,
      vs.SoftwareExceptionEnable    = false;
      vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;

-#if 0
-      /* TODO: move to shader binding */
-      if (GFX_VER == 9 && devinfo->gt == 4 &&
-          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-         /* On Sky Lake GT4, we have experienced some hangs related to the VS
-          * cache and tessellation.  It is unknown exactly what is happening
-          * but the Haswell docs for the "VS Reference Count Full Force Miss
-          * Enable" field of the "Thread Mode" register refer to a HSW bug in
-          * which the VUE handle reference count would overflow resulting in
-          * internal reference counting bugs.  My (Faith's) best guess is that
-          * this bug cropped back up on SKL GT4 when we suddenly had more
-          * threads in play than any previous gfx9 hardware.
-          *
-          * What we do know for sure is that setting this bit when
-          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
-          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
-          * Disabling the vertex cache with tessellation shaders should only
-          * have a minor performance impact as the tessellation shaders are
-          * likely generating and processing far more geometry than the vertex
-          * stage.
-          */
-         vs.VertexCacheDisable = true;
-      }
-#endif
-
      vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
      vs.VertexURBEntryReadOffset      = 0;
      vs.DispatchGRFStartRegisterForURBData =