anv: split pipeline programming into instructions

The goal of this change it to move away from a single batch buffer containing all kind of pipeline instructions to a list of instructions we can emit separately. We will later implement pipeline diffing and finer state tracking that will allow fewer instructions to be emitted. This changes the following things : * instead of having a batch & partially packed instructions, move everything into the batch * add a set of pointer in the batch that allows us to point to each instruction (almost... we group some like URB instructions, etc...). At pipeline emission time, we just go through all of those pointers and emit the instruction into the batch. No additional packing is involved. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24536>
2025-12-23 15:30:14 +01:00 · 2023-08-01 12:20:19 +03:00 · 2023-08-01 12:20:19 +03:00 · 44656f98d5
commit 44656f98d5
parent 758540d741
6 changed files with 721 additions and 604 deletions
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list)
      memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
 }

-static VkResult
+VkResult
 anv_reloc_list_append(struct anv_reloc_list *list,
                      struct anv_reloc_list *other)
 {
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);

 void genX(emit_vertex_input)(struct anv_batch *batch,
                             uint32_t *vertex_element_dws,
-                             const struct anv_graphics_pipeline *pipeline,
-                             const struct vk_vertex_input_state *vi);
+                             struct anv_graphics_pipeline *pipeline,
+                             const struct vk_vertex_input_state *vi,
+                             bool emit_in_pipeline);

 enum anv_pipe_bits
 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch,
 void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
                                const struct intel_l3_config *cfg);

-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);

 void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
                                     bool enable);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
   return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
 }

+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+                               struct anv_reloc_list *other);
+
 struct anv_batch_bo {
   /* Link in the anv_cmd_buffer.owned_batch_bos list */
   struct list_head                             link;
@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location,
      __dst;                                               \
   })

-#define anv_batch_emit_merge(batch, cmd, prepacked, name)               \
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name)         \
   for (struct cmd name = { 0 },                                        \
        *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
        __builtin_expect(_dst != NULL, 1);                              \
        ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
           __anv_cmd_pack(cmd)(batch, _partial, &name);                 \
-           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++)         \
-              ((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i];     \
+           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
+              ((uint32_t *)_dst)[i] = _partial[i] |                     \
+                 (pipeline)->batch_data[(pipeline)->state.offset + i];  \
+           }                                                            \
           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
           _dst = NULL;                                                 \
         }))
@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline {
   bool                                         retain_shaders;
 };

+struct anv_gfx_state_ptr {
+   /* Both in dwords */
+   uint16_t  offset;
+   uint16_t  len;
+};
+
 /* The final graphics pipeline object has all the graphics state ready to be
 * programmed into HW packets (dynamic_state field) or fully baked in its
 * batch.
@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline {
    * this array only holds the svgs_count elements.
    */
   uint32_t                                     vertex_input_elems;
-   uint32_t                                     vertex_input_data[96];
+   uint32_t                                     vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];

   enum brw_wm_msaa_flags                       fs_msaa_flags;

@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline {

   /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
   struct {
-      uint32_t                                  hs[9];
-      uint32_t                                  ds[11];
+      struct anv_gfx_state_ptr                  urb;
+      struct anv_gfx_state_ptr                  vf_statistics;
+      struct anv_gfx_state_ptr                  vf_sgvs;
+      struct anv_gfx_state_ptr                  vf_sgvs_2;
+      struct anv_gfx_state_ptr                  vf_sgvs_instancing;
+      struct anv_gfx_state_ptr                  vf_instancing;
+      struct anv_gfx_state_ptr                  primitive_replication;
+      struct anv_gfx_state_ptr                  sbe;
+      struct anv_gfx_state_ptr                  sbe_swiz;
+      struct anv_gfx_state_ptr                  so_decl_list;
+      struct anv_gfx_state_ptr                  ms;
+      struct anv_gfx_state_ptr                  vs;
+      struct anv_gfx_state_ptr                  hs;
+      struct anv_gfx_state_ptr                  ds;
+      struct anv_gfx_state_ptr                  ps;
+      struct anv_gfx_state_ptr                  ps_extra;
+
+      struct anv_gfx_state_ptr                  task_control;
+      struct anv_gfx_state_ptr                  task_shader;
+      struct anv_gfx_state_ptr                  task_redistrib;
+      struct anv_gfx_state_ptr                  clip_mesh;
+      struct anv_gfx_state_ptr                  mesh_control;
+      struct anv_gfx_state_ptr                  mesh_shader;
+      struct anv_gfx_state_ptr                  mesh_distrib;
+      struct anv_gfx_state_ptr                  sbe_mesh;
   } final;

   /* Pre packed CS instructions & structures that need to be merged later
    * with dynamic state.
    */
   struct {
-      uint32_t                                  clip[4];
-      uint32_t                                  sf[4];
-      uint32_t                                  raster[5];
-      uint32_t                                  wm[2];
-      uint32_t                                  streamout_state[5];
-      uint32_t                                  gs[10];
-      uint32_t                                  te[4];
-      uint32_t                                  vfg[4];
+      struct anv_gfx_state_ptr                  clip;
+      struct anv_gfx_state_ptr                  sf;
+      struct anv_gfx_state_ptr                  raster;
+      struct anv_gfx_state_ptr                  wm;
+      struct anv_gfx_state_ptr                  so;
+      struct anv_gfx_state_ptr                  gs;
+      struct anv_gfx_state_ptr                  te;
+      struct anv_gfx_state_ptr                  vfg;
   } partial;
 };

+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len);             \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] |                                         \
+            (pipeline)->batch_data[(pipeline)->state.offset + i];       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+   } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state)           \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      uint32_t *dw;                                                     \
+      dw = anv_batch_emit_dwords((batch), (pipeline)->state.len);       \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset],     \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 struct anv_compute_pipeline {
   struct anv_pipeline                          base;

--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
      return;

-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length),
-                         GENX(3DSTATE_HS));
-   memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
 }

 ALWAYS_INLINE static void
@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
      return;

-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length),
-                         GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
 #endif
 }

@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
      }
   }

-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch);
+   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
+      genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);

-      /* If the pipeline changed, we may need to re-allocate push constant
-       * space in the URB.
+   /* If the pipeline changed, we may need to re-allocate push constant space
+    * in the URB.
    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
      cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+      /* Also add the relocations (scratch buffers) */
+      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                              pipeline->base.base.batch.relocs);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
   }

   /* Render targets live in the same binding table as fragment descriptors */
@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
                                          dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
   }

-   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
-      genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
 }

 #include "genX_cmd_draw_generated_indirect.h"
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)

   if (!tes_prog_data ||
       !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      uint32_t *dw =
-         anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length),
-                         GENX(3DSTATE_TE));
-      memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te));
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te);
      return;
   }

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
-                        pipeline->partial.te, te) {
+                        pipeline, partial.te, te) {
      if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
         te.OutputTopology = tes_prog_data->output_topology;
      } else {
@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs);
      return;
   }

   const struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;
   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
-                        pipeline->partial.gs, gs) {
+                        pipeline, partial.gs, gs) {
      switch (dyn->rs.provoking_vertex) {
      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
         gs.ReorderMode = LEADING;
@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
      return;

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
-                        pipeline->partial.clip, clip) {
+                        pipeline, partial.clip, clip) {
      /* Take dynamic primitive topology in to account with
       *    3DSTATE_CLIP::ViewportXYClipTestEnable
       */
@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
   genX(streamout_prologue)(cmd_buffer);

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
-                        pipeline->partial.streamout_state, so) {
+                        pipeline, partial.so, so) {
      so.RenderingDisable = dyn->rs.rasterizer_discard_enable;
      so.RenderStreamSelect = dyn->rs.rasterization_stream;
 #if INTEL_NEEDS_WA_18022508906
@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
   }
 }

+#define cmd_buffer_emit_pipeline_state(batch, pipeline, state)          \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len);   \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw,                                                        \
+             &(pipeline)->batch_data[(pipeline)->state.offset],         \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
   struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;

+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      struct anv_batch *batch = &cmd_buffer->batch;
+
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra);
+
+      if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh);
+      }
+   }
+
   cmd_buffer_emit_clip(cmd_buffer);

   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         } else {
            /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
            genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
-                                    pipeline, dyn->vi);
+                                    pipeline, dyn->vi, false /* emit_in_pipeline */);
            /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
            memcpy(p + 1 + 2 * pipeline->vs_input_elements,
                   pipeline->vertex_input_data,
@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
-                           pipeline->partial.sf, sf) {
+                           pipeline, partial.sf, sf) {
         ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex);

         sf.LineWidth = dyn->rs.line.width;
@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         vk_rasterization_state_depth_clip_enable(&dyn->rs);

      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
-                           pipeline->partial.raster, raster) {
+                           pipeline, partial.raster, raster) {
         raster.APIMode = api_mode;
         raster.DXMultisampleRasterizationEnable   = msaa_raster_enable;
         raster.AntialiasingEnable                 = aa_enable;
@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
-                           pipeline->partial.vfg, vfg) {
+                           pipeline, partial.vfg, vfg) {
         vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
      }
   }
@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
       * threads.
       */
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
-                           pipeline->partial.wm, wm) {
+                           pipeline, partial.wm, wm) {
         wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
                                        (pipeline->force_fragment_thread_dispatch ||
                                        anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         ccp.ColorCalcStatePointerValid = true;
      }
   }
-
-   /* When we're done, there is no more dirty gfx state. */
-   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
-   cmd_buffer->state.gfx.dirty = 0;
 }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@ -36,6 +36,52 @@
 #include "vk_log.h"
 #include "vk_render_pass.h"

+static inline struct anv_batch *
+anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
+                     struct anv_gfx_state_ptr *ptr,
+                     uint32_t n_dwords)
+{
+   struct anv_batch *batch = &pipeline->base.base.batch;
+
+   assert(ptr->len == 0 ||
+          (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
+   if (ptr->len == 0)
+      ptr->offset = (batch->next - batch->start) / 4;
+   ptr->len += n_dwords;
+
+   return batch;
+}
+
+#define anv_pipeline_emit(pipeline, state, cmd, name)                   \
+   for (struct cmd name = { __anv_cmd_header(cmd) },                    \
+           *_dst = anv_batch_emit_dwords(                               \
+              anv_gfx_pipeline_add(pipeline,                            \
+                                   &(pipeline)->state,                  \
+                                   __anv_cmd_length(cmd)),              \
+              __anv_cmd_length(cmd));                                   \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
+                               _dst, &name);                            \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+        }))
+
+#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
+   void *__dst = anv_batch_emit_dwords(                                 \
+      anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
+   if (__dst) {                                                         \
+      struct cmd __template = {                                         \
+         __anv_cmd_header(cmd),                                         \
+         .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
+         __VA_ARGS__                                                    \
+      };                                                                \
+      __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
+                          __dst, &__template);                          \
+   }                                                                    \
+   __dst;                                                               \
+   })
+
+
 static uint32_t
 vertex_element_comp_control(enum isl_format format, unsigned comp)
 {
@ -91,8 +137,9 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
 void
 genX(emit_vertex_input)(struct anv_batch *batch,
                        uint32_t *vertex_element_dws,
-                        const struct anv_graphics_pipeline *pipeline,
-                        const struct vk_vertex_input_state *vi)
+                        struct anv_graphics_pipeline *pipeline,
+                        const struct vk_vertex_input_state *vi,
+                        bool emit_in_pipeline)
 {
   const struct anv_device *device = pipeline->base.base.device;
   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@ -169,6 +216,18 @@ genX(emit_vertex_input)(struct anv_batch *batch,
       * that controls instancing.  On Haswell and prior, that's part of
       * VERTEX_BUFFER_STATE which we emit later.
       */
+      if (emit_in_pipeline) {
+         anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
+      } else {
         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
            bool per_instance = vi->bindings[binding].input_rate ==
               VK_VERTEX_INPUT_RATE_INSTANCE;
@ -181,21 +240,20 @@ genX(emit_vertex_input)(struct anv_batch *batch,
         }
      }
   }
+}

 static void
 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                  const struct vk_graphics_pipeline_state *state,
                  const struct vk_vertex_input_state *vi)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
   /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
    * everything in gfx8_cmd_buffer.c
    */
   if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
-      genX(emit_vertex_input)(batch,
+      genX(emit_vertex_input)(NULL,
                              pipeline->vertex_input_data,
-                              pipeline, vi);
+                              pipeline, vi, true /* emit_in_pipeline */);
   }

   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
@ -207,6 +265,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
      assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
      uint32_t slot_offset =
         pipeline->vertex_input_elems - pipeline->svgs_count;
+
      if (needs_svgs_elem) {
 #if GFX_VER < 11
         /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
@ -243,7 +302,8 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                         &element);
         slot_offset++;

-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
            vfi.VertexElementIndex = id_slot;
         }
      }
@ -268,13 +328,14 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
                                         &element);
         slot_offset++;

-         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
            vfi.VertexElementIndex = drawid_slot;
         }
      }
   }

-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
      sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
      sgvs.VertexIDComponentNumber     = 2;
      sgvs.VertexIDElementOffset       = id_slot;
@ -284,7 +345,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
   }

 #if GFX_VER >= 11
-   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
      /* gl_BaseVertex */
      sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
      sgvs.XP0SourceSelect             = XP0_PARAMETER;
@ -306,14 +367,12 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,

 #if GFX_VERx10 >= 125
   struct anv_device *device = pipeline->base.base.device;
-   struct GENX(3DSTATE_VFG) vfg = {
-      GENX(3DSTATE_VFG_header),
+   anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
      /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
-      .DistributionMode =
+      vfg.DistributionMode =
         anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
-         RR_FREE,
-      .DistributionGranularity = BatchLevelGranularity,
-   };
+         RR_FREE;
+      vfg.DistributionGranularity = BatchLevelGranularity;
      /* Wa_14014890652 */
      if (intel_device_info_is_dg2(device->info))
         vfg.GranularityThresholdDisable = 1;
@ -331,7 +390,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
      vfg.PatchBatchSizeScale = 1;
      /* 192 control points for PATCHLIST_3 */
      vfg.PatchBatchSizeMultiplier = 31;
-   GENX(3DSTATE_VFG_pack)(NULL, pipeline->partial.vfg, &vfg);
+   }
 #endif
 }

@ -375,7 +434,6 @@ static void
 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
                    enum intel_urb_deref_block_size *deref_block_size)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   const struct intel_device_info *devinfo = pipeline->base.base.device->info;

   const struct brw_task_prog_data *task_prog_data =
@ -390,12 +448,12 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,

   /* Zero out the primitive pipeline URB allocations. */
   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
-      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
         urb._3DCommandSubOpcode += i;
      }
   }

-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
      if (task_prog_data) {
         urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
         urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
@ -405,7 +463,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
      }
   }

-   anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
      urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
      urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
      urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
@ -437,53 +495,73 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline,
      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
   }

-   genX(emit_urb_setup)(pipeline->base.base.device,
-                        &pipeline->base.base.batch,
+   struct anv_device *device = pipeline->base.base.device;
+   const struct intel_device_info *devinfo = device->info;
+
+   unsigned entries[4];
+   unsigned start[4];
+   bool constrained;
+   intel_get_urb_config(devinfo,
                        pipeline->base.base.l3_config,
-                        pipeline->base.base.active_stages, entry_size,
-                        deref_block_size);
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_GEOMETRY_BIT,
+                        entry_size, entries, start, deref_block_size,
+                        &constrained);
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = start[i];
+         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
+         urb.VSNumberofURBEntries      = entries[i];
+      }
+   }
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+   }
+#endif
+
 }

 static void
 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);

   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+      anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
+      anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
 #if GFX_VERx10 >= 125
      if (anv_pipeline_is_mesh(pipeline))
-         anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);
+         anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
 #endif
      return;
   }

-   struct GENX(3DSTATE_SBE) sbe = {
-      GENX(3DSTATE_SBE_header),
+   anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
+   anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
+
      /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
       * calculate_urb_setup() and related functions.
       */
-      .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline),
-      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
-      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
-      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
-   };
+      sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
+      sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;

      for (unsigned i = 0; i < 32; i++)
         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;

-   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
-   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
-      GENX(3DSTATE_SBE_SWIZ_header),
-   };
-
      if (anv_pipeline_is_primitive(pipeline)) {
         const struct brw_vue_map *fs_input_map =
            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;

-      int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+         int first_slot =
+            brw_compute_first_urb_slot_required(wm_prog_data->inputs,
                                                fs_input_map);
         assert(first_slot % 2 == 0);
         unsigned urb_entry_read_offset = first_slot / 2;
@ -511,12 +589,12 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
            const int slot = fs_input_map->varying_to_slot[attr];

            if (slot == -1) {
-            /* This attribute does not exist in the VUE--that means that the
-             * vertex shader did not write to it.  It could be that it's a
-             * regular varying read by the fragment shader but not written by
-             * the vertex shader or it's gl_PrimitiveID. In the first case the
-             * value is undefined, in the second it needs to be
-             * gl_PrimitiveID.
+               /* This attribute does not exist in the VUE--that means that
+                * the vertex shader did not write to it. It could be that it's
+                * a regular varying read by the fragment shader but not
+                * written by the vertex shader or it's gl_PrimitiveID. In the
+                * first case the value is undefined, in the second it needs to
+                * be gl_PrimitiveID.
                */
               swiz.Attribute[input_index].ConstantSource = PRIM_ID;
               swiz.Attribute[input_index].ComponentOverrideX = true;
@ -526,16 +604,16 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
               continue;
            }

-         /* We have to subtract two slots to account for the URB entry output
-          * read offset in the VS and GS stages.
+            /* We have to subtract two slots to account for the URB entry
+             * output read offset in the VS and GS stages.
             */
            const int source_attr = slot - 2 * urb_entry_read_offset;
            assert(source_attr >= 0 && source_attr < 32);
            max_source_attr = MAX2(max_source_attr, source_attr);
-         /* The hardware can only do overrides on 16 overrides at a time, and the
-          * other up to 16 have to be lined up so that the input index = the
-          * output index. We'll need to do some tweaking to make sure that's the
-          * case.
+            /* The hardware can only do overrides on 16 overrides at a time,
+             * and the other up to 16 have to be lined up so that the input
+             * index = the output index. We'll need to do some tweaking to
+             * make sure that's the case.
             */
            if (input_index < 16)
               swiz.Attribute[input_index].SourceAttribute = source_attr;
@ -565,7 +643,8 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
         assert(anv_pipeline_is_mesh(pipeline));
 #if GFX_VERx10 >= 125
         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
            const struct brw_mue_map *mue = &mesh_prog_data->map;

            assert(mue->per_vertex_header_size_dw % 8 == 0);
@ -573,9 +652,9 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
            sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);

            /* Clip distance array is passed in the per-vertex header so that
-          * it can be consumed by the HW. If user wants to read it in the FS,
-          * adjust the offset and length to cover it. Conveniently it is at
-          * the end of the per-vertex header, right before per-vertex
+             * it can be consumed by the HW. If user wants to read it in the
+             * FS, adjust the offset and length to cover it. Conveniently it
+             * is at the end of the per-vertex header, right before per-vertex
             * attributes.
             *
             * Note that FS attribute reading must be aware that the clip
@ -594,13 +673,15 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
            }

            assert(mue->per_primitive_header_size_dw % 8 == 0);
-         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
-         sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
+               mue->per_primitive_header_size_dw / 8;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
+               DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);

            /* Just like with clip distances, if Primitive Shading Rate,
-          * Viewport Index or Layer is read back in the FS, adjust
-          * the offset and length to cover the Primitive Header, where
-          * PSR, Viewport Index & Layer are stored.
+             * Viewport Index or Layer is read back in the FS, adjust the
+             * offset and length to cover the Primitive Header, where PSR,
+             * Viewport Index & Layer are stored.
             */
            if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
@ -613,16 +694,8 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
         }
 #endif
      }
-
-   uint32_t *dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_pack)(batch, dw, &sbe);
-
-   dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_SWIZ_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_SWIZ_pack)(batch, dw, &swiz);
+   }
+   }
 }

 /** Returns the final polygon mode for rasterization
@ -729,10 +802,7 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
              const struct vk_render_pass_state *rp,
              enum intel_urb_deref_block_size urb_deref_block_size)
 {
-   struct GENX(3DSTATE_SF) sf = {
-      GENX(3DSTATE_SF_header),
-   };
-
+   anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
      sf.ViewportTransformEnable = true;
      sf.StatisticsEnable = true;
      sf.VertexSubPixelPrecisionSelect = _8Bit;
@ -759,11 +829,9 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
         sf.PointWidthSource = State;
         sf.PointWidth = 1.0;
      }
+   }

-   struct GENX(3DSTATE_RASTER) raster = {
-      GENX(3DSTATE_RASTER_header),
-   };
-
+   anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
      /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
       * "Multisample Modes State".
       */
@ -775,17 +843,14 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
      raster.ForceMultisampling = false;

      raster.ScissorRectangleEnable = true;
-
-   GENX(3DSTATE_SF_pack)(NULL, pipeline->partial.sf, &sf);
-   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->partial.raster, &raster);
+   }
 }

 static void
 emit_ms_state(struct anv_graphics_pipeline *pipeline,
              const struct vk_multisample_state *ms)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+   anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
      ms.NumberofMultisamples       = __builtin_ffs(pipeline->rasterization_samples) - 1;

      ms.PixelLocation              = CENTER;
@ -862,10 +927,7 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
   (void) wm_prog_data;

-   struct GENX(3DSTATE_CLIP) clip = {
-      GENX(3DSTATE_CLIP_header),
-   };
-
+   anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
      clip.ClipEnable               = true;
      clip.StatisticsEnable         = true;
      clip.EarlyCullEnable          = true;
@ -884,8 +946,8 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,

         /* From the Vulkan 1.0.45 spec:
          *
-       *    "If the last active vertex processing stage shader entry point's
-       *    interface does not include a variable decorated with
+          *    "If the last active vertex processing stage shader entry
+          *    point's interface does not include a variable decorated with
          *    ViewportIndex, then the first viewport is used."
          */
         if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
@ -919,14 +981,13 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,

      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
         wm_prog_data->uses_nonperspective_interp_modes : 0;
-
-   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->partial.clip, &clip);
+   }

 #if GFX_VERx10 >= 125
   if (anv_pipeline_is_mesh(pipeline)) {
-      struct anv_batch *batch = &pipeline->base.base.batch;
      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
-      anv_batch_emit(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+      anv_pipeline_emit(pipeline, final.clip_mesh,
+                        GENX(3DSTATE_CLIP_MESH), clip_mesh) {
         clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
         clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
         clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
@ -939,8 +1000,6 @@ static void
 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
                       const struct vk_rasterization_state *rs)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   const struct anv_device *device = pipeline->base.base.device;
   const struct brw_vue_prog_data *prog_data =
      anv_pipeline_get_last_vue_prog_data(pipeline);
   const struct brw_vue_map *vue_map = &prog_data->vue_map;
@ -1034,16 +1093,8 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
      }

-      /* Wa_16011773973:
-       * If SOL is enabled and SO_DECL state has to be programmed,
-       *    1. Send 3D State SOL state with SOL disabled
-       *    2. Send SO_DECL NP state
-       *    3. Send 3D State SOL with SOL Enabled
-       */
-      if (intel_device_info_is_dg2(device->info))
-         anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
-
-      uint32_t *dw = anv_batch_emitn(batch, 3 + 2 * max_decls,
+      uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
+                                        3 + 2 * max_decls,
                                        GENX(3DSTATE_SO_DECL_LIST),
                                        .StreamtoBufferSelects0 = sbs[0],
                                        .StreamtoBufferSelects1 = sbs[1],
@ -1063,17 +1114,9 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
               .Stream3Decl = so_decl[3][i],
            });
      }
-
-#if GFX_VERx10 == 125
-      /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
-      genX(batch_emit_pipe_control)(batch, device->info, ANV_PIPE_CS_STALL_BIT);
-#endif
   }

-   struct GENX(3DSTATE_STREAMOUT) so = {
-      GENX(3DSTATE_STREAMOUT_header),
-   };
-
+   anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
      if (xfb_info) {
         pipeline->uses_xfb = true;

@ -1102,8 +1145,7 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
         so.Stream3VertexReadOffset = urb_entry_read_offset;
         so.Stream3VertexReadLength = urb_entry_read_length - 1;
      }
-
-   GENX(3DSTATE_STREAMOUT_pack)(NULL, pipeline->partial.streamout_state, &so);
+   }
 }

 static uint32_t
@ -1158,7 +1200,6 @@ get_scratch_surf(struct anv_pipeline *pipeline,
 static void
 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
   const struct anv_shader_bin *vs_bin =
@ -1166,7 +1207,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)

   assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));

-   anv_batch_emit(batch, GENX(3DSTATE_VS), vs) {
+   anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
      vs.Enable               = true;
      vs.StatisticsEnable     = true;
      vs.KernelStartPointer   = vs_bin->kernel.offset;
@ -1237,11 +1278,9 @@ static void
 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
                   const struct vk_tessellation_state *ts)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
-      anv_batch_emit(batch, GENX(3DSTATE_DS), ds);
+      anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+      anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
      return;
   }

@ -1254,10 +1293,7 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
   const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
   const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);

-   struct GENX(3DSTATE_HS) hs = {
-      GENX(3DSTATE_HS_header),
-   };
-
+   anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
      hs.Enable = true;
      hs.StatisticsEnable = true;
      hs.KernelStartPointer = tcs_bin->kernel.offset;
@ -1306,14 +1342,9 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,

      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
-
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.hs) == GENX(3DSTATE_HS_length));
-   GENX(3DSTATE_HS_pack)(&pipeline->base.base.batch, pipeline->final.hs, &hs);
-
-   struct GENX(3DSTATE_DS) ds = {
-      GENX(3DSTATE_DS_header),
   };

+   anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
      ds.Enable = true;
      ds.StatisticsEnable = true;
      ds.KernelStartPointer = tes_bin->kernel.offset;
@ -1356,18 +1387,7 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
      ds.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
 #endif
-
-   /* Wa_14019750404:
-    * See genX(emit_ds)().
-    * We need to both emit 3DSTATE_DS now, and before each 3DPRIMITIVE, so
-    * we pack it to have it later, and memcpy into the current batch.
-    */
-   STATIC_ASSERT(ARRAY_SIZE(pipeline->final.ds) == GENX(3DSTATE_DS_length));
-   GENX(3DSTATE_DS_pack)(&pipeline->base.base.batch, pipeline->final.ds, &ds);
-
-   uint32_t *dw =
-      anv_batch_emitn(batch, GENX(3DSTATE_DS_length), GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   }
 }

 static UNUSED bool
@ -1391,10 +1411,7 @@ geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
 static void
 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
 {
-   struct GENX(3DSTATE_TE) te = {
-      GENX(3DSTATE_TE_header),
-   };
-
+   anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
         const struct brw_tes_prog_data *tes_prog_data =
            get_tes_prog_data(pipeline);
@ -1430,24 +1447,23 @@ emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
         te.LocalBOPAccumulatorThreshold = 1;
 #endif
      }
-
-   GENX(3DSTATE_TE_pack)(NULL, pipeline->partial.te, &te);
+   }
 }

 static void
 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
 {
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+      anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
+      return;
+   }
+
   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
   const struct anv_shader_bin *gs_bin =
      pipeline->base.shaders[MESA_SHADER_GEOMETRY];
-
-   struct GENX(3DSTATE_GS) gs = {
-      GENX(3DSTATE_GS_header),
-   };
-
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);

+   anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
      gs.Enable                  = true;
      gs.StatisticsEnable        = true;
      gs.KernelStartPointer      = gs_bin->kernel.offset;
@ -1493,8 +1509,6 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
         get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
 #endif
   }
-
-   GENX(3DSTATE_GS_pack)(&pipeline->base.base.batch, pipeline->partial.gs, &gs);
 }

 static bool
@ -1514,9 +1528,7 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
 {
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);

-   struct GENX(3DSTATE_WM) wm = {
-      GENX(3DSTATE_WM_header),
-   };
+   anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
      wm.StatisticsEnable                    = true;
      wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
      wm.LineAntialiasingRegionWidth         = _10pixels;
@ -1555,8 +1567,7 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
            wm_prog_data_barycentric_modes(wm_prog_data,
                                           pipeline->fs_msaa_flags);
      }
-
-   GENX(3DSTATE_WM_pack)(NULL, pipeline->partial.wm, &wm);
+   }
 }

 static void
@ -1564,21 +1575,19 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
                const struct vk_multisample_state *ms,
                const struct vk_color_blend_state *cb)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   UNUSED const struct intel_device_info *devinfo =
      pipeline->base.base.device->info;
   const struct anv_shader_bin *fs_bin =
      pipeline->base.shaders[MESA_SHADER_FRAGMENT];

   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
-      }
+      anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps);
      return;
   }

   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);

-   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+   anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) {
      intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
                                  ms != NULL ? ms->rasterization_samples : 1,
                                  pipeline->fs_msaa_flags);
@ -1629,15 +1638,14 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
                      const struct vk_rasterization_state *rs,
                      const struct vk_render_pass_state *rp)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);

   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps);
+      anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
      return;
   }

-   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps) {
+   anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
      ps.PixelShaderValid              = true;
      ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
      ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
@ -1689,8 +1697,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
 static void
 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+   anv_pipeline_emit(pipeline, final.vf_statistics,
+                     GENX(3DSTATE_VF_STATISTICS), vfs) {
      vfs.StatisticsEnable = true;
   }
 }
@ -1733,10 +1741,9 @@ static void
 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
                                   const struct vk_render_pass_state *rp)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
-
   if (anv_pipeline_is_mesh(pipeline)) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
      return;
   }

@ -1745,14 +1752,16 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,

   assert(replication_count >= 1);
   if (replication_count == 1) {
-      anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
      return;
   }

   assert(replication_count == util_bitcount(rp->view_mask));
   assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);

-   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+   anv_pipeline_emit(pipeline, final.primitive_replication,
+                     GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
      pr.ReplicaMask = (1 << replication_count) - 1;
      pr.ReplicationCount = replication_count - 1;

@ -1769,18 +1778,19 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
 static void
 emit_task_state(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   assert(anv_pipeline_is_mesh(pipeline));

   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
-      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+      anv_pipeline_emit(pipeline, final.task_control,
+                        GENX(3DSTATE_TASK_CONTROL), zero);
      return;
   }

   const struct anv_shader_bin *task_bin =
      pipeline->base.shaders[MESA_SHADER_TASK];

-   anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), tc) {
+   anv_pipeline_emit(pipeline, final.task_control,
+                     GENX(3DSTATE_TASK_CONTROL), tc) {
      tc.TaskShaderEnable = true;
      tc.ScratchSpaceBuffer =
         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
@ -1792,7 +1802,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
   const struct brw_cs_dispatch_info task_dispatch =
      brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);

-   anv_batch_emit(batch, GENX(3DSTATE_TASK_SHADER), task) {
+   anv_pipeline_emit(pipeline, final.task_shader,
+                     GENX(3DSTATE_TASK_SHADER), task) {
      task.KernelStartPointer                = task_bin->kernel.offset;
      task.SIMDSize                          = task_dispatch.simd_size / 16;
      task.MessageSIMD                       = task.SIMDSize;
@ -1818,7 +1829,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
   }

   /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+   anv_pipeline_emit(pipeline, final.task_redistrib,
+                     GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
      redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
      redistrib.SmallTaskThreshold = 1; /* 2^N */
      redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
@ -1830,12 +1842,12 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)
 static void
 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   assert(anv_pipeline_is_mesh(pipeline));

   const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];

-   anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mc) {
+   anv_pipeline_emit(pipeline, final.mesh_control,
+                     GENX(3DSTATE_MESH_CONTROL), mc) {
      mc.MeshShaderEnable = true;
      mc.ScratchSpaceBuffer =
         get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
@ -1864,7 +1876,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
      unreachable("invalid index format");
   }

-   anv_batch_emit(batch, GENX(3DSTATE_MESH_SHADER), mesh) {
+   anv_pipeline_emit(pipeline, final.mesh_shader,
+                     GENX(3DSTATE_MESH_SHADER), mesh) {
      mesh.KernelStartPointer                = mesh_bin->kernel.offset;
      mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
      mesh.MessageSIMD                       = mesh.SIMDSize;
@ -1897,7 +1910,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)
   }

   /* Recommended values from "Task and Mesh Distribution Programming". */
-   anv_batch_emit(batch, GENX(3DSTATE_MESH_DISTRIB), distrib) {
+   anv_pipeline_emit(pipeline, final.mesh_distrib,
+                     GENX(3DSTATE_MESH_DISTRIB), distrib) {
      distrib.DistributionMode = MESH_RR_FREE;
      distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
      distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
@ -1909,7 +1923,6 @@ void
 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
                             const struct vk_graphics_pipeline_state *state)
 {
-   struct anv_batch *batch = &pipeline->base.base.batch;
   enum intel_urb_deref_block_size urb_deref_block_size;
   emit_urb_setup(pipeline, &urb_deref_block_size);

@ -1940,10 +1953,10 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
      const struct anv_device *device = pipeline->base.base.device;
      /* Disable Mesh. */
      if (device->vk.enabled_extensions.EXT_mesh_shader) {
-         struct anv_batch *batch = &pipeline->base.base.batch;
-
-         anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
-         anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.mesh_control,
+                           GENX(3DSTATE_MESH_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.task_control,
+                           GENX(3DSTATE_TASK_CONTROL), zero);
      }
 #endif
   } else {
@ -1952,7 +1965,7 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
      /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
       * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
       */
-      anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {}
+      anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);

 #if GFX_VERx10 >= 125
      emit_task_state(pipeline);