anv: split pipeline programming into instructions

The goal of this change it to move away from a single batch buffer containing all kind of pipeline instructions to a list of instructions we can emit separately. We will later implement pipeline diffing and finer state tracking that will allow fewer instructions to be emitted. This changes the following things : * instead of having a batch & partially packed instructions, move everything into the batch * add a set of pointer in the batch that allows us to point to each instruction (almost... we group some like URB instructions, etc...). At pipeline emission time, we just go through all of those pointers and emit the instruction into the batch. No additional packing is involved. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24536>
2026-02-18 10:30:39 +01:00 · 2023-08-01 12:20:19 +03:00 · 2023-08-01 12:20:19 +03:00 · 44656f98d5
commit 44656f98d5
parent 758540d741
6 changed files with 721 additions and 604 deletions
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list)
      memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
 }

-static VkResult
+VkResult
 anv_reloc_list_append(struct anv_reloc_list *list,
                      struct anv_reloc_list *other)
 {
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);

 void genX(emit_vertex_input)(struct anv_batch *batch,
                             uint32_t *vertex_element_dws,
-                             const struct anv_graphics_pipeline *pipeline,
-                             const struct vk_vertex_input_state *vi);
+                             struct anv_graphics_pipeline *pipeline,
+                             const struct vk_vertex_input_state *vi,
+                             bool emit_in_pipeline);

 enum anv_pipe_bits
 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch,
 void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
                                const struct intel_l3_config *cfg);

-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);

 void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
                                     bool enable);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
   return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
 }

+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+                               struct anv_reloc_list *other);
+
 struct anv_batch_bo {
   /* Link in the anv_cmd_buffer.owned_batch_bos list */
   struct list_head                             link;
@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location,
      __dst;                                               \
   })

-#define anv_batch_emit_merge(batch, cmd, prepacked, name)               \
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name)         \
   for (struct cmd name = { 0 },                                        \
        *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
        __builtin_expect(_dst != NULL, 1);                              \
        ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
           __anv_cmd_pack(cmd)(batch, _partial, &name);                 \
-           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++)         \
-              ((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i];     \
+           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
+              ((uint32_t *)_dst)[i] = _partial[i] |                     \
+                 (pipeline)->batch_data[(pipeline)->state.offset + i];  \
+           }                                                            \
           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
           _dst = NULL;                                                 \
         }))
@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline {
   bool                                         retain_shaders;
 };

+struct anv_gfx_state_ptr {
+   /* Both in dwords */
+   uint16_t  offset;
+   uint16_t  len;
+};
+
 /* The final graphics pipeline object has all the graphics state ready to be
 * programmed into HW packets (dynamic_state field) or fully baked in its
 * batch.
@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline {
    * this array only holds the svgs_count elements.
    */
   uint32_t                                     vertex_input_elems;
-   uint32_t                                     vertex_input_data[96];
+   uint32_t                                     vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];

   enum brw_wm_msaa_flags                       fs_msaa_flags;

@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline {

   /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
   struct {
-      uint32_t                                  hs[9];
-      uint32_t                                  ds[11];
+      struct anv_gfx_state_ptr                  urb;
+      struct anv_gfx_state_ptr                  vf_statistics;
+      struct anv_gfx_state_ptr                  vf_sgvs;
+      struct anv_gfx_state_ptr                  vf_sgvs_2;
+      struct anv_gfx_state_ptr                  vf_sgvs_instancing;
+      struct anv_gfx_state_ptr                  vf_instancing;
+      struct anv_gfx_state_ptr                  primitive_replication;
+      struct anv_gfx_state_ptr                  sbe;
+      struct anv_gfx_state_ptr                  sbe_swiz;
+      struct anv_gfx_state_ptr                  so_decl_list;
+      struct anv_gfx_state_ptr                  ms;
+      struct anv_gfx_state_ptr                  vs;
+      struct anv_gfx_state_ptr                  hs;
+      struct anv_gfx_state_ptr                  ds;
+      struct anv_gfx_state_ptr                  ps;
+      struct anv_gfx_state_ptr                  ps_extra;
+
+      struct anv_gfx_state_ptr                  task_control;
+      struct anv_gfx_state_ptr                  task_shader;
+      struct anv_gfx_state_ptr                  task_redistrib;
+      struct anv_gfx_state_ptr                  clip_mesh;
+      struct anv_gfx_state_ptr                  mesh_control;
+      struct anv_gfx_state_ptr                  mesh_shader;
+      struct anv_gfx_state_ptr                  mesh_distrib;
+      struct anv_gfx_state_ptr                  sbe_mesh;
   } final;

   /* Pre packed CS instructions & structures that need to be merged later
    * with dynamic state.
    */
   struct {
-      uint32_t                                  clip[4];
-      uint32_t                                  sf[4];
-      uint32_t                                  raster[5];
-      uint32_t                                  wm[2];
-      uint32_t                                  streamout_state[5];
-      uint32_t                                  gs[10];
-      uint32_t                                  te[4];
-      uint32_t                                  vfg[4];
+      struct anv_gfx_state_ptr                  clip;
+      struct anv_gfx_state_ptr                  sf;
+      struct anv_gfx_state_ptr                  raster;
+      struct anv_gfx_state_ptr                  wm;
+      struct anv_gfx_state_ptr                  so;
+      struct anv_gfx_state_ptr                  gs;
+      struct anv_gfx_state_ptr                  te;
+      struct anv_gfx_state_ptr                  vfg;
   } partial;
 };

+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len);             \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] |                                         \
+            (pipeline)->batch_data[(pipeline)->state.offset + i];       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+   } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state)           \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      uint32_t *dw;                                                     \
+      dw = anv_batch_emit_dwords((batch), (pipeline)->state.len);       \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset],     \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 struct anv_compute_pipeline {
   struct anv_pipeline                          base;

--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
      return;

-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length),
-                         GENX(3DSTATE_HS));
-   memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
 }

 ALWAYS_INLINE static void
@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
      return;

-   uint32_t *dw =
-      anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length),
-                         GENX(3DSTATE_DS));
-   memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
 #endif
 }

@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
      }
   }

-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch);
+   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
+      genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);

-      /* If the pipeline changed, we may need to re-allocate push constant
-       * space in the URB.
-       */
+   /* If the pipeline changed, we may need to re-allocate push constant space
+    * in the URB.
+    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
      cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+      /* Also add the relocations (scratch buffers) */
+      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                              pipeline->base.base.batch.relocs);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
   }

   /* Render targets live in the same binding table as fragment descriptors */
@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
                                          dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
   }

-   if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
-      genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
 }

 #include "genX_cmd_draw_generated_indirect.h"
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)

   if (!tes_prog_data ||
       !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      uint32_t *dw =
-         anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length),
-                         GENX(3DSTATE_TE));
-      memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te));
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te);
      return;
   }

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
-                        pipeline->partial.te, te) {
+                        pipeline, partial.te, te) {
      if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
         te.OutputTopology = tes_prog_data->output_topology;
      } else {
@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs);
      return;
   }

   const struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;
   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
-                        pipeline->partial.gs, gs) {
+                        pipeline, partial.gs, gs) {
      switch (dyn->rs.provoking_vertex) {
      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
         gs.ReorderMode = LEADING;
@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
      return;

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
-                        pipeline->partial.clip, clip) {
+                        pipeline, partial.clip, clip) {
      /* Take dynamic primitive topology in to account with
       *    3DSTATE_CLIP::ViewportXYClipTestEnable
       */
@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
   genX(streamout_prologue)(cmd_buffer);

   anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
-                        pipeline->partial.streamout_state, so) {
+                        pipeline, partial.so, so) {
      so.RenderingDisable = dyn->rs.rasterizer_discard_enable;
      so.RenderStreamSelect = dyn->rs.rasterization_stream;
 #if INTEL_NEEDS_WA_18022508906
@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
   }
 }

+#define cmd_buffer_emit_pipeline_state(batch, pipeline, state)          \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len);   \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw,                                                        \
+             &(pipeline)->batch_data[(pipeline)->state.offset],         \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
 {
   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
   struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;

+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      struct anv_batch *batch = &cmd_buffer->batch;
+
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps);
+      cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra);
+
+      if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib);
+         cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh);
+      }
+   }
+
   cmd_buffer_emit_clip(cmd_buffer);

   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         } else {
            /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
            genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
-                                    pipeline, dyn->vi);
+                                    pipeline, dyn->vi, false /* emit_in_pipeline */);
            /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
            memcpy(p + 1 + 2 * pipeline->vs_input_elements,
                   pipeline->vertex_input_data,
@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
-                           pipeline->partial.sf, sf) {
+                           pipeline, partial.sf, sf) {
         ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex);

         sf.LineWidth = dyn->rs.line.width;
@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         vk_rasterization_state_depth_clip_enable(&dyn->rs);

      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
-                           pipeline->partial.raster, raster) {
+                           pipeline, partial.raster, raster) {
         raster.APIMode = api_mode;
         raster.DXMultisampleRasterizationEnable   = msaa_raster_enable;
         raster.AntialiasingEnable                 = aa_enable;
@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
-                           pipeline->partial.vfg, vfg) {
+                           pipeline, partial.vfg, vfg) {
         vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
      }
   }
@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
       * threads.
       */
      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
-                           pipeline->partial.wm, wm) {
+                           pipeline, partial.wm, wm) {
         wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
                                        (pipeline->force_fragment_thread_dispatch ||
                                        anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
         ccp.ColorCalcStatePointerValid = true;
      }
   }
-
-   /* When we're done, there is no more dirty gfx state. */
-   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
-   cmd_buffer->state.gfx.dirty = 0;
 }
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c