anv: split pipeline programming into instructions

The goal of this change it to move away from a single batch buffer
containing all kind of pipeline instructions to a list of instructions
we can emit separately.

We will later implement pipeline diffing and finer state tracking that
will allow fewer instructions to be emitted.

This changes the following things :

   * instead of having a batch & partially packed instructions, move
     everything into the batch

   * add a set of pointer in the batch that allows us to point to each
     instruction (almost... we group some like URB instructions,
     etc...).

At pipeline emission time, we just go through all of those pointers
and emit the instruction into the batch. No additional packing is
involved.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24536>
This commit is contained in:
Lionel Landwerlin 2023-08-01 12:20:19 +03:00 committed by Marge Bot
parent 758540d741
commit 44656f98d5
6 changed files with 721 additions and 604 deletions

View file

@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list)
memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
}
static VkResult
VkResult
anv_reloc_list_append(struct anv_reloc_list *list,
struct anv_reloc_list *other)
{

View file

@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);
void genX(emit_vertex_input)(struct anv_batch *batch,
uint32_t *vertex_element_dws,
const struct anv_graphics_pipeline *pipeline,
const struct vk_vertex_input_state *vi);
struct anv_graphics_pipeline *pipeline,
const struct vk_vertex_input_state *vi,
bool emit_in_pipeline);
enum anv_pipe_bits
genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch,
void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
const struct intel_l3_config *cfg);
void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
bool enable);

View file

@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
}
VkResult anv_reloc_list_append(struct anv_reloc_list *list,
struct anv_reloc_list *other);
struct anv_batch_bo {
/* Link in the anv_cmd_buffer.owned_batch_bos list */
struct list_head link;
@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location,
__dst; \
})
#define anv_batch_emit_merge(batch, cmd, prepacked, name) \
#define anv_batch_emit_merge(batch, cmd, pipeline, state, name) \
for (struct cmd name = { 0 }, \
*_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \
__builtin_expect(_dst != NULL, 1); \
({ uint32_t _partial[__anv_cmd_length(cmd)]; \
__anv_cmd_pack(cmd)(batch, _partial, &name); \
for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) \
((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i]; \
for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
((uint32_t *)_dst)[i] = _partial[i] | \
(pipeline)->batch_data[(pipeline)->state.offset + i]; \
} \
VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
_dst = NULL; \
}))
@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline {
bool retain_shaders;
};
struct anv_gfx_state_ptr {
/* Both in dwords */
uint16_t offset;
uint16_t len;
};
/* The final graphics pipeline object has all the graphics state ready to be
* programmed into HW packets (dynamic_state field) or fully baked in its
* batch.
@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline {
* this array only holds the svgs_count elements.
*/
uint32_t vertex_input_elems;
uint32_t vertex_input_data[96];
uint32_t vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];
enum brw_wm_msaa_flags fs_msaa_flags;
@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline {
/* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
struct {
uint32_t hs[9];
uint32_t ds[11];
struct anv_gfx_state_ptr urb;
struct anv_gfx_state_ptr vf_statistics;
struct anv_gfx_state_ptr vf_sgvs;
struct anv_gfx_state_ptr vf_sgvs_2;
struct anv_gfx_state_ptr vf_sgvs_instancing;
struct anv_gfx_state_ptr vf_instancing;
struct anv_gfx_state_ptr primitive_replication;
struct anv_gfx_state_ptr sbe;
struct anv_gfx_state_ptr sbe_swiz;
struct anv_gfx_state_ptr so_decl_list;
struct anv_gfx_state_ptr ms;
struct anv_gfx_state_ptr vs;
struct anv_gfx_state_ptr hs;
struct anv_gfx_state_ptr ds;
struct anv_gfx_state_ptr ps;
struct anv_gfx_state_ptr ps_extra;
struct anv_gfx_state_ptr task_control;
struct anv_gfx_state_ptr task_shader;
struct anv_gfx_state_ptr task_redistrib;
struct anv_gfx_state_ptr clip_mesh;
struct anv_gfx_state_ptr mesh_control;
struct anv_gfx_state_ptr mesh_shader;
struct anv_gfx_state_ptr mesh_distrib;
struct anv_gfx_state_ptr sbe_mesh;
} final;
/* Pre packed CS instructions & structures that need to be merged later
* with dynamic state.
*/
struct {
uint32_t clip[4];
uint32_t sf[4];
uint32_t raster[5];
uint32_t wm[2];
uint32_t streamout_state[5];
uint32_t gs[10];
uint32_t te[4];
uint32_t vfg[4];
struct anv_gfx_state_ptr clip;
struct anv_gfx_state_ptr sf;
struct anv_gfx_state_ptr raster;
struct anv_gfx_state_ptr wm;
struct anv_gfx_state_ptr so;
struct anv_gfx_state_ptr gs;
struct anv_gfx_state_ptr te;
struct anv_gfx_state_ptr vfg;
} partial;
};
#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
do { \
uint32_t *dw; \
\
assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len); \
dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0)); \
if (!dw) \
break; \
for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++) \
dw[i] = (dwords0)[i] | \
(pipeline)->batch_data[(pipeline)->state.offset + i]; \
VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4)); \
} while (0)
#define anv_batch_emit_pipeline_state(batch, pipeline, state) \
do { \
if ((pipeline)->state.len == 0) \
break; \
uint32_t *dw; \
dw = anv_batch_emit_dwords((batch), (pipeline)->state.len); \
if (!dw) \
break; \
memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset], \
4 * (pipeline)->state.len); \
} while (0)
struct anv_compute_pipeline {
struct anv_pipeline base;

View file

@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
return;
uint32_t *dw =
anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length),
GENX(3DSTATE_HS));
memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs));
anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
}
ALWAYS_INLINE static void
@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
return;
uint32_t *dw =
anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length),
GENX(3DSTATE_DS));
memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds));
anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
#endif
}
@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
}
}
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch);
if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
/* If the pipeline changed, we may need to re-allocate push constant
* space in the URB.
*/
/* If the pipeline changed, we may need to re-allocate push constant space
* in the URB.
*/
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
/* Also add the relocations (scratch buffers) */
VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
pipeline->base.base.batch.relocs);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
}
/* Render targets live in the same binding table as fragment descriptors */
@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
}
if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty)
genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
/* When we're done, there is no more dirty gfx state. */
vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
cmd_buffer->state.gfx.dirty = 0;
}
#include "genX_cmd_draw_generated_indirect.h"

View file

@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer)
if (!tes_prog_data ||
!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
uint32_t *dw =
anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length),
GENX(3DSTATE_TE));
memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te));
anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te);
return;
}
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
pipeline->partial.te, te) {
pipeline, partial.te, te) {
if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
te.OutputTopology = tes_prog_data->output_topology;
} else {
@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs);
return;
}
const struct vk_dynamic_graphics_state *dyn =
&cmd_buffer->vk.dynamic_graphics_state;
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
pipeline->partial.gs, gs) {
pipeline, partial.gs, gs) {
switch (dyn->rs.provoking_vertex) {
case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
gs.ReorderMode = LEADING;
@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
return;
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
pipeline->partial.clip, clip) {
pipeline, partial.clip, clip) {
/* Take dynamic primitive topology in to account with
* 3DSTATE_CLIP::ViewportXYClipTestEnable
*/
@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
genX(streamout_prologue)(cmd_buffer);
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
pipeline->partial.streamout_state, so) {
pipeline, partial.so, so) {
so.RenderingDisable = dyn->rs.rasterizer_discard_enable;
so.RenderStreamSelect = dyn->rs.rasterization_stream;
#if INTEL_NEEDS_WA_18022508906
@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
}
}
#define cmd_buffer_emit_pipeline_state(batch, pipeline, state) \
do { \
if ((pipeline)->state.len == 0) \
break; \
void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len); \
if (!dw) \
break; \
memcpy(dw, \
&(pipeline)->batch_data[(pipeline)->state.offset], \
4 * (pipeline)->state.len); \
} while (0)
void
genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
struct vk_dynamic_graphics_state *dyn =
&cmd_buffer->vk.dynamic_graphics_state;
if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
struct anv_batch *batch = &cmd_buffer->batch;
cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra);
if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib);
cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh);
}
}
cmd_buffer_emit_clip(cmd_buffer);
if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
} else {
/* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
pipeline, dyn->vi);
pipeline, dyn->vi, false /* emit_in_pipeline */);
/* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
memcpy(p + 1 + 2 * pipeline->vs_input_elements,
pipeline->vertex_input_data,
@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
pipeline->partial.sf, sf) {
pipeline, partial.sf, sf) {
ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex);
sf.LineWidth = dyn->rs.line.width;
@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
vk_rasterization_state_depth_clip_enable(&dyn->rs);
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
pipeline->partial.raster, raster) {
pipeline, partial.raster, raster) {
raster.APIMode = api_mode;
raster.DXMultisampleRasterizationEnable = msaa_raster_enable;
raster.AntialiasingEnable = aa_enable;
@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
pipeline->partial.vfg, vfg) {
pipeline, partial.vfg, vfg) {
vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
}
}
@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
* threads.
*/
anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
pipeline->partial.wm, wm) {
pipeline, partial.wm, wm) {
wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
(pipeline->force_fragment_thread_dispatch ||
anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
ccp.ColorCalcStatePointerValid = true;
}
}
/* When we're done, there is no more dirty gfx state. */
vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
cmd_buffer->state.gfx.dirty = 0;
}

File diff suppressed because it is too large Load diff