diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 2f8039e0bc0..73f7b29cf3f 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -136,7 +136,7 @@ anv_reloc_list_clear(struct anv_reloc_list *list) memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); } -static VkResult +VkResult anv_reloc_list_append(struct anv_reloc_list *list, struct anv_reloc_list *other) { diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 2779a8500a6..e7ba1205022 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -96,8 +96,9 @@ void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer); void genX(emit_vertex_input)(struct anv_batch *batch, uint32_t *vertex_element_dws, - const struct anv_graphics_pipeline *pipeline, - const struct vk_vertex_input_state *vi); + struct anv_graphics_pipeline *pipeline, + const struct vk_vertex_input_state *vi, + bool emit_in_pipeline); enum anv_pipe_bits genX(emit_apply_pipe_flushes)(struct anv_batch *batch, @@ -125,7 +126,7 @@ void genX(emit_l3_config)(struct anv_batch *batch, void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, const struct intel_l3_config *cfg); -void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 2122165dd3b..777c3983c85 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1464,6 +1464,9 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo) return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS; } +VkResult anv_reloc_list_append(struct anv_reloc_list *list, + struct anv_reloc_list *other); + struct anv_batch_bo { /* Link in the anv_cmd_buffer.owned_batch_bos list */ struct list_head link; @@ -1603,14 +1606,16 @@ _anv_combine_address(struct anv_batch *batch, void *location, __dst; \ }) -#define anv_batch_emit_merge(batch, cmd, prepacked, name) \ +#define anv_batch_emit_merge(batch, cmd, pipeline, state, name) \ for (struct cmd name = { 0 }, \ *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \ __builtin_expect(_dst != NULL, 1); \ ({ uint32_t _partial[__anv_cmd_length(cmd)]; \ __anv_cmd_pack(cmd)(batch, _partial, &name); \ - for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) \ - ((uint32_t *)_dst)[i] = _partial[i] | (prepacked)[i]; \ + for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \ + ((uint32_t *)_dst)[i] = _partial[i] | \ + (pipeline)->batch_data[(pipeline)->state.offset + i]; \ + } \ VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \ _dst = NULL; \ })) @@ -3515,6 +3520,12 @@ struct anv_graphics_lib_pipeline { bool retain_shaders; }; +struct anv_gfx_state_ptr { + /* Both in dwords */ + uint16_t offset; + uint16_t len; +}; + /* The final graphics pipeline object has all the graphics state ready to be * programmed into HW packets (dynamic_state field) or fully baked in its * batch. @@ -3564,7 +3575,7 @@ struct anv_graphics_pipeline { * this array only holds the svgs_count elements. */ uint32_t vertex_input_elems; - uint32_t vertex_input_data[96]; + uint32_t vertex_input_data[2 * 31 /* MAX_VES + 2 internal */]; enum brw_wm_msaa_flags fs_msaa_flags; @@ -3575,25 +3586,75 @@ struct anv_graphics_pipeline { /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */ struct { - uint32_t hs[9]; - uint32_t ds[11]; + struct anv_gfx_state_ptr urb; + struct anv_gfx_state_ptr vf_statistics; + struct anv_gfx_state_ptr vf_sgvs; + struct anv_gfx_state_ptr vf_sgvs_2; + struct anv_gfx_state_ptr vf_sgvs_instancing; + struct anv_gfx_state_ptr vf_instancing; + struct anv_gfx_state_ptr primitive_replication; + struct anv_gfx_state_ptr sbe; + struct anv_gfx_state_ptr sbe_swiz; + struct anv_gfx_state_ptr so_decl_list; + struct anv_gfx_state_ptr ms; + struct anv_gfx_state_ptr vs; + struct anv_gfx_state_ptr hs; + struct anv_gfx_state_ptr ds; + struct anv_gfx_state_ptr ps; + struct anv_gfx_state_ptr ps_extra; + + struct anv_gfx_state_ptr task_control; + struct anv_gfx_state_ptr task_shader; + struct anv_gfx_state_ptr task_redistrib; + struct anv_gfx_state_ptr clip_mesh; + struct anv_gfx_state_ptr mesh_control; + struct anv_gfx_state_ptr mesh_shader; + struct anv_gfx_state_ptr mesh_distrib; + struct anv_gfx_state_ptr sbe_mesh; } final; /* Pre packed CS instructions & structures that need to be merged later * with dynamic state. */ struct { - uint32_t clip[4]; - uint32_t sf[4]; - uint32_t raster[5]; - uint32_t wm[2]; - uint32_t streamout_state[5]; - uint32_t gs[10]; - uint32_t te[4]; - uint32_t vfg[4]; + struct anv_gfx_state_ptr clip; + struct anv_gfx_state_ptr sf; + struct anv_gfx_state_ptr raster; + struct anv_gfx_state_ptr wm; + struct anv_gfx_state_ptr so; + struct anv_gfx_state_ptr gs; + struct anv_gfx_state_ptr te; + struct anv_gfx_state_ptr vfg; } partial; }; +#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \ + do { \ + uint32_t *dw; \ + \ + assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len); \ + dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0)); \ + if (!dw) \ + break; \ + for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++) \ + dw[i] = (dwords0)[i] | \ + (pipeline)->batch_data[(pipeline)->state.offset + i]; \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4)); \ + } while (0) + +#define anv_batch_emit_pipeline_state(batch, pipeline, state) \ + do { \ + if ((pipeline)->state.len == 0) \ + break; \ + uint32_t *dw; \ + dw = anv_batch_emit_dwords((batch), (pipeline)->state.len); \ + if (!dw) \ + break; \ + memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset], \ + 4 * (pipeline)->state.len); \ + } while (0) + + struct anv_compute_pipeline { struct anv_pipeline base; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 6157e551cf7..86ab22a0392 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2994,10 +2994,7 @@ genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer) if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) return; - uint32_t *dw = - anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_HS_length), - GENX(3DSTATE_HS)); - memcpy(dw, &pipeline->final.hs, sizeof(pipeline->final.hs)); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs); } ALWAYS_INLINE static void @@ -3022,10 +3019,7 @@ genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer) if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) return; - uint32_t *dw = - anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_DS_length), - GENX(3DSTATE_DS)); - memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds)); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds); #endif } @@ -3224,13 +3218,22 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer) } } - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { - anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.base.batch); + if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty) + genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer); - /* If the pipeline changed, we may need to re-allocate push constant - * space in the URB. - */ + /* If the pipeline changed, we may need to re-allocate push constant space + * in the URB. + */ + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { cmd_buffer_alloc_gfx_push_constants(cmd_buffer); + + /* Also add the relocations (scratch buffers) */ + VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs, + pipeline->base.base.batch.relocs); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } } /* Render targets live in the same binding table as fragment descriptors */ @@ -3274,8 +3277,9 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer) dirty & VK_SHADER_STAGE_ALL_GRAPHICS); } - if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty) - genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); + /* When we're done, there is no more dirty gfx state. */ + vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); + cmd_buffer->state.gfx.dirty = 0; } #include "genX_cmd_draw_generated_indirect.h" diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c index 3d4ec697751..8a3031686e2 100644 --- a/src/intel/vulkan/genX_gfx_state.c +++ b/src/intel/vulkan/genX_gfx_state.c @@ -215,15 +215,12 @@ genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer) if (!tes_prog_data || !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { - uint32_t *dw = - anv_batch_emitn(&cmd_buffer->batch, GENX(3DSTATE_TE_length), - GENX(3DSTATE_TE)); - memcpy(dw, &pipeline->partial.te, sizeof(pipeline->partial.te)); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.te); return; } anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE), - pipeline->partial.te, te) { + pipeline, partial.te, te) { if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { te.OutputTopology = tes_prog_data->output_topology; } else { @@ -244,14 +241,14 @@ genX(emit_gs)(struct anv_cmd_buffer *cmd_buffer) { struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, partial.gs); return; } const struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS), - pipeline->partial.gs, gs) { + pipeline, partial.gs, gs) { switch (dyn->rs.provoking_vertex) { case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: gs.ReorderMode = LEADING; @@ -463,7 +460,7 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) return; anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP), - pipeline->partial.clip, clip) { + pipeline, partial.clip, clip) { /* Take dynamic primitive topology in to account with * 3DSTATE_CLIP::ViewportXYClipTestEnable */ @@ -532,7 +529,7 @@ cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer) genX(streamout_prologue)(cmd_buffer); anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), - pipeline->partial.streamout_state, so) { + pipeline, partial.so, so) { so.RenderingDisable = dyn->rs.rasterizer_discard_enable; so.RenderStreamSelect = dyn->rs.rasterization_stream; #if INTEL_NEEDS_WA_18022508906 @@ -802,13 +799,58 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer) } } +#define cmd_buffer_emit_pipeline_state(batch, pipeline, state) \ + do { \ + if ((pipeline)->state.len == 0) \ + break; \ + void *dw = anv_batch_emit_dwords(batch, (pipeline)->state.len); \ + if (!dw) \ + break; \ + memcpy(dw, \ + &(pipeline)->batch_data[(pipeline)->state.offset], \ + 4 * (pipeline)->state.len); \ + } while (0) + + void -genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) +genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { + struct anv_batch *batch = &cmd_buffer->batch; + + cmd_buffer_emit_pipeline_state(batch, pipeline, final.urb); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.ms); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.primitive_replication); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_instancing); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_instancing); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_sgvs_2); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vs); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.hs); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.ds); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.vf_statistics); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.so_decl_list); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_swiz); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.ps_extra); + + if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) { + cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_control); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_shader); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.task_redistrib); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.clip_mesh); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_control); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_shader); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.mesh_distrib); + cmd_buffer_emit_pipeline_state(batch, pipeline, final.sbe_mesh); + } + } + cmd_buffer_emit_clip(cmd_buffer); if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | @@ -865,7 +907,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) } else { /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */ genX(emit_vertex_input)(&cmd_buffer->batch, p + 1, - pipeline, dyn->vi); + pipeline, dyn->vi, false /* emit_in_pipeline */); /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */ memcpy(p + 1 + 2 * pipeline->vs_input_elements, pipeline->vertex_input_data, @@ -896,7 +938,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF), - pipeline->partial.sf, sf) { + pipeline, partial.sf, sf) { ANV_SETUP_PROVOKING_VERTEX(sf, dyn->rs.provoking_vertex); sf.LineWidth = dyn->rs.line.width; @@ -978,7 +1020,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) vk_rasterization_state_depth_clip_enable(&dyn->rs); anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER), - pipeline->partial.raster, raster) { + pipeline, partial.raster, raster) { raster.APIMode = api_mode; raster.DXMultisampleRasterizationEnable = msaa_raster_enable; raster.AntialiasingEnable = aa_enable; @@ -1120,7 +1162,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG), - pipeline->partial.vfg, vfg) { + pipeline, partial.vfg, vfg) { vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable; } } @@ -1141,7 +1183,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) * threads. */ anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM), - pipeline->partial.wm, wm) { + pipeline, partial.wm, wm) { wm.ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && (pipeline->force_fragment_thread_dispatch || anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ? @@ -1365,8 +1407,4 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) ccp.ColorCalcStatePointerValid = true; } } - - /* When we're done, there is no more dirty gfx state. */ - vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); - cmd_buffer->state.gfx.dirty = 0; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 8174ab54b8f..b7f4dbab315 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -36,6 +36,52 @@ #include "vk_log.h" #include "vk_render_pass.h" +static inline struct anv_batch * +anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline, + struct anv_gfx_state_ptr *ptr, + uint32_t n_dwords) +{ + struct anv_batch *batch = &pipeline->base.base.batch; + + assert(ptr->len == 0 || + (batch->next - batch->start) / 4 == (ptr->offset + ptr->len)); + if (ptr->len == 0) + ptr->offset = (batch->next - batch->start) / 4; + ptr->len += n_dwords; + + return batch; +} + +#define anv_pipeline_emit(pipeline, state, cmd, name) \ + for (struct cmd name = { __anv_cmd_header(cmd) }, \ + *_dst = anv_batch_emit_dwords( \ + anv_gfx_pipeline_add(pipeline, \ + &(pipeline)->state, \ + __anv_cmd_length(cmd)), \ + __anv_cmd_length(cmd)); \ + __builtin_expect(_dst != NULL, 1); \ + ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \ + _dst, &name); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \ + _dst = NULL; \ + })) + +#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({ \ + void *__dst = anv_batch_emit_dwords( \ + anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n); \ + if (__dst) { \ + struct cmd __template = { \ + __anv_cmd_header(cmd), \ + .DWordLength = n - __anv_cmd_length_bias(cmd), \ + __VA_ARGS__ \ + }; \ + __anv_cmd_pack(cmd)(&pipeline->base.base.batch, \ + __dst, &__template); \ + } \ + __dst; \ + }) + + static uint32_t vertex_element_comp_control(enum isl_format format, unsigned comp) { @@ -91,8 +137,9 @@ vertex_element_comp_control(enum isl_format format, unsigned comp) void genX(emit_vertex_input)(struct anv_batch *batch, uint32_t *vertex_element_dws, - const struct anv_graphics_pipeline *pipeline, - const struct vk_vertex_input_state *vi) + struct anv_graphics_pipeline *pipeline, + const struct vk_vertex_input_state *vi, + bool emit_in_pipeline) { const struct anv_device *device = pipeline->base.base.device; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); @@ -169,15 +216,28 @@ genX(emit_vertex_input)(struct anv_batch *batch, * that controls instancing. On Haswell and prior, that's part of * VERTEX_BUFFER_STATE which we emit later. */ - anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { - bool per_instance = vi->bindings[binding].input_rate == - VK_VERTEX_INPUT_RATE_INSTANCE; - uint32_t divisor = vi->bindings[binding].divisor * - pipeline->instance_multiplier; + if (emit_in_pipeline) { + anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) { + bool per_instance = vi->bindings[binding].input_rate == + VK_VERTEX_INPUT_RATE_INSTANCE; + uint32_t divisor = vi->bindings[binding].divisor * + pipeline->instance_multiplier; - vfi.InstancingEnable = per_instance; - vfi.VertexElementIndex = slot; - vfi.InstanceDataStepRate = per_instance ? divisor : 1; + vfi.InstancingEnable = per_instance; + vfi.VertexElementIndex = slot; + vfi.InstanceDataStepRate = per_instance ? divisor : 1; + } + } else { + anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + bool per_instance = vi->bindings[binding].input_rate == + VK_VERTEX_INPUT_RATE_INSTANCE; + uint32_t divisor = vi->bindings[binding].divisor * + pipeline->instance_multiplier; + + vfi.InstancingEnable = per_instance; + vfi.VertexElementIndex = slot; + vfi.InstanceDataStepRate = per_instance ? divisor : 1; + } } } } @@ -187,15 +247,13 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, const struct vk_graphics_pipeline_state *state, const struct vk_vertex_input_state *vi) { - struct anv_batch *batch = &pipeline->base.base.batch; - /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy * everything in gfx8_cmd_buffer.c */ if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) { - genX(emit_vertex_input)(batch, + genX(emit_vertex_input)(NULL, pipeline->vertex_input_data, - pipeline, vi); + pipeline, vi, true /* emit_in_pipeline */); } const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); @@ -207,6 +265,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, assert(pipeline->vertex_input_elems >= pipeline->svgs_count); uint32_t slot_offset = pipeline->vertex_input_elems - pipeline->svgs_count; + if (needs_svgs_elem) { #if GFX_VER < 11 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: @@ -243,7 +302,8 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, &element); slot_offset++; - anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + anv_pipeline_emit(pipeline, final.vf_sgvs_instancing, + GENX(3DSTATE_VF_INSTANCING), vfi) { vfi.VertexElementIndex = id_slot; } } @@ -268,13 +328,14 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, &element); slot_offset++; - anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + anv_pipeline_emit(pipeline, final.vf_sgvs_instancing, + GENX(3DSTATE_VF_INSTANCING), vfi) { vfi.VertexElementIndex = drawid_slot; } } } - anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) { + anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) { sgvs.VertexIDEnable = vs_prog_data->uses_vertexid; sgvs.VertexIDComponentNumber = 2; sgvs.VertexIDElementOffset = id_slot; @@ -284,7 +345,7 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, } #if GFX_VER >= 11 - anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs) { + anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) { /* gl_BaseVertex */ sgvs.XP0Enable = vs_prog_data->uses_firstvertex; sgvs.XP0SourceSelect = XP0_PARAMETER; @@ -306,32 +367,30 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline, #if GFX_VERx10 >= 125 struct anv_device *device = pipeline->base.base.device; - struct GENX(3DSTATE_VFG) vfg = { - GENX(3DSTATE_VFG_header), + anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) { /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/ - .DistributionMode = + vfg.DistributionMode = anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT : - RR_FREE, - .DistributionGranularity = BatchLevelGranularity, - }; - /* Wa_14014890652 */ - if (intel_device_info_is_dg2(device->info)) - vfg.GranularityThresholdDisable = 1; - /* 192 vertices for TRILIST_ADJ */ - vfg.ListNBatchSizeScale = 0; - /* Batch size of 384 vertices */ - vfg.List3BatchSizeScale = 2; - /* Batch size of 128 vertices */ - vfg.List2BatchSizeScale = 1; - /* Batch size of 128 vertices */ - vfg.List1BatchSizeScale = 2; - /* Batch size of 256 vertices for STRIP topologies */ - vfg.StripBatchSizeScale = 3; - /* 192 control points for PATCHLIST_3 */ - vfg.PatchBatchSizeScale = 1; - /* 192 control points for PATCHLIST_3 */ - vfg.PatchBatchSizeMultiplier = 31; - GENX(3DSTATE_VFG_pack)(NULL, pipeline->partial.vfg, &vfg); + RR_FREE; + vfg.DistributionGranularity = BatchLevelGranularity; + /* Wa_14014890652 */ + if (intel_device_info_is_dg2(device->info)) + vfg.GranularityThresholdDisable = 1; + /* 192 vertices for TRILIST_ADJ */ + vfg.ListNBatchSizeScale = 0; + /* Batch size of 384 vertices */ + vfg.List3BatchSizeScale = 2; + /* Batch size of 128 vertices */ + vfg.List2BatchSizeScale = 1; + /* Batch size of 128 vertices */ + vfg.List1BatchSizeScale = 2; + /* Batch size of 256 vertices for STRIP topologies */ + vfg.StripBatchSizeScale = 3; + /* 192 control points for PATCHLIST_3 */ + vfg.PatchBatchSizeScale = 1; + /* 192 control points for PATCHLIST_3 */ + vfg.PatchBatchSizeMultiplier = 31; + } #endif } @@ -375,7 +434,6 @@ static void emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, enum intel_urb_deref_block_size *deref_block_size) { - struct anv_batch *batch = &pipeline->base.base.batch; const struct intel_device_info *devinfo = pipeline->base.base.device->info; const struct brw_task_prog_data *task_prog_data = @@ -390,12 +448,12 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, /* Zero out the primitive pipeline URB allocations. */ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { - anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; } } - anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) { + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) { if (task_prog_data) { urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1; urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries; @@ -405,7 +463,7 @@ emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, } } - anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) { + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) { urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1; urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries; urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries; @@ -437,192 +495,207 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline, entry_size[i] = prog_data ? prog_data->urb_entry_size : 1; } - genX(emit_urb_setup)(pipeline->base.base.device, - &pipeline->base.base.batch, + struct anv_device *device = pipeline->base.base.device; + const struct intel_device_info *devinfo = device->info; + + unsigned entries[4]; + unsigned start[4]; + bool constrained; + intel_get_urb_config(devinfo, pipeline->base.base.l3_config, - pipeline->base.base.active_stages, entry_size, - deref_block_size); + pipeline->base.base.active_stages & + VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, + pipeline->base.base.active_stages & + VK_SHADER_STAGE_GEOMETRY_BIT, + entry_size, entries, start, deref_block_size, + &constrained); + + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = entry_size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } +#if GFX_VERx10 >= 125 + if (device->vk.enabled_extensions.EXT_mesh_shader) { + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero); + anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero); + } +#endif + } static void emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) { - struct anv_batch *batch = &pipeline->base.base.batch; const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { - anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe); - anv_batch_emit(batch, GENX(3DSTATE_SBE_SWIZ), sbe); + anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe); + anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe); #if GFX_VERx10 >= 125 if (anv_pipeline_is_mesh(pipeline)) - anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh); + anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe); #endif return; } - struct GENX(3DSTATE_SBE) sbe = { - GENX(3DSTATE_SBE_header), + anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) { + anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) { + /* TODO(mesh): Figure out cases where we need attribute swizzling. See also * calculate_urb_setup() and related functions. */ - .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline), - .PointSpriteTextureCoordinateOrigin = UPPERLEFT, - .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs, - .ConstantInterpolationEnable = wm_prog_data->flat_inputs, - }; + sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline); + sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT; + sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; + sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; - for (unsigned i = 0; i < 32; i++) - sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; - /* On Broadwell, they broke 3DSTATE_SBE into two packets */ - struct GENX(3DSTATE_SBE_SWIZ) swiz = { - GENX(3DSTATE_SBE_SWIZ_header), - }; + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_map *fs_input_map = + &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; - if (anv_pipeline_is_primitive(pipeline)) { - const struct brw_vue_map *fs_input_map = - &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; + int first_slot = + brw_compute_first_urb_slot_required(wm_prog_data->inputs, + fs_input_map); + assert(first_slot % 2 == 0); + unsigned urb_entry_read_offset = first_slot / 2; + int max_source_attr = 0; + for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { + uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; + int input_index = wm_prog_data->urb_setup[attr]; - int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs, - fs_input_map); - assert(first_slot % 2 == 0); - unsigned urb_entry_read_offset = first_slot / 2; - int max_source_attr = 0; - for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { - uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; - int input_index = wm_prog_data->urb_setup[attr]; + assert(0 <= input_index); - assert(0 <= input_index); - - /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the - * VUE header - */ - if (attr == VARYING_SLOT_VIEWPORT || - attr == VARYING_SLOT_LAYER || - attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { - continue; - } - - if (attr == VARYING_SLOT_PNTC) { - sbe.PointSpriteTextureCoordinateEnable = 1 << input_index; - continue; - } - - const int slot = fs_input_map->varying_to_slot[attr]; - - if (slot == -1) { - /* This attribute does not exist in the VUE--that means that the - * vertex shader did not write to it. It could be that it's a - * regular varying read by the fragment shader but not written by - * the vertex shader or it's gl_PrimitiveID. In the first case the - * value is undefined, in the second it needs to be - * gl_PrimitiveID. + /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the + * VUE header */ - swiz.Attribute[input_index].ConstantSource = PRIM_ID; - swiz.Attribute[input_index].ComponentOverrideX = true; - swiz.Attribute[input_index].ComponentOverrideY = true; - swiz.Attribute[input_index].ComponentOverrideZ = true; - swiz.Attribute[input_index].ComponentOverrideW = true; - continue; + if (attr == VARYING_SLOT_VIEWPORT || + attr == VARYING_SLOT_LAYER || + attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { + continue; + } + + if (attr == VARYING_SLOT_PNTC) { + sbe.PointSpriteTextureCoordinateEnable = 1 << input_index; + continue; + } + + const int slot = fs_input_map->varying_to_slot[attr]; + + if (slot == -1) { + /* This attribute does not exist in the VUE--that means that + * the vertex shader did not write to it. It could be that it's + * a regular varying read by the fragment shader but not + * written by the vertex shader or it's gl_PrimitiveID. In the + * first case the value is undefined, in the second it needs to + * be gl_PrimitiveID. + */ + swiz.Attribute[input_index].ConstantSource = PRIM_ID; + swiz.Attribute[input_index].ComponentOverrideX = true; + swiz.Attribute[input_index].ComponentOverrideY = true; + swiz.Attribute[input_index].ComponentOverrideZ = true; + swiz.Attribute[input_index].ComponentOverrideW = true; + continue; + } + + /* We have to subtract two slots to account for the URB entry + * output read offset in the VS and GS stages. + */ + const int source_attr = slot - 2 * urb_entry_read_offset; + assert(source_attr >= 0 && source_attr < 32); + max_source_attr = MAX2(max_source_attr, source_attr); + /* The hardware can only do overrides on 16 overrides at a time, + * and the other up to 16 have to be lined up so that the input + * index = the output index. We'll need to do some tweaking to + * make sure that's the case. + */ + if (input_index < 16) + swiz.Attribute[input_index].SourceAttribute = source_attr; + else + assert(source_attr == input_index); } - /* We have to subtract two slots to account for the URB entry output - * read offset in the VS and GS stages. - */ - const int source_attr = slot - 2 * urb_entry_read_offset; - assert(source_attr >= 0 && source_attr < 32); - max_source_attr = MAX2(max_source_attr, source_attr); - /* The hardware can only do overrides on 16 overrides at a time, and the - * other up to 16 have to be lined up so that the input index = the - * output index. We'll need to do some tweaking to make sure that's the - * case. - */ - if (input_index < 16) - swiz.Attribute[input_index].SourceAttribute = source_attr; - else - assert(source_attr == input_index); - } + sbe.VertexURBEntryReadOffset = urb_entry_read_offset; + sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); + sbe.ForceVertexURBEntryReadOffset = true; + sbe.ForceVertexURBEntryReadLength = true; - sbe.VertexURBEntryReadOffset = urb_entry_read_offset; - sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); - sbe.ForceVertexURBEntryReadOffset = true; - sbe.ForceVertexURBEntryReadLength = true; - - /* Ask the hardware to supply PrimitiveID if the fragment shader - * reads it but a previous stage didn't write one. - */ - if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) && - fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) { - sbe.PrimitiveIDOverrideAttributeSelect = - wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID]; - sbe.PrimitiveIDOverrideComponentX = true; - sbe.PrimitiveIDOverrideComponentY = true; - sbe.PrimitiveIDOverrideComponentZ = true; - sbe.PrimitiveIDOverrideComponentW = true; - pipeline->primitive_id_override = true; - } - } else { - assert(anv_pipeline_is_mesh(pipeline)); + /* Ask the hardware to supply PrimitiveID if the fragment shader + * reads it but a previous stage didn't write one. + */ + if ((wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) && + fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) { + sbe.PrimitiveIDOverrideAttributeSelect = + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID]; + sbe.PrimitiveIDOverrideComponentX = true; + sbe.PrimitiveIDOverrideComponentY = true; + sbe.PrimitiveIDOverrideComponentZ = true; + sbe.PrimitiveIDOverrideComponentW = true; + pipeline->primitive_id_override = true; + } + } else { + assert(anv_pipeline_is_mesh(pipeline)); #if GFX_VERx10 >= 125 - const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - anv_batch_emit(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) { - const struct brw_mue_map *mue = &mesh_prog_data->map; + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + anv_pipeline_emit(pipeline, final.sbe_mesh, + GENX(3DSTATE_SBE_MESH), sbe_mesh) { + const struct brw_mue_map *mue = &mesh_prog_data->map; - assert(mue->per_vertex_header_size_dw % 8 == 0); - sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8; - sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8); + assert(mue->per_vertex_header_size_dw % 8 == 0); + sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8; + sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8); - /* Clip distance array is passed in the per-vertex header so that - * it can be consumed by the HW. If user wants to read it in the FS, - * adjust the offset and length to cover it. Conveniently it is at - * the end of the per-vertex header, right before per-vertex - * attributes. - * - * Note that FS attribute reading must be aware that the clip - * distances have fixed position. - */ - if (mue->per_vertex_header_size_dw > 8 && - (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) { - sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; - sbe_mesh.PerVertexURBEntryOutputReadLength += 1; + /* Clip distance array is passed in the per-vertex header so that + * it can be consumed by the HW. If user wants to read it in the + * FS, adjust the offset and length to cover it. Conveniently it + * is at the end of the per-vertex header, right before per-vertex + * attributes. + * + * Note that FS attribute reading must be aware that the clip + * distances have fixed position. + */ + if (mue->per_vertex_header_size_dw > 8 && + (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) { + sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; + sbe_mesh.PerVertexURBEntryOutputReadLength += 1; + } + + if (mue->user_data_in_vertex_header) { + sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; + sbe_mesh.PerVertexURBEntryOutputReadLength += 1; + } + + assert(mue->per_primitive_header_size_dw % 8 == 0); + sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = + mue->per_primitive_header_size_dw / 8; + sbe_mesh.PerPrimitiveURBEntryOutputReadLength = + DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); + + /* Just like with clip distances, if Primitive Shading Rate, + * Viewport Index or Layer is read back in the FS, adjust the + * offset and length to cover the Primitive Header, where PSR, + * Viewport Index & Layer are stored. + */ + if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 || + mue->user_data_in_primitive_header) { + assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); + sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; + sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1; + } } - - if (mue->user_data_in_vertex_header) { - sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; - sbe_mesh.PerVertexURBEntryOutputReadLength += 1; - } - - assert(mue->per_primitive_header_size_dw % 8 == 0); - sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8; - sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); - - /* Just like with clip distances, if Primitive Shading Rate, - * Viewport Index or Layer is read back in the FS, adjust - * the offset and length to cover the Primitive Header, where - * PSR, Viewport Index & Layer are stored. - */ - if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || - wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 || - mue->user_data_in_primitive_header) { - assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); - sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; - sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1; - } - } #endif + } + } } - - uint32_t *dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_length)); - if (!dw) - return; - GENX(3DSTATE_SBE_pack)(batch, dw, &sbe); - - dw = anv_batch_emit_dwords(batch, GENX(3DSTATE_SBE_SWIZ_length)); - if (!dw) - return; - GENX(3DSTATE_SBE_SWIZ_pack)(batch, dw, &swiz); } /** Returns the final polygon mode for rasterization @@ -729,63 +802,55 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline, const struct vk_render_pass_state *rp, enum intel_urb_deref_block_size urb_deref_block_size) { - struct GENX(3DSTATE_SF) sf = { - GENX(3DSTATE_SF_header), - }; - - sf.ViewportTransformEnable = true; - sf.StatisticsEnable = true; - sf.VertexSubPixelPrecisionSelect = _8Bit; - sf.AALineDistanceMode = true; + anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) { + sf.ViewportTransformEnable = true; + sf.StatisticsEnable = true; + sf.VertexSubPixelPrecisionSelect = _8Bit; + sf.AALineDistanceMode = true; #if GFX_VER >= 12 - sf.DerefBlockSize = urb_deref_block_size; + sf.DerefBlockSize = urb_deref_block_size; #endif - bool point_from_shader; - if (anv_pipeline_is_primitive(pipeline)) { - const struct brw_vue_prog_data *last_vue_prog_data = - anv_pipeline_get_last_vue_prog_data(pipeline); - point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ; - } else { - assert(anv_pipeline_is_mesh(pipeline)); - const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0; + bool point_from_shader; + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_prog_data *last_vue_prog_data = + anv_pipeline_get_last_vue_prog_data(pipeline); + point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ; + } else { + assert(anv_pipeline_is_mesh(pipeline)); + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0; + } + + if (point_from_shader) { + sf.PointWidthSource = Vertex; + } else { + sf.PointWidthSource = State; + sf.PointWidth = 1.0; + } } - if (point_from_shader) { - sf.PointWidthSource = Vertex; - } else { - sf.PointWidthSource = State; - sf.PointWidth = 1.0; + anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) { + /* For details on 3DSTATE_RASTER multisample state, see the BSpec table + * "Multisample Modes State". + */ + /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix + * computations. If we ever set this bit to a different value, they will + * need to be updated accordingly. + */ + raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; + raster.ForceMultisampling = false; + + raster.ScissorRectangleEnable = true; } - - struct GENX(3DSTATE_RASTER) raster = { - GENX(3DSTATE_RASTER_header), - }; - - /* For details on 3DSTATE_RASTER multisample state, see the BSpec table - * "Multisample Modes State". - */ - /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix - * computations. If we ever set this bit to a different value, they will - * need to be updated accordingly. - */ - raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; - raster.ForceMultisampling = false; - - raster.ScissorRectangleEnable = true; - - GENX(3DSTATE_SF_pack)(NULL, pipeline->partial.sf, &sf); - GENX(3DSTATE_RASTER_pack)(NULL, pipeline->partial.raster, &raster); } static void emit_ms_state(struct anv_graphics_pipeline *pipeline, const struct vk_multisample_state *ms) { - struct anv_batch *batch = &pipeline->base.base.batch; - anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) { + anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) { ms.NumberofMultisamples = __builtin_ffs(pipeline->rasterization_samples) - 1; ms.PixelLocation = CENTER; @@ -862,71 +927,67 @@ emit_3dstate_clip(struct anv_graphics_pipeline *pipeline, const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); (void) wm_prog_data; - struct GENX(3DSTATE_CLIP) clip = { - GENX(3DSTATE_CLIP_header), - }; + anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) { + clip.ClipEnable = true; + clip.StatisticsEnable = true; + clip.EarlyCullEnable = true; + clip.GuardbandClipTestEnable = true; - clip.ClipEnable = true; - clip.StatisticsEnable = true; - clip.EarlyCullEnable = true; - clip.GuardbandClipTestEnable = true; + clip.VertexSubPixelPrecisionSelect = _8Bit; + clip.ClipMode = CLIPMODE_NORMAL; - clip.VertexSubPixelPrecisionSelect = _8Bit; - clip.ClipMode = CLIPMODE_NORMAL; + clip.MinimumPointWidth = 0.125; + clip.MaximumPointWidth = 255.875; - clip.MinimumPointWidth = 0.125; - clip.MaximumPointWidth = 255.875; + /* TODO(mesh): Multiview. */ + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_prog_data *last = + anv_pipeline_get_last_vue_prog_data(pipeline); - /* TODO(mesh): Multiview. */ - if (anv_pipeline_is_primitive(pipeline)) { - const struct brw_vue_prog_data *last = - anv_pipeline_get_last_vue_prog_data(pipeline); + /* From the Vulkan 1.0.45 spec: + * + * "If the last active vertex processing stage shader entry + * point's interface does not include a variable decorated with + * ViewportIndex, then the first viewport is used." + */ + if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) { + clip.MaximumVPIndex = vp->viewport_count > 0 ? + vp->viewport_count - 1 : 0; + } else { + clip.MaximumVPIndex = 0; + } - /* From the Vulkan 1.0.45 spec: - * - * "If the last active vertex processing stage shader entry point's - * interface does not include a variable decorated with - * ViewportIndex, then the first viewport is used." - */ - if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) { - clip.MaximumVPIndex = vp->viewport_count > 0 ? - vp->viewport_count - 1 : 0; - } else { - clip.MaximumVPIndex = 0; - } + /* From the Vulkan 1.0.45 spec: + * + * "If the last active vertex processing stage shader entry point's + * interface does not include a variable decorated with Layer, then + * the first layer is used." + */ + clip.ForceZeroRTAIndexEnable = + !(last->vue_map.slots_valid & VARYING_BIT_LAYER); - /* From the Vulkan 1.0.45 spec: - * - * "If the last active vertex processing stage shader entry point's - * interface does not include a variable decorated with Layer, then - * the first layer is used." - */ - clip.ForceZeroRTAIndexEnable = - !(last->vue_map.slots_valid & VARYING_BIT_LAYER); + } else if (anv_pipeline_is_mesh(pipeline)) { + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + if (vp && vp->viewport_count > 0 && + mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) { + clip.MaximumVPIndex = vp->viewport_count - 1; + } else { + clip.MaximumVPIndex = 0; + } - } else if (anv_pipeline_is_mesh(pipeline)) { - const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - if (vp && vp->viewport_count > 0 && - mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) { - clip.MaximumVPIndex = vp->viewport_count - 1; - } else { - clip.MaximumVPIndex = 0; - } - - clip.ForceZeroRTAIndexEnable = + clip.ForceZeroRTAIndexEnable = mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0; + } + + clip.NonPerspectiveBarycentricEnable = wm_prog_data ? + wm_prog_data->uses_nonperspective_interp_modes : 0; } - clip.NonPerspectiveBarycentricEnable = wm_prog_data ? - wm_prog_data->uses_nonperspective_interp_modes : 0; - - GENX(3DSTATE_CLIP_pack)(NULL, pipeline->partial.clip, &clip); - #if GFX_VERx10 >= 125 if (anv_pipeline_is_mesh(pipeline)) { - struct anv_batch *batch = &pipeline->base.base.batch; const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); - anv_batch_emit(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) { + anv_pipeline_emit(pipeline, final.clip_mesh, + GENX(3DSTATE_CLIP_MESH), clip_mesh) { clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0; clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask; clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask; @@ -939,8 +1000,6 @@ static void emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline, const struct vk_rasterization_state *rs) { - struct anv_batch *batch = &pipeline->base.base.batch; - const struct anv_device *device = pipeline->base.base.device; const struct brw_vue_prog_data *prog_data = anv_pipeline_get_last_vue_prog_data(pipeline); const struct brw_vue_map *vue_map = &prog_data->vue_map; @@ -1034,25 +1093,17 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline, sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; } - /* Wa_16011773973: - * If SOL is enabled and SO_DECL state has to be programmed, - * 1. Send 3D State SOL state with SOL disabled - * 2. Send SO_DECL NP state - * 3. Send 3D State SOL with SOL Enabled - */ - if (intel_device_info_is_dg2(device->info)) - anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so); - - uint32_t *dw = anv_batch_emitn(batch, 3 + 2 * max_decls, - GENX(3DSTATE_SO_DECL_LIST), - .StreamtoBufferSelects0 = sbs[0], - .StreamtoBufferSelects1 = sbs[1], - .StreamtoBufferSelects2 = sbs[2], - .StreamtoBufferSelects3 = sbs[3], - .NumEntries0 = decls[0], - .NumEntries1 = decls[1], - .NumEntries2 = decls[2], - .NumEntries3 = decls[3]); + uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list, + 3 + 2 * max_decls, + GENX(3DSTATE_SO_DECL_LIST), + .StreamtoBufferSelects0 = sbs[0], + .StreamtoBufferSelects1 = sbs[1], + .StreamtoBufferSelects2 = sbs[2], + .StreamtoBufferSelects3 = sbs[3], + .NumEntries0 = decls[0], + .NumEntries1 = decls[1], + .NumEntries2 = decls[2], + .NumEntries3 = decls[3]); for (int i = 0; i < max_decls; i++) { GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, @@ -1063,47 +1114,38 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline, .Stream3Decl = so_decl[3][i], }); } - -#if GFX_VERx10 == 125 - /* Wa_14015946265: Send PC with CS stall after SO_DECL. */ - genX(batch_emit_pipe_control)(batch, device->info, ANV_PIPE_CS_STALL_BIT); -#endif } - struct GENX(3DSTATE_STREAMOUT) so = { - GENX(3DSTATE_STREAMOUT_header), - }; + anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) { + if (xfb_info) { + pipeline->uses_xfb = true; - if (xfb_info) { - pipeline->uses_xfb = true; + so.SOFunctionEnable = true; + so.SOStatisticsEnable = true; - so.SOFunctionEnable = true; - so.SOStatisticsEnable = true; + so.Buffer0SurfacePitch = xfb_info->buffers[0].stride; + so.Buffer1SurfacePitch = xfb_info->buffers[1].stride; + so.Buffer2SurfacePitch = xfb_info->buffers[2].stride; + so.Buffer3SurfacePitch = xfb_info->buffers[3].stride; - so.Buffer0SurfacePitch = xfb_info->buffers[0].stride; - so.Buffer1SurfacePitch = xfb_info->buffers[1].stride; - so.Buffer2SurfacePitch = xfb_info->buffers[2].stride; - so.Buffer3SurfacePitch = xfb_info->buffers[3].stride; + int urb_entry_read_offset = 0; + int urb_entry_read_length = + (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; - int urb_entry_read_offset = 0; - int urb_entry_read_length = - (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; - - /* We always read the whole vertex. This could be reduced at some - * point by reading less and offsetting the register index in the - * SO_DECLs. - */ - so.Stream0VertexReadOffset = urb_entry_read_offset; - so.Stream0VertexReadLength = urb_entry_read_length - 1; - so.Stream1VertexReadOffset = urb_entry_read_offset; - so.Stream1VertexReadLength = urb_entry_read_length - 1; - so.Stream2VertexReadOffset = urb_entry_read_offset; - so.Stream2VertexReadLength = urb_entry_read_length - 1; - so.Stream3VertexReadOffset = urb_entry_read_offset; - so.Stream3VertexReadLength = urb_entry_read_length - 1; + /* We always read the whole vertex. This could be reduced at some + * point by reading less and offsetting the register index in the + * SO_DECLs. + */ + so.Stream0VertexReadOffset = urb_entry_read_offset; + so.Stream0VertexReadLength = urb_entry_read_length - 1; + so.Stream1VertexReadOffset = urb_entry_read_offset; + so.Stream1VertexReadLength = urb_entry_read_length - 1; + so.Stream2VertexReadOffset = urb_entry_read_offset; + so.Stream2VertexReadLength = urb_entry_read_length - 1; + so.Stream3VertexReadOffset = urb_entry_read_offset; + so.Stream3VertexReadLength = urb_entry_read_length - 1; + } } - - GENX(3DSTATE_STREAMOUT_pack)(NULL, pipeline->partial.streamout_state, &so); } static uint32_t @@ -1158,7 +1200,6 @@ get_scratch_surf(struct anv_pipeline *pipeline, static void emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) { - struct anv_batch *batch = &pipeline->base.base.batch; const struct intel_device_info *devinfo = pipeline->base.base.device->info; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); const struct anv_shader_bin *vs_bin = @@ -1166,7 +1207,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX)); - anv_batch_emit(batch, GENX(3DSTATE_VS), vs) { + anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) { vs.Enable = true; vs.StatisticsEnable = true; vs.KernelStartPointer = vs_bin->kernel.offset; @@ -1237,11 +1278,9 @@ static void emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline, const struct vk_tessellation_state *ts) { - struct anv_batch *batch = &pipeline->base.base.batch; - if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { - anv_batch_emit(batch, GENX(3DSTATE_HS), hs); - anv_batch_emit(batch, GENX(3DSTATE_DS), ds); + anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs); + anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds); return; } @@ -1254,120 +1293,101 @@ emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline, const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline); const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); - struct GENX(3DSTATE_HS) hs = { - GENX(3DSTATE_HS_header), - }; - - hs.Enable = true; - hs.StatisticsEnable = true; - hs.KernelStartPointer = tcs_bin->kernel.offset; - /* Wa_1606682166 */ - hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin); - hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count; + anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) { + hs.Enable = true; + hs.StatisticsEnable = true; + hs.KernelStartPointer = tcs_bin->kernel.offset; + /* Wa_1606682166 */ + hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin); + hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count; #if GFX_VER >= 12 - /* Wa_1604578095: - * - * Hang occurs when the number of max threads is less than 2 times - * the number of instance count. The number of max threads must be - * more than 2 times the number of instance count. - */ - assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); + /* Wa_1604578095: + * + * Hang occurs when the number of max threads is less than 2 times + * the number of instance count. The number of max threads must be + * more than 2 times the number of instance count. + */ + assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); #endif - hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; - hs.IncludeVertexHandles = true; - hs.InstanceCount = tcs_prog_data->instances - 1; + hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; + hs.IncludeVertexHandles = true; + hs.InstanceCount = tcs_prog_data->instances - 1; - hs.VertexURBEntryReadLength = 0; - hs.VertexURBEntryReadOffset = 0; - hs.DispatchGRFStartRegisterForURBData = - tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; + hs.VertexURBEntryReadLength = 0; + hs.VertexURBEntryReadOffset = 0; + hs.DispatchGRFStartRegisterForURBData = + tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; #if GFX_VER >= 12 - hs.DispatchGRFStartRegisterForURBData5 = - tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; + hs.DispatchGRFStartRegisterForURBData5 = + tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; #endif #if GFX_VERx10 >= 125 - hs.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin); + hs.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin); #else - hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); - hs.ScratchSpaceBasePointer = - get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin); + hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); + hs.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin); #endif #if GFX_VER == 12 - /* Patch Count threshold specifies the maximum number of patches that - * will be accumulated before a thread dispatch is forced. - */ - hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; + /* Patch Count threshold specifies the maximum number of patches that + * will be accumulated before a thread dispatch is forced. + */ + hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; #endif - hs.DispatchMode = tcs_prog_data->base.dispatch_mode; - hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; - - STATIC_ASSERT(ARRAY_SIZE(pipeline->final.hs) == GENX(3DSTATE_HS_length)); - GENX(3DSTATE_HS_pack)(&pipeline->base.base.batch, pipeline->final.hs, &hs); - - struct GENX(3DSTATE_DS) ds = { - GENX(3DSTATE_DS_header), + hs.DispatchMode = tcs_prog_data->base.dispatch_mode; + hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; }; - ds.Enable = true; - ds.StatisticsEnable = true; - ds.KernelStartPointer = tes_bin->kernel.offset; - /* Wa_1606682166 */ - ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin); - ds.BindingTableEntryCount = tes_bin->bind_map.surface_count; - ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; + anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) { + ds.Enable = true; + ds.StatisticsEnable = true; + ds.KernelStartPointer = tes_bin->kernel.offset; + /* Wa_1606682166 */ + ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin); + ds.BindingTableEntryCount = tes_bin->bind_map.surface_count; + ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; - ds.ComputeWCoordinateEnable = - tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; + ds.ComputeWCoordinateEnable = + tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; - ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; - ds.PatchURBEntryReadOffset = 0; - ds.DispatchGRFStartRegisterForURBData = - tes_prog_data->base.base.dispatch_grf_start_reg; + ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; + ds.PatchURBEntryReadOffset = 0; + ds.DispatchGRFStartRegisterForURBData = + tes_prog_data->base.base.dispatch_grf_start_reg; #if GFX_VER < 11 - ds.DispatchMode = - tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? - DISPATCH_MODE_SIMD8_SINGLE_PATCH : - DISPATCH_MODE_SIMD4X2; + ds.DispatchMode = + tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? + DISPATCH_MODE_SIMD8_SINGLE_PATCH : + DISPATCH_MODE_SIMD4X2; #else - assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); - ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; + assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); + ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; #endif - ds.UserClipDistanceClipTestEnableBitmask = - tes_prog_data->base.clip_distance_mask; - ds.UserClipDistanceCullTestEnableBitmask = - tes_prog_data->base.cull_distance_mask; + ds.UserClipDistanceClipTestEnableBitmask = + tes_prog_data->base.clip_distance_mask; + ds.UserClipDistanceCullTestEnableBitmask = + tes_prog_data->base.cull_distance_mask; #if GFX_VER >= 12 - ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id; + ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id; #endif #if GFX_VERx10 >= 125 - ds.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin); + ds.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin); #else - ds.PerThreadScratchSpace = get_scratch_space(tes_bin); - ds.ScratchSpaceBasePointer = - get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin); + ds.PerThreadScratchSpace = get_scratch_space(tes_bin); + ds.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin); #endif - - /* Wa_14019750404: - * See genX(emit_ds)(). - * We need to both emit 3DSTATE_DS now, and before each 3DPRIMITIVE, so - * we pack it to have it later, and memcpy into the current batch. - */ - STATIC_ASSERT(ARRAY_SIZE(pipeline->final.ds) == GENX(3DSTATE_DS_length)); - GENX(3DSTATE_DS_pack)(&pipeline->base.base.batch, pipeline->final.ds, &ds); - - uint32_t *dw = - anv_batch_emitn(batch, GENX(3DSTATE_DS_length), GENX(3DSTATE_DS)); - memcpy(dw, &pipeline->final.ds, sizeof(pipeline->final.ds)); + } } static UNUSED bool @@ -1391,63 +1411,59 @@ geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline) static void emit_3dstate_te(struct anv_graphics_pipeline *pipeline) { - struct GENX(3DSTATE_TE) te = { - GENX(3DSTATE_TE_header), - }; + anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) { + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + const struct brw_tes_prog_data *tes_prog_data = + get_tes_prog_data(pipeline); - if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { - const struct brw_tes_prog_data *tes_prog_data = - get_tes_prog_data(pipeline); - - te.Partitioning = tes_prog_data->partitioning; - te.TEDomain = tes_prog_data->domain; - te.TEEnable = true; - te.MaximumTessellationFactorOdd = 63.0; - te.MaximumTessellationFactorNotOdd = 64.0; + te.Partitioning = tes_prog_data->partitioning; + te.TEDomain = tes_prog_data->domain; + te.TEEnable = true; + te.MaximumTessellationFactorOdd = 63.0; + te.MaximumTessellationFactorNotOdd = 64.0; #if GFX_VERx10 >= 125 - const struct anv_device *device = pipeline->base.base.device; - if (intel_needs_workaround(device->info, 22012699309)) - te.TessellationDistributionMode = TEDMODE_RR_STRICT; - else - te.TessellationDistributionMode = TEDMODE_RR_FREE; + const struct anv_device *device = pipeline->base.base.device; + if (intel_needs_workaround(device->info, 22012699309)) + te.TessellationDistributionMode = TEDMODE_RR_STRICT; + else + te.TessellationDistributionMode = TEDMODE_RR_FREE; - if (intel_needs_workaround(device->info, 14015055625)) { - /* Wa_14015055625: - * - * Disable Tessellation Distribution when primitive Id is enabled. - */ - if (pipeline->primitive_id_override || - geom_or_tess_prim_id_used(pipeline)) - te.TessellationDistributionMode = TEDMODE_OFF; - } + if (intel_needs_workaround(device->info, 14015055625)) { + /* Wa_14015055625: + * + * Disable Tessellation Distribution when primitive Id is enabled. + */ + if (pipeline->primitive_id_override || + geom_or_tess_prim_id_used(pipeline)) + te.TessellationDistributionMode = TEDMODE_OFF; + } - te.TessellationDistributionLevel = TEDLEVEL_PATCH; - /* 64_TRIANGLES */ - te.SmallPatchThreshold = 3; - /* 1K_TRIANGLES */ - te.TargetBlockSize = 8; - /* 1K_TRIANGLES */ - te.LocalBOPAccumulatorThreshold = 1; + te.TessellationDistributionLevel = TEDLEVEL_PATCH; + /* 64_TRIANGLES */ + te.SmallPatchThreshold = 3; + /* 1K_TRIANGLES */ + te.TargetBlockSize = 8; + /* 1K_TRIANGLES */ + te.LocalBOPAccumulatorThreshold = 1; #endif + } } - - GENX(3DSTATE_TE_pack)(NULL, pipeline->partial.te, &te); } static void emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) { + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { + anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs); + return; + } + const struct intel_device_info *devinfo = pipeline->base.base.device->info; const struct anv_shader_bin *gs_bin = pipeline->base.shaders[MESA_SHADER_GEOMETRY]; + const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline); - struct GENX(3DSTATE_GS) gs = { - GENX(3DSTATE_GS_header), - }; - - if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { - const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline); - + anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) { gs.Enable = true; gs.StatisticsEnable = true; gs.KernelStartPointer = gs_bin->kernel.offset; @@ -1493,8 +1509,6 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin); #endif } - - GENX(3DSTATE_GS_pack)(&pipeline->base.base.batch, pipeline->partial.gs, &gs); } static bool @@ -1514,49 +1528,46 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, { const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); - struct GENX(3DSTATE_WM) wm = { - GENX(3DSTATE_WM_header), - }; - wm.StatisticsEnable = true; - wm.LineEndCapAntialiasingRegionWidth = _05pixels; - wm.LineAntialiasingRegionWidth = _10pixels; - wm.PointRasterizationRule = RASTRULE_UPPER_LEFT; + anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) { + wm.StatisticsEnable = true; + wm.LineEndCapAntialiasingRegionWidth = _05pixels; + wm.LineAntialiasingRegionWidth = _10pixels; + wm.PointRasterizationRule = RASTRULE_UPPER_LEFT; - if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { - if (wm_prog_data->early_fragment_tests) { + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + if (wm_prog_data->early_fragment_tests) { wm.EarlyDepthStencilControl = EDSC_PREPS; - } else if (wm_prog_data->has_side_effects) { - wm.EarlyDepthStencilControl = EDSC_PSEXEC; - } else { - wm.EarlyDepthStencilControl = EDSC_NORMAL; + } else if (wm_prog_data->has_side_effects) { + wm.EarlyDepthStencilControl = EDSC_PSEXEC; + } else { + wm.EarlyDepthStencilControl = EDSC_NORMAL; + } + + /* Gen8 hardware tries to compute ThreadDispatchEnable for us but + * doesn't take into account KillPixels when no depth or stencil + * writes are enabled. In order for occlusion queries to work + * correctly with no attachments, we need to force-enable PS thread + * dispatch. + * + * The BDW docs are pretty clear that that this bit isn't validated + * and probably shouldn't be used in production: + * + * "This must always be set to Normal. This field should not be + * tested for functional validation." + * + * Unfortunately, however, the other mechanism we have for doing this + * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW. + * Given two bad options, we choose the one which works. + */ + pipeline->force_fragment_thread_dispatch = + wm_prog_data->has_side_effects || + wm_prog_data->uses_kill; + + wm.BarycentricInterpolationMode = + wm_prog_data_barycentric_modes(wm_prog_data, + pipeline->fs_msaa_flags); } - - /* Gen8 hardware tries to compute ThreadDispatchEnable for us but - * doesn't take into account KillPixels when no depth or stencil - * writes are enabled. In order for occlusion queries to work - * correctly with no attachments, we need to force-enable PS thread - * dispatch. - * - * The BDW docs are pretty clear that that this bit isn't validated - * and probably shouldn't be used in production: - * - * "This must always be set to Normal. This field should not be - * tested for functional validation." - * - * Unfortunately, however, the other mechanism we have for doing this - * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW. - * Given two bad options, we choose the one which works. - */ - pipeline->force_fragment_thread_dispatch = - wm_prog_data->has_side_effects || - wm_prog_data->uses_kill; - - wm.BarycentricInterpolationMode = - wm_prog_data_barycentric_modes(wm_prog_data, - pipeline->fs_msaa_flags); } - - GENX(3DSTATE_WM_pack)(NULL, pipeline->partial.wm, &wm); } static void @@ -1564,21 +1575,19 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, const struct vk_multisample_state *ms, const struct vk_color_blend_state *cb) { - struct anv_batch *batch = &pipeline->base.base.batch; UNUSED const struct intel_device_info *devinfo = pipeline->base.base.device->info; const struct anv_shader_bin *fs_bin = pipeline->base.shaders[MESA_SHADER_FRAGMENT]; if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { - anv_batch_emit(batch, GENX(3DSTATE_PS), ps) { - } + anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps); return; } const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); - anv_batch_emit(batch, GENX(3DSTATE_PS), ps) { + anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) { intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data, ms != NULL ? ms->rasterization_samples : 1, pipeline->fs_msaa_flags); @@ -1629,15 +1638,14 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, const struct vk_rasterization_state *rs, const struct vk_render_pass_state *rp) { - struct anv_batch *batch = &pipeline->base.base.batch; const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { - anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps); + anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps); return; } - anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), ps) { + anv_pipeline_emit(pipeline, final.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) { ps.PixelShaderValid = true; ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; @@ -1689,8 +1697,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, static void emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline) { - struct anv_batch *batch = &pipeline->base.base.batch; - anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vfs) { + anv_pipeline_emit(pipeline, final.vf_statistics, + GENX(3DSTATE_VF_STATISTICS), vfs) { vfs.StatisticsEnable = true; } } @@ -1733,10 +1741,9 @@ static void emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, const struct vk_render_pass_state *rp) { - struct anv_batch *batch = &pipeline->base.base.batch; - if (anv_pipeline_is_mesh(pipeline)) { - anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); + anv_pipeline_emit(pipeline, final.primitive_replication, + GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); return; } @@ -1745,14 +1752,16 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, assert(replication_count >= 1); if (replication_count == 1) { - anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); + anv_pipeline_emit(pipeline, final.primitive_replication, + GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); return; } assert(replication_count == util_bitcount(rp->view_mask)); assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); - anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { + anv_pipeline_emit(pipeline, final.primitive_replication, + GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { pr.ReplicaMask = (1 << replication_count) - 1; pr.ReplicationCount = replication_count - 1; @@ -1769,18 +1778,19 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, static void emit_task_state(struct anv_graphics_pipeline *pipeline) { - struct anv_batch *batch = &pipeline->base.base.batch; assert(anv_pipeline_is_mesh(pipeline)); if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { - anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero); + anv_pipeline_emit(pipeline, final.task_control, + GENX(3DSTATE_TASK_CONTROL), zero); return; } const struct anv_shader_bin *task_bin = pipeline->base.shaders[MESA_SHADER_TASK]; - anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), tc) { + anv_pipeline_emit(pipeline, final.task_control, + GENX(3DSTATE_TASK_CONTROL), tc) { tc.TaskShaderEnable = true; tc.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin); @@ -1792,7 +1802,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline) const struct brw_cs_dispatch_info task_dispatch = brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL); - anv_batch_emit(batch, GENX(3DSTATE_TASK_SHADER), task) { + anv_pipeline_emit(pipeline, final.task_shader, + GENX(3DSTATE_TASK_SHADER), task) { task.KernelStartPointer = task_bin->kernel.offset; task.SIMDSize = task_dispatch.simd_size / 16; task.MessageSIMD = task.SIMDSize; @@ -1818,7 +1829,8 @@ emit_task_state(struct anv_graphics_pipeline *pipeline) } /* Recommended values from "Task and Mesh Distribution Programming". */ - anv_batch_emit(batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) { + anv_pipeline_emit(pipeline, final.task_redistrib, + GENX(3DSTATE_TASK_REDISTRIB), redistrib) { redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1; redistrib.SmallTaskThreshold = 1; /* 2^N */ redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */ @@ -1830,12 +1842,12 @@ emit_task_state(struct anv_graphics_pipeline *pipeline) static void emit_mesh_state(struct anv_graphics_pipeline *pipeline) { - struct anv_batch *batch = &pipeline->base.base.batch; assert(anv_pipeline_is_mesh(pipeline)); const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH]; - anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mc) { + anv_pipeline_emit(pipeline, final.mesh_control, + GENX(3DSTATE_MESH_CONTROL), mc) { mc.MeshShaderEnable = true; mc.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin); @@ -1864,7 +1876,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline) unreachable("invalid index format"); } - anv_batch_emit(batch, GENX(3DSTATE_MESH_SHADER), mesh) { + anv_pipeline_emit(pipeline, final.mesh_shader, + GENX(3DSTATE_MESH_SHADER), mesh) { mesh.KernelStartPointer = mesh_bin->kernel.offset; mesh.SIMDSize = mesh_dispatch.simd_size / 16; mesh.MessageSIMD = mesh.SIMDSize; @@ -1897,7 +1910,8 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline) } /* Recommended values from "Task and Mesh Distribution Programming". */ - anv_batch_emit(batch, GENX(3DSTATE_MESH_DISTRIB), distrib) { + anv_pipeline_emit(pipeline, final.mesh_distrib, + GENX(3DSTATE_MESH_DISTRIB), distrib) { distrib.DistributionMode = MESH_RR_FREE; distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */ distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */ @@ -1909,7 +1923,6 @@ void genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, const struct vk_graphics_pipeline_state *state) { - struct anv_batch *batch = &pipeline->base.base.batch; enum intel_urb_deref_block_size urb_deref_block_size; emit_urb_setup(pipeline, &urb_deref_block_size); @@ -1940,10 +1953,10 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, const struct anv_device *device = pipeline->base.base.device; /* Disable Mesh. */ if (device->vk.enabled_extensions.EXT_mesh_shader) { - struct anv_batch *batch = &pipeline->base.base.batch; - - anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero); - anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero); + anv_pipeline_emit(pipeline, final.mesh_control, + GENX(3DSTATE_MESH_CONTROL), zero); + anv_pipeline_emit(pipeline, final.task_control, + GENX(3DSTATE_TASK_CONTROL), zero); } #endif } else { @@ -1952,7 +1965,7 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1. */ - anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {} + anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so); #if GFX_VERx10 >= 125 emit_task_state(pipeline);