mesa/src/intel/vulkan/genX_gfx_state.c
Lionel Landwerlin 4e8a25cf6f anv: remove use of emit_apply_pipe_flushes() in various helpers
For a bunch of workarounds and special cases we want PIPE_CONTROL not
RESOURCE_BARRIER. We want emit_apply_pipe_flushes() to be mostly for
application barriers.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38707>
2025-12-15 08:25:31 +00:00

4206 lines
168 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include "anv_private.h"
#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
#include "common/intel_genX_state_brw.h"
#include "common/intel_guardband.h"
#include "common/intel_tiled_render.h"
#include "compiler/intel_prim.h"
#include "genX_mi_builder.h"
#define anv_gfx_pack(field, cmd, name) \
for (struct cmd name = { __anv_cmd_header(cmd) }, \
*_dst = (struct cmd *)hw_state->packed.field; \
__builtin_expect(_dst != NULL, 1); \
({ \
assert(sizeof(hw_state->packed.field) >= \
4 * __anv_cmd_length(cmd)); \
__anv_cmd_pack(cmd)(NULL, _dst, &name); \
_dst = NULL; \
}))
static const uint32_t vk_to_intel_blend[] = {
[VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,
[VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,
[VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,
[VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,
[VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,
[VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,
[VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,
[VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,
[VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,
[VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,
[VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,
[VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
[VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,
[VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
[VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,
[VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,
[VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,
[VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,
[VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,
};
static const uint32_t vk_to_intel_blend_op[] = {
[VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,
[VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,
[VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,
[VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,
[VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,
};
static const uint32_t vk_to_intel_cullmode[] = {
[VK_CULL_MODE_NONE] = CULLMODE_NONE,
[VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT,
[VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK,
[VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH
};
static const uint32_t vk_to_intel_fillmode[] = {
[VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
[VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
[VK_POLYGON_MODE_POINT] = FILL_MODE_POINT,
};
static const uint32_t vk_to_intel_front_face[] = {
[VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1,
[VK_FRONT_FACE_CLOCKWISE] = 0
};
static const uint32_t vk_to_intel_logic_op[] = {
[VK_LOGIC_OP_COPY] = LOGICOP_COPY,
[VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR,
[VK_LOGIC_OP_AND] = LOGICOP_AND,
[VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE,
[VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED,
[VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP,
[VK_LOGIC_OP_XOR] = LOGICOP_XOR,
[VK_LOGIC_OP_OR] = LOGICOP_OR,
[VK_LOGIC_OP_NOR] = LOGICOP_NOR,
[VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV,
[VK_LOGIC_OP_INVERT] = LOGICOP_INVERT,
[VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE,
[VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED,
[VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED,
[VK_LOGIC_OP_NAND] = LOGICOP_NAND,
[VK_LOGIC_OP_SET] = LOGICOP_SET,
};
static const uint32_t vk_to_intel_compare_op[] = {
[VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER,
[VK_COMPARE_OP_LESS] = PREFILTEROP_LESS,
[VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL,
[VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL,
[VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER,
[VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL,
[VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL,
[VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS,
};
static const uint32_t vk_to_intel_stencil_op[] = {
[VK_STENCIL_OP_KEEP] = STENCILOP_KEEP,
[VK_STENCIL_OP_ZERO] = STENCILOP_ZERO,
[VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE,
[VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT,
[VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT,
[VK_STENCIL_OP_INVERT] = STENCILOP_INVERT,
[VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR,
[VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR,
};
static const uint32_t vk_to_intel_primitive_type[] = {
[VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST,
[VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST,
[VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP,
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST,
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
[VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
[VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
};
static uint32_t vk_to_intel_index_type(VkIndexType type)
{
switch (type) {
case VK_INDEX_TYPE_UINT8_KHR:
return INDEX_BYTE;
case VK_INDEX_TYPE_UINT16:
return INDEX_WORD;
case VK_INDEX_TYPE_UINT32:
return INDEX_DWORD;
default:
UNREACHABLE("invalid index type");
}
}
void
genX(batch_emit_wa_16014912113)(struct anv_batch *batch,
const struct intel_urb_config *urb_cfg)
{
#if INTEL_NEEDS_WA_16014912113
if (urb_cfg->size[0] == 0)
return;
for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
#if GFX_VER >= 12
anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
urb._3DCommandSubOpcode += i;
urb.VSURBEntryAllocationSize = urb_cfg->size[i] - 1;
urb.VSURBStartingAddressSlice0 = urb_cfg->start[i];
urb.VSURBStartingAddressSliceN = urb_cfg->start[i];
urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
}
#else
anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
urb._3DCommandSubOpcode += i;
urb.VSURBStartingAddress = urb_cfg->start[i];
urb.VSURBEntryAllocationSize = urb_cfg->size[i] - 1;
urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
}
#endif
}
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
pc.HDCPipelineFlushEnable = true;
}
#endif
}
static void
genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer,
const struct anv_cmd_graphics_state *gfx)
{
#if INTEL_WA_16013994831_GFX_VER
/* Wa_16013994831 - Disable preemption during streamout, enable back
* again if XFB not used by the current pipeline.
*/
if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
return;
if (gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) {
genX(cmd_buffer_set_preemption)(cmd_buffer, false);
return;
}
if (!cmd_buffer->state.gfx.object_preemption)
genX(cmd_buffer_set_preemption)(cmd_buffer, true);
#endif
}
#if GFX_VER >= 12 && GFX_VER < 30
static uint32_t
get_cps_state_offset(const struct anv_device *device,
const struct vk_fragment_shading_rate_state *fsr)
{
uint32_t offset;
static const uint32_t size_index[] = {
[1] = 0,
[2] = 1,
[4] = 2,
};
#if GFX_VERx10 >= 125
offset =
1 + /* skip disabled */
fsr->combiner_ops[0] * 5 * 3 * 3 +
fsr->combiner_ops[1] * 3 * 3 +
size_index[fsr->fragment_size.width] * 3 +
size_index[fsr->fragment_size.height];
#else
offset =
1 + /* skip disabled */
size_index[fsr->fragment_size.width] * 3 +
size_index[fsr->fragment_size.height];
#endif
offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
return device->cps_states.offset + offset;
}
#endif /* GFX_VER >= 12 && GFX_VER < 30 */
#if GFX_VER >= 30
static uint32_t
get_cps_size(uint32_t size)
{
switch (size) {
case 1:
return CPSIZE_1;
case 2:
return CPSIZE_2;
case 4:
return CPSIZE_4;
default:
UNREACHABLE("Invalid size");
}
}
static const uint32_t vk_to_intel_shading_rate_combiner_op[] = {
[VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = CPS_COMB_OP_PASSTHROUGH,
[VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = CPS_COMB_OP_OVERRIDE,
[VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = CPS_COMB_OP_HIGH_QUALITY,
[VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = CPS_COMB_OP_LOW_QUALITY,
[VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = CPS_COMB_OP_RELATIVE,
};
#endif
static bool
has_ds_feedback_loop(const struct anv_pipeline_bind_map *bind_map,
const struct vk_dynamic_graphics_state *dyn)
{
if (BITSET_IS_EMPTY(bind_map->input_attachments))
return false;
const unsigned depth_att = dyn->ial.depth_att == MESA_VK_ATTACHMENT_NO_INDEX ?
MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS : dyn->ial.depth_att;
const unsigned stencil_att = dyn->ial.stencil_att == MESA_VK_ATTACHMENT_NO_INDEX ?
MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS : dyn->ial.stencil_att;
return
(dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
VK_IMAGE_ASPECT_STENCIL_BIT)) != 0 ||
(dyn->ial.depth_att != MESA_VK_ATTACHMENT_UNUSED &&
BITSET_TEST(bind_map->input_attachments, depth_att)) ||
(dyn->ial.stencil_att != MESA_VK_ATTACHMENT_UNUSED &&
BITSET_TEST(bind_map->input_attachments, stencil_att));
}
static bool
kill_pixel(const struct brw_wm_prog_data *wm_prog_data,
const struct vk_dynamic_graphics_state *dyn)
{
return wm_prog_data->uses_kill ||
wm_prog_data->uses_omask ||
dyn->ms.alpha_to_coverage_enable;
}
UNUSED static bool
want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx,
const struct vk_depth_stencil_state *ds)
{
if (GFX_VER > 9)
return false;
assert(GFX_VER == 9);
/* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
*
* Clearing this bit will force the STC cache to wait for pending
* retirement of pixels at the HZ-read stage and do the STC-test for
* Non-promoted, R-computed and Computed depth modes instead of
* postponing the STC-test to RCPFE.
*
* STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
* 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
*
* STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
*
* COMP_STC_EN = STC_TEST_EN &&
* 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
*
* SW parses the pipeline states to generate the following logical
* signal indicating if PMA FIX can be enabled.
*
* STC_PMA_OPT =
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
* 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
* !(3DSTATE_WM::EDSC_Mode == 2) &&
* 3DSTATE_PS_EXTRA::PixelShaderValid &&
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
* (COMP_STC_EN || STC_WRITE_EN) &&
* ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_WM::ForceKillPix == ON ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
* (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
*/
/* These are always true:
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
* !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
*/
/* We only enable the PMA fix if we know for certain that HiZ is enabled.
* If we don't know whether HiZ is enabled or not, we disable the PMA fix
* and there is no harm.
*
* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
* 3DSTATE_DEPTH_BUFFER::HIZ Enable
*/
if (!gfx->hiz_enabled)
return false;
/* We can't possibly know if HiZ is enabled without the depth attachment */
ASSERTED const struct anv_image_view *d_iview = gfx->depth_att.iview;
assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
/* 3DSTATE_PS_EXTRA::PixelShaderValid */
if (gfx->shaders[MESA_SHADER_FRAGMENT] == NULL)
return false;
/* !(3DSTATE_WM::EDSC_Mode == 2) */
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (wm_prog_data->early_fragment_tests)
return false;
/* We never use anv_pipeline for HiZ ops so this is trivially true:
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear)
*/
/* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
* 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
*/
const bool stc_test_en = ds->stencil.test_enable;
/* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
* (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
*/
const bool stc_write_en = ds->stencil.write_enable;
/* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
/* COMP_STC_EN || STC_WRITE_EN */
if (!(comp_stc_en || stc_write_en))
return false;
/* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_WM::ForceKillPix == ON ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
* (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
*/
struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
return kill_pixel(wm_prog_data, dyn) ||
has_ds_feedback_loop(&fs->bind_map, dyn) ||
wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
}
static inline bool
anv_rasterization_aa_mode(VkPolygonMode raster_mode,
VkLineRasterizationModeKHR line_mode)
{
if (raster_mode == VK_POLYGON_MODE_LINE &&
line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
return true;
return false;
}
static inline VkLineRasterizationModeKHR
anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,
unsigned rasterization_samples)
{
if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR) {
if (rasterization_samples > 1) {
return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR;
} else {
return VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
}
}
return line_mode;
}
/** Returns the final polygon mode for rasterization
*
* This function takes into account polygon mode, primitive topology and the
* different shader stages which might generate their own type of primitives.
*/
static inline VkPolygonMode
anv_raster_polygon_mode(const struct anv_cmd_graphics_state *gfx,
VkPolygonMode polygon_mode,
VkPrimitiveTopology primitive_topology)
{
if (gfx->shaders[MESA_SHADER_MESH] != NULL) {
switch (get_gfx_mesh_prog_data(gfx)->primitive_type) {
case MESA_PRIM_POINTS:
return VK_POLYGON_MODE_POINT;
case MESA_PRIM_LINES:
return VK_POLYGON_MODE_LINE;
case MESA_PRIM_TRIANGLES:
return polygon_mode;
default:
UNREACHABLE("invalid primitive type for mesh");
}
} else if (gfx->shaders[MESA_SHADER_GEOMETRY] != NULL) {
switch (get_gfx_gs_prog_data(gfx)->output_topology) {
case _3DPRIM_POINTLIST:
return VK_POLYGON_MODE_POINT;
case _3DPRIM_LINELIST:
case _3DPRIM_LINESTRIP:
case _3DPRIM_LINELOOP:
return VK_POLYGON_MODE_LINE;
case _3DPRIM_TRILIST:
case _3DPRIM_TRIFAN:
case _3DPRIM_TRISTRIP:
case _3DPRIM_RECTLIST:
case _3DPRIM_QUADLIST:
case _3DPRIM_QUADSTRIP:
case _3DPRIM_POLYGON:
return polygon_mode;
}
UNREACHABLE("Unsupported GS output topology");
} else if (gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL) {
struct brw_tess_info tess_info =
brw_merge_tess_info(
get_gfx_tcs_prog_data(gfx)->tess_info,
get_gfx_tes_prog_data(gfx)->tess_info);
switch (brw_tess_info_output_topology(tess_info)) {
case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
return VK_POLYGON_MODE_POINT;
case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
return VK_POLYGON_MODE_LINE;
case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
return polygon_mode;
default:
UNREACHABLE("Unsupported TCS output topology");
}
} else {
switch (primitive_topology) {
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
return VK_POLYGON_MODE_POINT;
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
return VK_POLYGON_MODE_LINE;
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
return polygon_mode;
default:
UNREACHABLE("Unsupported primitive topology");
}
}
}
static inline bool
anv_is_dual_src_blend_factor(VkBlendFactor factor)
{
return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
}
static inline bool
anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state *cb)
{
return anv_is_dual_src_blend_factor(cb->src_color_blend_factor) &&
anv_is_dual_src_blend_factor(cb->dst_color_blend_factor) &&
anv_is_dual_src_blend_factor(cb->src_alpha_blend_factor) &&
anv_is_dual_src_blend_factor(cb->dst_alpha_blend_factor);
}
static void
anv_rasterization_mode(VkPolygonMode raster_mode,
VkLineRasterizationModeKHR line_mode,
float line_width,
uint32_t *api_mode,
bool *msaa_rasterization_enable)
{
if (raster_mode == VK_POLYGON_MODE_LINE) {
/* Unfortunately, configuring our line rasterization hardware on gfx8
* and later is rather painful. Instead of giving us bits to tell the
* hardware what line mode to use like we had on gfx7, we now have an
* arcane combination of API Mode and MSAA enable bits which do things
* in a table which are expected to magically put the hardware into the
* right mode for your API. Sadly, Vulkan isn't any of the APIs the
* hardware people thought of so nothing works the way you want it to.
*
* Look at the table titled "Multisample Rasterization Modes" in Vol 7
* of the Skylake PRM for more details.
*/
switch (line_mode) {
case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
*api_mode = DX101;
#if GFX_VER <= 9
/* Prior to ICL, the algorithm the HW uses to draw wide lines
* doesn't quite match what the CTS expects, at least for rectangular
* lines, so we set this to false here, making it draw parallelograms
* instead, which work well enough.
*/
*msaa_rasterization_enable = line_width < 1.0078125;
#else
*msaa_rasterization_enable = true;
#endif
break;
case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
*api_mode = DX9OGL;
*msaa_rasterization_enable = false;
break;
default:
UNREACHABLE("Unsupported line rasterization mode");
}
} else {
*api_mode = DX101;
*msaa_rasterization_enable = true;
}
}
static bool
is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
{
return factor == BLENDFACTOR_SRC1_COLOR ||
factor == BLENDFACTOR_SRC1_ALPHA ||
factor == BLENDFACTOR_INV_SRC1_COLOR ||
factor == BLENDFACTOR_INV_SRC1_ALPHA;
}
#if GFX_VERx10 == 125
/**
* Return the dimensions of the current rendering area, defined as the
* bounding box of all present color, depth and stencil attachments.
*/
UNUSED static bool
calculate_render_area(const struct anv_cmd_graphics_state *gfx,
unsigned *width, unsigned *height)
{
*width = gfx->render_area.offset.x + gfx->render_area.extent.width;
*height = gfx->render_area.offset.y + gfx->render_area.extent.height;
for (unsigned i = 0; i < gfx->color_att_count; i++) {
const struct anv_attachment *att = &gfx->color_att[i];
if (att->iview) {
*width = MAX2(*width, att->iview->vk.extent.width);
*height = MAX2(*height, att->iview->vk.extent.height);
}
}
const struct anv_image_view *const z_view = gfx->depth_att.iview;
if (z_view) {
*width = MAX2(*width, z_view->vk.extent.width);
*height = MAX2(*height, z_view->vk.extent.height);
}
const struct anv_image_view *const s_view = gfx->stencil_att.iview;
if (s_view) {
*width = MAX2(*width, s_view->vk.extent.width);
*height = MAX2(*height, s_view->vk.extent.height);
}
return *width && *height;
}
/* Calculate TBIMR tiling parameters adequate for the current pipeline
* setup. Return true if TBIMR should be enabled.
*/
UNUSED static bool
calculate_tile_dimensions(const struct anv_device *device,
const struct anv_cmd_graphics_state *gfx,
const struct intel_l3_config *l3_config,
unsigned fb_width, unsigned fb_height,
unsigned *tile_width, unsigned *tile_height)
{
assert(GFX_VER == 12);
const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
unsigned pixel_size = 0;
/* Perform a rough calculation of the tile cache footprint of the
* pixel pipeline, approximating it as the sum of the amount of
* memory used per pixel by every render target, depth, stencil and
* auxiliary surfaces bound to the pipeline.
*/
for (uint32_t i = 0; i < gfx->color_att_count; i++) {
const struct anv_attachment *att = &gfx->color_att[i];
if (att->iview) {
const struct anv_image *image = att->iview->image;
const unsigned p = anv_image_aspect_to_plane(image,
VK_IMAGE_ASPECT_COLOR_BIT);
const struct anv_image_plane *plane = &image->planes[p];
pixel_size += intel_calculate_surface_pixel_size(
&plane->primary_surface.isl);
if (isl_aux_usage_has_mcs(att->aux_usage))
pixel_size += intel_calculate_surface_pixel_size(
&plane->aux_surface.isl);
if (isl_aux_usage_has_ccs(att->aux_usage))
pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
&plane->primary_surface.isl),
aux_scale);
}
}
const struct anv_image_view *const z_view = gfx->depth_att.iview;
if (z_view) {
const struct anv_image *image = z_view->image;
assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
const unsigned p = anv_image_aspect_to_plane(image,
VK_IMAGE_ASPECT_DEPTH_BIT);
const struct anv_image_plane *plane = &image->planes[p];
pixel_size += intel_calculate_surface_pixel_size(
&plane->primary_surface.isl);
if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
pixel_size += intel_calculate_surface_pixel_size(
&plane->aux_surface.isl);
if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
&plane->primary_surface.isl),
aux_scale);
}
const struct anv_image_view *const s_view = gfx->depth_att.iview;
if (s_view && s_view != z_view) {
const struct anv_image *image = s_view->image;
assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
const unsigned p = anv_image_aspect_to_plane(image,
VK_IMAGE_ASPECT_STENCIL_BIT);
const struct anv_image_plane *plane = &image->planes[p];
pixel_size += intel_calculate_surface_pixel_size(
&plane->primary_surface.isl);
}
if (!pixel_size)
return false;
/* Compute a tile layout that allows reasonable utilization of the
* tile cache based on the per-pixel cache footprint estimated
* above.
*/
intel_calculate_tile_dimensions(device->info, l3_config,
32, 32, fb_width, fb_height,
pixel_size, tile_width, tile_height);
/* Perform TBIMR tile passes only if the framebuffer covers more
* than a single tile.
*/
return *tile_width < fb_width || *tile_height < fb_height;
}
#endif
#define GET(field) hw_state->field
#define SET(bit, field, value) \
do { \
__typeof(hw_state->field) __v = value; \
if (hw_state->field != __v) { \
hw_state->field = __v; \
BITSET_SET(hw_state->pack_dirty, \
ANV_GFX_STATE_##bit); \
} \
} while (0)
#define SET_STAGE(bit, field, value, stage) \
do { \
__typeof(hw_state->field) __v = value; \
if (gfx->shaders[MESA_SHADER_##stage] == NULL) { \
hw_state->field = __v; \
break; \
} \
if (hw_state->field != __v) { \
hw_state->field = __v; \
BITSET_SET(hw_state->pack_dirty, \
ANV_GFX_STATE_##bit); \
} \
} while (0)
#define SETUP_PROVOKING_VERTEX(bit, cmd, mode) \
switch (mode) { \
case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \
SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \
SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \
SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \
break; \
case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \
SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2); \
SET(bit, cmd.LineStripListProvokingVertexSelect, 1); \
SET(bit, cmd.TriangleFanProvokingVertexSelect, 2); \
break; \
default: \
UNREACHABLE("Invalid provoking vertex mode"); \
} \
#define SETUP_PROVOKING_VERTEX_FSB(bit, cmd, mode) \
switch (mode) { \
case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \
SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \
SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \
SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \
SET(bit, cmd.TriangleStripOddProvokingVertexSelect, 0); \
break; \
case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \
SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \
SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \
SET(bit, cmd.TriangleFanProvokingVertexSelect, 0); \
SET(bit, cmd.TriangleStripOddProvokingVertexSelect, 1); \
break; \
default: \
UNREACHABLE("Invalid provoking vertex mode"); \
} \
ALWAYS_INLINE static void
update_urb_config(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx,
const struct anv_device *device)
{
struct intel_urb_config new_cfg = { 0 };
#if GFX_VERx10 >= 125
if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
const struct brw_task_prog_data *task_prog_data =
get_gfx_task_prog_data(gfx);
const struct brw_mesh_prog_data *mesh_prog_data =
get_gfx_mesh_prog_data(gfx);
intel_get_mesh_urb_config(device->info, device->l3_config,
task_prog_data ? task_prog_data->map.size_dw : 0,
mesh_prog_data->map.size / 4, &new_cfg);
} else
#endif
{
for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
const struct brw_vue_prog_data *prog_data = anv_gfx_has_stage(gfx, i) ?
(const struct brw_vue_prog_data *) gfx->shaders[i]->prog_data :
NULL;
new_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
}
UNUSED bool constrained;
intel_get_urb_config(device->info, device->l3_config,
anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL),
anv_gfx_has_stage(gfx, MESA_SHADER_GEOMETRY),
&new_cfg, &constrained);
}
#if GFX_VER >= 12
SET(SF, sf.DerefBlockSize, new_cfg.deref_block_size);
#endif
for (int s = 0; s <= MESA_SHADER_MESH; s++) {
SET(URB, urb_cfg.size[s], new_cfg.size[s]);
SET(URB, urb_cfg.start[s], new_cfg.start[s]);
SET(URB, urb_cfg.entries[s], new_cfg.entries[s]);
}
}
ALWAYS_INLINE static void
update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (!wm_prog_data)
return;
/* If we have any dynamic bits here, we might need to update the value
* in the push constant for the shader.
*/
if (!brw_wm_prog_data_is_dynamic(wm_prog_data))
return;
const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
enum intel_msaa_flags fs_msaa_flags =
intel_fs_msaa_flags((struct intel_fs_params) {
.shader_sample_shading = wm_prog_data->sample_shading,
.shader_min_sample_shading = wm_prog_data->min_sample_shading,
.state_sample_shading = wm_prog_data->api_sample_shading,
.rasterization_samples = dyn->ms.rasterization_samples,
.coarse_pixel = !vk_fragment_shading_rate_is_disabled(&dyn->fsr),
.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
.provoking_vertex_last = dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT,
.first_vue_slot = hw_state->first_vue_slot,
.primitive_id_index = hw_state->primitive_id_index,
.per_primitive_remapping = mesh_prog_data &&
mesh_prog_data->map.wa_18019110168_active,
});
SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags);
}
static bool
sbe_primitive_id_override(const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (!wm_prog_data)
return false;
if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
const struct brw_mesh_prog_data *mesh_prog_data =
get_gfx_mesh_prog_data(gfx);
const struct brw_mue_map *mue = &mesh_prog_data->map;
return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
mue->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_ID] == -1;
}
const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);
return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
(vue_map->slots_valid & VARYING_BIT_PRIMITIVE_ID) == 0;
}
ALWAYS_INLINE static void
update_sbe(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx,
const struct anv_device *device)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (wm_prog_data == NULL)
return;
const struct brw_mesh_prog_data *mesh_prog_data =
get_gfx_mesh_prog_data(gfx);
const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);
uint32_t vertex_read_offset, vertex_read_length, vertex_varyings, flat_inputs;
brw_compute_sbe_per_vertex_urb_read(
vue_map, mesh_prog_data != NULL,
mesh_prog_data ? mesh_prog_data->map.wa_18019110168_active : false,
wm_prog_data,
&vertex_read_offset, &vertex_read_length, &vertex_varyings,
&hw_state->primitive_id_index, &flat_inputs);
hw_state->first_vue_slot = vertex_read_offset * 2;
/* As far as we can test, 3DSTATE_SBE & 3DSTATE_SBE_SWIZ has no effect when
* the pipeline is using Mesh. We still fill the instruction for now, but
* in the future we might want to completely avoid its emission.
*/
SET(SBE, sbe.AttributeSwizzleEnable, mesh_prog_data == NULL);
SET(SBE, sbe.PointSpriteTextureCoordinateOrigin, UPPERLEFT);
SET(SBE, sbe.NumberofSFOutputAttributes, vertex_varyings);
SET(SBE, sbe.ConstantInterpolationEnable, flat_inputs);
SET(SBE, sbe.VertexAttributesBypass, wm_prog_data->vertex_attributes_bypass);
if (mesh_prog_data == NULL) {
for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
gl_varying_slot attr = wm_prog_data->urb_setup_attribs[idx];
int input_index = wm_prog_data->urb_setup[attr];
assert(0 <= input_index);
if (attr == VARYING_SLOT_PNTC) {
SET(SBE, sbe.PointSpriteTextureCoordinateEnable, 1 << input_index);
continue;
}
const int slot = vue_map->varying_to_slot[attr];
if (slot == -1)
continue;
/* We have to subtract two slots to account for the URB entry output
* read offset in the VS and GS stages.
*/
const int source_attr = slot - 2 * vertex_read_offset;
assert(source_attr >= 0 && source_attr < 32);
/* The hardware can only do overrides on 16 overrides at a time, and
* the other up to 16 have to be lined up so that the input index =
* the output index. We'll need to do some tweaking to make sure
* that's the case.
*/
if (input_index < 16) {
SET(SBE_SWIZ,
sbe_swiz.Attribute[input_index].SourceAttribute,
source_attr);
} else {
assert(source_attr == input_index);
}
}
SET(SBE, sbe.VertexURBEntryReadOffset, vertex_read_offset);
SET(SBE, sbe.VertexURBEntryReadLength, vertex_read_length);
}
/* Ask the hardware to supply PrimitiveID if the fragment shader reads it
* but a previous stage didn't write one.
*/
const bool prim_id_override = sbe_primitive_id_override(gfx);
SET(SBE, sbe.PrimitiveIDOverrideAttributeSelect,
prim_id_override ? wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] : 0);
SET(SBE, sbe.PrimitiveIDOverrideComponentX, prim_id_override);
SET(SBE, sbe.PrimitiveIDOverrideComponentY, prim_id_override);
SET(SBE, sbe.PrimitiveIDOverrideComponentZ, prim_id_override);
SET(SBE, sbe.PrimitiveIDOverrideComponentW, prim_id_override);
#if GFX_VERx10 >= 125
if (mesh_prog_data) {
SET(SBE_MESH, sbe_mesh.PerVertexURBEntryOutputReadOffset, vertex_read_offset);
SET(SBE_MESH, sbe_mesh.PerVertexURBEntryOutputReadLength, vertex_read_length);
uint32_t prim_read_offset, prim_read_length;
brw_compute_sbe_per_primitive_urb_read(wm_prog_data->per_primitive_inputs,
wm_prog_data->num_per_primitive_inputs,
&mesh_prog_data->map,
&prim_read_offset,
&prim_read_length);
SET(SBE_MESH, sbe_mesh.PerPrimitiveURBEntryOutputReadOffset, prim_read_offset);
SET(SBE_MESH, sbe_mesh.PerPrimitiveURBEntryOutputReadLength, prim_read_length);
}
#endif
}
ALWAYS_INLINE static void
update_ps(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (!wm_prog_data) {
#if GFX_VER < 20
SET(PS, ps._8PixelDispatchEnable, false);
SET(PS, ps._16PixelDispatchEnable, false);
SET(PS, ps._32PixelDispatchEnable, false);
#else
SET(PS, ps.Kernel0Enable, false);
SET(PS, ps.Kernel1Enable, false);
#endif
return;
}
const struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
struct GENX(3DSTATE_PS) ps = {};
intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
MAX2(dyn->ms.rasterization_samples, 1),
hw_state->fs_msaa_flags);
SET(PS, ps.KernelStartPointer0,
fs->kernel.offset +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
SET(PS, ps.KernelStartPointer1,
fs->kernel.offset +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
#if GFX_VER < 20
SET(PS, ps.KernelStartPointer2,
fs->kernel.offset +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
#endif
SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
#if GFX_VER < 20
SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
#endif
#if GFX_VER < 20
SET(PS, ps._8PixelDispatchEnable, ps._8PixelDispatchEnable);
SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
#else
SET(PS, ps.Kernel0Enable, ps.Kernel0Enable);
SET(PS, ps.Kernel1Enable, ps.Kernel1Enable);
SET(PS, ps.Kernel0SIMDWidth, ps.Kernel0SIMDWidth);
SET(PS, ps.Kernel1SIMDWidth, ps.Kernel1SIMDWidth);
SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
SET(PS, ps.Kernel0MaximumPolysperThread, ps.Kernel0MaximumPolysperThread);
#endif
SET(PS, ps.PositionXYOffsetSelect,
!wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
brw_wm_prog_data_is_persample(wm_prog_data,
hw_state->fs_msaa_flags) ?
POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
}
ALWAYS_INLINE static void
update_ps_extra_wm(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (!wm_prog_data)
return;
UNUSED const bool uses_coarse_pixel =
brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags);
uint32_t InputCoverageMaskState = ICMS_NONE;
assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
if (!wm_prog_data->uses_sample_mask)
InputCoverageMaskState = ICMS_NONE;
else if (wm_prog_data->post_depth_coverage)
InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
else
InputCoverageMaskState = ICMS_NORMAL;
SET(PS_EXTRA, ps_extra.InputCoverageMaskState, InputCoverageMaskState);
SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
brw_wm_prog_data_is_persample(wm_prog_data,
hw_state->fs_msaa_flags));
#if GFX_VER >= 11
SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel);
#endif
#if GFX_VERx10 >= 125
/* TODO: We should only require this when the last geometry shader uses a
* fragment shading rate that is not constant.
*/
SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, uses_coarse_pixel);
#endif
SET(WM, wm.BarycentricInterpolationMode,
wm_prog_data_barycentric_modes(wm_prog_data, hw_state->fs_msaa_flags));
#if INTEL_WA_18038825448_GFX_VER
SET(WA_18038825448, coarse_state, uses_coarse_pixel ?
ANV_COARSE_PIXEL_STATE_ENABLED :
ANV_COARSE_PIXEL_STATE_DISABLED);
#endif
}
ALWAYS_INLINE static void
update_ps_extra_has_uav(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
/* Force fragment shader execution if occlusion queries are active to
* ensure PS_DEPTH_COUNT is correct. Otherwise a fragment shader with
* discard and no render target setup could be increment PS_DEPTH_COUNT if
* the HW internally decides to not run the shader because it has already
* established that depth-test is passing.
*/
SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
wm_prog_data && (wm_prog_data->has_side_effects ||
gfx->n_occlusion_queries > 0),
FRAGMENT);
}
ALWAYS_INLINE static void
update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
wm_prog_data &&
(has_ds_feedback_loop(&fs->bind_map, dyn) ||
wm_prog_data->uses_kill),
FRAGMENT);
}
#if GFX_VERx10 >= 125
ALWAYS_INLINE static bool
geom_or_tess_prim_id_used(const struct anv_cmd_graphics_state *gfx)
{
const struct brw_tcs_prog_data *tcs_prog_data =
get_gfx_tcs_prog_data(gfx);
const struct brw_tes_prog_data *tes_prog_data =
get_gfx_tes_prog_data(gfx);
const struct brw_gs_prog_data *gs_prog_data =
get_gfx_gs_prog_data(gfx);
return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
(tes_prog_data && tes_prog_data->include_primitive_id) ||
(gs_prog_data && gs_prog_data->include_primitive_id);
}
ALWAYS_INLINE static void
update_vfg_distribution_mode(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct anv_cmd_graphics_state *gfx)
{
const bool needs_instance_granularity =
intel_needs_workaround(device->info, 14019166699) &&
(sbe_primitive_id_override(gfx) || geom_or_tess_prim_id_used(gfx));
SET(VFG, vfg.DistributionMode, (GFX_VER < 20 &&
!anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) ?
RR_FREE : RR_STRICT);
SET(VFG, vfg.DistributionGranularity, needs_instance_granularity ?
InstanceLevelGranularity :
BatchLevelGranularity);
#if INTEL_WA_14014851047_GFX_VER
SET(VFG, vfg.GranularityThresholdDisable, intel_needs_workaround(device->info,
14014851047));
#endif
}
ALWAYS_INLINE static void
update_vfg_list_cut_index(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
}
#endif
ALWAYS_INLINE static void
update_streamout(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
#if INTEL_NEEDS_WA_18022508906
/* Wa_18022508906 :
*
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
*
* SOL_INT::Render_Enable =
* (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
* (
* (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
* !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
* !3DSTATE_STREAMOUT::API_Render_Disable &&
* (
* 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
* 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
* 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
* 3DSTATE_PS_EXTRA::PS_Valid ||
* 3DSTATE_WM::Legacy Depth_Buffer_Clear ||
* 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
* 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
* )
* )
*
* If SOL_INT::Render_Enable is false, the SO stage will not forward any
* topologies down the pipeline. Which is not what we want for occlusion
* queries.
*
* Here we force rendering to get SOL_INT::Render_Enable when occlusion
* queries are active.
*/
SET(STREAMOUT, so.ForceRendering,
(!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
Force_on : 0);
#endif
}
ALWAYS_INLINE static void
update_provoking_vertex(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
#if GFX_VERx10 >= 200
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
/* In order to respect the table indicated by Vulkan 1.4.312,
* 28.9. Barycentric Interpolation, we need to program the provoking
* vertex state differently depending on whether we need to set
* vertex_attributes_bypass or not.
* At this point we only deal with full pipelines, so if we don't have
* a wm_prog_data, there is no fragment shader and none of this matters.
*/
if (wm_prog_data && wm_prog_data->vertex_attributes_bypass) {
SETUP_PROVOKING_VERTEX_FSB(SF, sf, dyn->rs.provoking_vertex);
SETUP_PROVOKING_VERTEX_FSB(CLIP, clip, dyn->rs.provoking_vertex);
} else {
/* If we are not setting vertex attributes bypass, we can just use
* the same macro as older generations. There's one bit missing from
* it, but that one is only used for the case above and ignored
* otherwise, so we can pretend it doesn't exist here.
*/
SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
}
#else
SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
#endif
switch (dyn->rs.provoking_vertex) {
case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
SET(STREAMOUT, so.ReorderMode, LEADING);
SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
break;
case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
SET(STREAMOUT, so.ReorderMode, TRAILING);
SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
break;
default:
UNREACHABLE("Invalid provoking vertex mode");
}
}
ALWAYS_INLINE static void
update_topology(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
uint32_t topology =
gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL ?
_3DPRIM_PATCHLIST(dyn->ts.patch_control_points) :
vk_to_intel_primitive_type[dyn->ia.primitive_topology];
SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
}
#if GFX_VER >= 11
ALWAYS_INLINE static void
update_cps(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct vk_dynamic_graphics_state *dyn)
{
#if GFX_VER >= 30
SET(CPS, coarse_pixel.CPSizeX,
get_cps_size(dyn->fsr.fragment_size.width));
SET(CPS, coarse_pixel.CPSizeY,
get_cps_size(dyn->fsr.fragment_size.height));
SET(CPS, coarse_pixel.CPSizeCombiner0Opcode,
vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[0]]);
SET(CPS, coarse_pixel.CPSizeCombiner1Opcode,
vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[1]]);
#elif GFX_VER >= 12
SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
get_cps_state_offset(device, &dyn->fsr));
#else
STATIC_ASSERT(GFX_VER == 11);
SET(CPS, cps.CoarsePixelShadingMode, CPS_MODE_CONSTANT);
SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
#endif
}
#endif
ALWAYS_INLINE static void
update_ds(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);
if (tes_prog_data) {
struct brw_tess_info tess_info =
brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
tes_prog_data->tess_info);
SET(DS, ds.ComputeWCoordinateEnable,
brw_tess_info_domain(tess_info) == INTEL_TESS_DOMAIN_TRI);
}
}
ALWAYS_INLINE static void
update_te(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);
if (tes_prog_data) {
struct brw_tess_info tess_info =
brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
tes_prog_data->tess_info);
SET(TE, te.TEDomain, brw_tess_info_domain(tess_info));
#if GFX_VER >= 12
SET(TE, te.PatchHeaderLayout,
tess_info.primitive_mode == TESS_PRIMITIVE_TRIANGLES ?
REVERSED_TRI_INSIDE_SEPARATE : REVERSED);
#endif
SET(TE, te.Partitioning, brw_tess_info_partitioning(tess_info));
if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
SET(TE, te.OutputTopology, brw_tess_info_output_topology(tess_info));
} else {
/* When the origin is upper-left, we have to flip the winding order */
enum intel_tess_output_topology output_topology =
brw_tess_info_output_topology(tess_info);
switch (output_topology) {
case OUTPUT_TRI_CCW:
SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
break;
case OUTPUT_TRI_CW:
SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
break;
default:
SET(TE, te.OutputTopology, output_topology);
break;
}
}
#if GFX_VERx10 >= 125
uint32_t distrib_mode =
intel_needs_workaround(device->info, 22012699309) ?
TEDMODE_RR_STRICT : TEDMODE_RR_FREE;
/* Wa_14015055625:
*
* Disable Tessellation Distribution when primitive Id is enabled.
*/
if (intel_needs_workaround(device->info, 14015055625) &&
(sbe_primitive_id_override(gfx) || geom_or_tess_prim_id_used(gfx)))
distrib_mode = TEDMODE_OFF;
/* Debug feature for hang analysis */
if (!device->physical->instance->enable_te_distribution)
distrib_mode = TEDMODE_OFF;
SET(TE, te.TessellationDistributionMode, distrib_mode);
#endif
} else {
SET(TE, te.OutputTopology, OUTPUT_POINT);
}
}
ALWAYS_INLINE static void
update_primitive_replication(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);
uint32_t count = vue_map ? vue_map->num_pos_slots : 0;
SET(PRIMITIVE_REPLICATION, pr.ReplicaMask, (1u << count) - 1);
SET(PRIMITIVE_REPLICATION, pr.ReplicationCount, count - 1);
if (count) {
int i = 0;
u_foreach_bit(view_index, gfx->view_mask) {
SET(PRIMITIVE_REPLICATION, pr.RTAIOffset[i], view_index);
i++;
}
}
}
ALWAYS_INLINE static void
update_line_width(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(SF, sf.LineWidth, dyn->rs.line.width);
}
ALWAYS_INLINE static void
update_sf_point_width_source(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
SET(SF, sf.PointWidthSource,
(get_gfx_last_vue_map(gfx)->slots_valid & VARYING_BIT_PSIZ) ?
Vertex : State);
}
ALWAYS_INLINE static void
update_sf_global_depth_bias(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
/**
* From the Vulkan Spec:
*
* "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth bias
* representation is a factor of constant r equal to 1."
*
* From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
*
* "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
*
* Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
*
* Where r is the minimum representable value > 0 in the depth buffer
* format, converted to float32 (note: If state bit Legacy Global Depth
* Bias Enable is set, the r term will be forced to 1.0)"
*
* When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
* LegacyGlobalDepthBiasEnable.
*/
SET(SF, sf.LegacyGlobalDepthBiasEnable,
dyn->rs.depth_bias.representation ==
VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
}
ALWAYS_INLINE static void
update_clip_api_mode(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(CLIP, clip.APIMode,
dyn->vp.depth_clip_negative_one_to_one ?
APIMODE_OGL : APIMODE_D3D);
}
ALWAYS_INLINE static void
update_clip_max_viewport(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
/* From the Vulkan 1.0.45 spec:
*
* "If the last active vertex processing stage shader entry point's
* interface does not include a variable decorated with ViewportIndex,
* then the first viewport is used."
*
* This could mean that we might need to set the MaximumVPIndex based on
* the pipeline's last stage, but if the last shader doesn't write the
* viewport index and the VUE header is used, the compiler will force the
* value to 0 (which is what the spec requires above). Otherwise it seems
* like the HW should be pulling 0 if the VUE header is not present.
*
* Avoiding a check on the pipeline seems to prevent additional emissions
* of 3DSTATE_CLIP which appear to impact performance on Assassin's Creed
* Valhalla..
*/
SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
dyn->vp.viewport_count - 1 : 0);
}
ALWAYS_INLINE static void
update_clip_raster(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
/* Take dynamic primitive topology in to account with
* 3DSTATE_RASTER::APIMode
* 3DSTATE_RASTER::DXMultisampleRasterizationEnable
* 3DSTATE_RASTER::AntialiasingEnable
*/
uint32_t api_mode = 0;
bool msaa_raster_enable = false;
const VkLineRasterizationModeKHR line_mode =
anv_line_rasterization_mode(dyn->rs.line.mode,
dyn->ms.rasterization_samples);
const VkPolygonMode dynamic_raster_mode =
anv_raster_polygon_mode(gfx,
dyn->rs.polygon_mode,
dyn->ia.primitive_topology);
anv_rasterization_mode(dynamic_raster_mode,
line_mode, dyn->rs.line.width,
&api_mode, &msaa_raster_enable);
/* From the Browadwell PRM, Volume 2, documentation for 3DSTATE_RASTER,
* "Antialiasing Enable":
*
* "This field must be disabled if any of the render targets have integer
* (UINT or SINT) surface format."
*
* Additionally internal documentation for Gfx12+ states:
*
* "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
* FORCED_SAMPLE_COUNT > 1."
*/
const bool aa_enable =
anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
!gfx->has_uint_rt &&
!(GFX_VER >= 12 && gfx->samples > 1);
const bool depth_clip_enable =
vk_rasterization_state_depth_clip_enable(&dyn->rs);
const bool xy_clip_test_enable =
(dynamic_raster_mode == VK_POLYGON_MODE_FILL);
SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
SET(RASTER, raster.APIMode, api_mode);
SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
SET(RASTER, raster.AntialiasingEnable, aa_enable);
SET(RASTER, raster.CullMode, vk_to_intel_cullmode[dyn->rs.cull_mode]);
SET(RASTER, raster.FrontWinding, vk_to_intel_front_face[dyn->rs.front_face]);
SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant_factor);
SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope_factor);
SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
SET(RASTER, raster.FrontFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
SET(RASTER, raster.BackFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
SET(RASTER, raster.ConservativeRasterizationEnable,
dyn->rs.conservative_mode !=
VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
#if GFX_VERx10 >= 200
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
SET(RASTER, raster.LegacyBaryAssignmentDisable,
wm_prog_data && wm_prog_data->vertex_attributes_bypass);
#endif
}
ALWAYS_INLINE static void
update_clip_preraster_stages(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const bool layer_written =
anv_gfx_has_stage(gfx, MESA_SHADER_MESH) ?
get_gfx_mesh_prog_data(gfx)->map.per_primitive_offsets[VARYING_SLOT_LAYER] >= 0 :
(get_gfx_last_vue_map(gfx)->slots_valid & VARYING_BIT_LAYER);
SET(CLIP, clip.ForceZeroRTAIndexEnable, !layer_written);
}
ALWAYS_INLINE static void
update_clip_non_perspective_barycentrics(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx)
{
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
SET(CLIP, clip.NonPerspectiveBarycentricEnable,
wm_prog_data ?
wm_prog_data->uses_nonperspective_interp_modes : 0);
}
ALWAYS_INLINE static void
update_multisample(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(MULTISAMPLE, ms.NumberofMultisamples,
__builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
}
ALWAYS_INLINE static void
update_sample_mask(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
/* From the Vulkan 1.0 spec:
* If pSampleMask is NULL, it is treated as if the mask has all bits
* enabled, i.e. no coverage is removed from fragments.
*
* 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
*/
SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
}
ALWAYS_INLINE static void
update_wm_depth_stencil(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx,
const struct anv_device *device)
{
VkImageAspectFlags ds_aspects = 0;
if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
struct vk_depth_stencil_state opt_ds = dyn->ds;
vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
SET(WM_DEPTH_STENCIL, wm_ds.DoubleSidedStencilEnable, true);
SET(WM_DEPTH_STENCIL, wm_ds.StencilTestMask,
opt_ds.stencil.front.compare_mask & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.StencilWriteMask,
opt_ds.stencil.front.write_mask & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.StencilReferenceValue,
opt_ds.stencil.front.reference & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilReferenceValue,
opt_ds.stencil.back.reference & 0xff);
SET(WM_DEPTH_STENCIL, wm_ds.DepthTestEnable, opt_ds.depth.test_enable);
SET(WM_DEPTH_STENCIL, wm_ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
SET(WM_DEPTH_STENCIL, wm_ds.DepthTestFunction,
vk_to_intel_compare_op[opt_ds.depth.compare_op]);
SET(WM_DEPTH_STENCIL, wm_ds.StencilTestEnable, opt_ds.stencil.test_enable);
SET(WM_DEPTH_STENCIL, wm_ds.StencilBufferWriteEnable,
opt_ds.stencil.write_enable);
SET(WM_DEPTH_STENCIL, wm_ds.StencilFailOp,
vk_to_intel_stencil_op[opt_ds.stencil.front.op.fail]);
SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthPassOp,
vk_to_intel_stencil_op[opt_ds.stencil.front.op.pass]);
SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthFailOp,
vk_to_intel_stencil_op[
opt_ds.stencil.front.op.depth_fail]);
SET(WM_DEPTH_STENCIL, wm_ds.StencilTestFunction,
vk_to_intel_compare_op[
opt_ds.stencil.front.op.compare]);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilFailOp,
vk_to_intel_stencil_op[
opt_ds.stencil.back.op.fail]);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthPassOp,
vk_to_intel_stencil_op[
opt_ds.stencil.back.op.pass]);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthFailOp,
vk_to_intel_stencil_op[
opt_ds.stencil.back.op.depth_fail]);
SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestFunction,
vk_to_intel_compare_op[
opt_ds.stencil.back.op.compare]);
#if GFX_VER == 9
const bool pma = want_stencil_pma_fix(dyn, gfx, &opt_ds);
SET(PMA_FIX, pma_fix, pma);
#endif
#if INTEL_WA_18019816803_GFX_VER
if (intel_needs_workaround(device->info, 18019816803)) {
bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
SET(WA_18019816803, ds_write_state, ds_write_state);
}
#endif
}
ALWAYS_INLINE static void
update_depth_bounds(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
/* Only look at updating the bounds if testing is enabled */
if (dyn->ds.depth.bounds_test.enable) {
SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
}
}
ALWAYS_INLINE static void
update_line_stipple(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn)
{
SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
1.0f / MAX2(1, dyn->rs.line.stipple.factor));
SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
SET(WM, wm.LineStippleEnable, dyn->rs.line.stipple.enable);
}
ALWAYS_INLINE static void
update_vf_restart(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
SET(VF, vf.CutIndex, vk_index_to_restart(gfx->index_type));
}
ALWAYS_INLINE static void
update_blend_state(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
struct anv_cmd_graphics_state *gfx,
const struct anv_device *device,
bool has_fs_stage,
bool has_fs_dual_src)
{
const struct anv_instance *instance = device->physical->instance;
const uint8_t color_writes = dyn->cb.color_write_enables;
bool has_writeable_rt =
has_fs_stage &&
!anv_gfx_all_color_write_masked(gfx, dyn);
SET(BLEND_STATE, blend.AlphaToCoverageEnable,
dyn->ms.alpha_to_coverage_enable);
SET(BLEND_STATE, blend.AlphaToOneEnable,
dyn->ms.alpha_to_one_enable);
SET(BLEND_STATE, blend.ColorDitherEnable,
gfx->rendering_flags &
VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT);
bool independent_alpha_blend = false;
/* Wa_14018912822, check if we set these during RT setup. */
bool color_blend_zero = false;
bool alpha_blend_zero = false;
uint32_t rt_0 = MESA_VK_ATTACHMENT_UNUSED;
for (uint32_t rt = 0; rt < MAX_RTS; rt++) {
if (gfx->color_output_mapping[rt] >= gfx->color_att_count) {
/* The Dual Source Blending documentation says:
*
* "If SRC1 is included in a src/dst blend factor and a DualSource RT
* Write message is not used, results are UNDEFINED."
*
* In practice, this results in hangs if we leave the Dual Source
* Blending enabled for the unused render targets. The easiest way to
* avoid it altogether is to completely disable the blending for them.
*/
SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable, false);
continue;
}
uint32_t att = gfx->color_output_mapping[rt];
if (att == 0)
rt_0 = att;
/* Disable anything above the current number of color attachments. */
bool write_disabled = (color_writes & BITFIELD_BIT(att)) == 0;
SET(BLEND_STATE, blend.rts[rt].WriteDisableAlpha,
write_disabled ||
(dyn->cb.attachments[att].write_mask &
VK_COLOR_COMPONENT_A_BIT) == 0);
SET(BLEND_STATE, blend.rts[rt].WriteDisableRed,
write_disabled ||
(dyn->cb.attachments[att].write_mask &
VK_COLOR_COMPONENT_R_BIT) == 0);
SET(BLEND_STATE, blend.rts[rt].WriteDisableGreen,
write_disabled ||
(dyn->cb.attachments[att].write_mask &
VK_COLOR_COMPONENT_G_BIT) == 0);
SET(BLEND_STATE, blend.rts[rt].WriteDisableBlue,
write_disabled ||
(dyn->cb.attachments[att].write_mask &
VK_COLOR_COMPONENT_B_BIT) == 0);
/* Vulkan specification 1.2.168, VkLogicOp:
*
* "Logical operations are controlled by the logicOpEnable and logicOp
* members of VkPipelineColorBlendStateCreateInfo. If logicOpEnable is
* VK_TRUE, then a logical operation selected by logicOp is applied
* between each color attachment and the fragments corresponding
* output value, and blending of all attachments is treated as if it
* were disabled."
*
* From the Broadwell PRM Volume 2d: Command Reference: Structures:
* BLEND_STATE_ENTRY:
*
* "Enabling LogicOp and Color Buffer Blending at the same time is
* UNDEFINED"
*
* The Vulkan spec also says:
* "Logical operations are not applied to floating-point or sRGB format
* color attachments."
* and
* "Any attachments using color formats for which logical operations
* are not supported simply pass through the color values unmodified."
*/
bool ignores_logic_op =
vk_format_is_float(gfx->color_att[att].vk_format) ||
vk_format_is_srgb(gfx->color_att[att].vk_format);
SET(BLEND_STATE, blend.rts[rt].LogicOpFunction,
vk_to_intel_logic_op[dyn->cb.logic_op]);
SET(BLEND_STATE, blend.rts[rt].LogicOpEnable,
dyn->cb.logic_op_enable && !ignores_logic_op);
SET(BLEND_STATE, blend.rts[rt].ColorClampRange, COLORCLAMP_RTFORMAT);
SET(BLEND_STATE, blend.rts[rt].PreBlendColorClampEnable, true);
SET(BLEND_STATE, blend.rts[rt].PostBlendColorClampEnable, true);
#if GFX_VER >= 30
SET(BLEND_STATE, blend.rts[rt].SimpleFloatBlendEnable, true);
#endif
/* Setup blend equation. */
SET(BLEND_STATE, blend.rts[rt].ColorBlendFunction,
vk_to_intel_blend_op[
dyn->cb.attachments[att].color_blend_op]);
SET(BLEND_STATE, blend.rts[rt].AlphaBlendFunction,
vk_to_intel_blend_op[
dyn->cb.attachments[att].alpha_blend_op]);
if (dyn->cb.attachments[att].src_color_blend_factor !=
dyn->cb.attachments[att].src_alpha_blend_factor ||
dyn->cb.attachments[att].dst_color_blend_factor !=
dyn->cb.attachments[att].dst_alpha_blend_factor ||
dyn->cb.attachments[att].color_blend_op !=
dyn->cb.attachments[att].alpha_blend_op)
independent_alpha_blend = true;
/* The Dual Source Blending documentation says:
*
* "If SRC1 is included in a src/dst blend factor and a DualSource RT
* Write message is not used, results are UNDEFINED. (This reflects the
* same restriction in DX APIs, where undefined results are produced if
* “o1” is not written by a PS there are no default values defined)."
*
* There is no way to gracefully fix this undefined situation so we just
* disable the blending to prevent possible issues.
*/
if (has_fs_stage && !has_fs_dual_src &&
anv_is_dual_src_blend_equation(&dyn->cb.attachments[att])) {
SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable, false);
} else {
SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable,
!dyn->cb.logic_op_enable &&
dyn->cb.attachments[att].blend_enable);
}
/* Our hardware applies the blend factor prior to the blend function
* regardless of what function is used. Technically, this means the
* hardware can do MORE than GL or Vulkan specify. However, it also
* means that, for MIN and MAX, we have to stomp the blend factor to ONE
* to make it a no-op.
*/
uint32_t SourceBlendFactor;
uint32_t DestinationBlendFactor;
uint32_t SourceAlphaBlendFactor;
uint32_t DestinationAlphaBlendFactor;
if (dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MIN ||
dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MAX) {
SourceBlendFactor = BLENDFACTOR_ONE;
DestinationBlendFactor = BLENDFACTOR_ONE;
} else {
SourceBlendFactor = vk_to_intel_blend[
dyn->cb.attachments[att].src_color_blend_factor];
DestinationBlendFactor = vk_to_intel_blend[
dyn->cb.attachments[att].dst_color_blend_factor];
}
if (dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MIN ||
dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MAX) {
SourceAlphaBlendFactor = BLENDFACTOR_ONE;
DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
} else {
SourceAlphaBlendFactor = vk_to_intel_blend[
dyn->cb.attachments[att].src_alpha_blend_factor];
DestinationAlphaBlendFactor = vk_to_intel_blend[
dyn->cb.attachments[att].dst_alpha_blend_factor];
}
/* Replace and Src1 value by 1.0 if dual source blending is not
* enabled.
*/
if (has_fs_stage && !has_fs_dual_src) {
if (is_src1_blend_factor(SourceBlendFactor))
SourceBlendFactor = BLENDFACTOR_ONE;
if (is_src1_blend_factor(DestinationBlendFactor))
DestinationBlendFactor = BLENDFACTOR_ONE;
}
if (instance->intel_enable_wa_14018912822 &&
intel_needs_workaround(device->info, 14018912822) &&
dyn->ms.rasterization_samples > 1) {
if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
color_blend_zero = true;
}
if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
alpha_blend_zero = true;
}
}
SET(BLEND_STATE, blend.rts[rt].SourceBlendFactor, SourceBlendFactor);
SET(BLEND_STATE, blend.rts[rt].DestinationBlendFactor, DestinationBlendFactor);
SET(BLEND_STATE, blend.rts[rt].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
SET(BLEND_STATE, blend.rts[rt].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
}
gfx->color_blend_zero = color_blend_zero;
gfx->alpha_blend_zero = alpha_blend_zero;
SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
if (rt_0 == MESA_VK_ATTACHMENT_UNUSED)
rt_0 = 0;
/* 3DSTATE_PS_BLEND to be consistent with the rest of the
* BLEND_STATE_ENTRY.
*/
SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
SET(PS_BLEND, ps_blend.ColorBufferBlendEnable,
GET(blend.rts[rt_0].ColorBufferBlendEnable));
SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor,
GET(blend.rts[rt_0].SourceAlphaBlendFactor));
SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor,
gfx->alpha_blend_zero ?
BLENDFACTOR_CONST_ALPHA :
GET(blend.rts[rt_0].DestinationAlphaBlendFactor));
SET(PS_BLEND, ps_blend.SourceBlendFactor,
GET(blend.rts[rt_0].SourceBlendFactor));
SET(PS_BLEND, ps_blend.DestinationBlendFactor,
gfx->color_blend_zero ?
BLENDFACTOR_CONST_COLOR :
GET(blend.rts[rt_0].DestinationBlendFactor));
SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable,
GET(blend.IndependentAlphaBlendEnable));
SET(PS_BLEND, ps_blend.AlphaToCoverageEnable,
dyn->ms.alpha_to_coverage_enable);
}
ALWAYS_INLINE static void
update_blend_constants(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx)
{
SET(CC_STATE, cc.BlendConstantColorRed,
gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
SET(CC_STATE, cc.BlendConstantColorGreen,
gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
SET(CC_STATE, cc.BlendConstantColorBlue,
gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
SET(CC_STATE, cc.BlendConstantColorAlpha,
gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
}
ALWAYS_INLINE static void
update_viewports(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx,
const struct anv_device *device)
{
const struct anv_instance *instance = device->physical->instance;
const VkViewport *viewports = dyn->vp.viewports;
const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
const VkViewport *vp = &viewports[i];
/* The gfx7 state struct has just the matrix and guardband fields, the
* gfx8 struct adds the min/max viewport fields. */
struct GENX(SF_CLIP_VIEWPORT) sfv = {
.ViewportMatrixElementm00 = vp->width / 2,
.ViewportMatrixElementm11 = vp->height / 2,
.ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
.ViewportMatrixElementm30 = vp->x + vp->width / 2,
.ViewportMatrixElementm31 = vp->y + vp->height / 2,
.ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
(vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
.XMinClipGuardband = -1.0f,
.XMaxClipGuardband = 1.0f,
.YMinClipGuardband = -1.0f,
.YMaxClipGuardband = 1.0f,
.XMinViewPort = vp->x,
.XMaxViewPort = vp->x + vp->width - 1,
.YMinViewPort = MIN2(vp->y, vp->y + vp->height),
.YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
};
/* Fix depth test misrenderings by lowering translated depth range */
if (instance->lower_depth_range_rate != 1.0f)
sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
const uint32_t fb_size_max = 1 << 14;
uint32_t x_min = 0, x_max = fb_size_max;
uint32_t y_min = 0, y_max = fb_size_max;
/* If we have a valid renderArea, include that */
if (gfx->render_area.extent.width > 0 &&
gfx->render_area.extent.height > 0) {
x_min = MAX2(x_min, gfx->render_area.offset.x);
x_max = MIN2(x_max, gfx->render_area.offset.x +
gfx->render_area.extent.width);
y_min = MAX2(y_min, gfx->render_area.offset.y);
y_max = MIN2(y_max, gfx->render_area.offset.y +
gfx->render_area.extent.height);
}
/* The client is required to have enough scissors for whatever it
* sets as ViewportIndex but it's possible that they've got more
* viewports set from a previous command. Also, from the Vulkan
* 1.3.207:
*
* "The application must ensure (using scissor if necessary) that
* all rendering is contained within the render area."
*
* If the client doesn't set a scissor, that basically means it
* guarantees everything is in-bounds already. If we end up using a
* guardband of [-1, 1] in that case, there shouldn't be much loss.
* It's theoretically possible that they could do all their clipping
* with clip planes but that'd be a bit odd.
*/
if (i < dyn->vp.scissor_count) {
const VkRect2D *scissor = &dyn->vp.scissors[i];
x_min = MAX2(x_min, scissor->offset.x);
x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
y_min = MAX2(y_min, scissor->offset.y);
y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
}
/* Only bother calculating the guardband if our known render area is
* less than the maximum size. Otherwise, it will calculate [-1, 1]
* anyway but possibly with precision loss.
*/
if (x_min > 0 || x_max < fb_size_max ||
y_min > 0 || y_max < fb_size_max) {
intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
sfv.ViewportMatrixElementm00,
sfv.ViewportMatrixElementm11,
sfv.ViewportMatrixElementm30,
sfv.ViewportMatrixElementm31,
&sfv.XMinClipGuardband,
&sfv.XMaxClipGuardband,
&sfv.YMinClipGuardband,
&sfv.YMaxClipGuardband);
}
#define SET_VP(bit, state, field) \
do { \
if (hw_state->state.field != sfv.field) { \
hw_state->state.field = sfv.field; \
BITSET_SET(hw_state->pack_dirty, \
ANV_GFX_STATE_##bit); \
} \
} while (0)
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
#undef SET_VP
const bool depth_range_unrestricted =
device->vk.enabled_extensions.EXT_depth_range_unrestricted;
float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0f;
float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0f;
float min_depth = dyn->rs.depth_clamp_enable ?
MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
float max_depth = dyn->rs.depth_clamp_enable ?
MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
if (dyn->rs.depth_clamp_enable &&
dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT) {
min_depth = dyn->vp.depth_clamp_range.minDepthClamp;
max_depth = dyn->vp.depth_clamp_range.maxDepthClamp;
}
SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
}
/* If the HW state is already considered dirty or the previous
* programmed viewport count is smaller than what we need, update the
* viewport count and ensure the HW state is dirty. Otherwise if the
* number of viewport programmed previously was larger than what we need
* now, no need to reemit we can just keep the old programmed values.
*/
if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
}
if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
hw_state->vp_cc.count < dyn->vp.viewport_count) {
hw_state->vp_cc.count = dyn->vp.viewport_count;
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_CC);
}
}
ALWAYS_INLINE static void
update_scissors(struct anv_gfx_dynamic_state *hw_state,
const struct vk_dynamic_graphics_state *dyn,
const struct anv_cmd_graphics_state *gfx,
VkCommandBufferLevel cmd_buffer_level)
{
const VkRect2D *scissors = dyn->vp.scissors;
const VkViewport *viewports = dyn->vp.viewports;
for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
const VkRect2D *s = &scissors[i];
const VkViewport *vp = &viewports[i];
const int max = 0xffff;
uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
uint32_t x_min = MAX2(s->offset.x, vp->x);
int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
MAX2(vp->y, vp->y + vp->height) - 1);
int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
vp->x + vp->width - 1);
y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
/* Do this math using int64_t so overflow gets clamped correctly. */
if (cmd_buffer_level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
y_max = CLAMP((uint64_t) y_max, 0,
gfx->render_area.offset.y +
gfx->render_area.extent.height - 1);
x_max = CLAMP((uint64_t) x_max, 0,
gfx->render_area.offset.x +
gfx->render_area.extent.width - 1);
}
if (s->extent.width <= 0 || s->extent.height <= 0) {
/* Since xmax and ymax are inclusive, we have to have xmax < xmin or
* ymax < ymin for empty clips. In case clip x, y, width height are
* all 0, the clamps below produce 0 for xmin, ymin, xmax, ymax,
* which isn't what we want. Just special case empty clips and
* produce a canonical empty clip.
*/
SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
} else {
SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
}
}
/* If the HW state is already considered dirty or the previous programmed
* viewport count is smaller than what we need, update the viewport count
* and ensure the HW state is dirty. Otherwise if the number of viewport
* programmed previously was larger than what we need now, no need to
* reemit we can just keep the old programmed values.
*/
if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_SCISSOR) ||
hw_state->scissor.count < dyn->vp.scissor_count) {
hw_state->scissor.count = dyn->vp.scissor_count;
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_SCISSOR);
}
}
#if GFX_VERx10 == 125
ALWAYS_INLINE static void
update_tbimr_info(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct anv_cmd_graphics_state *gfx,
const struct intel_l3_config *l3_config)
{
unsigned fb_width, fb_height, tile_width, tile_height;
if (device->physical->instance->enable_tbimr &&
calculate_render_area(gfx, &fb_width, &fb_height) &&
calculate_tile_dimensions(device, gfx, l3_config,
fb_width, fb_height,
&tile_width, &tile_height)) {
/* Use a batch size of 128 polygons per slice as recommended */
/* by BSpec 68436 "TBIMR Programming". */
const unsigned num_slices = device->info->num_slices;
const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
DIV_ROUND_UP(fb_height, tile_height));
SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
DIV_ROUND_UP(fb_width, tile_width));
SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
util_logbase2(batch_size) - 5);
SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
} else {
hw_state->use_tbimr = false;
}
}
#endif
#if GFX_VERx10 == 90
ALWAYS_INLINE static void
update_vs(struct anv_gfx_dynamic_state *hw_state,
const struct anv_cmd_graphics_state *gfx,
const struct anv_device *device)
{
if (device->info->gt < 4)
return;
/* On Sky Lake GT4, we have experienced some hangs related to the VS cache
* and tessellation. It is unknown exactly what is happening but the
* Haswell docs for the "VS Reference Count Full Force Miss Enable" field
* of the "Thread Mode" register refer to a HSW bug in which the VUE handle
* reference count would overflow resulting in internal reference counting
* bugs. My (Faith's) best guess is that this bug cropped back up on SKL
* GT4 when we suddenly had more threads in play than any previous gfx9
* hardware.
*
* What we do know for sure is that setting this bit when tessellation
* shaders are in use fixes a GPU hang in Batman: Arkham City when playing
* with DXVK (https://bugs.freedesktop.org/107280). Disabling the vertex
* cache with tessellation shaders should only have a minor performance
* impact as the tessellation shaders are likely generating and processing
* far more geometry than the vertex stage.
*/
SET(VS, vs.VertexCacheDisable, anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL));
}
#endif
#if INTEL_WA_18019110168_GFX_VER
static inline unsigned
compute_mesh_provoking_vertex(const struct brw_mesh_prog_data *mesh_prog_data,
const struct vk_dynamic_graphics_state *dyn)
{
switch (mesh_prog_data->primitive_type) {
case MESA_PRIM_POINTS:
return 0;
case MESA_PRIM_LINES:
case MESA_PRIM_LINE_LOOP:
case MESA_PRIM_LINE_STRIP:
case MESA_PRIM_LINES_ADJACENCY:
case MESA_PRIM_LINE_STRIP_ADJACENCY:
return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 1 : 0;
case MESA_PRIM_TRIANGLES:
case MESA_PRIM_TRIANGLE_STRIP:
case MESA_PRIM_TRIANGLE_FAN:
case MESA_PRIM_TRIANGLES_ADJACENCY:
case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 2 : 0;
case MESA_PRIM_QUADS:
case MESA_PRIM_QUAD_STRIP:
return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 3 : 0;
default:
UNREACHABLE("invalid mesh primitive type");
}
}
#endif
/**
* This function takes the vulkan runtime values & dirty states and updates
* the values in anv_gfx_dynamic_state, flagging HW instructions for
* reemission if the values are changing.
*
* Nothing is emitted in the batch buffer.
*/
static void
cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
const struct anv_device *device,
const struct vk_dynamic_graphics_state *dyn,
struct anv_cmd_graphics_state *gfx,
VkCommandBufferLevel cmd_buffer_level)
{
UNUSED bool fs_msaa_changed = false;
assert(gfx->shaders[gfx->streamout_stage] != NULL);
assert(gfx->instance_multiplier != 0);
/* Do this before update_fs_msaa_flags() for primitive_id_index */
if (gfx->dirty & ANV_CMD_DIRTY_ALL_SHADERS(device))
update_sbe(hw_state, gfx, device);
if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
update_fs_msaa_flags(hw_state, dyn, gfx);
if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
update_urb_config(hw_state, gfx, device);
#if GFX_VERx10 == 90
if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
update_vs(hw_state, gfx, device);
#endif
if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
update_ps(hw_state, device, dyn, gfx);
update_ps_extra_wm(hw_state, gfx);
}
if (gfx->dirty &
#if GFX_VERx10 >= 125
ANV_CMD_DIRTY_PS
#else
(ANV_CMD_DIRTY_PS | ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)
#endif
)
update_ps_extra_has_uav(hw_state, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE))
update_ps_extra_kills_pixel(hw_state, dyn, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
update_streamout(hw_state, dyn, gfx);
if (
#if GFX_VERx10 >= 200
/* Xe2+ might need to update this if the FS changed */
(gfx->dirty & ANV_CMD_DIRTY_PS) ||
#endif
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX))
update_provoking_vertex(hw_state, dyn, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_DS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY))
update_topology(hw_state, dyn, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_VS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VERTEX_INPUT);
#if GFX_VER >= 11
if (device->vk.enabled_extensions.KHR_fragment_shading_rate &&
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
update_cps(hw_state, device, dyn);
#endif /* GFX_VER >= 11 */
if (gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS))
update_ds(hw_state, gfx);
if (
#if GFX_VERx10 >= 125
(gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
#else
(gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
#endif
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN))
update_te(hw_state, device, dyn, gfx);
#if GFX_VER >= 12
if ((gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
(gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS))
update_primitive_replication(hw_state, gfx);
#endif
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
update_line_width(hw_state, dyn);
if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
update_sf_point_width_source(hw_state, gfx);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS))
update_sf_global_depth_bias(hw_state, dyn);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
update_clip_api_mode(hw_state, dyn);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
update_clip_max_viewport(hw_state, dyn);
if ((gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
(gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE))
update_clip_raster(hw_state, dyn, gfx);
if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
update_clip_preraster_stages(hw_state, gfx);
if (gfx->dirty & ANV_CMD_DIRTY_PS)
update_clip_non_perspective_barycentrics(hw_state, gfx);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES))
update_multisample(hw_state, dyn);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK))
update_sample_mask(hw_state, dyn);
if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
#if GFX_VER == 9
/* For the PMA fix */
(gfx->dirty & ANV_CMD_DIRTY_PS) ||
#endif
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE))
update_wm_depth_stencil(hw_state, dyn, gfx, device);
#if GFX_VER >= 12
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
update_depth_bounds(hw_state, dyn);
#endif
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
update_line_stipple(hw_state, dyn);
if ((gfx->dirty & ANV_CMD_DIRTY_INDEX_TYPE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
update_vf_restart(hw_state, dyn, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER) ||
(gfx->dirty & ANV_CMD_DIRTY_INDEX_TYPE))
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_INDEX_BUFFER);
#if GFX_VERx10 >= 125
if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
update_vfg_distribution_mode(hw_state, device, gfx);
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
update_vfg_list_cut_index(hw_state, dyn);
#endif
if (device->vk.enabled_extensions.EXT_sample_locations &&
(BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
(gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
update_blend_state(hw_state, dyn, gfx, device,
wm_prog_data != NULL,
wm_prog_data != NULL ?
wm_prog_data->dual_src_blend : false);
}
if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
update_blend_constants(hw_state, dyn, gfx);
if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE))
update_viewports(hw_state, dyn, gfx, device);
if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
update_scissors(hw_state, dyn, gfx, cmd_buffer_level);
#if GFX_VERx10 == 125
if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS))
update_tbimr_info(hw_state, device, gfx, device->l3_config);
#endif
#if INTEL_WA_14018283232_GFX_VER
if (intel_needs_workaround(device->info, 14018283232) &&
((gfx->dirty & ANV_CMD_DIRTY_PS) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) {
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
SET(WA_14018283232, wa_14018283232_toggle,
dyn->ds.depth.bounds_test.enable &&
wm_prog_data &&
wm_prog_data->uses_kill);
}
#endif
#if INTEL_WA_14024997852_GFX_VER
/* Wa_14024997852: When Draw Cut Index or primitive id is enabled
* and topology is tri list, we need to disable autostrip.
*
* Note that we do not take primitive id in to account because it
* is mentioned only in xe2 clone of this wa and autostrip has been
* disabled globally on xe2 (+xe3 a0) by kernel due to 14021490052
* workaround.
*/
SET(WA_14024997852, autostrip_disabled,
hw_state->vft.PrimitiveTopologyType == _3DPRIM_TRILIST &&
dyn->ia.primitive_restart_enable);
#endif
/* If the pipeline uses a dynamic value of patch_control_points or the
* tessellation domain is dynamic and either the pipeline change or the
* dynamic value change, check the value and reemit if needed.
*/
const struct brw_tcs_prog_data *tcs_prog_data = get_gfx_tcs_prog_data(gfx);
const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);
const bool tcs_dynamic =
tcs_prog_data && tcs_prog_data->input_vertices == 0;
const bool tes_dynamic =
tes_prog_data && tes_prog_data->base.vue_map.layout != INTEL_VUE_LAYOUT_FIXED;
if ((tcs_dynamic || tes_dynamic) &&
((gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS))) {
assert(tcs_prog_data != NULL && tes_prog_data != NULL);
struct brw_tess_info tess_info =
brw_merge_tess_info(tcs_prog_data->tess_info,
tes_prog_data->tess_info);
SET(TESS_CONFIG, tess_config,
intel_tess_config(dyn->ts.patch_control_points,
tcs_prog_data->output_vertices,
brw_tess_info_domain(tess_info),
tcs_prog_data->base.vue_map.num_per_patch_slots,
tcs_prog_data->base.vue_map.num_per_vertex_slots,
tcs_prog_data->base.vue_map.builtins_slot_offset));
}
#if INTEL_WA_18019110168_GFX_VER
const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
const bool mesh_provoking_vertex_update =
intel_needs_workaround(device->info, 18019110168) &&
mesh_prog_data &&
(mesh_prog_data->map.vue_map.slots_valid & (VARYING_BIT_CLIP_DIST0 |
VARYING_BIT_CLIP_DIST1)) &&
((gfx->dirty & ANV_CMD_DIRTY_MESH) ||
BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX));
if (mesh_provoking_vertex_update) {
SET(MESH_PROVOKING_VERTEX, mesh_provoking_vertex,
compute_mesh_provoking_vertex(
mesh_prog_data, dyn));
}
#endif
}
#undef GET
#undef SET
#undef SET_STAGE
#undef SETUP_PROVOKING_VERTEX
#if INTEL_WA_14024997852_GFX_VER
void
genX(setup_autostrip_state)(struct anv_cmd_buffer *cmd_buffer, bool enable)
{
/* Add CS stall before writing registers. */
genx_batch_emit_pipe_control(&cmd_buffer->batch,
cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_CS_STALL_BIT);
/* VF */
anv_batch_write_reg(&cmd_buffer->batch, GENX(VFL_SCRATCH_PAD), vfl) {
vfl.AutostripDisable = !enable;
vfl.PartialAutostripDisable = !enable;
vfl.AutostripDisableMask = true;
vfl.PartialAutostripDisableMask = true;
}
/* TE and Mesh. */
anv_batch_write_reg(&cmd_buffer->batch, GENX(FF_MODE), ff) {
ff.TEAutostripDisable = !enable;
ff.MeshShaderAutostripDisable = !enable;
ff.MeshShaderPartialAutostripDisable = !enable;
}
}
#endif /* INTEL_WA_14024997852_GFX_VER */
static void
cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
struct anv_cmd_buffer *cmd_buffer,
const struct anv_cmd_graphics_state *gfx)
{
struct anv_device *device = cmd_buffer->device;
struct anv_instance *instance = device->physical->instance;
#define INIT(category, name) \
.name = hw_state->category.name
#define SET(s, category, name) \
s.name = hw_state->category.name
#define SET_ARRAY(s, category, name) \
do { \
assert(sizeof(s.name) == \
sizeof(hw_state->category.name)); \
memcpy(&s.name, \
&hw_state->category.name, \
sizeof(s.name)); \
} while (0)
#define IS_DIRTY(name) BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##name)
#define anv_gfx_copy(field, cmd, stage, source) ({ \
if (gfx->shaders[stage] != NULL) { \
assert(sizeof(hw_state->packed.field) >= \
4 * __anv_cmd_length(cmd)); \
assert((gfx->shaders[stage]->source).len == \
__anv_cmd_length(cmd)); \
memcpy(&hw_state->packed.field, \
&gfx->shaders[stage]->cmd_data[ \
(gfx->shaders[stage]->source).offset], \
4 * __anv_cmd_length(cmd)); \
} else { \
anv_gfx_pack(field, cmd, __unused_name); \
} \
})
#define anv_gfx_copy_variable(field, stage, source) ({ \
if (gfx->shaders[stage] != NULL) { \
assert(sizeof(hw_state->packed.field) >= \
4 * gfx->shaders[stage]->source.len); \
memcpy(&hw_state->packed.field, \
&gfx->shaders[stage]->cmd_data[ \
(gfx->shaders[stage]->source).offset], \
4 * gfx->shaders[stage]->source.len); \
hw_state->packed.field##_len = \
gfx->shaders[stage]->source.len; \
} \
})
#define anv_gfx_copy_protected(field, cmd, stage, source) ({ \
const bool __protected = (cmd_buffer->vk.pool->flags & \
VK_COMMAND_POOL_CREATE_PROTECTED_BIT); \
assert(sizeof(hw_state->packed.field) >= \
4 * __anv_cmd_length(cmd)); \
if (gfx->shaders[stage] != NULL) { \
assert((gfx->shaders[stage]->source).len == \
__anv_cmd_length(cmd)); \
memcpy(&hw_state->packed.field, \
&gfx->shaders[stage]->cmd_data[ \
__protected ? \
gfx->shaders[stage]->source##_protected.offset : \
gfx->shaders[stage]->source.offset], \
4 * __anv_cmd_length(cmd)); \
} else { \
memcpy(&hw_state->packed.field, \
device->physical->gfx_default.field, \
4 * __anv_cmd_length(cmd)); \
} \
})
#define anv_gfx_pack_merge(field, cmd, stage, source, name) \
for (struct cmd name = (struct cmd) { 0 }, \
*_dst = (struct cmd *)hw_state->packed.field; \
__builtin_expect(_dst != NULL, 1); \
({ \
uint32_t _partial[__anv_cmd_length(cmd)]; \
assert(sizeof(hw_state->packed.field) >= \
4 * __anv_cmd_length(cmd)); \
__anv_cmd_pack(cmd)(NULL, _partial, &name); \
if (gfx->shaders[stage] != NULL) { \
const struct anv_gfx_state_ptr *_cmd_state = \
&gfx->shaders[stage]->source; \
assert(_cmd_state->len == __anv_cmd_length(cmd)); \
for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
assert((_partial[i] & \
gfx->shaders[stage]->cmd_data[ \
_cmd_state->offset + i]) == 0); \
((uint32_t *)_dst)[i] = _partial[i] | \
gfx->shaders[stage]->cmd_data[_cmd_state->offset + i]; \
} \
} else { \
for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
assert((_partial[i] & \
device->physical->gfx_default.field[i]) == 0); \
((uint32_t *)_dst)[i] = _partial[i] | \
device->physical->gfx_default.field[i]; \
} \
} \
_dst = NULL; \
}))
#define anv_gfx_pack_merge_protected(field, cmd, stage, source, name) \
for (struct cmd name = (struct cmd) { 0 }, \
*_dst = (struct cmd *)hw_state->packed.field; \
__builtin_expect(_dst != NULL, 1); \
({ \
uint32_t _partial[__anv_cmd_length(cmd)]; \
assert(sizeof(hw_state->packed.field) >= \
4 * __anv_cmd_length(cmd)); \
__anv_cmd_pack(cmd)(NULL, _partial, &name); \
const struct anv_gfx_state_ptr *_cmd_state = \
gfx->shaders[stage] != NULL ? \
((cmd_buffer->vk.pool->flags & \
VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ? \
&gfx->shaders[stage]->source##_protected : \
&gfx->shaders[stage]->source) : \
NULL; \
assert(_cmd_state == NULL || \
_cmd_state->len == __anv_cmd_length(cmd)); \
const uint32_t *_inst_data = \
gfx->shaders[stage] != NULL ? \
&gfx->shaders[stage]->cmd_data[_cmd_state->offset] : \
device->physical->gfx_default.field; \
for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
assert((_partial[i] & _inst_data[i]) == 0); \
((uint32_t *)_dst)[i] = _partial[i] | _inst_data[i]; \
} \
_dst = NULL; \
}))
if (IS_DIRTY(VF)) {
anv_gfx_pack(vf, GENX(3DSTATE_VF), vf) {
#if GFX_VERx10 >= 125
vf.GeometryDistributionEnable = instance->enable_vf_distribution;
#endif
vf.ComponentPackingEnable = instance->vf_component_packing;
SET(vf, vf, IndexedDrawCutIndexEnable);
SET(vf, vf, CutIndex);
}
}
if (IS_DIRTY(VF_TOPOLOGY)) {
anv_gfx_pack(vft, GENX(3DSTATE_VF_TOPOLOGY), vft) {
SET(vft, vft, PrimitiveTopologyType);
}
}
if (IS_DIRTY(VF_STATISTICS)) {
anv_gfx_pack(vfs, GENX(3DSTATE_VF_STATISTICS), vfs) {
vfs.StatisticsEnable = true;
}
}
#if GFX_VERx10 >= 125
if (IS_DIRTY(VFG)) {
anv_gfx_pack(vfg, GENX(3DSTATE_VFG), vfg) {
/* 192 vertices for TRILIST_ADJ */
vfg.ListNBatchSizeScale = 0;
/* Batch size of 384 vertices */
vfg.List3BatchSizeScale = 2;
/* Batch size of 128 vertices */
vfg.List2BatchSizeScale = 1;
/* Batch size of 128 vertices */
vfg.List1BatchSizeScale = 2;
/* Batch size of 256 vertices for STRIP topologies */
vfg.StripBatchSizeScale = 3;
/* 192 control points for PATCHLIST_3 */
vfg.PatchBatchSizeScale = 1;
/* 192 control points for PATCHLIST_3 */
vfg.PatchBatchSizeMultiplier = 31;
SET(vfg, vfg, DistributionGranularity);
SET(vfg, vfg, DistributionMode);
SET(vfg, vfg, GranularityThresholdDisable);
SET(vfg, vfg, ListCutIndexEnable);
}
}
#endif
if (IS_DIRTY(VF_SGVS))
anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), MESA_SHADER_VERTEX, vs.vf_sgvs);
#if GFX_VER >= 11
if (IS_DIRTY(VF_SGVS_2))
anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), MESA_SHADER_VERTEX, vs.vf_sgvs_2);
#endif
if (IS_DIRTY(VF_SGVS_INSTANCING))
anv_gfx_copy_variable(vf_sgvs_instancing, MESA_SHADER_VERTEX, vs.vf_sgvs_instancing);
if (instance->vf_component_packing && IS_DIRTY(VF_COMPONENT_PACKING)) {
anv_gfx_copy(vf_component_packing, GENX(3DSTATE_VF_COMPONENT_PACKING),
MESA_SHADER_VERTEX, vs.vf_component_packing);
}
if (IS_DIRTY(INDEX_BUFFER)) {
anv_gfx_pack(ib, GENX(3DSTATE_INDEX_BUFFER), ib) {
ib.IndexFormat = vk_to_intel_index_type(gfx->index_type);
ib.MOCS = gfx->index_addr == 0 ?
anv_mocs(device, NULL, ISL_SURF_USAGE_INDEX_BUFFER_BIT) :
gfx->index_mocs;
#if GFX_VER >= 12
ib.L3BypassDisable = true;
#endif
ib.BufferStartingAddress = anv_address_from_u64(gfx->index_addr);
ib.BufferSize = gfx->index_size;
}
}
if (IS_DIRTY(STREAMOUT)) {
anv_gfx_pack_merge(so, GENX(3DSTATE_STREAMOUT),
gfx->streamout_stage, so, so) {
SET(so, so, RenderingDisable);
SET(so, so, RenderStreamSelect);
SET(so, so, ReorderMode);
SET(so, so, ForceRendering);
}
}
if (IS_DIRTY(SO_DECL_LIST))
anv_gfx_copy_variable(so_decl_list, gfx->streamout_stage, so_decl_list);
if (IS_DIRTY(CLIP)) {
anv_gfx_pack(clip, GENX(3DSTATE_CLIP), clip) {
clip.ClipEnable = true;
clip.StatisticsEnable = true;
clip.EarlyCullEnable = true;
clip.GuardbandClipTestEnable = true;
clip.VertexSubPixelPrecisionSelect = _8Bit;
clip.ClipMode = CLIPMODE_NORMAL;
clip.MinimumPointWidth = 0.125;
clip.MaximumPointWidth = 255.875;
SET(clip, clip, APIMode);
SET(clip, clip, ViewportXYClipTestEnable);
SET(clip, clip, TriangleStripListProvokingVertexSelect);
SET(clip, clip, LineStripListProvokingVertexSelect);
SET(clip, clip, TriangleFanProvokingVertexSelect);
#if GFX_VERx10 >= 200
SET(clip, clip, TriangleStripOddProvokingVertexSelect);
#endif
SET(clip, clip, MaximumVPIndex);
SET(clip, clip, ForceZeroRTAIndexEnable);
SET(clip, clip, NonPerspectiveBarycentricEnable);
}
}
if (IS_DIRTY(VIEWPORT_SF_CLIP)) {
struct anv_state sf_clip_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
hw_state->vp_sf_clip.count * 64, 64);
for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
struct GENX(SF_CLIP_VIEWPORT) sfv = {
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
INIT(vp_sf_clip.elem[i], XMinClipGuardband),
INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
INIT(vp_sf_clip.elem[i], YMinClipGuardband),
INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
INIT(vp_sf_clip.elem[i], XMinViewPort),
INIT(vp_sf_clip.elem[i], XMaxViewPort),
INIT(vp_sf_clip.elem[i], YMinViewPort),
INIT(vp_sf_clip.elem[i], YMaxViewPort),
};
GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
}
anv_gfx_pack(sf_clip, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
clip.SFClipViewportPointer = sf_clip_state.offset;
}
}
if (IS_DIRTY(VIEWPORT_CC)) {
hw_state->vp_cc.state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
hw_state->vp_cc.count * 8, 32);
for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
struct GENX(CC_VIEWPORT) cc_viewport = {
INIT(vp_cc.elem[i], MinimumDepth),
INIT(vp_cc.elem[i], MaximumDepth),
};
GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
&cc_viewport);
}
anv_gfx_pack(cc_viewport,
GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
cc.CCViewportPointer = hw_state->vp_cc.state.offset;
}
}
if (IS_DIRTY(SCISSOR)) {
/* Wa_1409725701:
*
* "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
* stored as an array of up to 16 elements. The location of first
* element of the array, as specified by Pointer to SCISSOR_RECT,
* should be aligned to a 64-byte boundary.
*/
struct anv_state scissor_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
hw_state->scissor.count * 8, 64);
for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
struct GENX(SCISSOR_RECT) scissor = {
INIT(scissor.elem[i], ScissorRectangleYMin),
INIT(scissor.elem[i], ScissorRectangleXMin),
INIT(scissor.elem[i], ScissorRectangleYMax),
INIT(scissor.elem[i], ScissorRectangleXMax),
};
GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
}
anv_gfx_pack(scissor, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
ssp.ScissorRectPointer = scissor_state.offset;
}
}
if (IS_DIRTY(CPS)) {
#if GFX_VER >= 30
anv_gfx_pack(cps, GENX(3DSTATE_COARSE_PIXEL), coarse_pixel) {
coarse_pixel.DisableCPSPointers = true;
SET(coarse_pixel, coarse_pixel, CPSizeX);
SET(coarse_pixel, coarse_pixel, CPSizeY);
SET(coarse_pixel, coarse_pixel, CPSizeCombiner0Opcode);
SET(coarse_pixel, coarse_pixel, CPSizeCombiner1Opcode);
}
#elif GFX_VER >= 12
anv_gfx_pack(cps, GENX(3DSTATE_CPS_POINTERS), cps) {
SET(cps, cps, CoarsePixelShadingStateArrayPointer);
}
#elif GFX_VER == 11
anv_gfx_pack(cps, GENX(3DSTATE_CPS), cps) {
SET(cps, cps, CoarsePixelShadingMode);
SET(cps, cps, MinCPSizeX);
SET(cps, cps, MinCPSizeY);
}
#endif
}
if (IS_DIRTY(SF)) {
anv_gfx_pack(sf, GENX(3DSTATE_SF), sf) {
/* Fixed values */
sf.ViewportTransformEnable = true;
sf.StatisticsEnable = true;
sf.VertexSubPixelPrecisionSelect = _8Bit;
sf.AALineDistanceMode = true;
sf.PointWidth = 1.0;
#if GFX_VER >= 12
SET(sf, sf, DerefBlockSize);
#endif
SET(sf, sf, PointWidthSource);
SET(sf, sf, LineWidth);
SET(sf, sf, TriangleStripListProvokingVertexSelect);
SET(sf, sf, LineStripListProvokingVertexSelect);
SET(sf, sf, TriangleFanProvokingVertexSelect);
#if GFX_VERx10 >= 200
SET(sf, sf, TriangleStripOddProvokingVertexSelect);
#endif
SET(sf, sf, LegacyGlobalDepthBiasEnable);
}
}
if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_RASTER)) {
anv_gfx_pack(raster, GENX(3DSTATE_RASTER), raster) {
/* For details on 3DSTATE_RASTER multisample state, see the BSpec
* table "Multisample Modes State".
*
* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the SKL PMA fix
* computations. If we ever set this bit to a different value, they
* will need to be updated accordingly.
*/
raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
raster.ForceMultisampling = false;
raster.ScissorRectangleEnable = true;
SET(raster, raster, APIMode);
SET(raster, raster, DXMultisampleRasterizationEnable);
SET(raster, raster, AntialiasingEnable);
SET(raster, raster, CullMode);
SET(raster, raster, FrontWinding);
SET(raster, raster, GlobalDepthOffsetEnableSolid);
SET(raster, raster, GlobalDepthOffsetEnableWireframe);
SET(raster, raster, GlobalDepthOffsetEnablePoint);
SET(raster, raster, GlobalDepthOffsetConstant);
SET(raster, raster, GlobalDepthOffsetScale);
SET(raster, raster, GlobalDepthOffsetClamp);
SET(raster, raster, FrontFaceFillMode);
SET(raster, raster, BackFaceFillMode);
SET(raster, raster, ViewportZFarClipTestEnable);
SET(raster, raster, ViewportZNearClipTestEnable);
SET(raster, raster, ConservativeRasterizationEnable);
#if GFX_VER >= 20
SET(raster, raster, LegacyBaryAssignmentDisable);
#endif
}
}
if (IS_DIRTY(LINE_STIPPLE)) {
anv_gfx_pack(ls, GENX(3DSTATE_LINE_STIPPLE), ls) {
SET(ls, ls, LineStipplePattern);
SET(ls, ls, LineStippleInverseRepeatCount);
SET(ls, ls, LineStippleRepeatCount);
}
}
if (IS_DIRTY(MULTISAMPLE)) {
anv_gfx_pack(ms, GENX(3DSTATE_MULTISAMPLE), ms) {
ms.PixelLocation = CENTER;
/* The PRM says that this bit is valid only for DX9:
*
* SW can choose to set this bit only for DX9 API. DX10/OGL API's
* should not have any effect by setting or not setting this bit.
*/
ms.PixelPositionOffsetEnable = false;
SET(ms, ms, NumberofMultisamples);
}
}
if (IS_DIRTY(SAMPLE_MASK)) {
anv_gfx_pack(sm, GENX(3DSTATE_SAMPLE_MASK), sm) {
SET(sm, sm, SampleMask);
}
}
if (IS_DIRTY(TE)) {
if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
anv_gfx_pack_merge(te, GENX(3DSTATE_TE),
MESA_SHADER_TESS_EVAL, ds.te, te) {
SET(te, te, TEDomain);
#if GFX_VER >= 12
SET(te, te, PatchHeaderLayout);
#endif
SET(te, te, Partitioning);
SET(te, te, OutputTopology);
#if GFX_VERx10 >= 125
SET(te, te, TessellationDistributionMode);
#endif
}
} else {
anv_gfx_pack(te, GENX(3DSTATE_TE), te);
}
}
if (IS_DIRTY(WM_DEPTH_STENCIL)) {
anv_gfx_pack(wm_ds, GENX(3DSTATE_WM_DEPTH_STENCIL), wm_ds) {
SET(wm_ds, wm_ds, DoubleSidedStencilEnable);
SET(wm_ds, wm_ds, StencilTestMask);
SET(wm_ds, wm_ds, StencilWriteMask);
SET(wm_ds, wm_ds, BackfaceStencilTestMask);
SET(wm_ds, wm_ds, BackfaceStencilWriteMask);
SET(wm_ds, wm_ds, StencilReferenceValue);
SET(wm_ds, wm_ds, BackfaceStencilReferenceValue);
SET(wm_ds, wm_ds, DepthTestEnable);
SET(wm_ds, wm_ds, DepthBufferWriteEnable);
SET(wm_ds, wm_ds, DepthTestFunction);
SET(wm_ds, wm_ds, StencilTestEnable);
SET(wm_ds, wm_ds, StencilBufferWriteEnable);
SET(wm_ds, wm_ds, StencilFailOp);
SET(wm_ds, wm_ds, StencilPassDepthPassOp);
SET(wm_ds, wm_ds, StencilPassDepthFailOp);
SET(wm_ds, wm_ds, StencilTestFunction);
SET(wm_ds, wm_ds, BackfaceStencilFailOp);
SET(wm_ds, wm_ds, BackfaceStencilPassDepthPassOp);
SET(wm_ds, wm_ds, BackfaceStencilPassDepthFailOp);
SET(wm_ds, wm_ds, BackfaceStencilTestFunction);
}
}
#if GFX_VER >= 12
if (IS_DIRTY(DEPTH_BOUNDS)) {
anv_gfx_pack(db, GENX(3DSTATE_DEPTH_BOUNDS), db) {
SET(db, db, DepthBoundsTestEnable);
SET(db, db, DepthBoundsTestMinValue);
SET(db, db, DepthBoundsTestMaxValue);
}
}
#endif
#if GFX_VER >= 12
if (IS_DIRTY(PRIMITIVE_REPLICATION)) {
anv_gfx_pack(pr, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
SET(pr, pr, ReplicaMask);
SET(pr, pr, ReplicationCount);
SET_ARRAY(pr, pr, RTAIOffset);
}
}
#endif
if (IS_DIRTY(SBE)) {
anv_gfx_pack(sbe, GENX(3DSTATE_SBE), sbe) {
for (unsigned i = 0; i < 32; i++)
sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
sbe.ForceVertexURBEntryReadOffset = true;
sbe.ForceVertexURBEntryReadLength = true;
SET(sbe, sbe, AttributeSwizzleEnable);
SET(sbe, sbe, PointSpriteTextureCoordinateEnable);
SET(sbe, sbe, PointSpriteTextureCoordinateOrigin);
SET(sbe, sbe, NumberofSFOutputAttributes);
SET(sbe, sbe, ConstantInterpolationEnable);
SET(sbe, sbe, VertexURBEntryReadOffset);
SET(sbe, sbe, VertexURBEntryReadLength);
#if GFX_VER >= 20
SET(sbe, sbe, VertexAttributesBypass);
#endif
SET(sbe, sbe, PrimitiveIDOverrideAttributeSelect);
SET(sbe, sbe, PrimitiveIDOverrideComponentX);
SET(sbe, sbe, PrimitiveIDOverrideComponentY);
SET(sbe, sbe, PrimitiveIDOverrideComponentZ);
SET(sbe, sbe, PrimitiveIDOverrideComponentW);
}
}
#if GFX_VERx10 >= 125
if (IS_DIRTY(SBE_MESH)) {
anv_gfx_pack(sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
SET(sbe_mesh, sbe_mesh, PerVertexURBEntryOutputReadOffset);
SET(sbe_mesh, sbe_mesh, PerVertexURBEntryOutputReadLength);
SET(sbe_mesh, sbe_mesh, PerPrimitiveURBEntryOutputReadOffset);
SET(sbe_mesh, sbe_mesh, PerPrimitiveURBEntryOutputReadLength);
}
}
#endif
if (IS_DIRTY(SBE_SWIZ)) {
anv_gfx_pack(sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe_swiz) {
for (unsigned i = 0; i < 16; i++)
SET(sbe_swiz, sbe_swiz, Attribute[i].SourceAttribute);
}
}
if (IS_DIRTY(WM)) {
anv_gfx_pack_merge(wm, GENX(3DSTATE_WM),
MESA_SHADER_FRAGMENT, ps.wm, wm) {
SET(wm, wm, LineStippleEnable);
SET(wm, wm, BarycentricInterpolationMode);
}
}
if (IS_DIRTY(PS_BLEND)) {
anv_gfx_pack(ps_blend, GENX(3DSTATE_PS_BLEND), blend) {
SET(blend, ps_blend, HasWriteableRT);
SET(blend, ps_blend, ColorBufferBlendEnable);
SET(blend, ps_blend, SourceAlphaBlendFactor);
SET(blend, ps_blend, DestinationAlphaBlendFactor);
SET(blend, ps_blend, SourceBlendFactor);
SET(blend, ps_blend, DestinationBlendFactor);
SET(blend, ps_blend, AlphaTestEnable);
SET(blend, ps_blend, IndependentAlphaBlendEnable);
SET(blend, ps_blend, AlphaToCoverageEnable);
}
}
if (IS_DIRTY(CC_STATE)) {
hw_state->cc.state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
GENX(COLOR_CALC_STATE_length) * 4,
64);
struct GENX(COLOR_CALC_STATE) cc = {
INIT(cc, BlendConstantColorRed),
INIT(cc, BlendConstantColorGreen),
INIT(cc, BlendConstantColorBlue),
INIT(cc, BlendConstantColorAlpha),
};
GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
anv_gfx_pack(cc_state, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
ccp.ColorCalcStatePointerValid = true;
}
}
if (IS_DIRTY(BLEND_STATE)) {
const uint32_t num_dwords = GENX(BLEND_STATE_length) +
GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
hw_state->blend.state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
num_dwords * 4,
64);
uint32_t *dws = hw_state->blend.state.map;
struct GENX(BLEND_STATE) blend_state = {
INIT(blend, AlphaToCoverageEnable),
INIT(blend, AlphaToOneEnable),
INIT(blend, IndependentAlphaBlendEnable),
INIT(blend, ColorDitherEnable),
};
GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
/* Jump to blend entries. */
dws += GENX(BLEND_STATE_length);
for (uint32_t i = 0; i < MAX_RTS; i++) {
struct GENX(BLEND_STATE_ENTRY) entry = {
INIT(blend.rts[i], WriteDisableAlpha),
INIT(blend.rts[i], WriteDisableRed),
INIT(blend.rts[i], WriteDisableGreen),
INIT(blend.rts[i], WriteDisableBlue),
INIT(blend.rts[i], LogicOpFunction),
INIT(blend.rts[i], LogicOpEnable),
INIT(blend.rts[i], ColorBufferBlendEnable),
INIT(blend.rts[i], ColorClampRange),
#if GFX_VER >= 30
INIT(blend.rts[i], SimpleFloatBlendEnable),
#endif
INIT(blend.rts[i], PreBlendColorClampEnable),
INIT(blend.rts[i], PostBlendColorClampEnable),
INIT(blend.rts[i], SourceBlendFactor),
INIT(blend.rts[i], DestinationBlendFactor),
INIT(blend.rts[i], ColorBlendFunction),
INIT(blend.rts[i], SourceAlphaBlendFactor),
INIT(blend.rts[i], DestinationAlphaBlendFactor),
INIT(blend.rts[i], AlphaBlendFunction),
};
GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
dws += GENX(BLEND_STATE_ENTRY_length);
}
anv_gfx_pack(blend_state, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
bsp.BlendStatePointer = hw_state->blend.state.offset;
bsp.BlendStatePointerValid = true;
}
}
#if GFX_VERx10 >= 125
if (device->vk.enabled_extensions.EXT_mesh_shader) {
if (IS_DIRTY(MESH_CONTROL)) {
if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
anv_gfx_copy_protected(mesh_control,
GENX(3DSTATE_MESH_CONTROL),
MESA_SHADER_MESH, ms.control);
} else {
anv_gfx_pack(mesh_control, GENX(3DSTATE_MESH_CONTROL), mc);
}
}
if (IS_DIRTY(TASK_CONTROL)) {
if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
anv_gfx_copy_protected(task_control,
GENX(3DSTATE_TASK_CONTROL),
MESA_SHADER_TASK, ts.control);
} else {
anv_gfx_pack(task_control, GENX(3DSTATE_TASK_CONTROL), tc);
}
}
if (IS_DIRTY(MESH_SHADER)) {
anv_gfx_copy(mesh_shader, GENX(3DSTATE_MESH_SHADER),
MESA_SHADER_MESH, ms.shader);
}
if (IS_DIRTY(MESH_DISTRIB)) {
anv_gfx_copy(mesh_distrib, GENX(3DSTATE_MESH_DISTRIB),
MESA_SHADER_MESH, ms.distrib);
}
if (IS_DIRTY(CLIP_MESH)) {
anv_gfx_copy(clip_mesh, GENX(3DSTATE_CLIP_MESH),
MESA_SHADER_MESH, ms.clip);
}
if (IS_DIRTY(TASK_SHADER)) {
anv_gfx_copy(task_shader, GENX(3DSTATE_TASK_SHADER),
MESA_SHADER_TASK, ts.shader);
}
if (IS_DIRTY(TASK_REDISTRIB)) {
anv_gfx_copy(task_redistrib, GENX(3DSTATE_TASK_REDISTRIB),
MESA_SHADER_TASK, ts.redistrib);
}
}
#endif /* GFX_VERx10 >= 125 */
if (IS_DIRTY(VS)) {
#if GFX_VERx10 == 90
anv_gfx_pack_merge_protected(vs, GENX(3DSTATE_VS),
MESA_SHADER_VERTEX, vs.vs, vs) {
SET(vs, vs, VertexCacheDisable);
}
#else
anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), MESA_SHADER_VERTEX, vs.vs);
#endif
}
if (IS_DIRTY(HS))
anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs);
if (IS_DIRTY(DS)) {
anv_gfx_pack_merge_protected(ds, GENX(3DSTATE_DS),
MESA_SHADER_TESS_EVAL, ds.ds, ds) {
SET(ds, ds, ComputeWCoordinateEnable);
}
}
if (IS_DIRTY(GS)) {
anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
MESA_SHADER_GEOMETRY, gs.gs, gs) {
SET(gs, gs, ReorderMode);
}
}
if (IS_DIRTY(PS)) {
anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS),
MESA_SHADER_FRAGMENT, ps.ps, ps) {
SET(ps, ps, KernelStartPointer0);
SET(ps, ps, KernelStartPointer1);
SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
#if GFX_VER < 20
SET(ps, ps, KernelStartPointer2);
SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
SET(ps, ps, _8PixelDispatchEnable);
SET(ps, ps, _16PixelDispatchEnable);
SET(ps, ps, _32PixelDispatchEnable);
#else
SET(ps, ps, Kernel0Enable);
SET(ps, ps, Kernel1Enable);
SET(ps, ps, Kernel0SIMDWidth);
SET(ps, ps, Kernel1SIMDWidth);
SET(ps, ps, Kernel0PolyPackingPolicy);
SET(ps, ps, Kernel0MaximumPolysperThread);
#endif
SET(ps, ps, PositionXYOffsetSelect);
}
}
if (IS_DIRTY(PS_EXTRA)) {
if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
anv_gfx_pack_merge(ps_extra, GENX(3DSTATE_PS_EXTRA),
MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
SET(pse, ps_extra, PixelShaderHasUAV);
SET(pse, ps_extra, PixelShaderIsPerSample);
#if GFX_VER >= 11
SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
#endif
SET(pse, ps_extra, PixelShaderKillsPixel);
SET(pse, ps_extra, InputCoverageMaskState);
#if GFX_VERx10 >= 125
SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
#endif
}
#if INTEL_WA_18038825448_GFX_VER
/* Add a dependency if easier the shader needs it (because of runtime
* change through pre-rasterization shader) or if we notice a change.
*/
anv_gfx_pack_merge(ps_extra_dep, GENX(3DSTATE_PS_EXTRA),
MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
SET(pse, ps_extra, PixelShaderHasUAV);
SET(pse, ps_extra, PixelShaderIsPerSample);
#if GFX_VER >= 11
SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
#endif
SET(pse, ps_extra, PixelShaderKillsPixel);
SET(pse, ps_extra, InputCoverageMaskState);
#if GFX_VERx10 >= 125 && INTEL_WA_18038825448_GFX_VER
pse.EnablePSDependencyOnCPsizeChange = true;
#endif
}
#endif /* INTEL_WA_18038825448_GFX_VER */
} else {
anv_gfx_pack(ps_extra, GENX(3DSTATE_PS_EXTRA), ps_extra);
anv_gfx_pack(ps_extra_dep, GENX(3DSTATE_PS_EXTRA), ps_extra);
}
}
#if GFX_VERx10 >= 125
if (hw_state->use_tbimr && IS_DIRTY(TBIMR_TILE_PASS_INFO)) {
anv_gfx_pack(tbimr, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
SET(tbimr, tbimr, TileRectangleHeight);
SET(tbimr, tbimr, TileRectangleWidth);
SET(tbimr, tbimr, VerticalTileCount);
SET(tbimr, tbimr, HorizontalTileCount);
SET(tbimr, tbimr, TBIMRBatchSize);
SET(tbimr, tbimr, TileBoxCheck);
}
}
#endif
#undef IS_DIRTY
#undef GET
#undef SET
BITSET_OR(hw_state->emit_dirty, hw_state->emit_dirty, hw_state->pack_dirty);
BITSET_ZERO(hw_state->pack_dirty);
}
/**
* This function takes the vulkan runtime values & dirty states and updates
* the values in anv_gfx_dynamic_state, flagging HW instructions for
* reemission if the values are changing.
*
* Nothing is emitted in the batch buffer.
*/
void
genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
{
cmd_buffer_flush_gfx_runtime_state(
&cmd_buffer->state.gfx.dyn_state,
cmd_buffer->device,
&cmd_buffer->vk.dynamic_graphics_state,
&cmd_buffer->state.gfx,
cmd_buffer->vk.level);
vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
cmd_buffer_repack_gfx_state(&cmd_buffer->state.gfx.dyn_state,
cmd_buffer,
&cmd_buffer->state.gfx);
}
static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
{
/* For Wa_16012775297, ensure VF_STATISTICS is emitted before 3DSTATE_VF
*/
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
#if GFX_VERx10 >= 125
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
vfg.DistributionMode = RR_STRICT;
}
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
vf.GeometryDistributionEnable =
cmd_buffer->device->physical->instance->enable_vf_distribution;
}
#endif
#if GFX_VER >= 12
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
pr.ReplicaMask = 1;
}
#endif
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
rr.CullMode = CULLMODE_NONE;
rr.FrontFaceFillMode = FILL_MODE_SOLID;
rr.BackFaceFillMode = FILL_MODE_SOLID;
}
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
#if GFX_VER >= 11
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
#endif
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
clip.ClipEnable = true;
clip.ClipMode = CLIPMODE_REJECT_ALL;
}
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
GENX(3DSTATE_VERTEX_ELEMENTS));
uint32_t *ve_pack_dest = &vertex_elements[1];
for (int i = 0; i < 2; i++) {
struct GENX(VERTEX_ELEMENT_STATE) element = {
.Valid = true,
.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
.Component0Control = VFCOMP_STORE_0,
.Component1Control = VFCOMP_STORE_0,
.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
};
GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
}
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
}
/* Emit dummy draw per slice. */
for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.VertexCountPerInstance = 3;
prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
prim.InstanceCount = 1;
prim.VertexAccessType = SEQUENTIAL;
}
}
}
#if INTEL_WA_14018283232_GFX_VER
void
genX(batch_emit_wa_14018283232)(struct anv_batch *batch)
{
anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) {
.BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE,
.SignalStage = RESOURCE_BARRIER_STAGE_COLOR,
.WaitStage = RESOURCE_BARRIER_STAGE_PIXEL,
};
}
}
#endif
void
genX(emit_urb_setup)(struct anv_batch *batch,
const struct anv_device *device,
const struct intel_urb_config *urb_cfg)
{
for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
#if GFX_VER >= 12
anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
urb._3DCommandSubOpcode += i;
if (urb_cfg->size[i] > 0)
urb.VSURBEntryAllocationSize = urb_cfg->size[i] - 1;
urb.VSURBStartingAddressSlice0 = urb_cfg->start[i];
urb.VSURBStartingAddressSliceN = urb_cfg->start[i];
urb.VSNumberofURBEntriesSlice0 = urb_cfg->entries[i];
urb.VSNumberofURBEntriesSliceN = urb_cfg->entries[i];
}
#else
anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
urb._3DCommandSubOpcode += i;
if (urb_cfg->size[i] > 0)
urb.VSURBEntryAllocationSize = urb_cfg->size[i] - 1;
urb.VSURBStartingAddress = urb_cfg->start[i];
urb.VSNumberofURBEntries = urb_cfg->entries[i];
}
#endif
}
#if GFX_VERx10 >= 125
if (device->vk.enabled_features.meshShader) {
anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
if (urb_cfg->size[MESA_SHADER_TASK] > 0)
urb.TASKURBEntryAllocationSize = urb_cfg->size[MESA_SHADER_TASK] - 1;
urb.TASKNumberofURBEntriesSlice0 = urb_cfg->entries[MESA_SHADER_TASK];
urb.TASKNumberofURBEntriesSliceN = urb_cfg->entries[MESA_SHADER_TASK];
urb.TASKURBStartingAddressSlice0 = urb_cfg->start[MESA_SHADER_TASK];
urb.TASKURBStartingAddressSliceN = urb_cfg->start[MESA_SHADER_TASK];
}
anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
if (urb_cfg->size[MESA_SHADER_MESH] > 0)
urb.MESHURBEntryAllocationSize = urb_cfg->size[MESA_SHADER_MESH] - 1;
urb.MESHNumberofURBEntriesSlice0 = urb_cfg->entries[MESA_SHADER_MESH];
urb.MESHNumberofURBEntriesSliceN = urb_cfg->entries[MESA_SHADER_MESH];
urb.MESHURBStartingAddressSlice0 = urb_cfg->start[MESA_SHADER_MESH];
urb.MESHURBStartingAddressSliceN = urb_cfg->start[MESA_SHADER_MESH];
}
}
#endif
}
/**
* This function handles dirty state emission to the batch buffer.
*/
static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_batch *batch = &cmd_buffer->batch;
struct anv_device *device = cmd_buffer->device;
struct anv_instance *instance = device->physical->instance;
struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
const struct vk_dynamic_graphics_state *dyn =
&cmd_buffer->vk.dynamic_graphics_state;
struct anv_push_constants *push_consts =
&cmd_buffer->state.gfx.base.push_constants;
struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
#define DEBUG_SHADER_HASH(stage) do { \
if (unlikely( \
(instance->debug & ANV_DEBUG_SHADER_HASH) && \
anv_gfx_has_stage(gfx, stage))) { \
mi_store(&b, \
mi_mem32(device->workaround_address), \
mi_imm(gfx->shaders[stage]->prog_data->source_hash)); \
} \
} while (0)
struct mi_builder b;
if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {
mi_builder_init(&b, device->info, &cmd_buffer->batch);
mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
}
#if INTEL_WA_16011107343_GFX_VER
/* Will be emitted in front of every draw instead */
if (intel_needs_workaround(device->info, 16011107343) &&
anv_cmd_buffer_has_gfx_stage(cmd_buffer, MESA_SHADER_TESS_CTRL))
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_HS);
#endif
#if INTEL_WA_22018402687_GFX_VER
/* Will be emitted in front of every draw instead */
if (intel_needs_workaround(device->info, 22018402687) &&
anv_cmd_buffer_has_gfx_stage(cmd_buffer, MESA_SHADER_TESS_EVAL))
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_DS);
#endif
#define IS_DIRTY(name) BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_##name)
/*
* Values provided by push constants
*/
if (IS_DIRTY(TESS_CONFIG)) {
push_consts->gfx.tess_config = hw_state->tess_config;
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
gfx->base.push_constants_data_dirty = true;
}
#if INTEL_WA_14024997852_GFX_VER
if (IS_DIRTY(WA_14024997852) &&
intel_needs_workaround(device->info, 14024997852)) {
genX(setup_autostrip_state)(cmd_buffer, !hw_state->autostrip_disabled);
}
#endif
#if INTEL_WA_18019110168_GFX_VER
if (IS_DIRTY(MESH_PROVOKING_VERTEX))
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_MESH_BIT_EXT;
#endif
if (IS_DIRTY(FS_MSAA_FLAGS)) {
push_consts->gfx.fs_msaa_flags = hw_state->fs_msaa_flags;
const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
if (mesh_prog_data) {
push_consts->gfx.fs_per_prim_remap_offset =
gfx->shaders[MESA_SHADER_MESH]->kernel.offset +
mesh_prog_data->wa_18019110168_mapping_offset;
}
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
gfx->base.push_constants_data_dirty = true;
}
#define anv_batch_emit_gfx(batch, cmd, name) ({ \
void *__dst = anv_batch_emit_dwords( \
batch, __anv_cmd_length(cmd)); \
if (__dst != NULL) { \
memcpy(__dst, hw_state->packed.name, \
4 * __anv_cmd_length(cmd)); \
VG(VALGRIND_CHECK_MEM_IS_DEFINED( \
__dst, __anv_cmd_length(cmd) * 4)); \
} \
__dst; \
})
#define anv_batch_emit_gfx_variable(batch, name) do { \
void *__dst = anv_batch_emit_dwords( \
batch, hw_state->packed.name##_len); \
if (__dst != NULL) { \
memcpy(__dst, hw_state->packed.name, \
4 * hw_state->packed.name##_len); \
VG(VALGRIND_CHECK_MEM_IS_DEFINED( \
__dst, 4 * hw_state->packed.name##_len)); \
} \
} while (0)
if (IS_DIRTY(URB)) {
#if INTEL_NEEDS_WA_16014912113
if (genX(need_wa_16014912113)(
&cmd_buffer->state.gfx.urb_cfg, &hw_state->urb_cfg))
genX(batch_emit_wa_16014912113)(batch, &cmd_buffer->state.gfx.urb_cfg);
/* Update urb config. */
memcpy(&cmd_buffer->state.gfx.urb_cfg, &hw_state->urb_cfg,
sizeof(hw_state->urb_cfg));
#endif
genX(emit_urb_setup)(batch, device, &hw_state->urb_cfg);
}
if (IS_DIRTY(VF_SGVS_INSTANCING))
anv_batch_emit_gfx_variable(batch, vf_sgvs_instancing);
if (IS_DIRTY(VF_SGVS))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_SGVS), vf_sgvs);
#if GFX_VER >= 11
if (IS_DIRTY(VF_SGVS_2))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_SGVS_2), vf_sgvs_2);
#endif
if (device->physical->instance->vf_component_packing &&
IS_DIRTY(VF_COMPONENT_PACKING)) {
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_COMPONENT_PACKING),
vf_component_packing);
}
if (IS_DIRTY(VS)) {
DEBUG_SHADER_HASH(MESA_SHADER_VERTEX);
anv_batch_emit_gfx(batch, GENX(3DSTATE_VS), vs);
}
if (IS_DIRTY(HS)) {
DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL);
anv_batch_emit_gfx(batch, GENX(3DSTATE_HS), hs);
}
if (IS_DIRTY(DS)) {
DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL);
anv_batch_emit_gfx(batch, GENX(3DSTATE_DS), ds);
}
if (IS_DIRTY(VF_STATISTICS))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_STATISTICS), vfs);
if (IS_DIRTY(SO_DECL_LIST)) {
/* Wa_16011773973:
* If SOL is enabled and SO_DECL state has to be programmed,
* 1. Send 3D State SOL state with SOL disabled
* 2. Send SO_DECL NP state
* 3. Send 3D State SOL with SOL Enabled
*/
if (intel_needs_workaround(device->info, 16011773973) &&
gfx->shaders[gfx->streamout_stage]->xfb_info != NULL)
anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
anv_batch_emit_gfx_variable(batch, so_decl_list);
#if GFX_VER >= 11 && GFX_VER < 20
/* ICL PRMs, Volume 2a - Command Reference: Instructions,
* 3DSTATE_SO_DECL_LIST:
*
* "Workaround: This command must be followed by a PIPE_CONTROL with
* CS Stall bit set."
*
* On DG2+ also known as Wa_1509820217.
*/
genx_batch_emit_pipe_control(batch, device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_CS_STALL_BIT);
#endif
}
#if GFX_VERx10 >= 125
if (device->vk.enabled_extensions.EXT_mesh_shader) {
if (IS_DIRTY(MESH_CONTROL))
anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_CONTROL), mesh_control);
if (IS_DIRTY(MESH_SHADER)) {
DEBUG_SHADER_HASH(MESA_SHADER_MESH);
anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_SHADER), mesh_shader);
}
if (IS_DIRTY(MESH_DISTRIB))
anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_DISTRIB), mesh_distrib);
if (IS_DIRTY(TASK_CONTROL))
anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_CONTROL), task_control);
if (IS_DIRTY(TASK_SHADER)) {
DEBUG_SHADER_HASH(MESA_SHADER_TASK);
anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_SHADER), task_shader);
}
if (IS_DIRTY(TASK_REDISTRIB))
anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_REDISTRIB), task_redistrib);
if (IS_DIRTY(SBE_MESH))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);
if (IS_DIRTY(CLIP_MESH))
anv_batch_emit_gfx(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh);
}
#endif
if (IS_DIRTY(SBE))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE), sbe);
if (IS_DIRTY(SBE_SWIZ))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE_SWIZ), sbe_swiz);
if (IS_DIRTY(PS)) {
DEBUG_SHADER_HASH(MESA_SHADER_FRAGMENT);
anv_batch_emit_gfx(batch, GENX(3DSTATE_PS), ps);
}
#if INTEL_WA_18038825448_GFX_VER
if (IS_DIRTY(PS_EXTRA) || IS_DIRTY(WA_18038825448)) {
if (IS_DIRTY(WA_18038825448))
anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra_dep);
else
anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra);
}
#else
if (IS_DIRTY(PS_EXTRA))
anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra);
#endif
if (IS_DIRTY(CLIP))
anv_batch_emit_gfx(batch, GENX(3DSTATE_CLIP), clip);
if (IS_DIRTY(STREAMOUT)) {
genX(streamout_prologue)(cmd_buffer, gfx);
anv_batch_emit_gfx(batch, GENX(3DSTATE_STREAMOUT), so);
}
if (IS_DIRTY(VIEWPORT_SF_CLIP))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), sf_clip);
if (IS_DIRTY(VIEWPORT_CC)) {
anv_batch_emit_gfx(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc_viewport);
cmd_buffer->state.gfx.viewport_set = true;
}
if (IS_DIRTY(SCISSOR))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), scissor);
if (IS_DIRTY(VF_TOPOLOGY))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_TOPOLOGY), vft);
if (IS_DIRTY(VERTEX_INPUT)) {
genX(batch_emit_vertex_input)(batch, device,
gfx->shaders[MESA_SHADER_VERTEX], dyn->vi);
}
if (IS_DIRTY(TE))
anv_batch_emit_gfx(batch, GENX(3DSTATE_TE), te);
if (IS_DIRTY(GS)) {
DEBUG_SHADER_HASH(MESA_SHADER_GEOMETRY);
anv_batch_emit_gfx(batch, GENX(3DSTATE_GS), gs);
}
#if GFX_VER >= 11
if (IS_DIRTY(CPS)) {
#if GFX_VER >= 30
anv_batch_emit_gfx(batch, GENX(3DSTATE_COARSE_PIXEL), cps);
#elif GFX_VER >= 12
/* TODO: we can optimize this flush in the following cases:
*
* In the case where the last geometry shader emits a value that is
* not constant, we can avoid this stall because we can synchronize
* the pixel shader internally with
* 3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
*
* If we know that the previous pipeline and the current one are
* using the same fragment shading rate.
*/
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
#if GFX_VERx10 >= 125
pc.PSSStallSyncEnable = true;
#else
pc.PSDSyncEnable = true;
#endif
}
anv_batch_emit_gfx(batch, GENX(3DSTATE_CPS_POINTERS), cps);
#else
anv_batch_emit_gfx(batch, GENX(3DSTATE_CPS), cps);
#endif
}
#endif /* GFX_VER >= 11 */
if (IS_DIRTY(SF))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SF), sf);
if (IS_DIRTY(RASTER))
anv_batch_emit_gfx(batch, GENX(3DSTATE_RASTER), raster);
if (IS_DIRTY(MULTISAMPLE))
anv_batch_emit_gfx(batch, GENX(3DSTATE_MULTISAMPLE), ms);
if (IS_DIRTY(CC_STATE))
anv_batch_emit_gfx(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc_state);
if (IS_DIRTY(SAMPLE_MASK))
anv_batch_emit_gfx(batch, GENX(3DSTATE_SAMPLE_MASK), sm);
if (IS_DIRTY(WM_DEPTH_STENCIL))
anv_batch_emit_gfx(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm_ds);
#if GFX_VER >= 12
if (IS_DIRTY(DEPTH_BOUNDS))
anv_batch_emit_gfx(batch, GENX(3DSTATE_DEPTH_BOUNDS), db);
#endif
if (IS_DIRTY(LINE_STIPPLE)) {
anv_batch_emit_gfx(batch, GENX(3DSTATE_LINE_STIPPLE), ls);
#if GFX_VER >= 11
/* ICL PRMs, Volume 2a - Command Reference: Instructions,
* 3DSTATE_LINE_STIPPLE:
*
* "Workaround: This command must be followed by a PIPE_CONTROL with
* CS Stall bit set."
*/
genx_batch_emit_pipe_control(batch, device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_CS_STALL_BIT);
#endif
}
if (IS_DIRTY(VF))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VF), vf);
#if GFX_VER >= 12
if (IS_DIRTY(PRIMITIVE_REPLICATION))
anv_batch_emit_gfx(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
#endif
if (IS_DIRTY(INDEX_BUFFER))
anv_batch_emit_gfx(batch, GENX(3DSTATE_INDEX_BUFFER), ib);
#if GFX_VERx10 >= 125
if (IS_DIRTY(VFG))
anv_batch_emit_gfx(batch, GENX(3DSTATE_VFG), vfg);
#endif
if (IS_DIRTY(SAMPLE_PATTERN)) {
genX(emit_sample_pattern)(batch,
dyn->ms.sample_locations_enable ?
dyn->ms.sample_locations : NULL);
}
if (IS_DIRTY(WM))
anv_batch_emit_gfx(batch, GENX(3DSTATE_WM), wm);
if (IS_DIRTY(PS_BLEND))
anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_BLEND), ps_blend);
if (IS_DIRTY(BLEND_STATE))
anv_batch_emit_gfx(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), blend_state);
#if INTEL_WA_18019816803_GFX_VER
if (IS_DIRTY(WA_18019816803)) {
genX(batch_emit_pipe_control)(batch, device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_PSS_STALL_SYNC_BIT,
"Wa_18019816803");
}
#endif
#if INTEL_WA_14018283232_GFX_VER
if (IS_DIRTY(WA_14018283232))
genX(batch_emit_wa_14018283232)(batch);
#endif
#if GFX_VER == 9
if (IS_DIRTY(PMA_FIX))
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
#endif
#if GFX_VERx10 >= 125
if (hw_state->use_tbimr && IS_DIRTY(TBIMR_TILE_PASS_INFO))
anv_batch_emit_gfx(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr);
#endif
#undef anv_batch_emit_gfx
#undef anv_batch_emit_gfx_variable
#undef INIT
#undef SET
#undef SET_ARRAY
#undef IS_DIRTY
#undef DEBUG_SHADER_HASH
BITSET_ZERO(hw_state->emit_dirty);
}
/**
* This function handles possible state workarounds and emits the dirty
* instructions to the batch buffer.
*/
void
genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_device *device = cmd_buffer->device;
struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
if (INTEL_DEBUG(DEBUG_REEMIT)) {
BITSET_OR(gfx->dyn_state.emit_dirty,
gfx->dyn_state.emit_dirty,
device->gfx_dirty_state);
}
/**
* Put potential workarounds here if you need to reemit an instruction
* because of another one is changing.
*/
/* Reprogram SF_CLIP & CC_STATE together. This reproduces the programming
* done on Windows drivers. Fixes flickering issues with multiple
* workloads.
*
* Since blorp disables 3DSTATE_CLIP::ClipEnable and dirties CC_STATE, this
* also takes care of Wa_14016820455 which requires SF_CLIP to be
* reprogrammed whenever 3DSTATE_CLIP::ClipEnable is enabled.
*/
if (BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC);
}
/* Wa_16012775297 - Emit dummy VF statistics before each 3DSTATE_VF. */
#if INTEL_WA_16012775297_GFX_VER
if (intel_needs_workaround(device->info, 16012775297) &&
BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VF))
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
#endif
/* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
* it after.
*/
if (intel_needs_workaround(device->info, 16011773973) &&
gfx->shaders[gfx->streamout_stage]->xfb_info != NULL &&
BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
}
#if INTEL_WA_18038825448_GFX_VER
const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
if (wm_prog_data) {
genX(cmd_buffer_set_coarse_pixel_active)(
cmd_buffer,
brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags));
}
#endif
/* Gfx11 undocumented issue :
* https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
*/
#if GFX_VER == 11
if (BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_BLEND_STATE))
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_MULTISAMPLE);
#endif
#if GFX_VERx10 == 125
if (intel_device_info_is_dg2(device->info)) {
/* On DG2 & MTL, dEQP-VK.shader_object.binding.mesh_swap_task fails on
* both simulation & HW, dEQP-VK.shader_object.binding.mesh_swap_mesh
* fails on HW.
*
* We can get the first test to pass more often by reemitting
* 3DSTATE_TASK_CONTROL but the other nothing is helping but a CS stall.
*
* What seems to happen is that the new shader offset programmed isn't
* applied and instead the HW reexecutes the previous shader.
*/
if ((BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_TASK_SHADER) ||
BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_MESH_SHADER)) &&
gfx->shaders[MESA_SHADER_MESH] != NULL) {
genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
_3D, ANV_PIPE_CS_STALL_BIT);
}
}
#endif
/* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
if (intel_needs_workaround(device->info, 18020335297) &&
BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC) &&
cmd_buffer->state.gfx.viewport_set) {
/* For mesh, we implement the WA using CS stall. This is for
* simplicity and takes care of possible interaction with Wa_16014390852.
*/
if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
_3D, ANV_PIPE_CS_STALL_BIT);
} else {
/* Mask off all instructions that we program. */
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VFG);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_RASTER);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS_2);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_CLIP);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VERTEX_INPUT);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_TOPOLOGY);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VS);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_GS);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_HS);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_TE);
BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_DS);
cmd_buffer_gfx_state_emission(cmd_buffer);
emit_wa_18020335297_dummy_draw(cmd_buffer);
/* Dirty all emitted WA state to make sure that current real
* state is restored.
*/
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VFG);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_RASTER);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS_2);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_CLIP);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VERTEX_INPUT);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_TOPOLOGY);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_GS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_HS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_TE);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_DS);
}
}
cmd_buffer_gfx_state_emission(cmd_buffer);
}
void
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
{
if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
return;
if (cmd_buffer->state.gfx.pma_fix_enabled == enable)
return;
cmd_buffer->state.gfx.pma_fix_enabled = enable;
/* According to the Broadwell PIPE_CONTROL documentation, software should
* emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
* prior to the LRI. If stencil buffer writes are enabled, then a Render
* Cache Flush is also necessary.
*
* The Skylake docs say to use a depth stall rather than a command
* streamer stall. However, the hardware seems to violently disagree.
* A full command streamer stall seems to be needed in both cases.
*/
genx_batch_emit_pipe_control
(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT |
#if GFX_VER >= 12
ANV_PIPE_TILE_CACHE_FLUSH_BIT |
#endif
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
#if GFX_VER == 9
uint32_t cache_mode;
anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
.STCPMAOptimizationEnable = enable,
.STCPMAOptimizationEnableMask = true);
anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = GENX(CACHE_MODE_0_num);
lri.DataDWord = cache_mode;
}
#endif /* GFX_VER == 9 */
/* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
* Flush bits is often necessary. We do it regardless because it's easier.
* The render cache flush is also necessary if stencil writes are enabled.
*
* Again, the Skylake docs give a different set of flushes but the BDW
* flushes seem to work just as well.
*/
genx_batch_emit_pipe_control
(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_DEPTH_STALL_BIT |
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
#if GFX_VER >= 12
ANV_PIPE_TILE_CACHE_FLUSH_BIT |
#endif
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
}