/*
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>

#include "anv_private.h"

#include "genxml/gen_macros.h"
#include "genxml/genX_pack.h"
#include "common/intel_genX_state_brw.h"
#include "common/intel_guardband.h"
#include "common/intel_tiled_render.h"
#include "compiler/intel_prim.h"

#include "genX_mi_builder.h"

#define anv_gfx_pack(field, cmd, name)                               \
   for (struct cmd name = { __anv_cmd_header(cmd) },                 \
        *_dst = (struct cmd *)hw_state->packed.field;                \
        __builtin_expect(_dst != NULL, 1);                           \
        ({                                                           \
           assert(sizeof(hw_state->packed.field) >=                  \
                  4 * __anv_cmd_length(cmd));                        \
           __anv_cmd_pack(cmd)(NULL, _dst, &name);                   \
           _dst = NULL;                                              \
        }))

static const uint32_t vk_to_intel_blend[] = {
   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
};

static const uint32_t vk_to_intel_blend_op[] = {
   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
};

static const uint32_t vk_to_intel_cullmode[] = {
   [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
   [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
   [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
   [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
};

static const uint32_t vk_to_intel_fillmode[] = {
   [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
   [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
   [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
};

static const uint32_t vk_to_intel_front_face[] = {
   [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
   [VK_FRONT_FACE_CLOCKWISE]                 = 0
};

static const uint32_t vk_to_intel_logic_op[] = {
   [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
   [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
   [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
   [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
   [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
   [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
   [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
   [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
   [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
   [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
   [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
   [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
   [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
   [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
   [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
   [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
};

static const uint32_t vk_to_intel_compare_op[] = {
   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
};

static const uint32_t vk_to_intel_stencil_op[] = {
   [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
   [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
   [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
   [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
   [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
   [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
   [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
   [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
};

static const uint32_t vk_to_intel_primitive_type[] = {
   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
};

static uint32_t vk_to_intel_index_type(VkIndexType type)
{
   switch (type) {
   case VK_INDEX_TYPE_UINT8_KHR:
      return INDEX_BYTE;
   case VK_INDEX_TYPE_UINT16:
      return INDEX_WORD;
   case VK_INDEX_TYPE_UINT32:
      return INDEX_DWORD;
   default:
      UNREACHABLE("invalid index type");
   }
}

void
genX(batch_emit_wa_16014912113)(struct anv_batch *batch,
                                const struct intel_urb_config *urb_cfg)
{
#if INTEL_NEEDS_WA_16014912113
   if (urb_cfg->size[0] == 0)
      return;

   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
#if GFX_VER >= 12
      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
         urb._3DCommandSubOpcode             += i;
         urb.VSURBEntryAllocationSize        = urb_cfg->size[i] - 1;
         urb.VSURBStartingAddressSlice0      = urb_cfg->start[i];
         urb.VSURBStartingAddressSliceN      = urb_cfg->start[i];
         urb.VSNumberofURBEntriesSlice0      = i == 0 ? 256 : 0;
         urb.VSNumberofURBEntriesSliceN      = i == 0 ? 256 : 0;
      }
#else
      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
         urb._3DCommandSubOpcode      += i;
         urb.VSURBStartingAddress      = urb_cfg->start[i];
         urb.VSURBEntryAllocationSize  = urb_cfg->size[i] - 1;
         urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
      }
#endif
   }
   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
      pc.HDCPipelineFlushEnable = true;
   }
#endif
}

static void
genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer,
                         const struct anv_cmd_graphics_state *gfx)
{
#if INTEL_WA_16013994831_GFX_VER
   /* Wa_16013994831 - Disable preemption during streamout, enable back
    * again if XFB not used by the current pipeline.
    */
   if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
      return;

   if (gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) {
      genX(cmd_buffer_set_preemption)(cmd_buffer, false);
      return;
   }

   if (!cmd_buffer->state.gfx.object_preemption)
      genX(cmd_buffer_set_preemption)(cmd_buffer, true);
#endif
}

#if GFX_VER >= 12 && GFX_VER < 30
static uint32_t
get_cps_state_offset(const struct anv_device *device,
                     const struct vk_fragment_shading_rate_state *fsr)
{
   uint32_t offset;
   static const uint32_t size_index[] = {
      [1] = 0,
      [2] = 1,
      [4] = 2,
   };

#if GFX_VERx10 >= 125
   offset =
      1 + /* skip disabled */
      fsr->combiner_ops[0] * 5 * 3 * 3 +
      fsr->combiner_ops[1] * 3 * 3 +
      size_index[fsr->fragment_size.width] * 3 +
      size_index[fsr->fragment_size.height];
#else
   offset =
      1 + /* skip disabled */
      size_index[fsr->fragment_size.width] * 3 +
      size_index[fsr->fragment_size.height];
#endif

   offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;

   return device->cps_states.offset + offset;
}
#endif /* GFX_VER >= 12 && GFX_VER < 30 */

#if GFX_VER >= 30
static uint32_t
get_cps_size(uint32_t size)
{
   switch (size) {
   case 1:
      return CPSIZE_1;
   case 2:
      return CPSIZE_2;
   case 4:
      return CPSIZE_4;
   default:
      UNREACHABLE("Invalid size");
   }
}

static const uint32_t vk_to_intel_shading_rate_combiner_op[] = {
   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = CPS_COMB_OP_PASSTHROUGH,
   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = CPS_COMB_OP_OVERRIDE,
   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = CPS_COMB_OP_HIGH_QUALITY,
   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = CPS_COMB_OP_LOW_QUALITY,
   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = CPS_COMB_OP_RELATIVE,
};
#endif

static bool
has_ds_feedback_loop(const struct anv_pipeline_bind_map *bind_map,
                     const struct vk_dynamic_graphics_state *dyn)
{
   if (BITSET_IS_EMPTY(bind_map->input_attachments))
      return false;

   const unsigned depth_att = dyn->ial.depth_att == MESA_VK_ATTACHMENT_NO_INDEX ?
      MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS : dyn->ial.depth_att;
   const unsigned stencil_att = dyn->ial.stencil_att == MESA_VK_ATTACHMENT_NO_INDEX ?
      MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS : dyn->ial.stencil_att;

   return
      (dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
                              VK_IMAGE_ASPECT_STENCIL_BIT)) != 0 ||
      (dyn->ial.depth_att != MESA_VK_ATTACHMENT_UNUSED &&
       BITSET_TEST(bind_map->input_attachments, depth_att)) ||
      (dyn->ial.stencil_att != MESA_VK_ATTACHMENT_UNUSED &&
       BITSET_TEST(bind_map->input_attachments, stencil_att));
}

static bool
kill_pixel(const struct brw_wm_prog_data *wm_prog_data,
           const struct vk_dynamic_graphics_state *dyn)
{
   return wm_prog_data->uses_kill ||
          wm_prog_data->uses_omask ||
          dyn->ms.alpha_to_coverage_enable;
}

UNUSED static bool
want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn,
                     const struct anv_cmd_graphics_state *gfx,
                     const struct vk_depth_stencil_state *ds)
{
   if (GFX_VER > 9)
      return false;
   assert(GFX_VER == 9);

   /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
    *
    *    Clearing this bit will force the STC cache to wait for pending
    *    retirement of pixels at the HZ-read stage and do the STC-test for
    *    Non-promoted, R-computed and Computed depth modes instead of
    *    postponing the STC-test to RCPFE.
    *
    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
    *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
    *
    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
    *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
    *
    *    COMP_STC_EN = STC_TEST_EN &&
    *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
    *
    *    SW parses the pipeline states to generate the following logical
    *    signal indicating if PMA FIX can be enabled.
    *
    *    STC_PMA_OPT =
    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
    *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
    *       (COMP_STC_EN || STC_WRITE_EN) &&
    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
    *         3DSTATE_WM::ForceKillPix == ON ||
    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
    */

   /* These are always true:
    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
    */

   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
    * and there is no harm.
    *
    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
    */
   if (!gfx->hiz_enabled)
      return false;

   /* We can't possibly know if HiZ is enabled without the depth attachment */
   ASSERTED const struct anv_image_view *d_iview = gfx->depth_att.iview;
   assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);

   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
   if (gfx->shaders[MESA_SHADER_FRAGMENT] == NULL)
      return false;

   /* !(3DSTATE_WM::EDSC_Mode == 2) */
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
   if (wm_prog_data->early_fragment_tests)
      return false;

   /* We never use anv_pipeline for HiZ ops so this is trivially true:
   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
    */

   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
    */
   const bool stc_test_en = ds->stencil.test_enable;

   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
    */
   const bool stc_write_en = ds->stencil.write_enable;

   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
   const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;

   /* COMP_STC_EN || STC_WRITE_EN */
   if (!(comp_stc_en || stc_write_en))
      return false;

   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
    *  3DSTATE_WM::ForceKillPix == ON ||
    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
    */
   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];

   return kill_pixel(wm_prog_data, dyn) ||
          has_ds_feedback_loop(&fs->bind_map, dyn) ||
          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
}

static inline bool
anv_rasterization_aa_mode(VkPolygonMode raster_mode,
                          VkLineRasterizationModeKHR line_mode)
{
   if (raster_mode == VK_POLYGON_MODE_LINE &&
       line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
      return true;
   return false;
}

static inline VkLineRasterizationModeKHR
anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,
                            unsigned rasterization_samples)
{
   if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR) {
      if (rasterization_samples > 1) {
         return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR;
      } else {
         return VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
      }
   }
   return line_mode;
}

/** Returns the final polygon mode for rasterization
 *
 * This function takes into account polygon mode, primitive topology and the
 * different shader stages which might generate their own type of primitives.
 */
static inline VkPolygonMode
anv_raster_polygon_mode(const struct anv_cmd_graphics_state *gfx,
                        VkPolygonMode polygon_mode,
                        VkPrimitiveTopology primitive_topology)
{
   if (gfx->shaders[MESA_SHADER_MESH] != NULL) {
      switch (get_gfx_mesh_prog_data(gfx)->primitive_type) {
      case MESA_PRIM_POINTS:
         return VK_POLYGON_MODE_POINT;
      case MESA_PRIM_LINES:
         return VK_POLYGON_MODE_LINE;
      case MESA_PRIM_TRIANGLES:
         return polygon_mode;
      default:
         UNREACHABLE("invalid primitive type for mesh");
      }
   } else if (gfx->shaders[MESA_SHADER_GEOMETRY] != NULL) {
      switch (get_gfx_gs_prog_data(gfx)->output_topology) {
      case _3DPRIM_POINTLIST:
         return VK_POLYGON_MODE_POINT;

      case _3DPRIM_LINELIST:
      case _3DPRIM_LINESTRIP:
      case _3DPRIM_LINELOOP:
         return VK_POLYGON_MODE_LINE;

      case _3DPRIM_TRILIST:
      case _3DPRIM_TRIFAN:
      case _3DPRIM_TRISTRIP:
      case _3DPRIM_RECTLIST:
      case _3DPRIM_QUADLIST:
      case _3DPRIM_QUADSTRIP:
      case _3DPRIM_POLYGON:
         return polygon_mode;
      }
      UNREACHABLE("Unsupported GS output topology");
   } else if (gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL) {
      struct brw_tess_info tess_info =
         brw_merge_tess_info(
            get_gfx_tcs_prog_data(gfx)->tess_info,
            get_gfx_tes_prog_data(gfx)->tess_info);

      switch (brw_tess_info_output_topology(tess_info)) {
      case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
         return VK_POLYGON_MODE_POINT;

      case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
         return VK_POLYGON_MODE_LINE;

      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
         return polygon_mode;

      default:
         UNREACHABLE("Unsupported TCS output topology");
      }
   } else {
      switch (primitive_topology) {
      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
         return VK_POLYGON_MODE_POINT;

      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
         return VK_POLYGON_MODE_LINE;

      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
         return polygon_mode;

      default:
         UNREACHABLE("Unsupported primitive topology");
      }
   }
}

static inline bool
anv_is_dual_src_blend_factor(VkBlendFactor factor)
{
   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
}

static inline bool
anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state *cb)
{
   return anv_is_dual_src_blend_factor(cb->src_color_blend_factor) &&
          anv_is_dual_src_blend_factor(cb->dst_color_blend_factor) &&
          anv_is_dual_src_blend_factor(cb->src_alpha_blend_factor) &&
          anv_is_dual_src_blend_factor(cb->dst_alpha_blend_factor);
}

static void
anv_rasterization_mode(VkPolygonMode raster_mode,
                       VkLineRasterizationModeKHR line_mode,
                       float line_width,
                       uint32_t *api_mode,
                       bool *msaa_rasterization_enable)
{
   if (raster_mode == VK_POLYGON_MODE_LINE) {
      /* Unfortunately, configuring our line rasterization hardware on gfx8
       * and later is rather painful.  Instead of giving us bits to tell the
       * hardware what line mode to use like we had on gfx7, we now have an
       * arcane combination of API Mode and MSAA enable bits which do things
       * in a table which are expected to magically put the hardware into the
       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
       * hardware people thought of so nothing works the way you want it to.
       *
       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
       * of the Skylake PRM for more details.
       */
      switch (line_mode) {
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
         *api_mode = DX101;
#if GFX_VER <= 9
         /* Prior to ICL, the algorithm the HW uses to draw wide lines
          * doesn't quite match what the CTS expects, at least for rectangular
          * lines, so we set this to false here, making it draw parallelograms
          * instead, which work well enough.
          */
         *msaa_rasterization_enable = line_width < 1.0078125;
#else
         *msaa_rasterization_enable = true;
#endif
         break;

      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
         *api_mode = DX9OGL;
         *msaa_rasterization_enable = false;
         break;

      default:
         UNREACHABLE("Unsupported line rasterization mode");
      }
   } else {
      *api_mode = DX101;
      *msaa_rasterization_enable = true;
   }
}

static bool
is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
{
   return factor == BLENDFACTOR_SRC1_COLOR ||
          factor == BLENDFACTOR_SRC1_ALPHA ||
          factor == BLENDFACTOR_INV_SRC1_COLOR ||
          factor == BLENDFACTOR_INV_SRC1_ALPHA;
}

#if GFX_VERx10 == 125
/**
 * Return the dimensions of the current rendering area, defined as the
 * bounding box of all present color, depth and stencil attachments.
 */
UNUSED static bool
calculate_render_area(const struct anv_cmd_graphics_state *gfx,
                      unsigned *width, unsigned *height)
{
   *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
   *height = gfx->render_area.offset.y + gfx->render_area.extent.height;

   for (unsigned i = 0; i < gfx->color_att_count; i++) {
      const struct anv_attachment *att = &gfx->color_att[i];
      if (att->iview) {
         *width = MAX2(*width, att->iview->vk.extent.width);
         *height = MAX2(*height, att->iview->vk.extent.height);
      }
   }

   const struct anv_image_view *const z_view = gfx->depth_att.iview;
   if (z_view) {
      *width = MAX2(*width, z_view->vk.extent.width);
      *height = MAX2(*height, z_view->vk.extent.height);
   }

   const struct anv_image_view *const s_view = gfx->stencil_att.iview;
   if (s_view) {
      *width = MAX2(*width, s_view->vk.extent.width);
      *height = MAX2(*height, s_view->vk.extent.height);
   }

   return *width && *height;
}

/* Calculate TBIMR tiling parameters adequate for the current pipeline
 * setup.  Return true if TBIMR should be enabled.
 */
UNUSED static bool
calculate_tile_dimensions(const struct anv_device *device,
                          const struct anv_cmd_graphics_state *gfx,
                          const struct intel_l3_config *l3_config,
                          unsigned fb_width, unsigned fb_height,
                          unsigned *tile_width, unsigned *tile_height)
{
   assert(GFX_VER == 12);
   const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;

   unsigned pixel_size = 0;

   /* Perform a rough calculation of the tile cache footprint of the
    * pixel pipeline, approximating it as the sum of the amount of
    * memory used per pixel by every render target, depth, stencil and
    * auxiliary surfaces bound to the pipeline.
    */
   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
      const struct anv_attachment *att = &gfx->color_att[i];

      if (att->iview) {
         const struct anv_image *image = att->iview->image;
         const unsigned p = anv_image_aspect_to_plane(image,
                                                      VK_IMAGE_ASPECT_COLOR_BIT);
         const struct anv_image_plane *plane = &image->planes[p];

         pixel_size += intel_calculate_surface_pixel_size(
            &plane->primary_surface.isl);

         if (isl_aux_usage_has_mcs(att->aux_usage))
            pixel_size += intel_calculate_surface_pixel_size(
               &plane->aux_surface.isl);

         if (isl_aux_usage_has_ccs(att->aux_usage))
            pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
                                          &plane->primary_surface.isl),
                                       aux_scale);
      }
   }

   const struct anv_image_view *const z_view = gfx->depth_att.iview;
   if (z_view) {
      const struct anv_image *image = z_view->image;
      assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
      const unsigned p = anv_image_aspect_to_plane(image,
                                                   VK_IMAGE_ASPECT_DEPTH_BIT);
      const struct anv_image_plane *plane = &image->planes[p];

      pixel_size += intel_calculate_surface_pixel_size(
         &plane->primary_surface.isl);

      if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
         pixel_size += intel_calculate_surface_pixel_size(
            &plane->aux_surface.isl);

      if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
         pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
                                       &plane->primary_surface.isl),
                                    aux_scale);
   }

   const struct anv_image_view *const s_view = gfx->depth_att.iview;
   if (s_view && s_view != z_view) {
      const struct anv_image *image = s_view->image;
      assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
      const unsigned p = anv_image_aspect_to_plane(image,
                                                   VK_IMAGE_ASPECT_STENCIL_BIT);
      const struct anv_image_plane *plane = &image->planes[p];

      pixel_size += intel_calculate_surface_pixel_size(
         &plane->primary_surface.isl);
   }

   if (!pixel_size)
      return false;

   /* Compute a tile layout that allows reasonable utilization of the
    * tile cache based on the per-pixel cache footprint estimated
    * above.
    */
   intel_calculate_tile_dimensions(device->info, l3_config,
                                   32, 32, fb_width, fb_height,
                                   pixel_size, tile_width, tile_height);

   /* Perform TBIMR tile passes only if the framebuffer covers more
    * than a single tile.
    */
   return *tile_width < fb_width || *tile_height < fb_height;
}
#endif

#define GET(field) hw_state->field
#define SET(bit, field, value)                               \
   do {                                                      \
      __typeof(hw_state->field) __v = value;                 \
      if (hw_state->field != __v) {                          \
         hw_state->field = __v;                              \
         BITSET_SET(hw_state->pack_dirty,                    \
                    ANV_GFX_STATE_##bit);                    \
      }                                                      \
   } while (0)
#define SET_STAGE(bit, field, value, stage)                  \
   do {                                                      \
      __typeof(hw_state->field) __v = value;                 \
      if (gfx->shaders[MESA_SHADER_##stage] == NULL) {       \
         hw_state->field = __v;                              \
         break;                                              \
      }                                                      \
      if (hw_state->field != __v) {                          \
         hw_state->field = __v;                              \
         BITSET_SET(hw_state->pack_dirty,                    \
                    ANV_GFX_STATE_##bit);                    \
      }                                                      \
   } while (0)
#define SETUP_PROVOKING_VERTEX(bit, cmd, mode)                         \
   switch (mode) {                                                     \
   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                     \
      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);         \
      SET(bit, cmd.LineStripListProvokingVertexSelect,     0);         \
      SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);         \
      break;                                                           \
   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                      \
      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2);         \
      SET(bit, cmd.LineStripListProvokingVertexSelect,     1);         \
      SET(bit, cmd.TriangleFanProvokingVertexSelect,       2);         \
      break;                                                           \
   default:                                                            \
      UNREACHABLE("Invalid provoking vertex mode");                    \
   }                                                                   \

#define SETUP_PROVOKING_VERTEX_FSB(bit, cmd, mode)                  \
   switch (mode) {                                                  \
   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                  \
      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);      \
      SET(bit, cmd.LineStripListProvokingVertexSelect,     0);      \
      SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);      \
      SET(bit, cmd.TriangleStripOddProvokingVertexSelect,  0);      \
      break;                                                        \
   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                   \
      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);      \
      SET(bit, cmd.LineStripListProvokingVertexSelect,     0);      \
      SET(bit, cmd.TriangleFanProvokingVertexSelect,       0);      \
      SET(bit, cmd.TriangleStripOddProvokingVertexSelect,  1);      \
      break;                                                        \
   default:                                                         \
      UNREACHABLE("Invalid provoking vertex mode");                 \
   }                                                                \

ALWAYS_INLINE static void
update_urb_config(struct anv_gfx_dynamic_state *hw_state,
                  const struct anv_cmd_graphics_state *gfx,
                  const struct anv_device *device)
{
   struct intel_urb_config new_cfg = { 0 };

#if GFX_VERx10 >= 125
   if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
      const struct brw_task_prog_data *task_prog_data =
         get_gfx_task_prog_data(gfx);
      const struct brw_mesh_prog_data *mesh_prog_data =
         get_gfx_mesh_prog_data(gfx);
      intel_get_mesh_urb_config(device->info, device->l3_config,
                                task_prog_data ? task_prog_data->map.size_dw : 0,
                                mesh_prog_data->map.size / 4, &new_cfg);
   } else
#endif
   {
      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
         const struct brw_vue_prog_data *prog_data = anv_gfx_has_stage(gfx, i) ?
            (const struct brw_vue_prog_data *) gfx->shaders[i]->prog_data :
            NULL;

         new_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
      }

      UNUSED bool constrained;
      intel_get_urb_config(device->info, device->l3_config,
                           anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL),
                           anv_gfx_has_stage(gfx, MESA_SHADER_GEOMETRY),
                           &new_cfg, &constrained);
   }

#if GFX_VER >= 12
   SET(SF, sf.DerefBlockSize, new_cfg.deref_block_size);
#endif

   for (int s = 0; s <= MESA_SHADER_MESH; s++) {
      SET(URB, urb_cfg.size[s],    new_cfg.size[s]);
      SET(URB, urb_cfg.start[s],   new_cfg.start[s]);
      SET(URB, urb_cfg.entries[s], new_cfg.entries[s]);
   }
}

ALWAYS_INLINE static void
update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state,
                     const struct vk_dynamic_graphics_state *dyn,
                     const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   if (!wm_prog_data)
      return;

   /* If we have any dynamic bits here, we might need to update the value
    * in the push constant for the shader.
    */
   if (!brw_wm_prog_data_is_dynamic(wm_prog_data))
      return;

   const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);

   enum intel_msaa_flags fs_msaa_flags =
      intel_fs_msaa_flags((struct intel_fs_params) {
            .shader_sample_shading     = wm_prog_data->sample_shading,
            .shader_min_sample_shading = wm_prog_data->min_sample_shading,
            .state_sample_shading      = wm_prog_data->api_sample_shading,
            .rasterization_samples     = dyn->ms.rasterization_samples,
            .coarse_pixel              = !vk_fragment_shading_rate_is_disabled(&dyn->fsr),
            .alpha_to_coverage         = dyn->ms.alpha_to_coverage_enable,
            .provoking_vertex_last     = dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT,
            .first_vue_slot            = hw_state->first_vue_slot,
            .primitive_id_index        = hw_state->primitive_id_index,
            .per_primitive_remapping   = mesh_prog_data &&
                                         mesh_prog_data->map.wa_18019110168_active,
         });

   SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags);
}

static bool
sbe_primitive_id_override(const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
   if (!wm_prog_data)
      return false;

   if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
      const struct brw_mesh_prog_data *mesh_prog_data =
         get_gfx_mesh_prog_data(gfx);
      const struct brw_mue_map *mue = &mesh_prog_data->map;
      return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
              mue->per_primitive_offsets[VARYING_SLOT_PRIMITIVE_ID] == -1;
   }

   const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);

   return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
          (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_ID) == 0;
}

ALWAYS_INLINE static void
update_sbe(struct anv_gfx_dynamic_state *hw_state,
           const struct anv_cmd_graphics_state *gfx,
           const struct anv_device *device)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
   if (wm_prog_data == NULL)
      return;

   const struct brw_mesh_prog_data *mesh_prog_data =
      get_gfx_mesh_prog_data(gfx);

   const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);

   uint32_t vertex_read_offset, vertex_read_length, vertex_varyings, flat_inputs;
   brw_compute_sbe_per_vertex_urb_read(
      vue_map, mesh_prog_data != NULL,
      mesh_prog_data ? mesh_prog_data->map.wa_18019110168_active : false,
      wm_prog_data,
      &vertex_read_offset, &vertex_read_length, &vertex_varyings,
      &hw_state->primitive_id_index, &flat_inputs);

   hw_state->first_vue_slot = vertex_read_offset * 2;

   /* As far as we can test, 3DSTATE_SBE & 3DSTATE_SBE_SWIZ has no effect when
    * the pipeline is using Mesh. We still fill the instruction for now, but
    * in the future we might want to completely avoid its emission.
    */
   SET(SBE, sbe.AttributeSwizzleEnable, mesh_prog_data == NULL);
   SET(SBE, sbe.PointSpriteTextureCoordinateOrigin, UPPERLEFT);
   SET(SBE, sbe.NumberofSFOutputAttributes, vertex_varyings);
   SET(SBE, sbe.ConstantInterpolationEnable, flat_inputs);
   SET(SBE, sbe.VertexAttributesBypass, wm_prog_data->vertex_attributes_bypass);

   if (mesh_prog_data == NULL) {
      for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
         gl_varying_slot attr = wm_prog_data->urb_setup_attribs[idx];
         int input_index = wm_prog_data->urb_setup[attr];

         assert(0 <= input_index);

         if (attr == VARYING_SLOT_PNTC) {
            SET(SBE, sbe.PointSpriteTextureCoordinateEnable, 1 << input_index);
            continue;
         }

         const int slot = vue_map->varying_to_slot[attr];
         if (slot == -1)
            continue;

         /* We have to subtract two slots to account for the URB entry output
          * read offset in the VS and GS stages.
          */
         const int source_attr = slot - 2 * vertex_read_offset;
         assert(source_attr >= 0 && source_attr < 32);
         /* The hardware can only do overrides on 16 overrides at a time, and
          * the other up to 16 have to be lined up so that the input index =
          * the output index. We'll need to do some tweaking to make sure
          * that's the case.
          */
         if (input_index < 16) {
            SET(SBE_SWIZ,
                sbe_swiz.Attribute[input_index].SourceAttribute,
                source_attr);
         } else {
            assert(source_attr == input_index);
         }
      }

      SET(SBE, sbe.VertexURBEntryReadOffset, vertex_read_offset);
      SET(SBE, sbe.VertexURBEntryReadLength, vertex_read_length);
   }

   /* Ask the hardware to supply PrimitiveID if the fragment shader reads it
    * but a previous stage didn't write one.
    */
   const bool prim_id_override = sbe_primitive_id_override(gfx);
   SET(SBE, sbe.PrimitiveIDOverrideAttributeSelect,
       prim_id_override ? wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID] : 0);
   SET(SBE, sbe.PrimitiveIDOverrideComponentX, prim_id_override);
   SET(SBE, sbe.PrimitiveIDOverrideComponentY, prim_id_override);
   SET(SBE, sbe.PrimitiveIDOverrideComponentZ, prim_id_override);
   SET(SBE, sbe.PrimitiveIDOverrideComponentW, prim_id_override);

#if GFX_VERx10 >= 125
   if (mesh_prog_data) {
      SET(SBE_MESH, sbe_mesh.PerVertexURBEntryOutputReadOffset, vertex_read_offset);
      SET(SBE_MESH, sbe_mesh.PerVertexURBEntryOutputReadLength, vertex_read_length);

      uint32_t prim_read_offset, prim_read_length;
      brw_compute_sbe_per_primitive_urb_read(wm_prog_data->per_primitive_inputs,
                                             wm_prog_data->num_per_primitive_inputs,
                                             &mesh_prog_data->map,
                                             &prim_read_offset,
                                             &prim_read_length);

      SET(SBE_MESH, sbe_mesh.PerPrimitiveURBEntryOutputReadOffset, prim_read_offset);
      SET(SBE_MESH, sbe_mesh.PerPrimitiveURBEntryOutputReadLength, prim_read_length);
   }
#endif
}

ALWAYS_INLINE static void
update_ps(struct anv_gfx_dynamic_state *hw_state,
          const struct anv_device *device,
          const struct vk_dynamic_graphics_state *dyn,
          const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   if (!wm_prog_data) {
#if GFX_VER < 20
      SET(PS, ps._8PixelDispatchEnable,  false);
      SET(PS, ps._16PixelDispatchEnable, false);
      SET(PS, ps._32PixelDispatchEnable, false);
#else
      SET(PS, ps.Kernel0Enable, false);
      SET(PS, ps.Kernel1Enable, false);
#endif
      return;
   }

   const struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
   struct GENX(3DSTATE_PS) ps = {};
   intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
                               MAX2(dyn->ms.rasterization_samples, 1),
                               hw_state->fs_msaa_flags);

   SET(PS, ps.KernelStartPointer0,
           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
   SET(PS, ps.KernelStartPointer1,
           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
#if GFX_VER < 20
   SET(PS, ps.KernelStartPointer2,
           fs->kernel.offset +
           brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
#endif

   SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
   SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
#if GFX_VER < 20
   SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
#endif

#if GFX_VER < 20
   SET(PS, ps._8PixelDispatchEnable,  ps._8PixelDispatchEnable);
   SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
   SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
#else
   SET(PS, ps.Kernel0Enable,            ps.Kernel0Enable);
   SET(PS, ps.Kernel1Enable,            ps.Kernel1Enable);
   SET(PS, ps.Kernel0SIMDWidth,         ps.Kernel0SIMDWidth);
   SET(PS, ps.Kernel1SIMDWidth,         ps.Kernel1SIMDWidth);
   SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
   SET(PS, ps.Kernel0MaximumPolysperThread, ps.Kernel0MaximumPolysperThread);
#endif

   SET(PS, ps.PositionXYOffsetSelect,
           !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
           brw_wm_prog_data_is_persample(wm_prog_data,
                                         hw_state->fs_msaa_flags) ?
           POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
}

ALWAYS_INLINE static void
update_ps_extra_wm(struct anv_gfx_dynamic_state *hw_state,
                   const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   if (!wm_prog_data)
      return;

   UNUSED const bool uses_coarse_pixel =
      brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags);

   uint32_t InputCoverageMaskState = ICMS_NONE;
   assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
   if (!wm_prog_data->uses_sample_mask)
      InputCoverageMaskState = ICMS_NONE;
   else if (wm_prog_data->post_depth_coverage)
      InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
   else
      InputCoverageMaskState = ICMS_NORMAL;

   SET(PS_EXTRA, ps_extra.InputCoverageMaskState, InputCoverageMaskState);

   SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
                 brw_wm_prog_data_is_persample(wm_prog_data,
                                               hw_state->fs_msaa_flags));
#if GFX_VER >= 11
   SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel);
#endif
#if GFX_VERx10 >= 125
   /* TODO: We should only require this when the last geometry shader uses a
    *       fragment shading rate that is not constant.
    */
   SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, uses_coarse_pixel);
#endif

   SET(WM, wm.BarycentricInterpolationMode,
           wm_prog_data_barycentric_modes(wm_prog_data, hw_state->fs_msaa_flags));

#if INTEL_WA_18038825448_GFX_VER
   SET(WA_18038825448, coarse_state, uses_coarse_pixel ?
                                     ANV_COARSE_PIXEL_STATE_ENABLED :
                                     ANV_COARSE_PIXEL_STATE_DISABLED);
#endif
}

ALWAYS_INLINE static void
update_ps_extra_has_uav(struct anv_gfx_dynamic_state *hw_state,
                        const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   /* Force fragment shader execution if occlusion queries are active to
    * ensure PS_DEPTH_COUNT is correct. Otherwise a fragment shader with
    * discard and no render target setup could be increment PS_DEPTH_COUNT if
    * the HW internally decides to not run the shader because it has already
    * established that depth-test is passing.
    */
   SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
                       wm_prog_data && (wm_prog_data->has_side_effects ||
                                        gfx->n_occlusion_queries > 0),
                       FRAGMENT);
}

ALWAYS_INLINE static void
update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state,
                            const struct vk_dynamic_graphics_state *dyn,
                            const struct anv_cmd_graphics_state *gfx)
{
   struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
                       wm_prog_data &&
                       (has_ds_feedback_loop(&fs->bind_map, dyn) ||
                        wm_prog_data->uses_kill),
                       FRAGMENT);
}

#if GFX_VERx10 >= 125
ALWAYS_INLINE static bool
geom_or_tess_prim_id_used(const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_tcs_prog_data *tcs_prog_data =
      get_gfx_tcs_prog_data(gfx);
   const struct brw_tes_prog_data *tes_prog_data =
      get_gfx_tes_prog_data(gfx);
   const struct brw_gs_prog_data *gs_prog_data =
      get_gfx_gs_prog_data(gfx);

   return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
          (tes_prog_data && tes_prog_data->include_primitive_id) ||
      (gs_prog_data && gs_prog_data->include_primitive_id);
}

ALWAYS_INLINE static void
update_vfg_distribution_mode(struct anv_gfx_dynamic_state *hw_state,
                             const struct anv_device *device,
                             const struct anv_cmd_graphics_state *gfx)
{
   const bool needs_instance_granularity =
      intel_needs_workaround(device->info, 14019166699) &&
      (sbe_primitive_id_override(gfx) || geom_or_tess_prim_id_used(gfx));


   SET(VFG, vfg.DistributionMode, (GFX_VER < 20 &&
                                   !anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) ?
                                  RR_FREE : RR_STRICT);
   SET(VFG, vfg.DistributionGranularity, needs_instance_granularity ?
                                         InstanceLevelGranularity :
                                         BatchLevelGranularity);
#if INTEL_WA_14014851047_GFX_VER
   SET(VFG, vfg.GranularityThresholdDisable, intel_needs_workaround(device->info,
                                                                    14014851047));
#endif
}

ALWAYS_INLINE static void
update_vfg_list_cut_index(struct anv_gfx_dynamic_state *hw_state,
                          const struct vk_dynamic_graphics_state *dyn)
{
   SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
}
#endif

ALWAYS_INLINE static void
update_streamout(struct anv_gfx_dynamic_state *hw_state,
                 const struct vk_dynamic_graphics_state *dyn,
                 const struct anv_cmd_graphics_state *gfx)
{
   SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
   SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);

#if INTEL_NEEDS_WA_18022508906
   /* Wa_18022508906 :
    *
    * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
    *
    * SOL_INT::Render_Enable =
    *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
    *   (
    *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
    *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
    *     !3DSTATE_STREAMOUT::API_Render_Disable &&
    *     (
    *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
    *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
    *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
    *       3DSTATE_PS_EXTRA::PS_Valid ||
    *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
    *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
    *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
    *     )
    *   )
    *
    * If SOL_INT::Render_Enable is false, the SO stage will not forward any
    * topologies down the pipeline. Which is not what we want for occlusion
    * queries.
    *
    * Here we force rendering to get SOL_INT::Render_Enable when occlusion
    * queries are active.
    */
   SET(STREAMOUT, so.ForceRendering,
       (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
       Force_on : 0);
#endif
}

ALWAYS_INLINE static void
update_provoking_vertex(struct anv_gfx_dynamic_state *hw_state,
                        const struct vk_dynamic_graphics_state *dyn,
                        const struct anv_cmd_graphics_state *gfx)
{
#if GFX_VERx10 >= 200
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   /* In order to respect the table indicated by Vulkan 1.4.312,
    * 28.9. Barycentric Interpolation, we need to program the provoking
    * vertex state differently depending on whether we need to set
    * vertex_attributes_bypass or not.
    * At this point we only deal with full pipelines, so if we don't have
    * a wm_prog_data, there is no fragment shader and none of this matters.
    */
   if (wm_prog_data && wm_prog_data->vertex_attributes_bypass) {
      SETUP_PROVOKING_VERTEX_FSB(SF, sf, dyn->rs.provoking_vertex);
      SETUP_PROVOKING_VERTEX_FSB(CLIP, clip, dyn->rs.provoking_vertex);
   } else {
      /* If we are not setting vertex attributes bypass, we can just use
       * the same macro as older generations. There's one bit missing from
       * it, but that one is only used for the case above and ignored
       * otherwise, so we can pretend it doesn't exist here.
       */
      SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
      SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
   }
#else
   SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
   SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
#endif

   switch (dyn->rs.provoking_vertex) {
   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
      SET(STREAMOUT, so.ReorderMode, LEADING);
      SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
      break;

   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
      SET(STREAMOUT, so.ReorderMode, TRAILING);
      SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
      break;

   default:
      UNREACHABLE("Invalid provoking vertex mode");
   }
}

ALWAYS_INLINE static void
update_topology(struct anv_gfx_dynamic_state *hw_state,
                const struct vk_dynamic_graphics_state *dyn,
                const struct anv_cmd_graphics_state *gfx)
{
   uint32_t topology =
      gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL ?
      _3DPRIM_PATCHLIST(dyn->ts.patch_control_points) :
      vk_to_intel_primitive_type[dyn->ia.primitive_topology];

   SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
}

#if GFX_VER >= 11
ALWAYS_INLINE static void
update_cps(struct anv_gfx_dynamic_state *hw_state,
           const struct anv_device *device,
           const struct vk_dynamic_graphics_state *dyn)
{
#if GFX_VER >= 30
   SET(CPS, coarse_pixel.CPSizeX,
       get_cps_size(dyn->fsr.fragment_size.width));
   SET(CPS, coarse_pixel.CPSizeY,
       get_cps_size(dyn->fsr.fragment_size.height));
   SET(CPS, coarse_pixel.CPSizeCombiner0Opcode,
       vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[0]]);
   SET(CPS, coarse_pixel.CPSizeCombiner1Opcode,
       vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[1]]);
#elif GFX_VER >= 12
   SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
       get_cps_state_offset(device, &dyn->fsr));
#else
   STATIC_ASSERT(GFX_VER == 11);
   SET(CPS, cps.CoarsePixelShadingMode, CPS_MODE_CONSTANT);
   SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
   SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
#endif
}
#endif

ALWAYS_INLINE static void
update_ds(struct anv_gfx_dynamic_state *hw_state,
          const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);

   if (tes_prog_data) {
      struct brw_tess_info tess_info =
         brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
                             tes_prog_data->tess_info);

      SET(DS, ds.ComputeWCoordinateEnable,
              brw_tess_info_domain(tess_info) == INTEL_TESS_DOMAIN_TRI);
   }
}

ALWAYS_INLINE static void
update_te(struct anv_gfx_dynamic_state *hw_state,
          const struct anv_device *device,
          const struct vk_dynamic_graphics_state *dyn,
          const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);

   if (tes_prog_data) {
      struct brw_tess_info tess_info =
         brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
                             tes_prog_data->tess_info);

      SET(TE, te.TEDomain, brw_tess_info_domain(tess_info));
#if GFX_VER >= 12
      SET(TE, te.PatchHeaderLayout,
          tess_info.primitive_mode == TESS_PRIMITIVE_TRIANGLES ?
          REVERSED_TRI_INSIDE_SEPARATE : REVERSED);
#endif
      SET(TE, te.Partitioning, brw_tess_info_partitioning(tess_info));
      if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
         SET(TE, te.OutputTopology, brw_tess_info_output_topology(tess_info));
      } else {
         /* When the origin is upper-left, we have to flip the winding order */
         enum intel_tess_output_topology output_topology =
            brw_tess_info_output_topology(tess_info);
         switch (output_topology) {
         case  OUTPUT_TRI_CCW:
            SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
            break;
         case OUTPUT_TRI_CW:
            SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
            break;
         default:
            SET(TE, te.OutputTopology, output_topology);
            break;
         }
      }

#if GFX_VERx10 >= 125
      uint32_t distrib_mode =
         intel_needs_workaround(device->info, 22012699309) ?
         TEDMODE_RR_STRICT : TEDMODE_RR_FREE;

      /* Wa_14015055625:
       *
       * Disable Tessellation Distribution when primitive Id is enabled.
       */
      if (intel_needs_workaround(device->info, 14015055625) &&
          (sbe_primitive_id_override(gfx) || geom_or_tess_prim_id_used(gfx)))
         distrib_mode = TEDMODE_OFF;

      /* Debug feature for hang analysis */
      if (!device->physical->instance->enable_te_distribution)
         distrib_mode = TEDMODE_OFF;

      SET(TE, te.TessellationDistributionMode, distrib_mode);
#endif
   } else {
      SET(TE, te.OutputTopology, OUTPUT_POINT);
   }
}

ALWAYS_INLINE static void
update_primitive_replication(struct anv_gfx_dynamic_state *hw_state,
                             const struct anv_cmd_graphics_state *gfx)
{
   const struct intel_vue_map *vue_map = get_gfx_last_vue_map(gfx);

   uint32_t count = vue_map ? vue_map->num_pos_slots : 0;

   SET(PRIMITIVE_REPLICATION, pr.ReplicaMask, (1u << count) - 1);
   SET(PRIMITIVE_REPLICATION, pr.ReplicationCount, count - 1);

   if (count) {
      int i = 0;
      u_foreach_bit(view_index, gfx->view_mask) {
         SET(PRIMITIVE_REPLICATION, pr.RTAIOffset[i], view_index);
         i++;
      }
   }
}

ALWAYS_INLINE static void
update_line_width(struct anv_gfx_dynamic_state *hw_state,
                  const struct vk_dynamic_graphics_state *dyn)
{
   SET(SF, sf.LineWidth, dyn->rs.line.width);
}

ALWAYS_INLINE static void
update_sf_point_width_source(struct anv_gfx_dynamic_state *hw_state,
                             const struct anv_cmd_graphics_state *gfx)
{
   SET(SF, sf.PointWidthSource,
       (get_gfx_last_vue_map(gfx)->slots_valid & VARYING_BIT_PSIZ) ?
       Vertex : State);
}

ALWAYS_INLINE static void
update_sf_global_depth_bias(struct anv_gfx_dynamic_state *hw_state,
                            const struct vk_dynamic_graphics_state *dyn)
{
   /**
    * From the Vulkan Spec:
    *
    *    "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth bias
    *     representation is a factor of constant r equal to 1."
    *
    * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
    *
    *    "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
    *
    *     Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
    *
    *     Where r is the minimum representable value > 0 in the depth buffer
    *     format, converted to float32 (note: If state bit Legacy Global Depth
    *     Bias Enable is set, the r term will be forced to 1.0)"
    *
    * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
    * LegacyGlobalDepthBiasEnable.
    */
   SET(SF, sf.LegacyGlobalDepthBiasEnable,
           dyn->rs.depth_bias.representation ==
           VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
}

ALWAYS_INLINE static void
update_clip_api_mode(struct anv_gfx_dynamic_state *hw_state,
                     const struct vk_dynamic_graphics_state *dyn)
{
   SET(CLIP, clip.APIMode,
             dyn->vp.depth_clip_negative_one_to_one ?
             APIMODE_OGL : APIMODE_D3D);
}

ALWAYS_INLINE static void
update_clip_max_viewport(struct anv_gfx_dynamic_state *hw_state,
                         const struct vk_dynamic_graphics_state *dyn)
{
   /* From the Vulkan 1.0.45 spec:
    *
    *    "If the last active vertex processing stage shader entry point's
    *     interface does not include a variable decorated with ViewportIndex,
    *     then the first viewport is used."
    *
    * This could mean that we might need to set the MaximumVPIndex based on
    * the pipeline's last stage, but if the last shader doesn't write the
    * viewport index and the VUE header is used, the compiler will force the
    * value to 0 (which is what the spec requires above). Otherwise it seems
    * like the HW should be pulling 0 if the VUE header is not present.
    *
    * Avoiding a check on the pipeline seems to prevent additional emissions
    * of 3DSTATE_CLIP which appear to impact performance on Assassin's Creed
    * Valhalla..
    */
   SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
                                  dyn->vp.viewport_count - 1 : 0);
}

ALWAYS_INLINE static void
update_clip_raster(struct anv_gfx_dynamic_state *hw_state,
                   const struct vk_dynamic_graphics_state *dyn,
                   const struct anv_cmd_graphics_state *gfx)
{
   /* Take dynamic primitive topology in to account with
    *    3DSTATE_RASTER::APIMode
    *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
    *    3DSTATE_RASTER::AntialiasingEnable
    */
   uint32_t api_mode = 0;
   bool msaa_raster_enable = false;

   const VkLineRasterizationModeKHR line_mode =
      anv_line_rasterization_mode(dyn->rs.line.mode,
                                  dyn->ms.rasterization_samples);

   const VkPolygonMode dynamic_raster_mode =
      anv_raster_polygon_mode(gfx,
                              dyn->rs.polygon_mode,
                              dyn->ia.primitive_topology);

   anv_rasterization_mode(dynamic_raster_mode,
                          line_mode, dyn->rs.line.width,
                          &api_mode, &msaa_raster_enable);

   /* From the Browadwell PRM, Volume 2, documentation for 3DSTATE_RASTER,
    * "Antialiasing Enable":
    *
    * "This field must be disabled if any of the render targets have integer
    * (UINT or SINT) surface format."
    *
    * Additionally internal documentation for Gfx12+ states:
    *
    * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
    *  FORCED_SAMPLE_COUNT > 1."
    */
   const bool aa_enable =
      anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
      !gfx->has_uint_rt &&
      !(GFX_VER >= 12 && gfx->samples > 1);

   const bool depth_clip_enable =
      vk_rasterization_state_depth_clip_enable(&dyn->rs);

   const bool xy_clip_test_enable =
      (dynamic_raster_mode == VK_POLYGON_MODE_FILL);

   SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);

   SET(RASTER, raster.APIMode, api_mode);
   SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
   SET(RASTER, raster.AntialiasingEnable, aa_enable);
   SET(RASTER, raster.CullMode, vk_to_intel_cullmode[dyn->rs.cull_mode]);
   SET(RASTER, raster.FrontWinding, vk_to_intel_front_face[dyn->rs.front_face]);
   SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
   SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
   SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
   SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant_factor);
   SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope_factor);
   SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
   SET(RASTER, raster.FrontFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
   SET(RASTER, raster.BackFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
   SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
   SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
   SET(RASTER, raster.ConservativeRasterizationEnable,
               dyn->rs.conservative_mode !=
               VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);

#if GFX_VERx10 >= 200
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
   SET(RASTER, raster.LegacyBaryAssignmentDisable,
       wm_prog_data && wm_prog_data->vertex_attributes_bypass);
#endif
}

ALWAYS_INLINE static void
update_clip_preraster_stages(struct anv_gfx_dynamic_state *hw_state,
                             const struct anv_cmd_graphics_state *gfx)
{
   const bool layer_written =
      anv_gfx_has_stage(gfx, MESA_SHADER_MESH) ?
      get_gfx_mesh_prog_data(gfx)->map.per_primitive_offsets[VARYING_SLOT_LAYER] >= 0 :
      (get_gfx_last_vue_map(gfx)->slots_valid & VARYING_BIT_LAYER);

   SET(CLIP, clip.ForceZeroRTAIndexEnable, !layer_written);
}

ALWAYS_INLINE static void
update_clip_non_perspective_barycentrics(struct anv_gfx_dynamic_state *hw_state,
                                         const struct anv_cmd_graphics_state *gfx)
{
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);

   SET(CLIP, clip.NonPerspectiveBarycentricEnable,
       wm_prog_data ?
       wm_prog_data->uses_nonperspective_interp_modes : 0);
}

ALWAYS_INLINE static void
update_multisample(struct anv_gfx_dynamic_state *hw_state,
                   const struct vk_dynamic_graphics_state *dyn)
{
   SET(MULTISAMPLE, ms.NumberofMultisamples,
                    __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
}

ALWAYS_INLINE static void
update_sample_mask(struct anv_gfx_dynamic_state *hw_state,
                   const struct vk_dynamic_graphics_state *dyn)
{
   /* From the Vulkan 1.0 spec:
    *    If pSampleMask is NULL, it is treated as if the mask has all bits
    *    enabled, i.e. no coverage is removed from fragments.
    *
    * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
    */
   SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
}

ALWAYS_INLINE static void
update_wm_depth_stencil(struct anv_gfx_dynamic_state *hw_state,
                        const struct vk_dynamic_graphics_state *dyn,
                        const struct anv_cmd_graphics_state *gfx,
                        const struct anv_device *device)
{
   VkImageAspectFlags ds_aspects = 0;
   if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
      ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
   if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
      ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;

   struct vk_depth_stencil_state opt_ds = dyn->ds;
   vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);

   SET(WM_DEPTH_STENCIL, wm_ds.DoubleSidedStencilEnable, true);

   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestMask,
       opt_ds.stencil.front.compare_mask & 0xff);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilWriteMask,
       opt_ds.stencil.front.write_mask & 0xff);

   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);

   SET(WM_DEPTH_STENCIL, wm_ds.StencilReferenceValue,
       opt_ds.stencil.front.reference & 0xff);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilReferenceValue,
       opt_ds.stencil.back.reference & 0xff);

   SET(WM_DEPTH_STENCIL, wm_ds.DepthTestEnable, opt_ds.depth.test_enable);
   SET(WM_DEPTH_STENCIL, wm_ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
   SET(WM_DEPTH_STENCIL, wm_ds.DepthTestFunction,
                         vk_to_intel_compare_op[opt_ds.depth.compare_op]);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestEnable, opt_ds.stencil.test_enable);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilBufferWriteEnable,
                         opt_ds.stencil.write_enable);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilFailOp,
                         vk_to_intel_stencil_op[opt_ds.stencil.front.op.fail]);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthPassOp,
                         vk_to_intel_stencil_op[opt_ds.stencil.front.op.pass]);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.front.op.depth_fail]);
   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestFunction,
                         vk_to_intel_compare_op[
                            opt_ds.stencil.front.op.compare]);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.fail]);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthPassOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.pass]);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.depth_fail]);
   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestFunction,
                         vk_to_intel_compare_op[
                            opt_ds.stencil.back.op.compare]);

#if GFX_VER == 9
   const bool pma = want_stencil_pma_fix(dyn, gfx, &opt_ds);
   SET(PMA_FIX, pma_fix, pma);
#endif

#if INTEL_WA_18019816803_GFX_VER
   if (intel_needs_workaround(device->info, 18019816803)) {
      bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
      SET(WA_18019816803, ds_write_state, ds_write_state);
   }
#endif
}

ALWAYS_INLINE static void
update_depth_bounds(struct anv_gfx_dynamic_state *hw_state,
                    const struct vk_dynamic_graphics_state *dyn)
{
   SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
   /* Only look at updating the bounds if testing is enabled */
   if (dyn->ds.depth.bounds_test.enable) {
      SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
      SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
   }
}

ALWAYS_INLINE static void
update_line_stipple(struct anv_gfx_dynamic_state *hw_state,
                    const struct vk_dynamic_graphics_state *dyn)
{
   SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
   SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
                     1.0f / MAX2(1, dyn->rs.line.stipple.factor));
   SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);

   SET(WM,           wm.LineStippleEnable, dyn->rs.line.stipple.enable);
}

ALWAYS_INLINE static void
update_vf_restart(struct anv_gfx_dynamic_state *hw_state,
                  const struct vk_dynamic_graphics_state *dyn,
                  const struct anv_cmd_graphics_state *gfx)
{
   SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
   SET(VF, vf.CutIndex, vk_index_to_restart(gfx->index_type));
}

ALWAYS_INLINE static void
update_blend_state(struct anv_gfx_dynamic_state *hw_state,
                   const struct vk_dynamic_graphics_state *dyn,
                   struct anv_cmd_graphics_state *gfx,
                   const struct anv_device *device,
                   bool has_fs_stage,
                   bool has_fs_dual_src)
{
   const struct anv_instance *instance = device->physical->instance;
   const uint8_t color_writes = dyn->cb.color_write_enables;
   bool has_writeable_rt =
      has_fs_stage &&
      !anv_gfx_all_color_write_masked(gfx, dyn);

   SET(BLEND_STATE, blend.AlphaToCoverageEnable,
                    dyn->ms.alpha_to_coverage_enable);
   SET(BLEND_STATE, blend.AlphaToOneEnable,
                    dyn->ms.alpha_to_one_enable);
   SET(BLEND_STATE, blend.ColorDitherEnable,
                    gfx->rendering_flags &
                    VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT);

   bool independent_alpha_blend = false;
   /* Wa_14018912822, check if we set these during RT setup. */
   bool color_blend_zero = false;
   bool alpha_blend_zero = false;
   uint32_t rt_0 = MESA_VK_ATTACHMENT_UNUSED;
   for (uint32_t rt = 0; rt < MAX_RTS; rt++) {
      if (gfx->color_output_mapping[rt] >= gfx->color_att_count) {
         /* The Dual Source Blending documentation says:
          *
          * "If SRC1 is included in a src/dst blend factor and a DualSource RT
          * Write message is not used, results are UNDEFINED."
          *
          * In practice, this results in hangs if we leave the Dual Source
          * Blending enabled for the unused render targets. The easiest way to
          * avoid it altogether is to completely disable the blending for them.
          */
         SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable, false);
         continue;
      }

      uint32_t att = gfx->color_output_mapping[rt];
      if (att == 0)
         rt_0 = att;

      /* Disable anything above the current number of color attachments. */
      bool write_disabled = (color_writes & BITFIELD_BIT(att)) == 0;

      SET(BLEND_STATE, blend.rts[rt].WriteDisableAlpha,
                       write_disabled ||
                       (dyn->cb.attachments[att].write_mask &
                        VK_COLOR_COMPONENT_A_BIT) == 0);
      SET(BLEND_STATE, blend.rts[rt].WriteDisableRed,
                       write_disabled ||
                       (dyn->cb.attachments[att].write_mask &
                        VK_COLOR_COMPONENT_R_BIT) == 0);
      SET(BLEND_STATE, blend.rts[rt].WriteDisableGreen,
                       write_disabled ||
                       (dyn->cb.attachments[att].write_mask &
                        VK_COLOR_COMPONENT_G_BIT) == 0);
      SET(BLEND_STATE, blend.rts[rt].WriteDisableBlue,
                       write_disabled ||
                       (dyn->cb.attachments[att].write_mask &
                        VK_COLOR_COMPONENT_B_BIT) == 0);
      /* Vulkan specification 1.2.168, VkLogicOp:
       *
       *   "Logical operations are controlled by the logicOpEnable and logicOp
       *   members of VkPipelineColorBlendStateCreateInfo. If logicOpEnable is
       *   VK_TRUE, then a logical operation selected by logicOp is applied
       *   between each color attachment and the fragment’s corresponding
       *   output value, and blending of all attachments is treated as if it
       *   were disabled."
       *
       * From the Broadwell PRM Volume 2d: Command Reference: Structures:
       * BLEND_STATE_ENTRY:
       *
       *   "Enabling LogicOp and Color Buffer Blending at the same time is
       *   UNDEFINED"
       *
       * The Vulkan spec also says:
       *   "Logical operations are not applied to floating-point or sRGB format
       *   color attachments."
       * and
       *   "Any attachments using color formats for which logical operations
       *   are not supported simply pass through the color values unmodified."
       */
      bool ignores_logic_op =
         vk_format_is_float(gfx->color_att[att].vk_format) ||
         vk_format_is_srgb(gfx->color_att[att].vk_format);
      SET(BLEND_STATE, blend.rts[rt].LogicOpFunction,
                       vk_to_intel_logic_op[dyn->cb.logic_op]);
      SET(BLEND_STATE, blend.rts[rt].LogicOpEnable,
                       dyn->cb.logic_op_enable && !ignores_logic_op);

      SET(BLEND_STATE, blend.rts[rt].ColorClampRange, COLORCLAMP_RTFORMAT);
      SET(BLEND_STATE, blend.rts[rt].PreBlendColorClampEnable, true);
      SET(BLEND_STATE, blend.rts[rt].PostBlendColorClampEnable, true);

#if GFX_VER >= 30
      SET(BLEND_STATE, blend.rts[rt].SimpleFloatBlendEnable, true);
#endif

      /* Setup blend equation. */
      SET(BLEND_STATE, blend.rts[rt].ColorBlendFunction,
                       vk_to_intel_blend_op[
                          dyn->cb.attachments[att].color_blend_op]);
      SET(BLEND_STATE, blend.rts[rt].AlphaBlendFunction,
                       vk_to_intel_blend_op[
                          dyn->cb.attachments[att].alpha_blend_op]);

      if (dyn->cb.attachments[att].src_color_blend_factor !=
          dyn->cb.attachments[att].src_alpha_blend_factor ||
          dyn->cb.attachments[att].dst_color_blend_factor !=
          dyn->cb.attachments[att].dst_alpha_blend_factor ||
          dyn->cb.attachments[att].color_blend_op !=
          dyn->cb.attachments[att].alpha_blend_op)
         independent_alpha_blend = true;

      /* The Dual Source Blending documentation says:
       *
       * "If SRC1 is included in a src/dst blend factor and a DualSource RT
       * Write message is not used, results are UNDEFINED. (This reflects the
       * same restriction in DX APIs, where undefined results are produced if
       * “o1” is not written by a PS – there are no default values defined)."
       *
       * There is no way to gracefully fix this undefined situation so we just
       * disable the blending to prevent possible issues.
       */
      if (has_fs_stage && !has_fs_dual_src &&
          anv_is_dual_src_blend_equation(&dyn->cb.attachments[att])) {
         SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable, false);
      } else {
         SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable,
                          !dyn->cb.logic_op_enable &&
                          dyn->cb.attachments[att].blend_enable);
      }

      /* Our hardware applies the blend factor prior to the blend function
       * regardless of what function is used. Technically, this means the
       * hardware can do MORE than GL or Vulkan specify. However, it also
       * means that, for MIN and MAX, we have to stomp the blend factor to ONE
       * to make it a no-op.
       */
      uint32_t SourceBlendFactor;
      uint32_t DestinationBlendFactor;
      uint32_t SourceAlphaBlendFactor;
      uint32_t DestinationAlphaBlendFactor;
      if (dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MIN ||
          dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MAX) {
         SourceBlendFactor = BLENDFACTOR_ONE;
         DestinationBlendFactor = BLENDFACTOR_ONE;
      } else {
         SourceBlendFactor = vk_to_intel_blend[
            dyn->cb.attachments[att].src_color_blend_factor];
         DestinationBlendFactor = vk_to_intel_blend[
            dyn->cb.attachments[att].dst_color_blend_factor];
      }

      if (dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MIN ||
          dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MAX) {
         SourceAlphaBlendFactor = BLENDFACTOR_ONE;
         DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
      } else {
         SourceAlphaBlendFactor = vk_to_intel_blend[
            dyn->cb.attachments[att].src_alpha_blend_factor];
         DestinationAlphaBlendFactor = vk_to_intel_blend[
            dyn->cb.attachments[att].dst_alpha_blend_factor];
      }

      /* Replace and Src1 value by 1.0 if dual source blending is not
       * enabled.
       */
      if (has_fs_stage && !has_fs_dual_src) {
         if (is_src1_blend_factor(SourceBlendFactor))
            SourceBlendFactor = BLENDFACTOR_ONE;
         if (is_src1_blend_factor(DestinationBlendFactor))
            DestinationBlendFactor = BLENDFACTOR_ONE;
      }

      if (instance->intel_enable_wa_14018912822 &&
          intel_needs_workaround(device->info, 14018912822) &&
          dyn->ms.rasterization_samples > 1) {
         if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
            DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
            color_blend_zero = true;
         }
         if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
            DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
            alpha_blend_zero = true;
         }
      }

      SET(BLEND_STATE, blend.rts[rt].SourceBlendFactor, SourceBlendFactor);
      SET(BLEND_STATE, blend.rts[rt].DestinationBlendFactor, DestinationBlendFactor);
      SET(BLEND_STATE, blend.rts[rt].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
      SET(BLEND_STATE, blend.rts[rt].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
   }
   gfx->color_blend_zero = color_blend_zero;
   gfx->alpha_blend_zero = alpha_blend_zero;

   SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);

   if (rt_0 == MESA_VK_ATTACHMENT_UNUSED)
      rt_0 = 0;

   /* 3DSTATE_PS_BLEND to be consistent with the rest of the
    * BLEND_STATE_ENTRY.
    */
   SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
   SET(PS_BLEND, ps_blend.ColorBufferBlendEnable,
                 GET(blend.rts[rt_0].ColorBufferBlendEnable));
   SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor,
                 GET(blend.rts[rt_0].SourceAlphaBlendFactor));
   SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor,
                 gfx->alpha_blend_zero ?
                 BLENDFACTOR_CONST_ALPHA :
                 GET(blend.rts[rt_0].DestinationAlphaBlendFactor));
   SET(PS_BLEND, ps_blend.SourceBlendFactor,
                 GET(blend.rts[rt_0].SourceBlendFactor));
   SET(PS_BLEND, ps_blend.DestinationBlendFactor,
                 gfx->color_blend_zero ?
                 BLENDFACTOR_CONST_COLOR :
                 GET(blend.rts[rt_0].DestinationBlendFactor));
   SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
   SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable,
                 GET(blend.IndependentAlphaBlendEnable));
   SET(PS_BLEND, ps_blend.AlphaToCoverageEnable,
                 dyn->ms.alpha_to_coverage_enable);
}

ALWAYS_INLINE static void
update_blend_constants(struct anv_gfx_dynamic_state *hw_state,
                       const struct vk_dynamic_graphics_state *dyn,
                       const struct anv_cmd_graphics_state *gfx)
{
   SET(CC_STATE, cc.BlendConstantColorRed,
                 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
   SET(CC_STATE, cc.BlendConstantColorGreen,
                 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
   SET(CC_STATE, cc.BlendConstantColorBlue,
                 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
   SET(CC_STATE, cc.BlendConstantColorAlpha,
                 gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
}

ALWAYS_INLINE static void
update_viewports(struct anv_gfx_dynamic_state *hw_state,
                 const struct vk_dynamic_graphics_state *dyn,
                 const struct anv_cmd_graphics_state *gfx,
                 const struct anv_device *device)
{
   const struct anv_instance *instance = device->physical->instance;
   const VkViewport *viewports = dyn->vp.viewports;

   const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;

      for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
         const VkViewport *vp = &viewports[i];

         /* The gfx7 state struct has just the matrix and guardband fields, the
          * gfx8 struct adds the min/max viewport fields. */
         struct GENX(SF_CLIP_VIEWPORT) sfv = {
            .ViewportMatrixElementm00 = vp->width / 2,
            .ViewportMatrixElementm11 = vp->height / 2,
            .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
            .ViewportMatrixElementm30 = vp->x + vp->width / 2,
            .ViewportMatrixElementm31 = vp->y + vp->height / 2,
            .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
               (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
            .XMinClipGuardband = -1.0f,
            .XMaxClipGuardband = 1.0f,
            .YMinClipGuardband = -1.0f,
            .YMaxClipGuardband = 1.0f,
            .XMinViewPort = vp->x,
            .XMaxViewPort = vp->x + vp->width - 1,
            .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
            .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
         };

         /* Fix depth test misrenderings by lowering translated depth range */
         if (instance->lower_depth_range_rate != 1.0f)
            sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;

         const uint32_t fb_size_max = 1 << 14;
         uint32_t x_min = 0, x_max = fb_size_max;
         uint32_t y_min = 0, y_max = fb_size_max;

         /* If we have a valid renderArea, include that */
         if (gfx->render_area.extent.width > 0 &&
             gfx->render_area.extent.height > 0) {
            x_min = MAX2(x_min, gfx->render_area.offset.x);
            x_max = MIN2(x_max, gfx->render_area.offset.x +
                                gfx->render_area.extent.width);
            y_min = MAX2(y_min, gfx->render_area.offset.y);
            y_max = MIN2(y_max, gfx->render_area.offset.y +
                                gfx->render_area.extent.height);
         }

         /* The client is required to have enough scissors for whatever it
          * sets as ViewportIndex but it's possible that they've got more
          * viewports set from a previous command. Also, from the Vulkan
          * 1.3.207:
          *
          *    "The application must ensure (using scissor if necessary) that
          *    all rendering is contained within the render area."
          *
          * If the client doesn't set a scissor, that basically means it
          * guarantees everything is in-bounds already. If we end up using a
          * guardband of [-1, 1] in that case, there shouldn't be much loss.
          * It's theoretically possible that they could do all their clipping
          * with clip planes but that'd be a bit odd.
          */
         if (i < dyn->vp.scissor_count) {
            const VkRect2D *scissor = &dyn->vp.scissors[i];
            x_min = MAX2(x_min, scissor->offset.x);
            x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
            y_min = MAX2(y_min, scissor->offset.y);
            y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
         }

         /* Only bother calculating the guardband if our known render area is
          * less than the maximum size. Otherwise, it will calculate [-1, 1]
          * anyway but possibly with precision loss.
          */
         if (x_min > 0 || x_max < fb_size_max ||
             y_min > 0 || y_max < fb_size_max) {
            intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
                                           sfv.ViewportMatrixElementm00,
                                           sfv.ViewportMatrixElementm11,
                                           sfv.ViewportMatrixElementm30,
                                           sfv.ViewportMatrixElementm31,
                                           &sfv.XMinClipGuardband,
                                           &sfv.XMaxClipGuardband,
                                           &sfv.YMinClipGuardband,
                                           &sfv.YMaxClipGuardband);
         }

#define SET_VP(bit, state, field)                                        \
         do {                                                           \
            if (hw_state->state.field != sfv.field) {                   \
               hw_state->state.field = sfv.field;                       \
               BITSET_SET(hw_state->pack_dirty,                         \
                          ANV_GFX_STATE_##bit);                         \
            }                                                           \
         } while (0)
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
#undef SET_VP

         const bool depth_range_unrestricted =
            device->vk.enabled_extensions.EXT_depth_range_unrestricted;

         float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0f;
         float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0f;

         float min_depth = dyn->rs.depth_clamp_enable ?
                           MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
         float max_depth = dyn->rs.depth_clamp_enable ?
                           MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;

         if (dyn->rs.depth_clamp_enable &&
            dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT) {
            min_depth = dyn->vp.depth_clamp_range.minDepthClamp;
            max_depth = dyn->vp.depth_clamp_range.maxDepthClamp;
         }

         SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
         SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
      }

      /* If the HW state is already considered dirty or the previous
       * programmed viewport count is smaller than what we need, update the
       * viewport count and ensure the HW state is dirty. Otherwise if the
       * number of viewport programmed previously was larger than what we need
       * now, no need to reemit we can just keep the old programmed values.
       */
      if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
          hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
         hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
      }
      if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
          hw_state->vp_cc.count < dyn->vp.viewport_count) {
         hw_state->vp_cc.count = dyn->vp.viewport_count;
         BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VIEWPORT_CC);
      }
}

ALWAYS_INLINE static void
update_scissors(struct anv_gfx_dynamic_state *hw_state,
                const struct vk_dynamic_graphics_state *dyn,
                const struct anv_cmd_graphics_state *gfx,
                VkCommandBufferLevel cmd_buffer_level)
{
   const VkRect2D *scissors = dyn->vp.scissors;
   const VkViewport *viewports = dyn->vp.viewports;

   for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
      const VkRect2D *s = &scissors[i];
      const VkViewport *vp = &viewports[i];

      const int max = 0xffff;

      uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
      uint32_t x_min = MAX2(s->offset.x, vp->x);
      int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
                           MAX2(vp->y, vp->y + vp->height) - 1);
      int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
                           vp->x + vp->width - 1);

      y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
      x_max = CLAMP(x_max, 0, INT16_MAX >> 1);

      /* Do this math using int64_t so overflow gets clamped correctly. */
      if (cmd_buffer_level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
         y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
         x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
         y_max = CLAMP((uint64_t) y_max, 0,
                       gfx->render_area.offset.y +
                       gfx->render_area.extent.height - 1);
         x_max = CLAMP((uint64_t) x_max, 0,
                       gfx->render_area.offset.x +
                       gfx->render_area.extent.width - 1);
      }

      if (s->extent.width <= 0 || s->extent.height <= 0) {
         /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
          * ymax < ymin for empty clips. In case clip x, y, width height are
          * all 0, the clamps below produce 0 for xmin, ymin, xmax, ymax,
          * which isn't what we want. Just special case empty clips and
          * produce a canonical empty clip.
          */
         SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
      } else {
         SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
         SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
      }
   }

   /* If the HW state is already considered dirty or the previous programmed
    * viewport count is smaller than what we need, update the viewport count
    * and ensure the HW state is dirty. Otherwise if the number of viewport
    * programmed previously was larger than what we need now, no need to
    * reemit we can just keep the old programmed values.
    */
   if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_SCISSOR) ||
       hw_state->scissor.count < dyn->vp.scissor_count) {
      hw_state->scissor.count = dyn->vp.scissor_count;
      BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_SCISSOR);
   }
}

#if GFX_VERx10 == 125
ALWAYS_INLINE static void
update_tbimr_info(struct anv_gfx_dynamic_state *hw_state,
                  const struct anv_device *device,
                  const struct anv_cmd_graphics_state *gfx,
                  const struct intel_l3_config *l3_config)
{
   unsigned fb_width, fb_height, tile_width, tile_height;

   if (device->physical->instance->enable_tbimr &&
       calculate_render_area(gfx, &fb_width, &fb_height) &&
       calculate_tile_dimensions(device, gfx, l3_config,
                                 fb_width, fb_height,
                                 &tile_width, &tile_height)) {
      /* Use a batch size of 128 polygons per slice as recommended */
      /*    by BSpec 68436 "TBIMR Programming". */
      const unsigned num_slices = device->info->num_slices;
      const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;

      SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
      SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
      SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
          DIV_ROUND_UP(fb_height, tile_height));
      SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
          DIV_ROUND_UP(fb_width, tile_width));
      SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
          util_logbase2(batch_size) - 5);
      SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
      SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
   } else {
      hw_state->use_tbimr = false;
   }
}
#endif

#if GFX_VERx10 == 90
ALWAYS_INLINE static void
update_vs(struct anv_gfx_dynamic_state *hw_state,
          const struct anv_cmd_graphics_state *gfx,
          const struct anv_device *device)
{
   if (device->info->gt < 4)
      return;

   /* On Sky Lake GT4, we have experienced some hangs related to the VS cache
    * and tessellation. It is unknown exactly what is happening but the
    * Haswell docs for the "VS Reference Count Full Force Miss Enable" field
    * of the "Thread Mode" register refer to a HSW bug in which the VUE handle
    * reference count would overflow resulting in internal reference counting
    * bugs. My (Faith's) best guess is that this bug cropped back up on SKL
    * GT4 when we suddenly had more threads in play than any previous gfx9
    * hardware.
    *
    * What we do know for sure is that setting this bit when tessellation
    * shaders are in use fixes a GPU hang in Batman: Arkham City when playing
    * with DXVK (https://bugs.freedesktop.org/107280). Disabling the vertex
    * cache with tessellation shaders should only have a minor performance
    * impact as the tessellation shaders are likely generating and processing
    * far more geometry than the vertex stage.
    */
   SET(VS, vs.VertexCacheDisable, anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL));
}
#endif

#if INTEL_WA_18019110168_GFX_VER
static inline unsigned
compute_mesh_provoking_vertex(const struct brw_mesh_prog_data *mesh_prog_data,
                              const struct vk_dynamic_graphics_state *dyn)
{
   switch (mesh_prog_data->primitive_type) {
   case MESA_PRIM_POINTS:
      return 0;
   case MESA_PRIM_LINES:
   case MESA_PRIM_LINE_LOOP:
   case MESA_PRIM_LINE_STRIP:
   case MESA_PRIM_LINES_ADJACENCY:
   case MESA_PRIM_LINE_STRIP_ADJACENCY:
      return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 1 : 0;
   case MESA_PRIM_TRIANGLES:
   case MESA_PRIM_TRIANGLE_STRIP:
   case MESA_PRIM_TRIANGLE_FAN:
   case MESA_PRIM_TRIANGLES_ADJACENCY:
   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
      return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 2 : 0;
   case MESA_PRIM_QUADS:
   case MESA_PRIM_QUAD_STRIP:
      return dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? 3 : 0;
   default:
      UNREACHABLE("invalid mesh primitive type");
   }
}
#endif

/**
 * This function takes the vulkan runtime values & dirty states and updates
 * the values in anv_gfx_dynamic_state, flagging HW instructions for
 * reemission if the values are changing.
 *
 * Nothing is emitted in the batch buffer.
 */
static void
cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
                                   const struct anv_device *device,
                                   const struct vk_dynamic_graphics_state *dyn,
                                   struct anv_cmd_graphics_state *gfx,
                                   VkCommandBufferLevel cmd_buffer_level)
{
   UNUSED bool fs_msaa_changed = false;

   assert(gfx->shaders[gfx->streamout_stage] != NULL);
   assert(gfx->instance_multiplier != 0);

   /* Do this before update_fs_msaa_flags() for primitive_id_index */
   if (gfx->dirty & ANV_CMD_DIRTY_ALL_SHADERS(device))
      update_sbe(hw_state, gfx, device);

   if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
      update_fs_msaa_flags(hw_state, dyn, gfx);

   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_urb_config(hw_state, gfx, device);

#if GFX_VERx10 == 90
   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_vs(hw_state, gfx, device);
#endif

   if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
       BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
      update_ps(hw_state, device, dyn, gfx);
      update_ps_extra_wm(hw_state, gfx);
   }

   if (gfx->dirty &
#if GFX_VERx10 >= 125
       ANV_CMD_DIRTY_PS
#else
       (ANV_CMD_DIRTY_PS | ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)
#endif
      )
      update_ps_extra_has_uav(hw_state, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE))
      update_ps_extra_kills_pixel(hw_state, dyn, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
      update_streamout(hw_state, dyn, gfx);

   if (
#if GFX_VERx10 >= 200
      /* Xe2+ might need to update this if the FS changed */
      (gfx->dirty & ANV_CMD_DIRTY_PS) ||
#endif
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX))
      update_provoking_vertex(hw_state, dyn, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_DS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY))
      update_topology(hw_state, dyn, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_VS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
      BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VERTEX_INPUT);

#if GFX_VER >= 11
   if (device->vk.enabled_extensions.KHR_fragment_shading_rate &&
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
      update_cps(hw_state, device, dyn);
#endif /* GFX_VER >= 11 */

   if (gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS))
      update_ds(hw_state, gfx);

   if (
#if GFX_VERx10 >= 125
      (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
#else
      (gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
#endif
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN))
      update_te(hw_state, device, dyn, gfx);

#if GFX_VER >= 12
   if ((gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS))
      update_primitive_replication(hw_state, gfx);
#endif

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
      update_line_width(hw_state, dyn);

   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_sf_point_width_source(hw_state, gfx);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS))
      update_sf_global_depth_bias(hw_state, dyn);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
      update_clip_api_mode(hw_state, dyn);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
      update_clip_max_viewport(hw_state, dyn);

   if ((gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE))
      update_clip_raster(hw_state, dyn, gfx);

   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_clip_preraster_stages(hw_state, gfx);

   if (gfx->dirty & ANV_CMD_DIRTY_PS)
      update_clip_non_perspective_barycentrics(hw_state, gfx);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES))
      update_multisample(hw_state, dyn);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK))
      update_sample_mask(hw_state, dyn);

   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
#if GFX_VER == 9
       /* For the PMA fix */
       (gfx->dirty & ANV_CMD_DIRTY_PS) ||
#endif
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE))
      update_wm_depth_stencil(hw_state, dyn, gfx, device);

#if GFX_VER >= 12
   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
      update_depth_bounds(hw_state, dyn);
#endif

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
      update_line_stipple(hw_state, dyn);

   if ((gfx->dirty & ANV_CMD_DIRTY_INDEX_TYPE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
      update_vf_restart(hw_state, dyn, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER) ||
       (gfx->dirty & ANV_CMD_DIRTY_INDEX_TYPE))
      BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_INDEX_BUFFER);

#if GFX_VERx10 >= 125
   if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS)
      update_vfg_distribution_mode(hw_state, device, gfx);

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
      update_vfg_list_cut_index(hw_state, dyn);
#endif

   if (device->vk.enabled_extensions.EXT_sample_locations &&
       (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
      BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_SAMPLE_PATTERN);

   if ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
      const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
      update_blend_state(hw_state, dyn, gfx, device,
                         wm_prog_data != NULL,
                         wm_prog_data != NULL ?
                         wm_prog_data->dual_src_blend : false);
   }

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
      update_blend_constants(hw_state, dyn, gfx);

   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE))
      update_viewports(hw_state, dyn, gfx, device);

   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
      update_scissors(hw_state, dyn, gfx, cmd_buffer_level);

#if GFX_VERx10 == 125
   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS))
      update_tbimr_info(hw_state, device, gfx, device->l3_config);
#endif

#if INTEL_WA_14018283232_GFX_VER
   if (intel_needs_workaround(device->info, 14018283232) &&
       ((gfx->dirty & ANV_CMD_DIRTY_PS) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) {
      const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
      SET(WA_14018283232, wa_14018283232_toggle,
          dyn->ds.depth.bounds_test.enable &&
          wm_prog_data &&
          wm_prog_data->uses_kill);
   }
#endif

#if INTEL_WA_14024997852_GFX_VER
   /* Wa_14024997852: When Draw Cut Index or primitive id is enabled
    * and topology is tri list, we need to disable autostrip.
    *
    * Note that we do not take primitive id in to account because it
    * is mentioned only in xe2 clone of this wa and autostrip has been
    * disabled globally on xe2 (+xe3 a0) by kernel due to 14021490052
    * workaround.
   */
   SET(WA_14024997852, autostrip_disabled,
       hw_state->vft.PrimitiveTopologyType == _3DPRIM_TRILIST &&
       dyn->ia.primitive_restart_enable);
#endif

   /* If the pipeline uses a dynamic value of patch_control_points or the
    * tessellation domain is dynamic and either the pipeline change or the
    * dynamic value change, check the value and reemit if needed.
    */
   const struct brw_tcs_prog_data *tcs_prog_data = get_gfx_tcs_prog_data(gfx);
   const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);
   const bool tcs_dynamic =
      tcs_prog_data && tcs_prog_data->input_vertices == 0;
   const bool tes_dynamic =
      tes_prog_data && tes_prog_data->base.vue_map.layout != INTEL_VUE_LAYOUT_FIXED;
   if ((tcs_dynamic || tes_dynamic) &&
       ((gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS))) {
      assert(tcs_prog_data != NULL && tes_prog_data != NULL);
      struct brw_tess_info tess_info =
         brw_merge_tess_info(tcs_prog_data->tess_info,
                             tes_prog_data->tess_info);

      SET(TESS_CONFIG, tess_config,
          intel_tess_config(dyn->ts.patch_control_points,
                            tcs_prog_data->output_vertices,
                            brw_tess_info_domain(tess_info),
                            tcs_prog_data->base.vue_map.num_per_patch_slots,
                            tcs_prog_data->base.vue_map.num_per_vertex_slots,
                            tcs_prog_data->base.vue_map.builtins_slot_offset));
   }

#if INTEL_WA_18019110168_GFX_VER
   const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
   const bool mesh_provoking_vertex_update =
      intel_needs_workaround(device->info, 18019110168) &&
      mesh_prog_data &&
      (mesh_prog_data->map.vue_map.slots_valid & (VARYING_BIT_CLIP_DIST0 |
                                                  VARYING_BIT_CLIP_DIST1)) &&
      ((gfx->dirty & ANV_CMD_DIRTY_MESH) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX));
   if (mesh_provoking_vertex_update) {
      SET(MESH_PROVOKING_VERTEX, mesh_provoking_vertex,
                                 compute_mesh_provoking_vertex(
                                    mesh_prog_data, dyn));
   }
#endif
}

#undef GET
#undef SET
#undef SET_STAGE
#undef SETUP_PROVOKING_VERTEX

#if INTEL_WA_14024997852_GFX_VER
void
genX(setup_autostrip_state)(struct anv_cmd_buffer *cmd_buffer, bool enable)
{
   /* Add CS stall before writing registers. */
   genx_batch_emit_pipe_control(&cmd_buffer->batch,
                                cmd_buffer->device->info,
                                cmd_buffer->state.current_pipeline,
                                ANV_PIPE_CS_STALL_BIT);

   /* VF */
   anv_batch_write_reg(&cmd_buffer->batch, GENX(VFL_SCRATCH_PAD), vfl) {
      vfl.AutostripDisable = !enable;
      vfl.PartialAutostripDisable = !enable;
      vfl.AutostripDisableMask = true;
      vfl.PartialAutostripDisableMask = true;
   }
   /* TE and Mesh. */
   anv_batch_write_reg(&cmd_buffer->batch, GENX(FF_MODE), ff) {
      ff.TEAutostripDisable = !enable;
      ff.MeshShaderAutostripDisable = !enable;
      ff.MeshShaderPartialAutostripDisable = !enable;
   }
}
#endif /* INTEL_WA_14024997852_GFX_VER */

static void
cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
                            struct anv_cmd_buffer *cmd_buffer,
                            const struct anv_cmd_graphics_state *gfx)
{
   struct anv_device *device = cmd_buffer->device;
   struct anv_instance *instance = device->physical->instance;

#define INIT(category, name) \
   .name = hw_state->category.name
#define SET(s, category, name) \
   s.name = hw_state->category.name
#define SET_ARRAY(s, category, name)            \
   do {                                         \
      assert(sizeof(s.name) ==                  \
             sizeof(hw_state->category.name));  \
      memcpy(&s.name,                           \
             &hw_state->category.name,          \
             sizeof(s.name));                   \
   } while (0)
#define IS_DIRTY(name) BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##name)

#define anv_gfx_copy(field, cmd, stage, source) ({                      \
      if (gfx->shaders[stage] != NULL) {                                \
         assert(sizeof(hw_state->packed.field) >=                       \
                4 * __anv_cmd_length(cmd));                             \
         assert((gfx->shaders[stage]->source).len ==                    \
                __anv_cmd_length(cmd));                                 \
         memcpy(&hw_state->packed.field,                                \
                &gfx->shaders[stage]->cmd_data[                         \
                   (gfx->shaders[stage]->source).offset],               \
                4 * __anv_cmd_length(cmd));                             \
      } else {                                                          \
         anv_gfx_pack(field, cmd, __unused_name);                       \
      }                                                                 \
   })
#define anv_gfx_copy_variable(field, stage, source) ({                  \
      if (gfx->shaders[stage] != NULL) {                                \
         assert(sizeof(hw_state->packed.field) >=                       \
                4 * gfx->shaders[stage]->source.len);                   \
         memcpy(&hw_state->packed.field,                                \
                &gfx->shaders[stage]->cmd_data[                         \
                   (gfx->shaders[stage]->source).offset],               \
                4 * gfx->shaders[stage]->source.len);                   \
         hw_state->packed.field##_len =                                 \
            gfx->shaders[stage]->source.len;                            \
      }                                                                 \
   })
#define anv_gfx_copy_protected(field, cmd, stage, source) ({           \
      const bool __protected = (cmd_buffer->vk.pool->flags &           \
                                VK_COMMAND_POOL_CREATE_PROTECTED_BIT); \
      assert(sizeof(hw_state->packed.field) >=                         \
             4 * __anv_cmd_length(cmd));                               \
      if (gfx->shaders[stage] != NULL) {                               \
         assert((gfx->shaders[stage]->source).len ==                   \
                __anv_cmd_length(cmd));                                \
         memcpy(&hw_state->packed.field,                               \
                &gfx->shaders[stage]->cmd_data[                        \
                   __protected ?                                       \
                   gfx->shaders[stage]->source##_protected.offset :    \
                   gfx->shaders[stage]->source.offset],                \
                4 * __anv_cmd_length(cmd));                            \
      } else {                                                         \
         memcpy(&hw_state->packed.field,                               \
                device->physical->gfx_default.field,                   \
                4 * __anv_cmd_length(cmd));                            \
      }                                                                \
   })
#define anv_gfx_pack_merge(field, cmd, stage, source, name)             \
   for (struct cmd name = (struct cmd) { 0 },                           \
        *_dst = (struct cmd *)hw_state->packed.field;                   \
        __builtin_expect(_dst != NULL, 1);                              \
        ({                                                              \
           uint32_t _partial[__anv_cmd_length(cmd)];                    \
           assert(sizeof(hw_state->packed.field) >=                     \
                  4 * __anv_cmd_length(cmd));                           \
           __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
           if (gfx->shaders[stage] != NULL) {                           \
              const struct anv_gfx_state_ptr *_cmd_state =              \
                 &gfx->shaders[stage]->source;                          \
              assert(_cmd_state->len == __anv_cmd_length(cmd));         \
              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
                 assert((_partial[i] &                                  \
                         gfx->shaders[stage]->cmd_data[                 \
                            _cmd_state->offset + i]) == 0);             \
                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
                    gfx->shaders[stage]->cmd_data[_cmd_state->offset + i]; \
              }                                                         \
           } else {                                                     \
              for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {    \
                 assert((_partial[i] &                                  \
                         device->physical->gfx_default.field[i]) == 0); \
                 ((uint32_t *)_dst)[i] = _partial[i] |                  \
                    device->physical->gfx_default.field[i];             \
              }                                                         \
           }                                                            \
           _dst = NULL;                                                 \
        }))
#define anv_gfx_pack_merge_protected(field, cmd, stage, source, name)   \
   for (struct cmd name = (struct cmd) { 0 },                           \
        *_dst = (struct cmd *)hw_state->packed.field;                   \
        __builtin_expect(_dst != NULL, 1);                              \
        ({                                                              \
           uint32_t _partial[__anv_cmd_length(cmd)];                    \
           assert(sizeof(hw_state->packed.field) >=                     \
                  4 * __anv_cmd_length(cmd));                           \
           __anv_cmd_pack(cmd)(NULL, _partial, &name);                  \
           const struct anv_gfx_state_ptr *_cmd_state =                 \
              gfx->shaders[stage] != NULL ?                             \
              ((cmd_buffer->vk.pool->flags &                            \
                VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?                 \
               &gfx->shaders[stage]->source##_protected :               \
               &gfx->shaders[stage]->source) :                          \
              NULL;                                                     \
           assert(_cmd_state == NULL ||                                 \
                  _cmd_state->len == __anv_cmd_length(cmd));            \
           const uint32_t *_inst_data =                                 \
              gfx->shaders[stage] != NULL ?                             \
              &gfx->shaders[stage]->cmd_data[_cmd_state->offset] :      \
              device->physical->gfx_default.field;                      \
           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
              assert((_partial[i] & _inst_data[i]) == 0);               \
              ((uint32_t *)_dst)[i] = _partial[i] | _inst_data[i];      \
           }                                                            \
           _dst = NULL;                                                 \
         }))


   if (IS_DIRTY(VF)) {
      anv_gfx_pack(vf, GENX(3DSTATE_VF), vf) {
#if GFX_VERx10 >= 125
         vf.GeometryDistributionEnable = instance->enable_vf_distribution;
#endif
         vf.ComponentPackingEnable = instance->vf_component_packing;
         SET(vf, vf, IndexedDrawCutIndexEnable);
         SET(vf, vf, CutIndex);
      }
   }

   if (IS_DIRTY(VF_TOPOLOGY)) {
      anv_gfx_pack(vft, GENX(3DSTATE_VF_TOPOLOGY), vft) {
         SET(vft, vft, PrimitiveTopologyType);
      }
   }

   if (IS_DIRTY(VF_STATISTICS)) {
      anv_gfx_pack(vfs, GENX(3DSTATE_VF_STATISTICS), vfs) {
         vfs.StatisticsEnable = true;
      }
   }

#if GFX_VERx10 >= 125
   if (IS_DIRTY(VFG)) {
      anv_gfx_pack(vfg, GENX(3DSTATE_VFG), vfg) {
         /* 192 vertices for TRILIST_ADJ */
         vfg.ListNBatchSizeScale = 0;
         /* Batch size of 384 vertices */
         vfg.List3BatchSizeScale = 2;
         /* Batch size of 128 vertices */
         vfg.List2BatchSizeScale = 1;
         /* Batch size of 128 vertices */
         vfg.List1BatchSizeScale = 2;
         /* Batch size of 256 vertices for STRIP topologies */
         vfg.StripBatchSizeScale = 3;
         /* 192 control points for PATCHLIST_3 */
         vfg.PatchBatchSizeScale = 1;
         /* 192 control points for PATCHLIST_3 */
         vfg.PatchBatchSizeMultiplier = 31;

         SET(vfg, vfg, DistributionGranularity);
         SET(vfg, vfg, DistributionMode);
         SET(vfg, vfg, GranularityThresholdDisable);
         SET(vfg, vfg, ListCutIndexEnable);
      }
   }
#endif

   if (IS_DIRTY(VF_SGVS))
      anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), MESA_SHADER_VERTEX, vs.vf_sgvs);

#if GFX_VER >= 11
   if (IS_DIRTY(VF_SGVS_2))
      anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), MESA_SHADER_VERTEX, vs.vf_sgvs_2);
#endif

   if (IS_DIRTY(VF_SGVS_INSTANCING))
      anv_gfx_copy_variable(vf_sgvs_instancing, MESA_SHADER_VERTEX, vs.vf_sgvs_instancing);

   if (instance->vf_component_packing && IS_DIRTY(VF_COMPONENT_PACKING)) {
      anv_gfx_copy(vf_component_packing, GENX(3DSTATE_VF_COMPONENT_PACKING),
                   MESA_SHADER_VERTEX, vs.vf_component_packing);
   }

   if (IS_DIRTY(INDEX_BUFFER)) {
      anv_gfx_pack(ib, GENX(3DSTATE_INDEX_BUFFER), ib) {
         ib.IndexFormat           = vk_to_intel_index_type(gfx->index_type);
         ib.MOCS                  = gfx->index_addr == 0 ?
            anv_mocs(device, NULL, ISL_SURF_USAGE_INDEX_BUFFER_BIT) :
            gfx->index_mocs;
#if GFX_VER >= 12
         ib.L3BypassDisable       = true;
#endif
         ib.BufferStartingAddress = anv_address_from_u64(gfx->index_addr);
         ib.BufferSize            = gfx->index_size;
      }
   }

   if (IS_DIRTY(STREAMOUT)) {
      anv_gfx_pack_merge(so, GENX(3DSTATE_STREAMOUT),
                         gfx->streamout_stage, so, so) {
         SET(so, so, RenderingDisable);
         SET(so, so, RenderStreamSelect);
         SET(so, so, ReorderMode);
         SET(so, so, ForceRendering);
      }
   }

   if (IS_DIRTY(SO_DECL_LIST))
      anv_gfx_copy_variable(so_decl_list, gfx->streamout_stage, so_decl_list);

   if (IS_DIRTY(CLIP)) {
      anv_gfx_pack(clip, GENX(3DSTATE_CLIP), clip) {
         clip.ClipEnable               = true;
         clip.StatisticsEnable         = true;
         clip.EarlyCullEnable          = true;
         clip.GuardbandClipTestEnable  = true;

         clip.VertexSubPixelPrecisionSelect = _8Bit;
         clip.ClipMode = CLIPMODE_NORMAL;

         clip.MinimumPointWidth = 0.125;
         clip.MaximumPointWidth = 255.875;

         SET(clip, clip, APIMode);
         SET(clip, clip, ViewportXYClipTestEnable);
         SET(clip, clip, TriangleStripListProvokingVertexSelect);
         SET(clip, clip, LineStripListProvokingVertexSelect);
         SET(clip, clip, TriangleFanProvokingVertexSelect);
#if GFX_VERx10 >= 200
         SET(clip, clip, TriangleStripOddProvokingVertexSelect);
#endif
         SET(clip, clip, MaximumVPIndex);
         SET(clip, clip, ForceZeroRTAIndexEnable);
         SET(clip, clip, NonPerspectiveBarycentricEnable);
      }
   }

   if (IS_DIRTY(VIEWPORT_SF_CLIP)) {
      struct anv_state sf_clip_state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            hw_state->vp_sf_clip.count * 64, 64);

      for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
         struct GENX(SF_CLIP_VIEWPORT) sfv = {
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
            INIT(vp_sf_clip.elem[i], XMinClipGuardband),
            INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
            INIT(vp_sf_clip.elem[i], YMinClipGuardband),
            INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
            INIT(vp_sf_clip.elem[i], XMinViewPort),
            INIT(vp_sf_clip.elem[i], XMaxViewPort),
            INIT(vp_sf_clip.elem[i], YMinViewPort),
            INIT(vp_sf_clip.elem[i], YMaxViewPort),
         };
         GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
      }

      anv_gfx_pack(sf_clip, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
         clip.SFClipViewportPointer = sf_clip_state.offset;
      }
   }

   if (IS_DIRTY(VIEWPORT_CC)) {
      hw_state->vp_cc.state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            hw_state->vp_cc.count * 8, 32);

      for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
         struct GENX(CC_VIEWPORT) cc_viewport = {
            INIT(vp_cc.elem[i], MinimumDepth),
            INIT(vp_cc.elem[i], MaximumDepth),
         };
         GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
                                &cc_viewport);
      }

      anv_gfx_pack(cc_viewport,
                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
         cc.CCViewportPointer = hw_state->vp_cc.state.offset;
      }
   }

   if (IS_DIRTY(SCISSOR)) {
      /* Wa_1409725701:
       *
       *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
       *    stored as an array of up to 16 elements. The location of first
       *    element of the array, as specified by Pointer to SCISSOR_RECT,
       *    should be aligned to a 64-byte boundary.
       */
      struct anv_state scissor_state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            hw_state->scissor.count * 8, 64);

      for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
         struct GENX(SCISSOR_RECT) scissor = {
            INIT(scissor.elem[i], ScissorRectangleYMin),
            INIT(scissor.elem[i], ScissorRectangleXMin),
            INIT(scissor.elem[i], ScissorRectangleYMax),
            INIT(scissor.elem[i], ScissorRectangleXMax),
         };
         GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
      }

      anv_gfx_pack(scissor, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
         ssp.ScissorRectPointer = scissor_state.offset;
      }
   }

   if (IS_DIRTY(CPS)) {
#if GFX_VER >= 30
      anv_gfx_pack(cps, GENX(3DSTATE_COARSE_PIXEL), coarse_pixel) {
         coarse_pixel.DisableCPSPointers = true;
         SET(coarse_pixel, coarse_pixel, CPSizeX);
         SET(coarse_pixel, coarse_pixel, CPSizeY);
         SET(coarse_pixel, coarse_pixel, CPSizeCombiner0Opcode);
         SET(coarse_pixel, coarse_pixel, CPSizeCombiner1Opcode);
      }
#elif GFX_VER >= 12
      anv_gfx_pack(cps, GENX(3DSTATE_CPS_POINTERS), cps) {
         SET(cps, cps, CoarsePixelShadingStateArrayPointer);
      }
#elif GFX_VER == 11
      anv_gfx_pack(cps, GENX(3DSTATE_CPS), cps) {
         SET(cps, cps, CoarsePixelShadingMode);
         SET(cps, cps, MinCPSizeX);
         SET(cps, cps, MinCPSizeY);
      }
#endif
   }

   if (IS_DIRTY(SF)) {
      anv_gfx_pack(sf, GENX(3DSTATE_SF), sf) {
         /* Fixed values */
         sf.ViewportTransformEnable = true;
         sf.StatisticsEnable = true;
         sf.VertexSubPixelPrecisionSelect = _8Bit;
         sf.AALineDistanceMode = true;
         sf.PointWidth = 1.0;

#if GFX_VER >= 12
         SET(sf, sf, DerefBlockSize);
#endif
         SET(sf, sf, PointWidthSource);
         SET(sf, sf, LineWidth);
         SET(sf, sf, TriangleStripListProvokingVertexSelect);
         SET(sf, sf, LineStripListProvokingVertexSelect);
         SET(sf, sf, TriangleFanProvokingVertexSelect);
#if GFX_VERx10 >= 200
         SET(sf, sf, TriangleStripOddProvokingVertexSelect);
#endif
         SET(sf, sf, LegacyGlobalDepthBiasEnable);
      }
   }

   if (BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_RASTER)) {
      anv_gfx_pack(raster, GENX(3DSTATE_RASTER), raster) {
         /* For details on 3DSTATE_RASTER multisample state, see the BSpec
          * table "Multisample Modes State".
          *
          * NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the SKL PMA fix
          * computations. If we ever set this bit to a different value, they
          * will need to be updated accordingly.
          */
         raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
         raster.ForceMultisampling = false;
         raster.ScissorRectangleEnable = true;

         SET(raster, raster, APIMode);
         SET(raster, raster, DXMultisampleRasterizationEnable);
         SET(raster, raster, AntialiasingEnable);
         SET(raster, raster, CullMode);
         SET(raster, raster, FrontWinding);
         SET(raster, raster, GlobalDepthOffsetEnableSolid);
         SET(raster, raster, GlobalDepthOffsetEnableWireframe);
         SET(raster, raster, GlobalDepthOffsetEnablePoint);
         SET(raster, raster, GlobalDepthOffsetConstant);
         SET(raster, raster, GlobalDepthOffsetScale);
         SET(raster, raster, GlobalDepthOffsetClamp);
         SET(raster, raster, FrontFaceFillMode);
         SET(raster, raster, BackFaceFillMode);
         SET(raster, raster, ViewportZFarClipTestEnable);
         SET(raster, raster, ViewportZNearClipTestEnable);
         SET(raster, raster, ConservativeRasterizationEnable);
#if GFX_VER >= 20
         SET(raster, raster, LegacyBaryAssignmentDisable);
#endif
      }
   }

   if (IS_DIRTY(LINE_STIPPLE)) {
      anv_gfx_pack(ls, GENX(3DSTATE_LINE_STIPPLE), ls) {
         SET(ls, ls, LineStipplePattern);
         SET(ls, ls, LineStippleInverseRepeatCount);
         SET(ls, ls, LineStippleRepeatCount);
      }
   }

   if (IS_DIRTY(MULTISAMPLE)) {
      anv_gfx_pack(ms, GENX(3DSTATE_MULTISAMPLE), ms) {
         ms.PixelLocation              = CENTER;

         /* The PRM says that this bit is valid only for DX9:
          *
          *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
          *    should not have any effect by setting or not setting this bit.
          */
         ms.PixelPositionOffsetEnable  = false;

         SET(ms, ms, NumberofMultisamples);
      }
   }

   if (IS_DIRTY(SAMPLE_MASK)) {
      anv_gfx_pack(sm, GENX(3DSTATE_SAMPLE_MASK), sm) {
         SET(sm, sm, SampleMask);
      }
   }

   if (IS_DIRTY(TE)) {
      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
         anv_gfx_pack_merge(te, GENX(3DSTATE_TE),
                            MESA_SHADER_TESS_EVAL, ds.te, te) {
            SET(te, te, TEDomain);
#if GFX_VER >= 12
            SET(te, te, PatchHeaderLayout);
#endif
            SET(te, te, Partitioning);
            SET(te, te, OutputTopology);
#if GFX_VERx10 >= 125
            SET(te, te, TessellationDistributionMode);
#endif
         }
      } else {
         anv_gfx_pack(te, GENX(3DSTATE_TE), te);
      }
   }

   if (IS_DIRTY(WM_DEPTH_STENCIL)) {
      anv_gfx_pack(wm_ds, GENX(3DSTATE_WM_DEPTH_STENCIL), wm_ds) {
         SET(wm_ds, wm_ds, DoubleSidedStencilEnable);
         SET(wm_ds, wm_ds, StencilTestMask);
         SET(wm_ds, wm_ds, StencilWriteMask);
         SET(wm_ds, wm_ds, BackfaceStencilTestMask);
         SET(wm_ds, wm_ds, BackfaceStencilWriteMask);
         SET(wm_ds, wm_ds, StencilReferenceValue);
         SET(wm_ds, wm_ds, BackfaceStencilReferenceValue);
         SET(wm_ds, wm_ds, DepthTestEnable);
         SET(wm_ds, wm_ds, DepthBufferWriteEnable);
         SET(wm_ds, wm_ds, DepthTestFunction);
         SET(wm_ds, wm_ds, StencilTestEnable);
         SET(wm_ds, wm_ds, StencilBufferWriteEnable);
         SET(wm_ds, wm_ds, StencilFailOp);
         SET(wm_ds, wm_ds, StencilPassDepthPassOp);
         SET(wm_ds, wm_ds, StencilPassDepthFailOp);
         SET(wm_ds, wm_ds, StencilTestFunction);
         SET(wm_ds, wm_ds, BackfaceStencilFailOp);
         SET(wm_ds, wm_ds, BackfaceStencilPassDepthPassOp);
         SET(wm_ds, wm_ds, BackfaceStencilPassDepthFailOp);
         SET(wm_ds, wm_ds, BackfaceStencilTestFunction);
      }
   }

#if GFX_VER >= 12
   if (IS_DIRTY(DEPTH_BOUNDS)) {
      anv_gfx_pack(db, GENX(3DSTATE_DEPTH_BOUNDS), db) {
         SET(db, db, DepthBoundsTestEnable);
         SET(db, db, DepthBoundsTestMinValue);
         SET(db, db, DepthBoundsTestMaxValue);
      }
   }
#endif

#if GFX_VER >= 12
   if (IS_DIRTY(PRIMITIVE_REPLICATION)) {
      anv_gfx_pack(pr, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
         SET(pr, pr, ReplicaMask);
         SET(pr, pr, ReplicationCount);
         SET_ARRAY(pr, pr, RTAIOffset);
      }
   }
#endif

   if (IS_DIRTY(SBE)) {
      anv_gfx_pack(sbe, GENX(3DSTATE_SBE), sbe) {
         for (unsigned i = 0; i < 32; i++)
            sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
         sbe.ForceVertexURBEntryReadOffset = true;
         sbe.ForceVertexURBEntryReadLength = true;

         SET(sbe, sbe, AttributeSwizzleEnable);
         SET(sbe, sbe, PointSpriteTextureCoordinateEnable);
         SET(sbe, sbe, PointSpriteTextureCoordinateOrigin);
         SET(sbe, sbe, NumberofSFOutputAttributes);
         SET(sbe, sbe, ConstantInterpolationEnable);
         SET(sbe, sbe, VertexURBEntryReadOffset);
         SET(sbe, sbe, VertexURBEntryReadLength);
#if GFX_VER >= 20
         SET(sbe, sbe, VertexAttributesBypass);
#endif
         SET(sbe, sbe, PrimitiveIDOverrideAttributeSelect);
         SET(sbe, sbe, PrimitiveIDOverrideComponentX);
         SET(sbe, sbe, PrimitiveIDOverrideComponentY);
         SET(sbe, sbe, PrimitiveIDOverrideComponentZ);
         SET(sbe, sbe, PrimitiveIDOverrideComponentW);
      }
   }

#if GFX_VERx10 >= 125
   if (IS_DIRTY(SBE_MESH)) {
      anv_gfx_pack(sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
         SET(sbe_mesh, sbe_mesh, PerVertexURBEntryOutputReadOffset);
         SET(sbe_mesh, sbe_mesh, PerVertexURBEntryOutputReadLength);
         SET(sbe_mesh, sbe_mesh, PerPrimitiveURBEntryOutputReadOffset);
         SET(sbe_mesh, sbe_mesh, PerPrimitiveURBEntryOutputReadLength);
      }
   }
#endif

   if (IS_DIRTY(SBE_SWIZ)) {
      anv_gfx_pack(sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe_swiz) {
         for (unsigned i = 0; i < 16; i++)
            SET(sbe_swiz, sbe_swiz, Attribute[i].SourceAttribute);
      }
   }

   if (IS_DIRTY(WM)) {
      anv_gfx_pack_merge(wm, GENX(3DSTATE_WM),
                         MESA_SHADER_FRAGMENT, ps.wm, wm) {
         SET(wm, wm, LineStippleEnable);
         SET(wm, wm, BarycentricInterpolationMode);
      }
   }

   if (IS_DIRTY(PS_BLEND)) {
      anv_gfx_pack(ps_blend, GENX(3DSTATE_PS_BLEND), blend) {
         SET(blend, ps_blend, HasWriteableRT);
         SET(blend, ps_blend, ColorBufferBlendEnable);
         SET(blend, ps_blend, SourceAlphaBlendFactor);
         SET(blend, ps_blend, DestinationAlphaBlendFactor);
         SET(blend, ps_blend, SourceBlendFactor);
         SET(blend, ps_blend, DestinationBlendFactor);
         SET(blend, ps_blend, AlphaTestEnable);
         SET(blend, ps_blend, IndependentAlphaBlendEnable);
         SET(blend, ps_blend, AlphaToCoverageEnable);
      }
   }

   if (IS_DIRTY(CC_STATE)) {
      hw_state->cc.state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            GENX(COLOR_CALC_STATE_length) * 4,
                                            64);
      struct GENX(COLOR_CALC_STATE) cc = {
         INIT(cc, BlendConstantColorRed),
         INIT(cc, BlendConstantColorGreen),
         INIT(cc, BlendConstantColorBlue),
         INIT(cc, BlendConstantColorAlpha),
      };
      GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);

      anv_gfx_pack(cc_state, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
         ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
         ccp.ColorCalcStatePointerValid = true;
      }
   }

   if (IS_DIRTY(BLEND_STATE)) {
      const uint32_t num_dwords = GENX(BLEND_STATE_length) +
         GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
      hw_state->blend.state =
         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
                                            num_dwords * 4,
                                            64);

      uint32_t *dws = hw_state->blend.state.map;

      struct GENX(BLEND_STATE) blend_state = {
         INIT(blend, AlphaToCoverageEnable),
         INIT(blend, AlphaToOneEnable),
         INIT(blend, IndependentAlphaBlendEnable),
         INIT(blend, ColorDitherEnable),
      };
      GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);

      /* Jump to blend entries. */
      dws += GENX(BLEND_STATE_length);
      for (uint32_t i = 0; i < MAX_RTS; i++) {
         struct GENX(BLEND_STATE_ENTRY) entry = {
            INIT(blend.rts[i], WriteDisableAlpha),
            INIT(blend.rts[i], WriteDisableRed),
            INIT(blend.rts[i], WriteDisableGreen),
            INIT(blend.rts[i], WriteDisableBlue),
            INIT(blend.rts[i], LogicOpFunction),
            INIT(blend.rts[i], LogicOpEnable),
            INIT(blend.rts[i], ColorBufferBlendEnable),
            INIT(blend.rts[i], ColorClampRange),
#if GFX_VER >= 30
            INIT(blend.rts[i], SimpleFloatBlendEnable),
#endif
            INIT(blend.rts[i], PreBlendColorClampEnable),
            INIT(blend.rts[i], PostBlendColorClampEnable),
            INIT(blend.rts[i], SourceBlendFactor),
            INIT(blend.rts[i], DestinationBlendFactor),
            INIT(blend.rts[i], ColorBlendFunction),
            INIT(blend.rts[i], SourceAlphaBlendFactor),
            INIT(blend.rts[i], DestinationAlphaBlendFactor),
            INIT(blend.rts[i], AlphaBlendFunction),
         };

         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
         dws += GENX(BLEND_STATE_ENTRY_length);
      }

      anv_gfx_pack(blend_state, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
         bsp.BlendStatePointer      = hw_state->blend.state.offset;
         bsp.BlendStatePointerValid = true;
      }
   }

#if GFX_VERx10 >= 125
   if (device->vk.enabled_extensions.EXT_mesh_shader) {
      if (IS_DIRTY(MESH_CONTROL)) {
         if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
            anv_gfx_copy_protected(mesh_control,
                                   GENX(3DSTATE_MESH_CONTROL),
                                   MESA_SHADER_MESH, ms.control);
         } else {
            anv_gfx_pack(mesh_control, GENX(3DSTATE_MESH_CONTROL), mc);
         }
      }

      if (IS_DIRTY(TASK_CONTROL)) {
         if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) {
            anv_gfx_copy_protected(task_control,
                                   GENX(3DSTATE_TASK_CONTROL),
                                   MESA_SHADER_TASK, ts.control);
         } else {
            anv_gfx_pack(task_control, GENX(3DSTATE_TASK_CONTROL), tc);
         }
      }

      if (IS_DIRTY(MESH_SHADER)) {
         anv_gfx_copy(mesh_shader, GENX(3DSTATE_MESH_SHADER),
                      MESA_SHADER_MESH, ms.shader);
      }

      if (IS_DIRTY(MESH_DISTRIB)) {
         anv_gfx_copy(mesh_distrib, GENX(3DSTATE_MESH_DISTRIB),
                      MESA_SHADER_MESH, ms.distrib);
      }

      if (IS_DIRTY(CLIP_MESH)) {
         anv_gfx_copy(clip_mesh, GENX(3DSTATE_CLIP_MESH),
                      MESA_SHADER_MESH, ms.clip);
      }

      if (IS_DIRTY(TASK_SHADER)) {
         anv_gfx_copy(task_shader, GENX(3DSTATE_TASK_SHADER),
                      MESA_SHADER_TASK, ts.shader);
      }

      if (IS_DIRTY(TASK_REDISTRIB)) {
         anv_gfx_copy(task_redistrib, GENX(3DSTATE_TASK_REDISTRIB),
                      MESA_SHADER_TASK, ts.redistrib);
      }
   }
#endif /* GFX_VERx10 >= 125 */

   if (IS_DIRTY(VS)) {
#if GFX_VERx10 == 90
      anv_gfx_pack_merge_protected(vs, GENX(3DSTATE_VS),
                                   MESA_SHADER_VERTEX, vs.vs, vs) {
         SET(vs, vs, VertexCacheDisable);
      }
#else
      anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), MESA_SHADER_VERTEX, vs.vs);
#endif
   }

   if (IS_DIRTY(HS))
      anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs);

   if (IS_DIRTY(DS)) {
      anv_gfx_pack_merge_protected(ds, GENX(3DSTATE_DS),
                                   MESA_SHADER_TESS_EVAL, ds.ds, ds) {
         SET(ds, ds, ComputeWCoordinateEnable);
      }
   }

   if (IS_DIRTY(GS)) {
      anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
                                   MESA_SHADER_GEOMETRY, gs.gs, gs) {
         SET(gs, gs, ReorderMode);
      }
   }

   if (IS_DIRTY(PS)) {
      anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS),
                                   MESA_SHADER_FRAGMENT, ps.ps, ps) {
         SET(ps, ps, KernelStartPointer0);
         SET(ps, ps, KernelStartPointer1);
         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);

#if GFX_VER < 20
         SET(ps, ps, KernelStartPointer2);
         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);

         SET(ps, ps, _8PixelDispatchEnable);
         SET(ps, ps, _16PixelDispatchEnable);
         SET(ps, ps, _32PixelDispatchEnable);
#else
         SET(ps, ps, Kernel0Enable);
         SET(ps, ps, Kernel1Enable);
         SET(ps, ps, Kernel0SIMDWidth);
         SET(ps, ps, Kernel1SIMDWidth);
         SET(ps, ps, Kernel0PolyPackingPolicy);
         SET(ps, ps, Kernel0MaximumPolysperThread);
#endif
         SET(ps, ps, PositionXYOffsetSelect);
      }
   }

   if (IS_DIRTY(PS_EXTRA)) {
      if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) {
         anv_gfx_pack_merge(ps_extra, GENX(3DSTATE_PS_EXTRA),
                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
            SET(pse, ps_extra, PixelShaderHasUAV);
            SET(pse, ps_extra, PixelShaderIsPerSample);
#if GFX_VER >= 11
            SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
#endif
            SET(pse, ps_extra, PixelShaderKillsPixel);
            SET(pse, ps_extra, InputCoverageMaskState);

#if GFX_VERx10 >= 125
            SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
#endif
         }
#if INTEL_WA_18038825448_GFX_VER
         /* Add a dependency if easier the shader needs it (because of runtime
          * change through pre-rasterization shader) or if we notice a change.
          */
         anv_gfx_pack_merge(ps_extra_dep, GENX(3DSTATE_PS_EXTRA),
                            MESA_SHADER_FRAGMENT, ps.ps_extra, pse) {
            SET(pse, ps_extra, PixelShaderHasUAV);
            SET(pse, ps_extra, PixelShaderIsPerSample);
#if GFX_VER >= 11
            SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
#endif
            SET(pse, ps_extra, PixelShaderKillsPixel);
            SET(pse, ps_extra, InputCoverageMaskState);

#if GFX_VERx10 >= 125 && INTEL_WA_18038825448_GFX_VER
            pse.EnablePSDependencyOnCPsizeChange = true;
#endif
         }
#endif /* INTEL_WA_18038825448_GFX_VER */
      } else {
         anv_gfx_pack(ps_extra, GENX(3DSTATE_PS_EXTRA), ps_extra);
         anv_gfx_pack(ps_extra_dep, GENX(3DSTATE_PS_EXTRA), ps_extra);
      }
   }

#if GFX_VERx10 >= 125
   if (hw_state->use_tbimr && IS_DIRTY(TBIMR_TILE_PASS_INFO)) {
      anv_gfx_pack(tbimr, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
         SET(tbimr, tbimr, TileRectangleHeight);
         SET(tbimr, tbimr, TileRectangleWidth);
         SET(tbimr, tbimr, VerticalTileCount);
         SET(tbimr, tbimr, HorizontalTileCount);
         SET(tbimr, tbimr, TBIMRBatchSize);
         SET(tbimr, tbimr, TileBoxCheck);
      }
   }
#endif

#undef IS_DIRTY
#undef GET
#undef SET

   BITSET_OR(hw_state->emit_dirty, hw_state->emit_dirty, hw_state->pack_dirty);
   BITSET_ZERO(hw_state->pack_dirty);
}

/**
 * This function takes the vulkan runtime values & dirty states and updates
 * the values in anv_gfx_dynamic_state, flagging HW instructions for
 * reemission if the values are changing.
 *
 * Nothing is emitted in the batch buffer.
 */
void
genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
{
   cmd_buffer_flush_gfx_runtime_state(
      &cmd_buffer->state.gfx.dyn_state,
      cmd_buffer->device,
      &cmd_buffer->vk.dynamic_graphics_state,
      &cmd_buffer->state.gfx,
      cmd_buffer->vk.level);

   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);

   cmd_buffer_repack_gfx_state(&cmd_buffer->state.gfx.dyn_state,
                               cmd_buffer,
                               &cmd_buffer->state.gfx);
}

static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
{
   /* For Wa_16012775297, ensure VF_STATISTICS is emitted before 3DSTATE_VF
    */
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
#if GFX_VERx10 >= 125
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
      vfg.DistributionMode = RR_STRICT;
   }
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
      vf.GeometryDistributionEnable =
         cmd_buffer->device->physical->instance->enable_vf_distribution;
   }
#endif

#if GFX_VER >= 12
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
      pr.ReplicaMask = 1;
   }
#endif

   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
      rr.CullMode = CULLMODE_NONE;
      rr.FrontFaceFillMode = FILL_MODE_SOLID;
      rr.BackFaceFillMode = FILL_MODE_SOLID;
   }

   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);

#if GFX_VER >= 11
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
#endif

   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
      clip.ClipEnable = true;
      clip.ClipMode = CLIPMODE_REJECT_ALL;
   }

   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);

   uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
                                               GENX(3DSTATE_VERTEX_ELEMENTS));
   uint32_t *ve_pack_dest = &vertex_elements[1];

   for (int i = 0; i < 2; i++) {
      struct GENX(VERTEX_ELEMENT_STATE) element = {
         .Valid = true,
         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
         .Component0Control = VFCOMP_STORE_0,
         .Component1Control = VFCOMP_STORE_0,
         .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
         .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
      };
      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
      ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
   }

   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
      topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
   }

   /* Emit dummy draw per slice. */
   for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
         prim.VertexCountPerInstance = 3;
         prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
         prim.InstanceCount = 1;
         prim.VertexAccessType = SEQUENTIAL;
      }
   }
}

#if INTEL_WA_14018283232_GFX_VER
void
genX(batch_emit_wa_14018283232)(struct anv_batch *batch)
{
   anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
      barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) {
         .BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE,
         .SignalStage = RESOURCE_BARRIER_STAGE_COLOR,
            .WaitStage = RESOURCE_BARRIER_STAGE_PIXEL,
      };
   }
}
#endif

void
genX(emit_urb_setup)(struct anv_batch *batch,
                     const struct anv_device *device,
                     const struct intel_urb_config *urb_cfg)
{
   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
#if GFX_VER >= 12
      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
         urb._3DCommandSubOpcode             += i;
         if (urb_cfg->size[i] > 0)
            urb.VSURBEntryAllocationSize     = urb_cfg->size[i] - 1;
         urb.VSURBStartingAddressSlice0      = urb_cfg->start[i];
         urb.VSURBStartingAddressSliceN      = urb_cfg->start[i];
         urb.VSNumberofURBEntriesSlice0      = urb_cfg->entries[i];
         urb.VSNumberofURBEntriesSliceN      = urb_cfg->entries[i];
      }
#else
      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
         urb._3DCommandSubOpcode      += i;
         if (urb_cfg->size[i] > 0)
            urb.VSURBEntryAllocationSize = urb_cfg->size[i] - 1;
         urb.VSURBStartingAddress        = urb_cfg->start[i];
         urb.VSNumberofURBEntries        = urb_cfg->entries[i];
      }
#endif
   }

#if GFX_VERx10 >= 125
   if (device->vk.enabled_features.meshShader) {
      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
         if (urb_cfg->size[MESA_SHADER_TASK] > 0)
            urb.TASKURBEntryAllocationSize = urb_cfg->size[MESA_SHADER_TASK] - 1;
         urb.TASKNumberofURBEntriesSlice0  = urb_cfg->entries[MESA_SHADER_TASK];
         urb.TASKNumberofURBEntriesSliceN  = urb_cfg->entries[MESA_SHADER_TASK];
         urb.TASKURBStartingAddressSlice0   = urb_cfg->start[MESA_SHADER_TASK];
         urb.TASKURBStartingAddressSliceN  = urb_cfg->start[MESA_SHADER_TASK];
      }
      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
         if (urb_cfg->size[MESA_SHADER_MESH] > 0)
            urb.MESHURBEntryAllocationSize = urb_cfg->size[MESA_SHADER_MESH] - 1;
         urb.MESHNumberofURBEntriesSlice0  = urb_cfg->entries[MESA_SHADER_MESH];
         urb.MESHNumberofURBEntriesSliceN  = urb_cfg->entries[MESA_SHADER_MESH];
         urb.MESHURBStartingAddressSlice0  = urb_cfg->start[MESA_SHADER_MESH];
         urb.MESHURBStartingAddressSliceN  = urb_cfg->start[MESA_SHADER_MESH];
      }
   }
#endif
}

/**
 * This function handles dirty state emission to the batch buffer.
 */
static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
{
   struct anv_batch *batch = &cmd_buffer->batch;
   struct anv_device *device = cmd_buffer->device;
   struct anv_instance *instance = device->physical->instance;
   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
   const struct vk_dynamic_graphics_state *dyn =
      &cmd_buffer->vk.dynamic_graphics_state;
   struct anv_push_constants *push_consts =
      &cmd_buffer->state.gfx.base.push_constants;
   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;

#define DEBUG_SHADER_HASH(stage) do {                                   \
      if (unlikely(                                                     \
             (instance->debug & ANV_DEBUG_SHADER_HASH) &&               \
             anv_gfx_has_stage(gfx, stage))) {                          \
         mi_store(&b,                                                   \
                  mi_mem32(device->workaround_address),                 \
                  mi_imm(gfx->shaders[stage]->prog_data->source_hash)); \
      }                                                                 \
   } while (0)

   struct mi_builder b;
   if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) {
      mi_builder_init(&b, device->info, &cmd_buffer->batch);
      mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
   }

#if INTEL_WA_16011107343_GFX_VER
   /* Will be emitted in front of every draw instead */
   if (intel_needs_workaround(device->info, 16011107343) &&
       anv_cmd_buffer_has_gfx_stage(cmd_buffer, MESA_SHADER_TESS_CTRL))
      BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_HS);
#endif

#if INTEL_WA_22018402687_GFX_VER
   /* Will be emitted in front of every draw instead */
   if (intel_needs_workaround(device->info, 22018402687) &&
       anv_cmd_buffer_has_gfx_stage(cmd_buffer, MESA_SHADER_TESS_EVAL))
      BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_DS);
#endif

#define IS_DIRTY(name) BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_##name)

   /*
    * Values provided by push constants
    */

   if (IS_DIRTY(TESS_CONFIG)) {
      push_consts->gfx.tess_config = hw_state->tess_config;
      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
                                                VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
      gfx->base.push_constants_data_dirty = true;
   }

#if INTEL_WA_14024997852_GFX_VER
   if (IS_DIRTY(WA_14024997852) &&
       intel_needs_workaround(device->info, 14024997852)) {
      genX(setup_autostrip_state)(cmd_buffer, !hw_state->autostrip_disabled);
   }
#endif

#if INTEL_WA_18019110168_GFX_VER
   if (IS_DIRTY(MESH_PROVOKING_VERTEX))
      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_MESH_BIT_EXT;
#endif

   if (IS_DIRTY(FS_MSAA_FLAGS)) {
      push_consts->gfx.fs_msaa_flags = hw_state->fs_msaa_flags;

      const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx);
      if (mesh_prog_data) {
         push_consts->gfx.fs_per_prim_remap_offset =
            gfx->shaders[MESA_SHADER_MESH]->kernel.offset +
            mesh_prog_data->wa_18019110168_mapping_offset;
      }

      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
      gfx->base.push_constants_data_dirty = true;
   }

#define anv_batch_emit_gfx(batch, cmd, name) ({                         \
      void *__dst = anv_batch_emit_dwords(                              \
         batch, __anv_cmd_length(cmd));                                 \
      if (__dst != NULL) {                                              \
         memcpy(__dst, hw_state->packed.name,                           \
               4 * __anv_cmd_length(cmd));                              \
         VG(VALGRIND_CHECK_MEM_IS_DEFINED(                              \
               __dst, __anv_cmd_length(cmd) * 4));                      \
      }                                                                 \
      __dst;                                                            \
   })
#define anv_batch_emit_gfx_variable(batch, name) do {                   \
      void *__dst = anv_batch_emit_dwords(                              \
         batch, hw_state->packed.name##_len);                           \
      if (__dst != NULL) {                                              \
         memcpy(__dst, hw_state->packed.name,                           \
               4 * hw_state->packed.name##_len);                        \
         VG(VALGRIND_CHECK_MEM_IS_DEFINED(                              \
               __dst, 4 * hw_state->packed.name##_len));                \
      }                                                                 \
   } while (0)

   if (IS_DIRTY(URB)) {
#if INTEL_NEEDS_WA_16014912113
      if (genX(need_wa_16014912113)(
             &cmd_buffer->state.gfx.urb_cfg, &hw_state->urb_cfg))
         genX(batch_emit_wa_16014912113)(batch, &cmd_buffer->state.gfx.urb_cfg);

      /* Update urb config. */
      memcpy(&cmd_buffer->state.gfx.urb_cfg, &hw_state->urb_cfg,
             sizeof(hw_state->urb_cfg));
#endif

      genX(emit_urb_setup)(batch, device, &hw_state->urb_cfg);
   }

   if (IS_DIRTY(VF_SGVS_INSTANCING))
      anv_batch_emit_gfx_variable(batch, vf_sgvs_instancing);

   if (IS_DIRTY(VF_SGVS))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_SGVS), vf_sgvs);

#if GFX_VER >= 11
   if (IS_DIRTY(VF_SGVS_2))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_SGVS_2), vf_sgvs_2);
#endif

   if (device->physical->instance->vf_component_packing &&
       IS_DIRTY(VF_COMPONENT_PACKING)) {
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_COMPONENT_PACKING),
                         vf_component_packing);
   }

   if (IS_DIRTY(VS)) {
      DEBUG_SHADER_HASH(MESA_SHADER_VERTEX);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VS), vs);
   }

   if (IS_DIRTY(HS)) {
      DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_HS), hs);
   }

   if (IS_DIRTY(DS)) {
      DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_DS), ds);
   }

   if (IS_DIRTY(VF_STATISTICS))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_STATISTICS), vfs);

   if (IS_DIRTY(SO_DECL_LIST)) {
      /* Wa_16011773973:
       * If SOL is enabled and SO_DECL state has to be programmed,
       *    1. Send 3D State SOL state with SOL disabled
       *    2. Send SO_DECL NP state
       *    3. Send 3D State SOL with SOL Enabled
       */
      if (intel_needs_workaround(device->info, 16011773973) &&
          gfx->shaders[gfx->streamout_stage]->xfb_info != NULL)
         anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);

      anv_batch_emit_gfx_variable(batch, so_decl_list);

#if GFX_VER >= 11 && GFX_VER < 20
      /* ICL PRMs, Volume 2a - Command Reference: Instructions,
       * 3DSTATE_SO_DECL_LIST:
       *
       *    "Workaround: This command must be followed by a PIPE_CONTROL with
       *     CS Stall bit set."
       *
       * On DG2+ also known as Wa_1509820217.
       */
      genx_batch_emit_pipe_control(batch, device->info,
                                   cmd_buffer->state.current_pipeline,
                                   ANV_PIPE_CS_STALL_BIT);
#endif
   }

#if GFX_VERx10 >= 125
   if (device->vk.enabled_extensions.EXT_mesh_shader) {
      if (IS_DIRTY(MESH_CONTROL))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_CONTROL), mesh_control);

      if (IS_DIRTY(MESH_SHADER)) {
         DEBUG_SHADER_HASH(MESA_SHADER_MESH);
         anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_SHADER), mesh_shader);
      }

      if (IS_DIRTY(MESH_DISTRIB))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_DISTRIB), mesh_distrib);

      if (IS_DIRTY(TASK_CONTROL))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_CONTROL), task_control);

      if (IS_DIRTY(TASK_SHADER)) {
         DEBUG_SHADER_HASH(MESA_SHADER_TASK);
         anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_SHADER), task_shader);
      }

      if (IS_DIRTY(TASK_REDISTRIB))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_TASK_REDISTRIB), task_redistrib);

      if (IS_DIRTY(SBE_MESH))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);

      if (IS_DIRTY(CLIP_MESH))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_CLIP_MESH), clip_mesh);
   }
#endif

   if (IS_DIRTY(SBE))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE), sbe);

   if (IS_DIRTY(SBE_SWIZ))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_SBE_SWIZ), sbe_swiz);

   if (IS_DIRTY(PS)) {
      DEBUG_SHADER_HASH(MESA_SHADER_FRAGMENT);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_PS), ps);
   }

#if INTEL_WA_18038825448_GFX_VER
   if (IS_DIRTY(PS_EXTRA) || IS_DIRTY(WA_18038825448)) {
      if (IS_DIRTY(WA_18038825448))
         anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra_dep);
      else
         anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra);
   }
#else
   if (IS_DIRTY(PS_EXTRA))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_EXTRA), ps_extra);
#endif

   if (IS_DIRTY(CLIP))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_CLIP), clip);

   if (IS_DIRTY(STREAMOUT)) {
      genX(streamout_prologue)(cmd_buffer, gfx);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_STREAMOUT), so);
   }

   if (IS_DIRTY(VIEWPORT_SF_CLIP))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), sf_clip);

   if (IS_DIRTY(VIEWPORT_CC)) {
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc_viewport);
      cmd_buffer->state.gfx.viewport_set = true;
   }

   if (IS_DIRTY(SCISSOR))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), scissor);

   if (IS_DIRTY(VF_TOPOLOGY))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_TOPOLOGY), vft);

   if (IS_DIRTY(VERTEX_INPUT)) {
      genX(batch_emit_vertex_input)(batch, device,
                                    gfx->shaders[MESA_SHADER_VERTEX], dyn->vi);
   }

   if (IS_DIRTY(TE))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_TE), te);

   if (IS_DIRTY(GS)) {
      DEBUG_SHADER_HASH(MESA_SHADER_GEOMETRY);
      anv_batch_emit_gfx(batch, GENX(3DSTATE_GS), gs);
   }

#if GFX_VER >= 11
   if (IS_DIRTY(CPS)) {
#if GFX_VER >= 30
      anv_batch_emit_gfx(batch, GENX(3DSTATE_COARSE_PIXEL), cps);
#elif GFX_VER >= 12
      /* TODO: we can optimize this flush in the following cases:
       *
       *    In the case where the last geometry shader emits a value that is
       *    not constant, we can avoid this stall because we can synchronize
       *    the pixel shader internally with
       *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
       *
       *    If we know that the previous pipeline and the current one are
       *    using the same fragment shading rate.
       */
      anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
#if GFX_VERx10 >= 125
         pc.PSSStallSyncEnable = true;
#else
         pc.PSDSyncEnable = true;
#endif
      }
      anv_batch_emit_gfx(batch, GENX(3DSTATE_CPS_POINTERS), cps);
#else
      anv_batch_emit_gfx(batch, GENX(3DSTATE_CPS), cps);
#endif
   }
#endif /* GFX_VER >= 11 */

   if (IS_DIRTY(SF))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_SF), sf);

   if (IS_DIRTY(RASTER))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_RASTER), raster);

   if (IS_DIRTY(MULTISAMPLE))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_MULTISAMPLE), ms);

   if (IS_DIRTY(CC_STATE))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc_state);

   if (IS_DIRTY(SAMPLE_MASK))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_SAMPLE_MASK), sm);

   if (IS_DIRTY(WM_DEPTH_STENCIL))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm_ds);

#if GFX_VER >= 12
   if (IS_DIRTY(DEPTH_BOUNDS))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_DEPTH_BOUNDS), db);
#endif

   if (IS_DIRTY(LINE_STIPPLE)) {
      anv_batch_emit_gfx(batch, GENX(3DSTATE_LINE_STIPPLE), ls);
#if GFX_VER >= 11
      /* ICL PRMs, Volume 2a - Command Reference: Instructions,
       * 3DSTATE_LINE_STIPPLE:
       *
       *    "Workaround: This command must be followed by a PIPE_CONTROL with
       *     CS Stall bit set."
       */
      genx_batch_emit_pipe_control(batch, device->info,
                                   cmd_buffer->state.current_pipeline,
                                   ANV_PIPE_CS_STALL_BIT);
#endif
   }

   if (IS_DIRTY(VF))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VF), vf);

#if GFX_VER >= 12
   if (IS_DIRTY(PRIMITIVE_REPLICATION))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
#endif

   if (IS_DIRTY(INDEX_BUFFER))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_INDEX_BUFFER), ib);

#if GFX_VERx10 >= 125
   if (IS_DIRTY(VFG))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_VFG), vfg);
#endif

   if (IS_DIRTY(SAMPLE_PATTERN)) {
      genX(emit_sample_pattern)(batch,
                                dyn->ms.sample_locations_enable ?
                                dyn->ms.sample_locations : NULL);
   }

   if (IS_DIRTY(WM))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_WM), wm);

   if (IS_DIRTY(PS_BLEND))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_PS_BLEND), ps_blend);

   if (IS_DIRTY(BLEND_STATE))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), blend_state);

#if INTEL_WA_18019816803_GFX_VER
   if (IS_DIRTY(WA_18019816803)) {
      genX(batch_emit_pipe_control)(batch, device->info,
                                    cmd_buffer->state.current_pipeline,
                                    ANV_PIPE_PSS_STALL_SYNC_BIT,
                                    "Wa_18019816803");
   }
#endif

#if INTEL_WA_14018283232_GFX_VER
   if (IS_DIRTY(WA_14018283232))
      genX(batch_emit_wa_14018283232)(batch);
#endif

#if GFX_VER == 9
   if (IS_DIRTY(PMA_FIX))
      genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
#endif

#if GFX_VERx10 >= 125
   if (hw_state->use_tbimr && IS_DIRTY(TBIMR_TILE_PASS_INFO))
      anv_batch_emit_gfx(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr);
#endif

#undef anv_batch_emit_gfx
#undef anv_batch_emit_gfx_variable
#undef INIT
#undef SET
#undef SET_ARRAY
#undef IS_DIRTY
#undef DEBUG_SHADER_HASH

   BITSET_ZERO(hw_state->emit_dirty);
}

/**
 * This function handles possible state workarounds and emits the dirty
 * instructions to the batch buffer.
 */
void
genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
{
   struct anv_device *device = cmd_buffer->device;
   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;

   if (INTEL_DEBUG(DEBUG_REEMIT)) {
      BITSET_OR(gfx->dyn_state.emit_dirty,
                gfx->dyn_state.emit_dirty,
                device->gfx_dirty_state);
   }

   /**
    * Put potential workarounds here if you need to reemit an instruction
    * because of another one is changing.
    */

   /* Reprogram SF_CLIP & CC_STATE together. This reproduces the programming
    * done on Windows drivers. Fixes flickering issues with multiple
    * workloads.
    *
    * Since blorp disables 3DSTATE_CLIP::ClipEnable and dirties CC_STATE, this
    * also takes care of Wa_14016820455 which requires SF_CLIP to be
    * reprogrammed whenever 3DSTATE_CLIP::ClipEnable is enabled.
    */
   if (BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
       BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC);
   }

   /* Wa_16012775297 - Emit dummy VF statistics before each 3DSTATE_VF. */
#if INTEL_WA_16012775297_GFX_VER
   if (intel_needs_workaround(device->info, 16012775297) &&
       BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VF))
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
#endif

   /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
    * it after.
    */
   if (intel_needs_workaround(device->info, 16011773973) &&
       gfx->shaders[gfx->streamout_stage]->xfb_info != NULL &&
       BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
   }

#if INTEL_WA_18038825448_GFX_VER
   const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx);
   if (wm_prog_data) {
      genX(cmd_buffer_set_coarse_pixel_active)(
         cmd_buffer,
         brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags));
   }
#endif

   /* Gfx11 undocumented issue :
    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
    */
#if GFX_VER == 11
   if (BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_BLEND_STATE))
      BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_MULTISAMPLE);
#endif

#if GFX_VERx10 == 125
   if (intel_device_info_is_dg2(device->info)) {
      /* On DG2 & MTL, dEQP-VK.shader_object.binding.mesh_swap_task fails on
       * both simulation & HW, dEQP-VK.shader_object.binding.mesh_swap_mesh
       * fails on HW.
       *
       * We can get the first test to pass more often by reemitting
       * 3DSTATE_TASK_CONTROL but the other nothing is helping but a CS stall.
       *
       * What seems to happen is that the new shader offset programmed isn't
       * applied and instead the HW reexecutes the previous shader.
       */
      if ((BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_TASK_SHADER) ||
           BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_MESH_SHADER)) &&
          gfx->shaders[MESA_SHADER_MESH] != NULL) {
         genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
                                      _3D, ANV_PIPE_CS_STALL_BIT);
      }
   }
#endif

   /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
   if (intel_needs_workaround(device->info, 18020335297) &&
       BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC) &&
       cmd_buffer->state.gfx.viewport_set) {
      /* For mesh, we implement the WA using CS stall. This is for
       * simplicity and takes care of possible interaction with Wa_16014390852.
       */
      if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) {
         genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
                                      _3D, ANV_PIPE_CS_STALL_BIT);
      } else {
         /* Mask off all instructions that we program. */
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VFG);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_RASTER);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS_2);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_CLIP);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VERTEX_INPUT);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VF_TOPOLOGY);

         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_VS);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_GS);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_HS);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_TE);
         BITSET_CLEAR(hw_state->emit_dirty, ANV_GFX_STATE_DS);

         cmd_buffer_gfx_state_emission(cmd_buffer);

         emit_wa_18020335297_dummy_draw(cmd_buffer);

         /* Dirty all emitted WA state to make sure that current real
          * state is restored.
          */
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VFG);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_RASTER);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS_2);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_CLIP);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VERTEX_INPUT);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_TOPOLOGY);

         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VS);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_GS);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_HS);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_TE);
         BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_DS);
      }
   }

   cmd_buffer_gfx_state_emission(cmd_buffer);
}

void
genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
{
   if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
      return;

   if (cmd_buffer->state.gfx.pma_fix_enabled == enable)
      return;

   cmd_buffer->state.gfx.pma_fix_enabled = enable;

   /* According to the Broadwell PIPE_CONTROL documentation, software should
    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
    * prior to the LRI.  If stencil buffer writes are enabled, then a Render
    * Cache Flush is also necessary.
    *
    * The Skylake docs say to use a depth stall rather than a command
    * streamer stall.  However, the hardware seems to violently disagree.
    * A full command streamer stall seems to be needed in both cases.
    */
   genx_batch_emit_pipe_control
      (&cmd_buffer->batch, cmd_buffer->device->info,
       cmd_buffer->state.current_pipeline,
       ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
       ANV_PIPE_CS_STALL_BIT |
#if GFX_VER >= 12
       ANV_PIPE_TILE_CACHE_FLUSH_BIT |
#endif
       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);

#if GFX_VER == 9
   uint32_t cache_mode;
   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
                   .STCPMAOptimizationEnable = enable,
                   .STCPMAOptimizationEnableMask = true);
   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
      lri.DataDWord        = cache_mode;
   }

#endif /* GFX_VER == 9 */

   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
    * Flush bits is often necessary.  We do it regardless because it's easier.
    * The render cache flush is also necessary if stencil writes are enabled.
    *
    * Again, the Skylake docs give a different set of flushes but the BDW
    * flushes seem to work just as well.
    */
   genx_batch_emit_pipe_control
      (&cmd_buffer->batch, cmd_buffer->device->info,
       cmd_buffer->state.current_pipeline,
       ANV_PIPE_DEPTH_STALL_BIT |
       ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
#if GFX_VER >= 12
       ANV_PIPE_TILE_CACHE_FLUSH_BIT |
#endif
       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
}