diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index c27a23afcb3..2646ebe29ad 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -52,6 +52,10 @@ anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer) state->gfx.restart_index = UINT32_MAX; state->gfx.object_preemption = true; state->gfx.dirty = 0; + + memcpy(state->gfx.dyn_state.dirty, + cmd_buffer->device->gfx_dirty_state, + sizeof(state->gfx.dyn_state.dirty)); } static void @@ -422,6 +426,125 @@ anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.push_constants_dirty |= stages; } +/** + * This function compute changes between 2 pipelines and flags the dirty HW + * state appropriately. + */ +static void +anv_cmd_buffer_flush_pipeline_state(struct anv_cmd_buffer *cmd_buffer, + struct anv_graphics_pipeline *old_pipeline, + struct anv_graphics_pipeline *new_pipeline) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; + +#define diff_fix_state(bit, name) \ + do { \ + /* Fixed states should always have matching sizes */ \ + assert(old_pipeline == NULL || \ + old_pipeline->name.len == new_pipeline->name.len); \ + /* Don't bother memcmp if the state is already dirty */ \ + if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) && \ + (old_pipeline == NULL || \ + memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ + &new_pipeline->batch_data[new_pipeline->name.offset], \ + 4 * new_pipeline->name.len) != 0)) \ + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ + } while (0) +#define diff_var_state(bit, name) \ + do { \ + /* Don't bother memcmp if the state is already dirty */ \ + /* Also if the new state is empty, avoid marking dirty */ \ + if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) && \ + new_pipeline->name.len != 0 && \ + (old_pipeline == NULL || \ + old_pipeline->name.len != new_pipeline->name.len || \ + memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ + &new_pipeline->batch_data[new_pipeline->name.offset], \ + 4 * new_pipeline->name.len) != 0)) \ + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ + } while (0) +#define assert_identical(bit, name) \ + do { \ + /* Fixed states should always have matching sizes */ \ + assert(old_pipeline == NULL || \ + old_pipeline->name.len == new_pipeline->name.len); \ + assert(old_pipeline == NULL || \ + memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ + &new_pipeline->batch_data[new_pipeline->name.offset], \ + 4 * new_pipeline->name.len) == 0); \ + } while (0) +#define assert_empty(name) assert(new_pipeline->name.len == 0) + + /* Compare all states, including partial packed ones, the dynamic part is + * left at 0 but the static part could still change. + */ + diff_fix_state(URB, final.urb); + diff_fix_state(VF_SGVS, final.vf_sgvs); + if (cmd_buffer->device->info->ver >= 11) + diff_fix_state(VF_SGVS_2, final.vf_sgvs_2); + if (cmd_buffer->device->info->ver >= 12) + diff_fix_state(PRIMITIVE_REPLICATION, final.primitive_replication); + diff_fix_state(SBE, final.sbe); + diff_fix_state(SBE_SWIZ, final.sbe_swiz); + diff_fix_state(MULTISAMPLE, final.ms); + diff_fix_state(VS, final.vs); + diff_fix_state(HS, final.hs); + diff_fix_state(DS, final.ds); + diff_fix_state(PS, final.ps); + diff_fix_state(PS_EXTRA, final.ps_extra); + + diff_fix_state(CLIP, partial.clip); + diff_fix_state(SF, partial.sf); + diff_fix_state(RASTER, partial.raster); + diff_fix_state(WM, partial.wm); + diff_fix_state(STREAMOUT, partial.so); + diff_fix_state(GS, partial.gs); + diff_fix_state(TE, partial.te); + diff_fix_state(VFG, partial.vfg); + + if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) { + diff_fix_state(TASK_CONTROL, final.task_control); + diff_fix_state(TASK_SHADER, final.task_shader); + diff_fix_state(TASK_REDISTRIB, final.task_redistrib); + diff_fix_state(MESH_CONTROL, final.mesh_control); + diff_fix_state(MESH_SHADER, final.mesh_shader); + diff_fix_state(MESH_DISTRIB, final.mesh_distrib); + diff_fix_state(CLIP_MESH, final.clip_mesh); + diff_fix_state(SBE_MESH, final.sbe_mesh); + } else { + assert_empty(final.task_control); + assert_empty(final.task_shader); + assert_empty(final.task_redistrib); + assert_empty(final.mesh_control); + assert_empty(final.mesh_shader); + assert_empty(final.mesh_distrib); + assert_empty(final.clip_mesh); + assert_empty(final.sbe_mesh); + } + + /* States that should never vary between pipelines, but can be affected by + * blorp etc... + */ + assert_identical(VF_STATISTICS, final.vf_statistics); + + /* States that can vary in length */ + diff_var_state(VF_SGVS_INSTANCING, final.vf_sgvs_instancing); + diff_var_state(SO_DECL_LIST, final.so_decl_list); + +#undef diff_fix_state +#undef diff_var_state +#undef assert_identical +#undef assert_empty + + /* We're not diffing the following : + * - anv_graphics_pipeline::vertex_input_data + * - anv_graphics_pipeline::final::vf_instancing + * + * since they are tracked by the runtime. + */ +} + void anv_CmdBindPipeline( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -451,26 +574,28 @@ void anv_CmdBindPipeline( } case VK_PIPELINE_BIND_POINT_GRAPHICS: { - struct anv_graphics_pipeline *gfx_pipeline = + struct anv_graphics_pipeline *old_pipeline = + cmd_buffer->state.gfx.pipeline; + struct anv_graphics_pipeline *new_pipeline = anv_pipeline_to_graphics(pipeline); - if (cmd_buffer->state.gfx.pipeline == gfx_pipeline) + if (old_pipeline == new_pipeline) return; cmd_buffer->state.gfx.base.pipeline = pipeline; - cmd_buffer->state.gfx.pipeline = gfx_pipeline; + cmd_buffer->state.gfx.pipeline = new_pipeline; cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; - anv_foreach_stage(stage, gfx_pipeline->base.base.active_stages) { + anv_foreach_stage(stage, new_pipeline->base.base.active_stages) { set_dirty_for_bind_map(cmd_buffer, stage, - &gfx_pipeline->base.shaders[stage]->bind_map); + &new_pipeline->base.shaders[stage]->bind_map); } /* Apply the non dynamic state from the pipeline */ vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, - &gfx_pipeline->dynamic_state); + &new_pipeline->dynamic_state); state = &cmd_buffer->state.gfx.base; - stages = gfx_pipeline->base.base.active_stages; + stages = new_pipeline->base.base.active_stages; /* When the pipeline is using independent states and dynamic buffers, @@ -479,7 +604,7 @@ void anv_CmdBindPipeline( */ struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants; - struct anv_pipeline_sets_layout *layout = &gfx_pipeline->base.base.layout; + struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout; if (layout->independent_sets && layout->num_dynamic_buffers > 0) { bool modified = false; for (uint32_t s = 0; s < layout->num_sets; s++) { @@ -499,15 +624,17 @@ void anv_CmdBindPipeline( cmd_buffer->state.push_constants_dirty |= stages; } - if ((gfx_pipeline->fs_msaa_flags & BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC) && - push->gfx.fs_msaa_flags != gfx_pipeline->fs_msaa_flags) { - push->gfx.fs_msaa_flags = gfx_pipeline->fs_msaa_flags; + if ((new_pipeline->fs_msaa_flags & BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC) && + push->gfx.fs_msaa_flags != new_pipeline->fs_msaa_flags) { + push->gfx.fs_msaa_flags = new_pipeline->fs_msaa_flags; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; } - if (gfx_pipeline->dynamic_patch_control_points) { + if (new_pipeline->dynamic_patch_control_points) { cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; } + + anv_cmd_buffer_flush_pipeline_state(cmd_buffer, old_pipeline, new_pipeline); break; } diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 0ae707e6cec..5a282271f94 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -3482,6 +3482,34 @@ VkResult anv_CreateDevice( anv_device_utrace_init(device); + BITSET_ONES(device->gfx_dirty_state); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST); + if (device->info->ver < 11) + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_VF_SGVS_2); + if (device->info->ver < 12) { + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PRIMITIVE_REPLICATION); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_DEPTH_BOUNDS); + } + if (!device->vk.enabled_extensions.EXT_sample_locations) + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SAMPLE_PATTERN); + if (!device->vk.enabled_extensions.KHR_fragment_shading_rate) + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CPS); + if (!device->vk.enabled_extensions.EXT_mesh_shader) { + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SBE_MESH); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CLIP_MESH); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_CONTROL); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_SHADER); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_DISTRIB); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_CONTROL); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_SHADER); + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_REDISTRIB); + } + if (!intel_needs_workaround(device->info, 18019816803)) + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_WA_18019816803); + if (device->info->ver > 9) + BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PMA_FIX); + *pDevice = anv_device_to_handle(device); return VK_SUCCESS; diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index e7ba1205022..1f9971ca06a 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -128,6 +128,10 @@ void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer); + void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 777c3983c85..9747ec6675f 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1124,6 +1124,304 @@ anv_device_upload_nir(struct anv_device *device, void anv_load_fp64_shader(struct anv_device *device); +/** + * This enum tracks the various HW instructions that hold graphics state + * needing to be reprogrammed. Some instructions are grouped together as they + * pretty much need to be emitted together (like 3DSTATE_URB_*). + * + * Not all bits apply to all platforms. We build a dirty state based on + * enabled extensions & generation on anv_device. + */ +enum anv_gfx_state_bits { + /* Pipeline states */ + ANV_GFX_STATE_URB, /* All legacy stages, including mesh */ + ANV_GFX_STATE_VF_STATISTICS, + ANV_GFX_STATE_VF_SGVS, + ANV_GFX_STATE_VF_SGVS_2, + ANV_GFX_STATE_VF_SGVS_VI, /* 3DSTATE_VERTEX_ELEMENTS for sgvs elements */ + ANV_GFX_STATE_VF_SGVS_INSTANCING, /* 3DSTATE_VF_INSTANCING for sgvs elements */ + ANV_GFX_STATE_PRIMITIVE_REPLICATION, + ANV_GFX_STATE_MULTISAMPLE, + ANV_GFX_STATE_SBE, + ANV_GFX_STATE_SBE_SWIZ, + ANV_GFX_STATE_SO_DECL_LIST, + ANV_GFX_STATE_VS, + ANV_GFX_STATE_HS, + ANV_GFX_STATE_DS, + ANV_GFX_STATE_GS, + ANV_GFX_STATE_PS, + ANV_GFX_STATE_PS_EXTRA, + ANV_GFX_STATE_SBE_MESH, + ANV_GFX_STATE_CLIP_MESH, + ANV_GFX_STATE_MESH_CONTROL, + ANV_GFX_STATE_MESH_SHADER, + ANV_GFX_STATE_MESH_DISTRIB, + ANV_GFX_STATE_TASK_CONTROL, + ANV_GFX_STATE_TASK_SHADER, + ANV_GFX_STATE_TASK_REDISTRIB, + /* Dynamic states */ + ANV_GFX_STATE_BLEND_STATE_POINTERS, + ANV_GFX_STATE_CLIP, + ANV_GFX_STATE_CC_STATE, + ANV_GFX_STATE_CPS, + ANV_GFX_STATE_DEPTH_BOUNDS, + ANV_GFX_STATE_INDEX_BUFFER, + ANV_GFX_STATE_LINE_STIPPLE, + ANV_GFX_STATE_PS_BLEND, + ANV_GFX_STATE_RASTER, + ANV_GFX_STATE_SAMPLE_MASK, + ANV_GFX_STATE_SAMPLE_PATTERN, + ANV_GFX_STATE_SCISSOR, + ANV_GFX_STATE_SF, + ANV_GFX_STATE_STREAMOUT, + ANV_GFX_STATE_TE, + ANV_GFX_STATE_VERTEX_INPUT, + ANV_GFX_STATE_VF, + ANV_GFX_STATE_VF_TOPOLOGY, + ANV_GFX_STATE_VFG, + ANV_GFX_STATE_VIEWPORT_CC, + ANV_GFX_STATE_VIEWPORT_SF_CLIP, + ANV_GFX_STATE_WM, + ANV_GFX_STATE_WM_DEPTH_STENCIL, + ANV_GFX_STATE_PMA_FIX, /* Fake state to implement workaround */ + ANV_GFX_STATE_WA_18019816803, /* Fake state to implement workaround */ + + ANV_GFX_STATE_MAX, +}; + +const char *anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state); + +/* This structure tracks the values to program in HW instructions for + * corresponding to dynamic states of the Vulkan API. Only fields that need to + * be reemitted outside of the VkPipeline object are tracked here. + */ +struct anv_gfx_dynamic_state { + /* 3DSTATE_BLEND_STATE_POINTERS */ + struct { + bool AlphaToCoverageEnable; + bool AlphaToOneEnable; + bool IndependentAlphaBlendEnable; + struct { + bool WriteDisableAlpha; + bool WriteDisableRed; + bool WriteDisableGreen; + bool WriteDisableBlue; + + uint32_t LogicOpFunction; + bool LogicOpEnable; + + bool ColorBufferBlendEnable; + uint32_t ColorClampRange; + bool PreBlendColorClampEnable; + bool PostBlendColorClampEnable; + uint32_t SourceBlendFactor; + uint32_t DestinationBlendFactor; + uint32_t ColorBlendFunction; + uint32_t SourceAlphaBlendFactor; + uint32_t DestinationAlphaBlendFactor; + uint32_t AlphaBlendFunction; + } rts[MAX_RTS]; + } blend; + + /* 3DSTATE_CC_STATE_POINTERS */ + struct { + float BlendConstantColorRed; + float BlendConstantColorGreen; + float BlendConstantColorBlue; + float BlendConstantColorAlpha; + } cc; + + /* 3DSTATE_CLIP */ + struct { + uint32_t APIMode; + uint32_t ViewportXYClipTestEnable; + uint32_t MaximumVPIndex; + uint32_t TriangleStripListProvokingVertexSelect; + uint32_t LineStripListProvokingVertexSelect; + uint32_t TriangleFanProvokingVertexSelect; + } clip; + + /* 3DSTATE_CPS/3DSTATE_CPS_POINTERS */ + struct { + /* Gfx11 */ + uint32_t CoarsePixelShadingMode; + float MinCPSizeX; + float MinCPSizeY; + /* Gfx12+ */ + uint32_t CoarsePixelShadingStateArrayPointer; + } cps; + + /* 3DSTATE_DEPTH_BOUNDS */ + struct { + bool DepthBoundsTestEnable; + float DepthBoundsTestMinValue; + float DepthBoundsTestMaxValue; + } db; + + /* 3DSTATE_GS */ + struct { + uint32_t ReorderMode; + } gs; + + /* 3DSTATE_LINE_STIPPLE */ + struct { + uint32_t LineStipplePattern; + float LineStippleInverseRepeatCount; + uint32_t LineStippleRepeatCount; + } ls; + + /* 3DSTATE_PS_BLEND */ + struct { + bool HasWriteableRT; + bool ColorBufferBlendEnable; + uint32_t SourceAlphaBlendFactor; + uint32_t DestinationAlphaBlendFactor; + uint32_t SourceBlendFactor; + uint32_t DestinationBlendFactor; + bool AlphaTestEnable; + bool IndependentAlphaBlendEnable; + bool AlphaToCoverageEnable; + } ps_blend; + + /* 3DSTATE_RASTER */ + struct { + uint32_t APIMode; + bool DXMultisampleRasterizationEnable; + bool AntialiasingEnable; + uint32_t CullMode; + uint32_t FrontWinding; + bool GlobalDepthOffsetEnableSolid; + bool GlobalDepthOffsetEnableWireframe; + bool GlobalDepthOffsetEnablePoint; + float GlobalDepthOffsetConstant; + float GlobalDepthOffsetScale; + float GlobalDepthOffsetClamp; + uint32_t FrontFaceFillMode; + uint32_t BackFaceFillMode; + bool ViewportZFarClipTestEnable; + bool ViewportZNearClipTestEnable; + bool ConservativeRasterizationEnable; + } raster; + + /* 3DSTATE_SCISSOR_STATE_POINTERS */ + struct { + uint32_t count; + struct { + uint32_t ScissorRectangleYMin; + uint32_t ScissorRectangleXMin; + uint32_t ScissorRectangleYMax; + uint32_t ScissorRectangleXMax; + } elem[MAX_SCISSORS]; + } scissor; + + /* 3DSTATE_SF */ + struct { + float LineWidth; + uint32_t TriangleStripListProvokingVertexSelect; + uint32_t LineStripListProvokingVertexSelect; + uint32_t TriangleFanProvokingVertexSelect; + bool LegacyGlobalDepthBiasEnable; + } sf; + + /* 3DSTATE_STREAMOUT */ + struct { + bool RenderingDisable; + uint32_t RenderStreamSelect; + uint32_t ReorderMode; + } so; + + /* 3DSTATE_SAMPLE_MASK */ + struct { + uint32_t SampleMask; + } sm; + + /* 3DSTATE_TE */ + struct { + uint32_t OutputTopology; + } te; + + /* 3DSTATE_VF */ + struct { + bool IndexedDrawCutIndexEnable; + uint32_t CutIndex; + } vf; + + /* 3DSTATE_VFG */ + struct { + uint32_t DistributionMode; + bool ListCutIndexEnable; + } vfg; + + /* 3DSTATE_VF_TOPOLOGY */ + struct { + uint32_t PrimitiveTopologyType; + } vft; + + /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */ + struct { + uint32_t count; + struct { + float MinimumDepth; + float MaximumDepth; + } elem[MAX_VIEWPORTS]; + } vp_cc; + + /* 3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP */ + struct { + uint32_t count; + struct { + float ViewportMatrixElementm00; + float ViewportMatrixElementm11; + float ViewportMatrixElementm22; + float ViewportMatrixElementm30; + float ViewportMatrixElementm31; + float ViewportMatrixElementm32; + float XMinClipGuardband; + float XMaxClipGuardband; + float YMinClipGuardband; + float YMaxClipGuardband; + float XMinViewPort; + float XMaxViewPort; + float YMinViewPort; + float YMaxViewPort; + } elem[MAX_VIEWPORTS]; + } vp_sf_clip; + + /* 3DSTATE_WM */ + struct { + uint32_t ForceThreadDispatchEnable; + bool LineStippleEnable; + } wm; + + /* 3DSTATE_WM_DEPTH_STENCIL */ + struct { + bool DoubleSidedStencilEnable; + uint32_t StencilTestMask; + uint32_t StencilWriteMask; + uint32_t BackfaceStencilTestMask; + uint32_t BackfaceStencilWriteMask; + uint32_t StencilReferenceValue; + uint32_t BackfaceStencilReferenceValue; + bool DepthTestEnable; + bool DepthBufferWriteEnable; + uint32_t DepthTestFunction; + bool StencilTestEnable; + bool StencilBufferWriteEnable; + uint32_t StencilFailOp; + uint32_t StencilPassDepthPassOp; + uint32_t StencilPassDepthFailOp; + uint32_t StencilTestFunction; + uint32_t BackfaceStencilFailOp; + uint32_t BackfaceStencilPassDepthPassOp; + uint32_t BackfaceStencilPassDepthFailOp; + uint32_t BackfaceStencilTestFunction; + } ds; + + bool pma_fix; + + BITSET_DECLARE(dirty, ANV_GFX_STATE_MAX); +}; + enum anv_internal_kernel_name { ANV_INTERNAL_KERNEL_GENERATED_DRAWS, ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE, @@ -1304,6 +1602,16 @@ struct anv_device { #ifdef ANDROID struct u_gralloc *u_gralloc; #endif + + /** Precompute all dirty graphics bits + * + * Depending on platforms, some of the dirty bits don't apply (for example + * 3DSTATE_PRIMITIVE_REPLICATION is only Gfx12.0+). Disabling some + * extensions like Mesh shaders also allow us to avoid emitting any + * mesh/task related instructions (we only initialize them once at device + * initialization). + */ + BITSET_DECLARE(gfx_dirty_state, ANV_GFX_STATE_MAX); }; static inline struct anv_state @@ -2860,6 +3168,8 @@ struct anv_cmd_graphics_state { bool ds_write_state; uint32_t n_occlusion_queries; + + struct anv_gfx_dynamic_state dyn_state; }; enum anv_depth_reg_mode { @@ -4568,23 +4878,6 @@ anv_line_rasterization_mode(VkLineRasterizationModeEXT line_mode, return line_mode; } -/* Fill provoking vertex mode to packet. */ -#define ANV_SETUP_PROVOKING_VERTEX(cmd, mode) \ - switch (mode) { \ - case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \ - cmd.TriangleStripListProvokingVertexSelect = 0; \ - cmd.LineStripListProvokingVertexSelect = 0; \ - cmd.TriangleFanProvokingVertexSelect = 1; \ - break; \ - case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \ - cmd.TriangleStripListProvokingVertexSelect = 2; \ - cmd.LineStripListProvokingVertexSelect = 1; \ - cmd.TriangleFanProvokingVertexSelect = 2; \ - break; \ - default: \ - unreachable("Invalid provoking vertex mode"); \ - } \ - static inline bool anv_is_dual_src_blend_factor(VkBlendFactor factor) { diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c index 3813ad2d713..de835aec937 100644 --- a/src/intel/vulkan/anv_util.c +++ b/src/intel/vulkan/anv_util.c @@ -93,3 +93,60 @@ anv_dump_pipe_bits(enum anv_pipe_bits bits, FILE *f) if (bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT) fputs("+ccs_flush ", f); } + +const char * +anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state) +{ +#define NAME(name) case ANV_GFX_STATE_##name: return #name; + switch (state) { + NAME(URB); + NAME(VF_STATISTICS); + NAME(VF_SGVS); + NAME(VF_SGVS_2); + NAME(VF_SGVS_INSTANCING); + NAME(PRIMITIVE_REPLICATION); + NAME(MULTISAMPLE); + NAME(SBE); + NAME(SBE_SWIZ); + NAME(SO_DECL_LIST); + NAME(VS); + NAME(HS); + NAME(DS); + NAME(GS); + NAME(PS); + NAME(PS_EXTRA); + NAME(SBE_MESH); + NAME(CLIP_MESH); + NAME(MESH_CONTROL); + NAME(MESH_SHADER); + NAME(MESH_DISTRIB); + NAME(TASK_CONTROL); + NAME(TASK_SHADER); + NAME(TASK_REDISTRIB); + NAME(BLEND_STATE_POINTERS); + NAME(CLIP); + NAME(CC_STATE); + NAME(CPS); + NAME(DEPTH_BOUNDS); + NAME(INDEX_BUFFER); + NAME(LINE_STIPPLE); + NAME(PS_BLEND); + NAME(RASTER); + NAME(SAMPLE_MASK); + NAME(SAMPLE_PATTERN); + NAME(SCISSOR); + NAME(SF); + NAME(STREAMOUT); + NAME(TE); + NAME(VERTEX_INPUT); + NAME(VF); + NAME(VF_TOPOLOGY); + NAME(VFG); + NAME(VIEWPORT_CC); + NAME(VIEWPORT_SF_CLIP); + NAME(WM); + NAME(WM_DEPTH_STENCIL); + NAME(PMA_FIX); + default: unreachable("invalid state"); + } +} diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 00cdcfc89db..498069403fb 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -273,6 +273,9 @@ blorp_exec_on_render(struct blorp_batch *batch, struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT); + struct anv_gfx_dynamic_state *hw_state = + &cmd_buffer->state.gfx.dyn_state; + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, params->y1 - params->y0, scale); @@ -294,14 +297,17 @@ blorp_exec_on_render(struct blorp_batch *batch, } #endif +#if INTEL_NEEDS_WA_18019816803 /* Check if blorp ds state matches ours. */ if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) { bool blorp_ds_state = params->depth.enabled || params->stencil.enabled; if (cmd_buffer->state.gfx.ds_write_state != blorp_ds_state) { batch->flags |= BLORP_BATCH_NEED_PSS_STALL_SYNC; cmd_buffer->state.gfx.ds_write_state = blorp_ds_state; + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803); } } +#endif if (params->depth.enabled && !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)) @@ -339,29 +345,53 @@ blorp_exec_on_render(struct blorp_batch *batch, } #endif - /* Calculate state that does not get touched by blorp. - * Flush everything else. - */ - anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER | - ANV_CMD_DIRTY_XFB_ENABLE); - - BITSET_DECLARE(dyn_dirty, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX); - BITSET_ONES(dyn_dirty); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSORS); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_FSR); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS); - if (!params->wm_prog_data) { - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); - BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP); + /* Flag all the instructions emitted by BLORP. */ + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS); +#if GFX_VER >= 11 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2); +#endif +#if GFX_VER >= 12 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION); +#endif + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_POINTERS); + if (batch->blorp->config.use_mesh_shading) { + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL); } + if (params->wm_prog_data) { + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND); + } + + anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_INDEX_BUFFER | + ANV_CMD_DIRTY_XFB_ENABLE); cmd_buffer->state.gfx.vb_dirty = ~0; cmd_buffer->state.gfx.dirty |= dirty; - BITSET_OR(cmd_buffer->vk.dynamic_graphics_state.dirty, - cmd_buffer->vk.dynamic_graphics_state.dirty, dyn_dirty); cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 86ab22a0392..8392e066363 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -3218,7 +3218,12 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer) } } - if (any_dynamic_state_dirty || cmd_buffer->state.gfx.dirty) + /* Flush the runtime state into the HW state tracking */ + if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty) + genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer); + + /* Flush the HW state into the commmand buffer */ + if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty)) genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer); /* If the pipeline changed, we may need to re-allocate push constant space @@ -3278,7 +3283,6 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer) } /* When we're done, there is no more dirty gfx state. */ - vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); cmd_buffer->state.gfx.dirty = 0; } @@ -3693,7 +3697,9 @@ genX(CmdExecuteCommands)( primary->state.current_hash_scale = 0; primary->state.gfx.push_constant_stages = 0; primary->state.gfx.ds_write_state = false; - vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state); + memcpy(primary->state.gfx.dyn_state.dirty, + primary->device->gfx_dirty_state, + sizeof(primary->state.gfx.dyn_state.dirty)); /* Each of the secondary command buffers will use its own state base * address. We need to re-emit state base address for the primary after diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c index 8a3031686e2..3f6243b44db 100644 --- a/src/intel/vulkan/genX_gfx_state.c +++ b/src/intel/vulkan/genX_gfx_state.c @@ -34,61 +34,95 @@ #include "common/intel_guardband.h" #include "compiler/brw_prim.h" -void -genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable) +const uint32_t genX(vk_to_intel_blend)[] = { + [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, + [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, + [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, + [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, + [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, + [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, + [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, + [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, + [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, + [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, + [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, +}; + +static const uint32_t genX(vk_to_intel_blend_op)[] = { + [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, + [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, + [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, + [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, + [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, +}; + +static void +genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer) { - if (cmd_buffer->state.pma_fix_enabled == enable) +#if GFX_VERx10 >= 120 + /* Wa_16013994831 - Disable preemption during streamout, enable back + * again if XFB not used by the current pipeline. + * + * Although this workaround applies to Gfx12+, we already disable object + * level preemption for another reason in genX_state.c so we can skip this + * for Gfx12. + */ + if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831)) return; - cmd_buffer->state.pma_fix_enabled = enable; - - /* According to the Broadwell PIPE_CONTROL documentation, software should - * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set - * prior to the LRI. If stencil buffer writes are enabled, then a Render - * Cache Flush is also necessary. - * - * The Skylake docs say to use a depth stall rather than a command - * streamer stall. However, the hardware seems to violently disagree. - * A full command streamer stall seems to be needed in both cases. - */ - genX(batch_emit_pipe_control) - (&cmd_buffer->batch, cmd_buffer->device->info, - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | - ANV_PIPE_CS_STALL_BIT | -#if GFX_VER >= 12 - ANV_PIPE_TILE_CACHE_FLUSH_BIT | -#endif - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); - -#if GFX_VER == 9 - uint32_t cache_mode; - anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0), - .STCPMAOptimizationEnable = enable, - .STCPMAOptimizationEnableMask = true); - anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { - lri.RegisterOffset = GENX(CACHE_MODE_0_num); - lri.DataDWord = cache_mode; + if (cmd_buffer->state.gfx.pipeline->uses_xfb) { + genX(cmd_buffer_set_preemption)(cmd_buffer, false); + return; } -#endif /* GFX_VER == 9 */ - - /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache - * Flush bits is often necessary. We do it regardless because it's easier. - * The render cache flush is also necessary if stencil writes are enabled. - * - * Again, the Skylake docs give a different set of flushes but the BDW - * flushes seem to work just as well. - */ - genX(batch_emit_pipe_control) - (&cmd_buffer->batch, cmd_buffer->device->info, - ANV_PIPE_DEPTH_STALL_BIT | - ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | -#if GFX_VER >= 12 - ANV_PIPE_TILE_CACHE_FLUSH_BIT | + if (!cmd_buffer->state.gfx.object_preemption) + genX(cmd_buffer_set_preemption)(cmd_buffer, true); #endif - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); } +#if GFX_VER >= 12 +static uint32_t +get_cps_state_offset(struct anv_device *device, bool cps_enabled, + const struct vk_fragment_shading_rate_state *fsr) +{ + if (!cps_enabled) + return device->cps_states.offset; + + uint32_t offset; + static const uint32_t size_index[] = { + [1] = 0, + [2] = 1, + [4] = 2, + }; + +#if GFX_VERx10 >= 125 + offset = + 1 + /* skip disabled */ + fsr->combiner_ops[0] * 5 * 3 * 3 + + fsr->combiner_ops[1] * 3 * 3 + + size_index[fsr->fragment_size.width] * 3 + + size_index[fsr->fragment_size.height]; +#else + offset = + 1 + /* skip disabled */ + size_index[fsr->fragment_size.width] * 3 + + size_index[fsr->fragment_size.height]; +#endif + + offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4; + + return device->cps_states.offset + offset; +} +#endif /* GFX_VER >= 12 */ + UNUSED static bool want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer, const struct vk_depth_stencil_state *ds) @@ -205,6 +239,1506 @@ want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer, wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; } +static void +genX(rasterization_mode)(VkPolygonMode raster_mode, + VkLineRasterizationModeEXT line_mode, + float line_width, + uint32_t *api_mode, + bool *msaa_rasterization_enable) +{ + if (raster_mode == VK_POLYGON_MODE_LINE) { + /* Unfortunately, configuring our line rasterization hardware on gfx8 + * and later is rather painful. Instead of giving us bits to tell the + * hardware what line mode to use like we had on gfx7, we now have an + * arcane combination of API Mode and MSAA enable bits which do things + * in a table which are expected to magically put the hardware into the + * right mode for your API. Sadly, Vulkan isn't any of the APIs the + * hardware people thought of so nothing works the way you want it to. + * + * Look at the table titled "Multisample Rasterization Modes" in Vol 7 + * of the Skylake PRM for more details. + */ + switch (line_mode) { + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: + *api_mode = DX101; +#if GFX_VER <= 9 + /* Prior to ICL, the algorithm the HW uses to draw wide lines + * doesn't quite match what the CTS expects, at least for rectangular + * lines, so we set this to false here, making it draw parallelograms + * instead, which work well enough. + */ + *msaa_rasterization_enable = line_width < 1.0078125; +#else + *msaa_rasterization_enable = true; +#endif + break; + + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: + case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: + *api_mode = DX9OGL; + *msaa_rasterization_enable = false; + break; + + default: + unreachable("Unsupported line rasterization mode"); + } + } else { + *api_mode = DX101; + *msaa_rasterization_enable = true; + } +} + +/** + * This function takes the vulkan runtime values & dirty states and updates + * the values in anv_gfx_dynamic_state, flagging HW instructions for + * reemission if the values are changing. + * + * Nothing is emitted in the batch buffer. + */ +void +genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer) +{ + UNUSED struct anv_device *device = cmd_buffer->device; + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx->pipeline; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; + +#define GET(field) hw_state->field +#define SET(bit, field, value) \ + do { \ + __typeof(hw_state->field) __v = value; \ + if (hw_state->field != __v) { \ + hw_state->field = __v; \ + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ + } \ + } while (0) +#define SET_STAGE(bit, field, value, stage) \ + do { \ + __typeof(hw_state->field) __v = value; \ + if (!anv_pipeline_has_stage(pipeline, \ + MESA_SHADER_##stage)) { \ + hw_state->field = __v; \ + break; \ + } \ + if (hw_state->field != __v) { \ + hw_state->field = __v; \ + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ + } \ + } while (0) + +#define SETUP_PROVOKING_VERTEX(bit, cmd, mode) \ + switch (mode) { \ + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \ + SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \ + SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \ + SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \ + break; \ + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \ + SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2); \ + SET(bit, cmd.LineStripListProvokingVertexSelect, 1); \ + SET(bit, cmd.TriangleFanProvokingVertexSelect, 2); \ + break; \ + default: \ + unreachable("Invalid provoking vertex mode"); \ + } \ + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_XFB_ENABLE | + ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) { + SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable); + SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream); + +#if INTEL_NEEDS_WA_14017076903 + /* Wa_14017076903 : + * + * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage: + * + * SOL_INT::Render_Enable = + * (3DSTATE_STREAMOUT::Force_Rending == Force_On) || + * ( + * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) && + * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) && + * !3DSTATE_STREAMOUT::API_Render_Disable && + * ( + * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable || + * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable || + * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable || + * 3DSTATE_PS_EXTRA::PS_Valid || + * 3DSTATE_WM::Legacy Depth_Buffer_Clear || + * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable || + * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable + * ) + * ) + * + * If SOL_INT::Render_Enable is false, the SO stage will not forward any + * topologies down the pipeline. Which is not what we want for occlusion + * queries. + * + * Here we force rendering to get SOL_INT::Render_Enable when occlusion + * queries are active. + */ + if (!GET(so.RenderingDisable) && cmd_buffer->state.gfx.n_occlusion_queries > 0) + SET(STREAMOUT, so.ForceRendering, Force_on); +#endif + + switch (dyn->rs.provoking_vertex) { + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: + SET(STREAMOUT, so.ReorderMode, LEADING); + SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY); + break; + + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: + SET(STREAMOUT, so.ReorderMode, TRAILING); + SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY); + break; + + default: + unreachable("Invalid provoking vertex mode"); + } + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) { + uint32_t topology; + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) + topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points); + else + topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology]; + + cmd_buffer->state.gfx.primitive_topology = topology; + + SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); + +#if GFX_VER >= 11 + if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate && + (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) { + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + const bool cps_enable = wm_prog_data && + brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags); +#if GFX_VER == 11 + SET(CPS, cps.CoarsePixelShadingMode, + cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE); + SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width); + SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height); +#elif GFX_VER >= 12 + SET(CPS, cps.CoarsePixelShadingStateArrayPointer, + get_cps_state_offset(device, cps_enable, &dyn->fsr)); +#endif + } +#endif /* GFX_VER >= 11 */ + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) { + const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); + + if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { + SET(TE, te.OutputTopology, tes_prog_data->output_topology); + } else { + /* When the origin is upper-left, we have to flip the winding order */ + if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) { + SET(TE, te.OutputTopology, OUTPUT_TRI_CW); + } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) { + SET(TE, te.OutputTopology, OUTPUT_TRI_CCW); + } else { + SET(TE, te.OutputTopology, tes_prog_data->output_topology); + } + } + } else { + SET(TE, te.OutputTopology, OUTPUT_POINT); + } + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) + SET(SF, sf.LineWidth, dyn->rs.line.width); + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) { + SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex); + SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex); + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) { + /** + * From the Vulkan Spec: + * + * "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth + * bias representation is a factor of constant r equal to 1." + * + * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset: + * + * "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer): + * + * Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope + * + * Where r is the minimum representable value > 0 in the depth + * buffer format, converted to float32 (note: If state bit Legacy + * Global Depth Bias Enable is set, the r term will be forced to + * 1.0)" + * + * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable + * LegacyGlobalDepthBiasEnable. + */ + SET(SF, sf.LegacyGlobalDepthBiasEnable, + dyn->rs.depth_bias.representation == + VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT); + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) + SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D); + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) { + /* Take dynamic primitive topology in to account with + * 3DSTATE_RASTER::APIMode + * 3DSTATE_RASTER::DXMultisampleRasterizationEnable + * 3DSTATE_RASTER::AntialiasingEnable + */ + uint32_t api_mode = 0; + bool msaa_raster_enable = false; + + const VkLineRasterizationModeEXT line_mode = + anv_line_rasterization_mode(dyn->rs.line.mode, + pipeline->rasterization_samples); + + const VkPolygonMode dynamic_raster_mode = + genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, + dyn->rs.polygon_mode, + dyn->ia.primitive_topology); + + genX(rasterization_mode)(dynamic_raster_mode, + line_mode, dyn->rs.line.width, + &api_mode, &msaa_raster_enable); + + /* From the Browadwell PRM, Volume 2, documentation for + * 3DSTATE_RASTER, "Antialiasing Enable": + * + * "This field must be disabled if any of the render targets + * have integer (UINT or SINT) surface format." + * + * Additionally internal documentation for Gfx12+ states: + * + * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR + * FORCED_SAMPLE_COUNT > 1." + */ + const bool aa_enable = + anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) && + !cmd_buffer->state.gfx.has_uint_rt && + !(GFX_VER >= 12 && cmd_buffer->state.gfx.samples > 1); + + const bool depth_clip_enable = + vk_rasterization_state_depth_clip_enable(&dyn->rs); + + const bool xy_clip_test_enable = + (dynamic_raster_mode == VK_POLYGON_MODE_FILL); + + SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable); + + SET(RASTER, raster.APIMode, api_mode); + SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable); + SET(RASTER, raster.AntialiasingEnable, aa_enable); + SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]); + SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]); + SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable); + SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable); + SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable); + SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant); + SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope); + SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp); + SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]); + SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]); + SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable); + SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable); + SET(RASTER, raster.ConservativeRasterizationEnable, + dyn->rs.conservative_mode != + VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT); + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) { + /* From the Vulkan 1.0 spec: + * If pSampleMask is NULL, it is treated as if the mask has all bits + * enabled, i.e. no coverage is removed from fragments. + * + * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits. + */ + SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || +#if GFX_VER == 9 + /* For the PMA fix */ + (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || +#endif + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) { + VkImageAspectFlags ds_aspects = 0; + if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + + struct vk_depth_stencil_state opt_ds = dyn->ds; + vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true); + + SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true); + + SET(WM_DEPTH_STENCIL, ds.StencilTestMask, + opt_ds.stencil.front.compare_mask & 0xff); + SET(WM_DEPTH_STENCIL, ds.StencilWriteMask, + opt_ds.stencil.front.write_mask & 0xff); + + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff); + + SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue, + opt_ds.stencil.front.reference & 0xff); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue, + opt_ds.stencil.back.reference & 0xff); + + SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable); + SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable); + SET(WM_DEPTH_STENCIL, ds.DepthTestFunction, + genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]); + SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable); + SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable); + SET(WM_DEPTH_STENCIL, ds.StencilFailOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]); + SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]); + SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]); + SET(WM_DEPTH_STENCIL, ds.StencilTestFunction, + genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp, + genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]); + SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction, + genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]); + +#if GFX_VER == 9 + const bool pma = want_stencil_pma_fix(cmd_buffer, &opt_ds); + SET(PMA_FIX, pma_fix, pma); +#endif + +#if INTEL_NEEDS_WA_18019816803 + if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) { + bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable; + if (cmd_buffer->state.gfx.ds_write_state != ds_write_state) { + cmd_buffer->state.gfx.ds_write_state = ds_write_state; + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803); + } + } +#endif + } + +#if GFX_VER >= 12 + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) { + SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable); + /* Only look at updating the bounds if testing is enabled */ + if (dyn->ds.depth.bounds_test.enable) { + SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min); + SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max); + } + } +#endif + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) { + SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern); + SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount, + 1.0f / MAX2(1, dyn->rs.line.stipple.factor)); + SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor); + + SET(WM, wm.LineStippleEnable, dyn->rs.line.stipple.enable); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RESTART_INDEX) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) { + SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable); + SET(VF, vf.CutIndex, cmd_buffer->state.gfx.restart_index); + } + + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDEX_BUFFER) + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER); + +#if GFX_VERx10 >= 125 + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) + SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable); +#endif + + if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations && + (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE))) + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN); + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders + * threads. + */ + bool force_thread_dispatch = + anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && + (pipeline->force_fragment_thread_dispatch || + anv_cmd_buffer_all_color_write_masked(cmd_buffer)); + SET(WM, wm.ForceThreadDispatchEnable, force_thread_dispatch ? ForceON : 0); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) { + const uint8_t color_writes = dyn->cb.color_write_enables; + const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx; + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + bool has_writeable_rt = + anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && + (color_writes & ((1u << state->color_att_count) - 1)) != 0; + + SET(BLEND_STATE_POINTERS, blend.AlphaToCoverageEnable, + dyn->ms.alpha_to_coverage_enable); + SET(BLEND_STATE_POINTERS, blend.AlphaToOneEnable, + dyn->ms.alpha_to_one_enable); + + bool independent_alpha_blend = false; + /* Wa_14018912822, check if we set these during RT setup. */ + bool color_blend_zero = false; + bool alpha_blend_zero = false; + for (uint32_t i = 0; i < MAX_RTS; i++) { + /* Disable anything above the current number of color attachments. */ + bool write_disabled = i >= cmd_buffer->state.gfx.color_att_count || + (color_writes & BITFIELD_BIT(i)) == 0; + + SET(BLEND_STATE_POINTERS, blend.rts[i].WriteDisableAlpha, + write_disabled || + (dyn->cb.attachments[i].write_mask & + VK_COLOR_COMPONENT_A_BIT) == 0); + SET(BLEND_STATE_POINTERS, blend.rts[i].WriteDisableRed, + write_disabled || + (dyn->cb.attachments[i].write_mask & + VK_COLOR_COMPONENT_R_BIT) == 0); + SET(BLEND_STATE_POINTERS, blend.rts[i].WriteDisableGreen, + write_disabled || + (dyn->cb.attachments[i].write_mask & + VK_COLOR_COMPONENT_G_BIT) == 0); + SET(BLEND_STATE_POINTERS, blend.rts[i].WriteDisableBlue, + write_disabled || + (dyn->cb.attachments[i].write_mask & + VK_COLOR_COMPONENT_B_BIT) == 0); + /* Vulkan specification 1.2.168, VkLogicOp: + * + * "Logical operations are controlled by the logicOpEnable and + * logicOp members of VkPipelineColorBlendStateCreateInfo. If + * logicOpEnable is VK_TRUE, then a logical operation selected by + * logicOp is applied between each color attachment and the + * fragment’s corresponding output value, and blending of all + * attachments is treated as if it were disabled." + * + * From the Broadwell PRM Volume 2d: Command Reference: Structures: + * BLEND_STATE_ENTRY: + * + * "Enabling LogicOp and Color Buffer Blending at the same time is + * UNDEFINED" + */ + SET(BLEND_STATE_POINTERS, blend.rts[i].LogicOpFunction, + genX(vk_to_intel_logic_op)[dyn->cb.logic_op]); + SET(BLEND_STATE_POINTERS, blend.rts[i].LogicOpEnable, + dyn->cb.logic_op_enable); + + SET(BLEND_STATE_POINTERS, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT); + SET(BLEND_STATE_POINTERS, blend.rts[i].PreBlendColorClampEnable, true); + SET(BLEND_STATE_POINTERS, blend.rts[i].PostBlendColorClampEnable, true); + + /* Setup blend equation. */ + SET(BLEND_STATE_POINTERS, blend.rts[i].ColorBlendFunction, + genX(vk_to_intel_blend_op)[ + dyn->cb.attachments[i].color_blend_op]); + SET(BLEND_STATE_POINTERS, blend.rts[i].AlphaBlendFunction, + genX(vk_to_intel_blend_op)[ + dyn->cb.attachments[i].alpha_blend_op]); + + if (dyn->cb.attachments[i].src_color_blend_factor != + dyn->cb.attachments[i].src_alpha_blend_factor || + dyn->cb.attachments[i].dst_color_blend_factor != + dyn->cb.attachments[i].dst_alpha_blend_factor || + dyn->cb.attachments[i].color_blend_op != + dyn->cb.attachments[i].alpha_blend_op) { + independent_alpha_blend = true; + } + + /* The Dual Source Blending documentation says: + * + * "If SRC1 is included in a src/dst blend factor and + * a DualSource RT Write message is not used, results + * are UNDEFINED. (This reflects the same restriction in DX APIs, + * where undefined results are produced if “o1” is not written + * by a PS – there are no default values defined)." + * + * There is no way to gracefully fix this undefined situation + * so we just disable the blending to prevent possible issues. + */ + if (wm_prog_data && !wm_prog_data->dual_src_blend && + anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) { + SET(BLEND_STATE_POINTERS, blend.rts[i].ColorBufferBlendEnable, false); + } else { + SET(BLEND_STATE_POINTERS, blend.rts[i].ColorBufferBlendEnable, + !dyn->cb.logic_op_enable && + dyn->cb.attachments[i].blend_enable); + } + + /* Our hardware applies the blend factor prior to the blend function + * regardless of what function is used. Technically, this means the + * hardware can do MORE than GL or Vulkan specify. However, it also + * means that, for MIN and MAX, we have to stomp the blend factor to + * ONE to make it a no-op. + */ + uint32_t SourceBlendFactor; + uint32_t DestinationBlendFactor; + uint32_t SourceAlphaBlendFactor; + uint32_t DestinationAlphaBlendFactor; + if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN || + dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) { + SourceBlendFactor = BLENDFACTOR_ONE; + DestinationBlendFactor = BLENDFACTOR_ONE; + } else { + SourceBlendFactor = genX(vk_to_intel_blend)[ + dyn->cb.attachments[i].src_color_blend_factor]; + DestinationBlendFactor = genX(vk_to_intel_blend)[ + dyn->cb.attachments[i].dst_color_blend_factor]; + } + + if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN || + dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) { + SourceAlphaBlendFactor = BLENDFACTOR_ONE; + DestinationAlphaBlendFactor = BLENDFACTOR_ONE; + } else { + SourceAlphaBlendFactor = genX(vk_to_intel_blend)[ + dyn->cb.attachments[i].src_alpha_blend_factor]; + DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[ + dyn->cb.attachments[i].dst_alpha_blend_factor]; + } + + if (intel_needs_workaround(cmd_buffer->device->info, 14018912822) && + pipeline->rasterization_samples > 1) { + if (DestinationBlendFactor == BLENDFACTOR_ZERO) { + DestinationBlendFactor = BLENDFACTOR_CONST_COLOR; + color_blend_zero = true; + } + if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) { + DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA; + alpha_blend_zero = true; + } + } + + SET(BLEND_STATE_POINTERS, blend.rts[i].SourceBlendFactor, SourceBlendFactor); + SET(BLEND_STATE_POINTERS, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor); + SET(BLEND_STATE_POINTERS, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor); + SET(BLEND_STATE_POINTERS, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor); + } + cmd_buffer->state.gfx.color_blend_zero = color_blend_zero; + cmd_buffer->state.gfx.alpha_blend_zero = alpha_blend_zero; + + SET(BLEND_STATE_POINTERS, blend.IndependentAlphaBlendEnable, independent_alpha_blend); + + /* 3DSTATE_PS_BLEND to be consistent with the rest of the + * BLEND_STATE_ENTRY. + */ + SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt); + SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable)); + SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor)); + SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, cmd_buffer->state.gfx.alpha_blend_zero ? + BLENDFACTOR_CONST_COLOR : + GET(blend.rts[0].DestinationAlphaBlendFactor)); + SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor)); + SET(PS_BLEND, ps_blend.DestinationBlendFactor, cmd_buffer->state.gfx.color_blend_zero ? + BLENDFACTOR_CONST_COLOR : + GET(blend.rts[0].DestinationBlendFactor)); + SET(PS_BLEND, ps_blend.AlphaTestEnable, false); + SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable)); + SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable); + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { + SET(CC_STATE, cc.BlendConstantColorRed, + cmd_buffer->state.gfx.color_blend_zero ? 0.0f : + dyn->cb.blend_constants[0]); + SET(CC_STATE, cc.BlendConstantColorGreen, + cmd_buffer->state.gfx.color_blend_zero ? + 0.0f : dyn->cb.blend_constants[1]); + SET(CC_STATE, cc.BlendConstantColorBlue, + cmd_buffer->state.gfx.color_blend_zero ? + 0.0f : dyn->cb.blend_constants[2]); + SET(CC_STATE, cc.BlendConstantColorAlpha, + cmd_buffer->state.gfx.alpha_blend_zero ? + 0.0f : dyn->cb.blend_constants[3]); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) { + struct anv_instance *instance = cmd_buffer->device->physical->instance; + const VkViewport *viewports = dyn->vp.viewports; + + const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f; + + for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) { + const VkViewport *vp = &viewports[i]; + + /* The gfx7 state struct has just the matrix and guardband fields, the + * gfx8 struct adds the min/max viewport fields. */ + struct GENX(SF_CLIP_VIEWPORT) sfv = { + .ViewportMatrixElementm00 = vp->width / 2, + .ViewportMatrixElementm11 = vp->height / 2, + .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale, + .ViewportMatrixElementm30 = vp->x + vp->width / 2, + .ViewportMatrixElementm31 = vp->y + vp->height / 2, + .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ? + (vp->minDepth + vp->maxDepth) * scale : vp->minDepth, + .XMinClipGuardband = -1.0f, + .XMaxClipGuardband = 1.0f, + .YMinClipGuardband = -1.0f, + .YMaxClipGuardband = 1.0f, + .XMinViewPort = vp->x, + .XMaxViewPort = vp->x + vp->width - 1, + .YMinViewPort = MIN2(vp->y, vp->y + vp->height), + .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1, + }; + + /* Fix depth test misrenderings by lowering translated depth range */ + if (instance->lower_depth_range_rate != 1.0f) + sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate; + + const uint32_t fb_size_max = 1 << 14; + uint32_t x_min = 0, x_max = fb_size_max; + uint32_t y_min = 0, y_max = fb_size_max; + + /* If we have a valid renderArea, include that */ + if (gfx->render_area.extent.width > 0 && + gfx->render_area.extent.height > 0) { + x_min = MAX2(x_min, gfx->render_area.offset.x); + x_max = MIN2(x_max, gfx->render_area.offset.x + + gfx->render_area.extent.width); + y_min = MAX2(y_min, gfx->render_area.offset.y); + y_max = MIN2(y_max, gfx->render_area.offset.y + + gfx->render_area.extent.height); + } + + /* The client is required to have enough scissors for whatever it + * sets as ViewportIndex but it's possible that they've got more + * viewports set from a previous command. Also, from the Vulkan + * 1.3.207: + * + * "The application must ensure (using scissor if necessary) that + * all rendering is contained within the render area." + * + * If the client doesn't set a scissor, that basically means it + * guarantees everything is in-bounds already. If we end up using a + * guardband of [-1, 1] in that case, there shouldn't be much loss. + * It's theoretically possible that they could do all their clipping + * with clip planes but that'd be a bit odd. + */ + if (i < dyn->vp.scissor_count) { + const VkRect2D *scissor = &dyn->vp.scissors[i]; + x_min = MAX2(x_min, scissor->offset.x); + x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width); + y_min = MAX2(y_min, scissor->offset.y); + y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height); + } + + /* Only bother calculating the guardband if our known render area is + * less than the maximum size. Otherwise, it will calculate [-1, 1] + * anyway but possibly with precision loss. + */ + if (x_min > 0 || x_max < fb_size_max || + y_min > 0 || y_max < fb_size_max) { + intel_calculate_guardband_size(x_min, x_max, y_min, y_max, + sfv.ViewportMatrixElementm00, + sfv.ViewportMatrixElementm11, + sfv.ViewportMatrixElementm30, + sfv.ViewportMatrixElementm31, + &sfv.XMinClipGuardband, + &sfv.XMaxClipGuardband, + &sfv.YMinClipGuardband, + &sfv.YMaxClipGuardband); + } + +#define SET_VP(bit, state, field) \ + do { \ + if (hw_state->state.field != sfv.field) { \ + hw_state->state.field = sfv.field; \ + BITSET_SET(hw_state->dirty, \ + ANV_GFX_STATE_##bit); \ + } \ + } while (0) + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort); + SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort); +#undef SET_VP + + float min_depth = dyn->rs.depth_clamp_enable ? + MIN2(vp->minDepth, vp->maxDepth) : + 0.0f; + float max_depth = dyn->rs.depth_clamp_enable ? + MAX2(vp->minDepth, vp->maxDepth) : + 1.0f; + + SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth); + SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth); + + SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ? + dyn->vp.viewport_count - 1 : 0); + } + + /* If the HW state is already considered dirty or the previous + * programmed viewport count is smaller than what we need, update the + * viewport count and ensure the HW state is dirty. Otherwise if the + * number of viewport programmed previously was larger than what we need + * now, no need to reemit we can just keep the old programmed values. + */ + if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) || + hw_state->vp_sf_clip.count < dyn->vp.viewport_count) { + hw_state->vp_sf_clip.count = dyn->vp.viewport_count; + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP); + } + if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) || + hw_state->vp_cc.count < dyn->vp.viewport_count) { + hw_state->vp_cc.count = dyn->vp.viewport_count; + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC); + } + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) { + const VkRect2D *scissors = dyn->vp.scissors; + const VkViewport *viewports = dyn->vp.viewports; + + for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) { + const VkRect2D *s = &scissors[i]; + const VkViewport *vp = &viewports[i]; + + const int max = 0xffff; + + uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height)); + uint32_t x_min = MAX2(s->offset.x, vp->x); + int64_t y_max = MIN2(s->offset.y + s->extent.height - 1, + MAX2(vp->y, vp->y + vp->height) - 1); + int64_t x_max = MIN2(s->offset.x + s->extent.width - 1, + vp->x + vp->width - 1); + + y_max = CLAMP(y_max, 0, INT16_MAX >> 1); + x_max = CLAMP(x_max, 0, INT16_MAX >> 1); + + /* Do this math using int64_t so overflow gets clamped correctly. */ + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max); + x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max); + y_max = CLAMP((uint64_t) y_max, 0, + gfx->render_area.offset.y + + gfx->render_area.extent.height - 1); + x_max = CLAMP((uint64_t) x_max, 0, + gfx->render_area.offset.x + + gfx->render_area.extent.width - 1); + } + + if (s->extent.width <= 0 || s->extent.height <= 0) { + /* Since xmax and ymax are inclusive, we have to have xmax < xmin + * or ymax < ymin for empty clips. In case clip x, y, width height + * are all 0, the clamps below produce 0 for xmin, ymin, xmax, + * ymax, which isn't what we want. Just special case empty clips + * and produce a canonical empty clip. + */ + SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1); + SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1); + SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0); + SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0); + } else { + SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min); + SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min); + SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max); + SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max); + } + } + + /* If the HW state is already considered dirty or the previous + * programmed viewport count is smaller than what we need, update the + * viewport count and ensure the HW state is dirty. Otherwise if the + * number of viewport programmed previously was larger than what we need + * now, no need to reemit we can just keep the old programmed values. + */ + if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR) || + hw_state->scissor.count < dyn->vp.scissor_count) { + hw_state->scissor.count = dyn->vp.scissor_count; + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR); + } + } + +#undef GET +#undef SET +#undef SET_STAGE + + vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); +} + +/** + * This function emits the dirty instructions in the batch buffer. + */ +void +genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_device *device = cmd_buffer->device; + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + struct anv_graphics_pipeline *pipeline = gfx->pipeline; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; + + /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit + * it after. + */ + if (intel_needs_workaround(device->info, 16011773973) && + pipeline->uses_xfb && + BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) { + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ms); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs); + +#if GFX_VER >= 11 + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2); +#endif + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vs); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) { + /* Wa_16011773973: + * If SOL is enabled and SO_DECL state has to be programmed, + * 1. Send 3D State SOL state with SOL disabled + * 2. Send SO_DECL NP state + * 3. Send 3D State SOL with SOL Enabled + */ + if (intel_needs_workaround(device->info, 16011773973) && + pipeline->uses_xfb) + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so); + + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, + final.so_decl_list); + +#if GFX_VERx10 == 125 + /* Wa_14015946265: Send PC with CS stall after SO_DECL. */ + genX(batch_emit_pipe_control)(&cmd_buffer->batch, device->info, + ANV_PIPE_CS_STALL_BIT); +#endif + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ps); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ps_extra); + + if (device->vk.enabled_extensions.EXT_mesh_shader) { + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_control); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_control); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader); + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB)) + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib); + } else { + assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) && + !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) && + !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) && + !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) && + !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) && + !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB)); + } + +#define INIT(category, name) \ + .name = hw_state->category.name +#define SET(s, category, name) \ + s.name = hw_state->category.name + + /* Now the potentially dynamic instructions */ + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP), + pipeline, partial.clip, clip) { + SET(clip, clip, APIMode); + SET(clip, clip, ViewportXYClipTestEnable); + SET(clip, clip, TriangleStripListProvokingVertexSelect); + SET(clip, clip, LineStripListProvokingVertexSelect); + SET(clip, clip, TriangleFanProvokingVertexSelect); + SET(clip, clip, MaximumVPIndex); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) { + genX(streamout_prologue)(cmd_buffer); + + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), + pipeline, partial.so, so) { + SET(so, so, RenderingDisable); + SET(so, so, RenderStreamSelect); + SET(so, so, ReorderMode); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) { + struct anv_state sf_clip_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + hw_state->vp_sf_clip.count * 64, 64); + + for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) { + struct GENX(SF_CLIP_VIEWPORT) sfv = { + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00), + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11), + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22), + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30), + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31), + INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32), + INIT(vp_sf_clip.elem[i], XMinClipGuardband), + INIT(vp_sf_clip.elem[i], XMaxClipGuardband), + INIT(vp_sf_clip.elem[i], YMinClipGuardband), + INIT(vp_sf_clip.elem[i], YMaxClipGuardband), + INIT(vp_sf_clip.elem[i], XMinViewPort), + INIT(vp_sf_clip.elem[i], XMaxViewPort), + INIT(vp_sf_clip.elem[i], YMinViewPort), + INIT(vp_sf_clip.elem[i], YMaxViewPort), + }; + GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv); + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) { + clip.SFClipViewportPointer = sf_clip_state.offset; + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC)) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + hw_state->vp_cc.count * 8, 32); + + for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) { + struct GENX(CC_VIEWPORT) cc_viewport = { + INIT(vp_cc.elem[i], MinimumDepth), + INIT(vp_cc.elem[i], MaximumDepth), + }; + GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport); + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) { + cc.CCViewportPointer = cc_state.offset; + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) { + /* Wa_1409725701: + * + * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is + * stored as an array of up to 16 elements. The location of first + * element of the array, as specified by Pointer to SCISSOR_RECT, + * should be aligned to a 64-byte boundary. + */ + struct anv_state scissor_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + hw_state->scissor.count * 8, 64); + + for (uint32_t i = 0; i < hw_state->scissor.count; i++) { + struct GENX(SCISSOR_RECT) scissor = { + INIT(scissor.elem[i], ScissorRectangleYMin), + INIT(scissor.elem[i], ScissorRectangleXMin), + INIT(scissor.elem[i], ScissorRectangleYMax), + INIT(scissor.elem[i], ScissorRectangleXMax), + }; + GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor); + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) { + ssp.ScissorRectPointer = scissor_state.offset; + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) { + SET(vft, vft, PrimitiveTopologyType); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) { + const uint32_t ve_count = + pipeline->vs_input_elements + pipeline->svgs_count; + const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count); + uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_VERTEX_ELEMENTS)); + + if (p) { + if (ve_count == 0) { + memcpy(p + 1, cmd_buffer->device->empty_vs_input, + sizeof(cmd_buffer->device->empty_vs_input)); + } else if (ve_count == pipeline->vertex_input_elems) { + /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so + * everything is in pipeline->vertex_input_data and we can just + * memcpy + */ + memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, + final.vf_instancing); + } else { + assert(pipeline->final.vf_instancing.len == 0); + /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */ + genX(emit_vertex_input)(&cmd_buffer->batch, p + 1, + pipeline, dyn->vi, false /* emit_in_pipeline */); + /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */ + memcpy(p + 1 + 2 * pipeline->vs_input_elements, + pipeline->vertex_input_data, + 4 * 2 * pipeline->vertex_input_elems); + } + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE), + pipeline, partial.te, te) { + SET(te, te, OutputTopology); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS), + pipeline, partial.gs, gs) { + SET(gs, gs, ReorderMode); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) { +#if GFX_VER == 11 + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) { + SET(cps, cps, CoarsePixelShadingMode); + SET(cps, cps, MinCPSizeX); + SET(cps, cps, MinCPSizeY); + } +#elif GFX_VER >= 12 + /* TODO: we can optimize this flush in the following cases: + * + * In the case where the last geometry shader emits a value that is + * not constant, we can avoid this stall because we can synchronize + * the pixel shader internally with + * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange. + * + * If we know that the previous pipeline and the current one are + * using the same fragment shading rate. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { +#if GFX_VERx10 >= 125 + pc.PSSStallSyncEnable = true; +#else + pc.PSDSyncEnable = true; +#endif + } + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) { + SET(cps, cps, CoarsePixelShadingStateArrayPointer); + } +#endif + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF), + pipeline, partial.sf, sf) { + SET(sf, sf, LineWidth); + SET(sf, sf, TriangleStripListProvokingVertexSelect); + SET(sf, sf, LineStripListProvokingVertexSelect); + SET(sf, sf, TriangleFanProvokingVertexSelect); + SET(sf, sf, LegacyGlobalDepthBiasEnable); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER), + pipeline, partial.raster, raster) { + SET(raster, raster, APIMode); + SET(raster, raster, DXMultisampleRasterizationEnable); + SET(raster, raster, AntialiasingEnable); + SET(raster, raster, CullMode); + SET(raster, raster, FrontWinding); + SET(raster, raster, GlobalDepthOffsetEnableSolid); + SET(raster, raster, GlobalDepthOffsetEnableWireframe); + SET(raster, raster, GlobalDepthOffsetEnablePoint); + SET(raster, raster, GlobalDepthOffsetConstant); + SET(raster, raster, GlobalDepthOffsetScale); + SET(raster, raster, GlobalDepthOffsetClamp); + SET(raster, raster, FrontFaceFillMode); + SET(raster, raster, BackFaceFillMode); + SET(raster, raster, ViewportZFarClipTestEnable); + SET(raster, raster, ViewportZNearClipTestEnable); + SET(raster, raster, ConservativeRasterizationEnable); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + GENX(COLOR_CALC_STATE_length) * 4, + 64); + struct GENX(COLOR_CALC_STATE) cc = { + INIT(cc, BlendConstantColorRed), + INIT(cc, BlendConstantColorGreen), + INIT(cc, BlendConstantColorBlue), + INIT(cc, BlendConstantColorAlpha), + }; + GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) { + ccp.ColorCalcStatePointer = cc_state.offset; + ccp.ColorCalcStatePointerValid = true; + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) { + SET(sm, sm, SampleMask); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) { + SET(ds, ds, DoubleSidedStencilEnable); + SET(ds, ds, StencilTestMask); + SET(ds, ds, StencilWriteMask); + SET(ds, ds, BackfaceStencilTestMask); + SET(ds, ds, BackfaceStencilWriteMask); + SET(ds, ds, StencilReferenceValue); + SET(ds, ds, BackfaceStencilReferenceValue); + SET(ds, ds, DepthTestEnable); + SET(ds, ds, DepthBufferWriteEnable); + SET(ds, ds, DepthTestFunction); + SET(ds, ds, StencilTestEnable); + SET(ds, ds, StencilBufferWriteEnable); + SET(ds, ds, StencilFailOp); + SET(ds, ds, StencilPassDepthPassOp); + SET(ds, ds, StencilPassDepthFailOp); + SET(ds, ds, StencilTestFunction); + SET(ds, ds, BackfaceStencilFailOp); + SET(ds, ds, BackfaceStencilPassDepthPassOp); + SET(ds, ds, BackfaceStencilPassDepthFailOp); + SET(ds, ds, BackfaceStencilTestFunction); + } + } + +#if GFX_VER >= 12 + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { + SET(db, db, DepthBoundsTestEnable); + SET(db, db, DepthBoundsTestMinValue); + SET(db, db, DepthBoundsTestMaxValue); + } + } +#endif + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) { + SET(ls, ls, LineStipplePattern); + SET(ls, ls, LineStippleInverseRepeatCount); + SET(ls, ls, LineStippleRepeatCount); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) { +#if GFX_VERx10 >= 125 + vf.GeometryDistributionEnable = true; +#endif + SET(vf, vf, IndexedDrawCutIndexEnable); + SET(vf, vf, CutIndex); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) { + struct anv_buffer *buffer = cmd_buffer->state.gfx.index_buffer; + uint32_t offset = cmd_buffer->state.gfx.index_offset; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) { + ib.IndexFormat = cmd_buffer->state.gfx.index_type; + ib.MOCS = anv_mocs(cmd_buffer->device, + buffer->address.bo, + ISL_SURF_USAGE_INDEX_BUFFER_BIT); +#if GFX_VER >= 12 + ib.L3BypassDisable = true; +#endif + ib.BufferStartingAddress = anv_address_add(buffer->address, offset); + ib.BufferSize = vk_buffer_range(&buffer->vk, offset, + VK_WHOLE_SIZE); + } + } + +#if GFX_VERx10 >= 125 + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG), + pipeline, partial.vfg, vfg) { + SET(vfg, vfg, ListCutIndexEnable); + } + } +#endif + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) { + genX(emit_sample_pattern)(&cmd_buffer->batch, + dyn->ms.sample_locations_enable ? + dyn->ms.sample_locations : NULL); + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) { + anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM), + pipeline, partial.wm, wm) { + SET(wm, wm, ForceThreadDispatchEnable); + SET(wm, wm, LineStippleEnable); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) { + SET(blend, ps_blend, HasWriteableRT); + SET(blend, ps_blend, ColorBufferBlendEnable); + SET(blend, ps_blend, SourceAlphaBlendFactor); + SET(blend, ps_blend, DestinationAlphaBlendFactor); + SET(blend, ps_blend, SourceBlendFactor); + SET(blend, ps_blend, DestinationBlendFactor); + SET(blend, ps_blend, AlphaTestEnable); + SET(blend, ps_blend, IndependentAlphaBlendEnable); + SET(blend, ps_blend, AlphaToCoverageEnable); + } + } + + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_POINTERS)) { + const uint32_t num_dwords = GENX(BLEND_STATE_length) + + GENX(BLEND_STATE_ENTRY_length) * MAX_RTS; + struct anv_state blend_states = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + num_dwords * 4, + 64); + + uint32_t *dws = blend_states.map; + + struct GENX(BLEND_STATE) blend_state = { + INIT(blend, AlphaToCoverageEnable), + INIT(blend, AlphaToOneEnable), + INIT(blend, IndependentAlphaBlendEnable), + }; + GENX(BLEND_STATE_pack)(NULL, blend_states.map, &blend_state); + + /* Jump to blend entries. */ + dws += GENX(BLEND_STATE_length); + for (uint32_t i = 0; i < MAX_RTS; i++) { + struct GENX(BLEND_STATE_ENTRY) entry = { + INIT(blend.rts[i], WriteDisableAlpha), + INIT(blend.rts[i], WriteDisableRed), + INIT(blend.rts[i], WriteDisableGreen), + INIT(blend.rts[i], WriteDisableBlue), + INIT(blend.rts[i], LogicOpFunction), + INIT(blend.rts[i], LogicOpEnable), + INIT(blend.rts[i], ColorBufferBlendEnable), + INIT(blend.rts[i], ColorClampRange), + INIT(blend.rts[i], PreBlendColorClampEnable), + INIT(blend.rts[i], PostBlendColorClampEnable), + INIT(blend.rts[i], SourceBlendFactor), + INIT(blend.rts[i], DestinationBlendFactor), + INIT(blend.rts[i], ColorBlendFunction), + INIT(blend.rts[i], SourceAlphaBlendFactor), + INIT(blend.rts[i], DestinationAlphaBlendFactor), + INIT(blend.rts[i], AlphaBlendFunction), + }; + + GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry); + dws += GENX(BLEND_STATE_ENTRY_length); + } + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) { + bsp.BlendStatePointer = blend_states.offset; + bsp.BlendStatePointerValid = true; + } + } + +#if INTEL_NEEDS_WA_18019816803 + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) { + genX(batch_emit_pipe_control)(&cmd_buffer->batch, cmd_buffer->device->info, + ANV_PIPE_PSS_STALL_SYNC_BIT); + } +#endif + +#if GFX_VER == 9 + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX)) + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix); +#endif + +#undef INIT +#undef SET + + BITSET_ZERO(hw_state->dirty); +} + +void +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable) +{ + if (cmd_buffer->state.pma_fix_enabled == enable) + return; + + cmd_buffer->state.pma_fix_enabled = enable; + + /* According to the Broadwell PIPE_CONTROL documentation, software should + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set + * prior to the LRI. If stencil buffer writes are enabled, then a Render + * Cache Flush is also necessary. + * + * The Skylake docs say to use a depth stall rather than a command + * streamer stall. However, the hardware seems to violently disagree. + * A full command streamer stall seems to be needed in both cases. + */ + genX(batch_emit_pipe_control) + (&cmd_buffer->batch, cmd_buffer->device->info, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_CS_STALL_BIT | +#if GFX_VER >= 12 + ANV_PIPE_TILE_CACHE_FLUSH_BIT | +#endif + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); + +#if GFX_VER == 9 + uint32_t cache_mode; + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0), + .STCPMAOptimizationEnable = enable, + .STCPMAOptimizationEnableMask = true); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(CACHE_MODE_0_num); + lri.DataDWord = cache_mode; + } + +#endif /* GFX_VER == 9 */ + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + * + * Again, the Skylake docs give a different set of flushes but the BDW + * flushes seem to work just as well. + */ + genX(batch_emit_pipe_control) + (&cmd_buffer->batch, cmd_buffer->device->info, + ANV_PIPE_DEPTH_STALL_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | +#if GFX_VER >= 12 + ANV_PIPE_TILE_CACHE_FLUSH_BIT | +#endif + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); +} + +/* Fill provoking vertex mode to packet. */ +#define ANV_SETUP_PROVOKING_VERTEX(cmd, mode) \ + switch (mode) { \ + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \ + cmd.TriangleStripListProvokingVertexSelect = 0; \ + cmd.LineStripListProvokingVertexSelect = 0; \ + cmd.TriangleFanProvokingVertexSelect = 1; \ + break; \ + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \ + cmd.TriangleStripListProvokingVertexSelect = 2; \ + cmd.LineStripListProvokingVertexSelect = 1; \ + cmd.TriangleFanProvokingVertexSelect = 2; \ + break; \ + default: \ + unreachable("Invalid provoking vertex mode"); \ + } \ + static void genX(cmd_emit_te)(struct anv_cmd_buffer *cmd_buffer) { @@ -283,41 +1817,6 @@ genX(cmd_emit_sample_mask)(struct anv_cmd_buffer *cmd_buffer) } } -#if GFX_VER >= 12 -static uint32_t -get_cps_state_offset(struct anv_device *device, bool cps_enabled, - const struct vk_fragment_shading_rate_state *fsr) -{ - if (!cps_enabled) - return device->cps_states.offset; - - uint32_t offset; - static const uint32_t size_index[] = { - [1] = 0, - [2] = 1, - [4] = 2, - }; - -#if GFX_VERx10 >= 125 - offset = - 1 + /* skip disabled */ - fsr->combiner_ops[0] * 5 * 3 * 3 + - fsr->combiner_ops[1] * 3 * 3 + - size_index[fsr->fragment_size.width] * 3 + - size_index[fsr->fragment_size.height]; -#else - offset = - 1 + /* skip disabled */ - size_index[fsr->fragment_size.width] * 3 + - size_index[fsr->fragment_size.height]; -#endif - - offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4; - - return device->cps_states.offset + offset; -} -#endif /* GFX_VER >= 12 */ - #if GFX_VER >= 11 static void genX(emit_shading_rate)(struct anv_batch *batch, @@ -365,85 +1864,6 @@ genX(emit_shading_rate)(struct anv_batch *batch, } #endif /* GFX_VER >= 11 */ -const uint32_t genX(vk_to_intel_blend)[] = { - [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, - [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, - [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, - [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, - [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, - [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, - [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, - [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, - [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, - [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, - [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, - [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, - [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, - [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, - [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, - [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, - [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, - [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, - [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, -}; - -static const uint32_t genX(vk_to_intel_blend_op)[] = { - [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, - [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, - [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, - [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, - [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, -}; - -static void -genX(rasterization_mode)(VkPolygonMode raster_mode, - VkLineRasterizationModeEXT line_mode, - float line_width, - uint32_t *api_mode, - bool *msaa_rasterization_enable) -{ - if (raster_mode == VK_POLYGON_MODE_LINE) { - /* Unfortunately, configuring our line rasterization hardware on gfx8 - * and later is rather painful. Instead of giving us bits to tell the - * hardware what line mode to use like we had on gfx7, we now have an - * arcane combination of API Mode and MSAA enable bits which do things - * in a table which are expected to magically put the hardware into the - * right mode for your API. Sadly, Vulkan isn't any of the APIs the - * hardware people thought of so nothing works the way you want it to. - * - * Look at the table titled "Multisample Rasterization Modes" in Vol 7 - * of the Skylake PRM for more details. - */ - switch (line_mode) { - case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: - *api_mode = DX101; -#if GFX_VER <= 9 - /* Prior to ICL, the algorithm the HW uses to draw wide lines - * doesn't quite match what the CTS expects, at least for rectangular - * lines, so we set this to false here, making it draw parallelograms - * instead, which work well enough. - */ - *msaa_rasterization_enable = line_width < 1.0078125; -#else - *msaa_rasterization_enable = true; -#endif - break; - - case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: - case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: - *api_mode = DX9OGL; - *msaa_rasterization_enable = false; - break; - - default: - unreachable("Unsupported line rasterization mode"); - } - } else { - *api_mode = DX101; - *msaa_rasterization_enable = true; - } -} - static void cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) { @@ -495,30 +1915,6 @@ cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) } } -static void -genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer) -{ -#if GFX_VERx10 >= 120 - /* Wa_16013994831 - Disable preemption during streamout, enable back - * again if XFB not used by the current pipeline. - * - * Although this workaround applies to Gfx12+, we already disable object - * level preemption for another reason in genX_state.c so we can skip this - * for Gfx12. - */ - if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831)) - return; - - if (cmd_buffer->state.gfx.pipeline->uses_xfb) { - genX(cmd_buffer_set_preemption)(cmd_buffer, false); - return; - } - - if (!cmd_buffer->state.gfx.object_preemption) - genX(cmd_buffer_set_preemption)(cmd_buffer, true); -#endif -} - static void cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer) { @@ -811,9 +2207,8 @@ cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer) 4 * (pipeline)->state.len); \ } while (0) - -void -genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) +static void +genX(old_cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; struct vk_dynamic_graphics_state *dyn = diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 4d5310bebca..f0a17df4baf 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -334,11 +334,39 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, 1ull << 32); #endif - /* Invalidate pipeline, xfb (for 3DSTATE_SO_BUFFER) & raster discard (for - * 3DSTATE_STREAMOUT). - */ - cmd_buffer->state.gfx.dirty |= (ANV_CMD_DIRTY_PIPELINE | - ANV_CMD_DIRTY_XFB_ENABLE); - BITSET_SET(cmd_buffer->vk.dynamic_graphics_state.dirty, - MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); + /* Flag all the instructions emitted by the memcpy. */ + struct anv_gfx_dynamic_state *hw_state = + &cmd_buffer->state.gfx.dyn_state; + + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS); +#if GFX_VER >= 11 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2); +#endif +#if GFX_VER >= 12 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION); +#endif + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS); + if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) { + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL); + } + + cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_INDEX_BUFFER); } diff --git a/src/intel/vulkan/genX_simple_shader.h b/src/intel/vulkan/genX_simple_shader.h index 78cbb884728..f2568936ec0 100644 --- a/src/intel/vulkan/genX_simple_shader.h +++ b/src/intel/vulkan/genX_simple_shader.h @@ -293,12 +293,49 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state) state->cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; #endif + /* Flag all the instructions emitted by the memcpy. */ + struct anv_gfx_dynamic_state *hw_state = + &state->cmd_buffer->state.gfx.dyn_state; + + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS); +#if GFX_VER >= 11 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2); +#endif +#if GFX_VER >= 12 + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION); +#endif + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND); + if (device->vk.enabled_extensions.EXT_mesh_shader) { + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL); + BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL); + } + state->cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0); state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER | ANV_CMD_DIRTY_XFB_ENABLE); state->cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; state->cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT; - vk_dynamic_graphics_state_dirty_all(&state->cmd_buffer->vk.dynamic_graphics_state); } static void