hasvk/pipeline: Rip out SKL+

v2: Fix incorrect DispatchMode removal (Lionel) Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19852>
2026-05-02 20:58:04 +02:00 · 2022-09-02 22:16:05 -05:00 · 2022-09-02 22:16:05 -05:00 · cc68b7cd94
commit cc68b7cd94
parent 91090e39af
1 changed files with 13 additions and 245 deletions
--- a/src/intel/vulkan_hasvk/genX_pipeline.c
+++ b/src/intel/vulkan_hasvk/genX_pipeline.c
@ -354,11 +354,6 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
   };

-#if GFX_VER >= 9
-   for (unsigned i = 0; i < 32; i++)
-      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-#endif
-
 #if GFX_VER >= 8
   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
@ -584,16 +579,12 @@ genX(rasterization_mode)(VkPolygonMode raster_mode,
      switch (line_mode) {
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
         *api_mode = DX100;
-#if GFX_VER <= 9
-         /* Prior to ICL, the algorithm the HW uses to draw wide lines
-          * doesn't quite match what the CTS expects, at least for rectangular
-          * lines, so we set this to false here, making it draw parallelograms
-          * instead, which work well enough.
+         /* The algorithm the HW uses to draw wide lines doesn't quite match
+          * what the CTS expects, at least for rectangular lines, so we set
+          * this to false here, making it draw parallelograms instead, which
+          * work well enough.
          */
         *msaa_rasterization_enable = line_width < 1.0078125;
-#else
-         *msaa_rasterization_enable = true;
-#endif
         break;

      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
@ -652,10 +643,6 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
   sf.LineStippleEnable = rs->line.stipple.enable;
 #endif

-#if GFX_VER >= 12
-   sf.DerefBlockSize = urb_deref_block_size;
-#endif
-
   bool point_from_shader;
   const struct brw_vue_prog_data *last_vue_prog_data =
      anv_pipeline_get_last_vue_prog_data(pipeline);
@ -692,19 +679,10 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline,
   raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode];
   raster.ScissorRectangleEnable = true;

-#if GFX_VER >= 9
-   /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
-   raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
-   raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
-#elif GFX_VER >= 8
+#if GFX_VER >= 8
   raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
 #endif

-#if GFX_VER >= 9
-   raster.ConservativeRasterizationEnable =
-      rs->conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
-#endif
-
 #if GFX_VER == 7
   /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
    * can get the depth offsets correct.
@ -1380,44 +1358,20 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
 #endif

      assert(!vs_prog_data->base.base.use_alt_mode);
-#if GFX_VER < 11
      vs.SingleVertexDispatch       = false;
-#endif
      vs.VectorMaskEnable           = false;
      /* Wa_1606682166:
       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
       * Disable the Sampler state prefetch functionality in the SARB by
       * programming 0xB000[30] to '1'.
       */
-      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
+      vs.SamplerCount               = get_sampler_count(vs_bin);
      vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
      vs.FloatingPointMode          = IEEE754;
      vs.IllegalOpcodeExceptionEnable = false;
      vs.SoftwareExceptionEnable    = false;
      vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;

-      if (GFX_VER == 9 && devinfo->gt == 4 &&
-          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-         /* On Sky Lake GT4, we have experienced some hangs related to the VS
-          * cache and tessellation.  It is unknown exactly what is happening
-          * but the Haswell docs for the "VS Reference Count Full Force Miss
-          * Enable" field of the "Thread Mode" register refer to a HSW bug in
-          * which the VUE handle reference count would overflow resulting in
-          * internal reference counting bugs.  My (Jason's) best guess is that
-          * this bug cropped back up on SKL GT4 when we suddenly had more
-          * threads in play than any previous gfx9 hardware.
-          *
-          * What we do know for sure is that setting this bit when
-          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
-          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
-          * Disabling the vertex cache with tessellation shaders should only
-          * have a minor performance impact as the tessellation shaders are
-          * likely generating and processing far more geometry than the vertex
-          * stage.
-          */
-         vs.VertexCacheDisable = true;
-      }
-
      vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
      vs.VertexURBEntryReadOffset      = 0;
      vs.DispatchGRFStartRegisterForURBData =
@ -1430,14 +1384,9 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
         vs_prog_data->base.cull_distance_mask;
 #endif

-#if GFX_VERx10 >= 125
-      vs.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
-#else
      vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
      vs.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
-#endif
   }
 }

@ -1466,19 +1415,9 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
      hs.StatisticsEnable = true;
      hs.KernelStartPointer = tcs_bin->kernel.offset;
      /* Wa_1606682166 */
-      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
+      hs.SamplerCount = get_sampler_count(tcs_bin);
      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;

-#if GFX_VER >= 12
-      /* Wa_1604578095:
-       *
-       *    Hang occurs when the number of max threads is less than 2 times
-       *    the number of instance count. The number of max threads must be
-       *    more than 2 times the number of instance count.
-       */
-      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
-#endif
-
      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
      hs.IncludeVertexHandles = true;
      hs.InstanceCount = tcs_prog_data->instances - 1;
@ -1487,31 +1426,10 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
      hs.VertexURBEntryReadOffset = 0;
      hs.DispatchGRFStartRegisterForURBData =
         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
-#if GFX_VER >= 12
-      hs.DispatchGRFStartRegisterForURBData5 =
-         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
-#endif

-#if GFX_VERx10 >= 125
-      hs.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
-#else
      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
      hs.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
-#endif
-
-#if GFX_VER == 12
-      /*  Patch Count threshold specifies the maximum number of patches that
-       *  will be accumulated before a thread dispatch is forced.
-       */
-      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
-#endif
-
-#if GFX_VER >= 9
-      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
-      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
-#endif
   }

   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
@ -1534,16 +1452,6 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
      te.TEEnable = true;
      te.MaximumTessellationFactorOdd = 63.0;
      te.MaximumTessellationFactorNotOdd = 64.0;
-#if GFX_VERx10 >= 125
-      te.TessellationDistributionMode = TEDMODE_RR_FREE;
-      te.TessellationDistributionLevel = TEDLEVEL_PATCH;
-      /* 64_TRIANGLES */
-      te.SmallPatchThreshold = 3;
-      /* 1K_TRIANGLES */
-      te.TargetBlockSize = 8;
-      /* 1K_TRIANGLES */
-      te.LocalBOPAccumulatorThreshold = 1;
-#endif
   }

   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
@ -1551,7 +1459,7 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
      ds.StatisticsEnable = true;
      ds.KernelStartPointer = tes_bin->kernel.offset;
      /* Wa_1606682166 */
-      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
+      ds.SamplerCount = get_sampler_count(tes_bin);
      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;

@ -1564,15 +1472,10 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
         tes_prog_data->base.base.dispatch_grf_start_reg;

 #if GFX_VER >= 8
-#if GFX_VER < 11
      ds.DispatchMode =
         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
-            DISPATCH_MODE_SIMD8_SINGLE_PATCH :
-            DISPATCH_MODE_SIMD4X2;
-#else
-      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
-      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
-#endif
+         DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+         DISPATCH_MODE_SIMD4X2;

      ds.UserClipDistanceClipTestEnableBitmask =
         tes_prog_data->base.clip_distance_mask;
@ -1580,17 +1483,9 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
         tes_prog_data->base.cull_distance_mask;
 #endif

-#if GFX_VER >= 12
-      ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
-#endif
-#if GFX_VERx10 >= 125
-      ds.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
-#else
      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
      ds.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
-#endif
   }
 }

@ -1617,7 +1512,7 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
      gs.SingleProgramFlow       = false;
      gs.VectorMaskEnable        = false;
      /* Wa_1606682166 */
-      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
+      gs.SamplerCount            = get_sampler_count(gs_bin);
      gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
      gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
      gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
@ -1655,14 +1550,9 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
         gs_prog_data->base.cull_distance_mask;
 #endif

-#if GFX_VERx10 >= 125
-      gs.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
-#else
      gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
      gs.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
-#endif
   }
 }

@ -1810,20 +1700,6 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
      ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;

-      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
-       *
-       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
-       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
-       *
-       * Since 16x MSAA is first introduced on SKL, we don't need to apply
-       * the workaround on any older hardware.
-       */
-      if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
-          ms != NULL && ms->rasterization_samples == 16) {
-         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
-         ps._32PixelDispatchEnable = false;
-      }
-
      ps.KernelStartPointer0 = fs_bin->kernel.offset +
                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
      ps.KernelStartPointer1 = fs_bin->kernel.offset +
@ -1835,7 +1711,7 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
      ps.VectorMaskEnable           = GFX_VER >= 8 &&
                                      wm_prog_data->uses_vmask;
      /* Wa_1606682166 */
-      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
+      ps.SamplerCount               = get_sampler_count(fs_bin);
      ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
      ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
                                      wm_prog_data->base.ubo_ranges[0].length;
@ -1868,14 +1744,9 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
      ps.DispatchGRFStartRegisterForConstantSetupData2 =
         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);

-#if GFX_VERx10 >= 125
-      ps.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
-#else
      ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
      ps.ScratchSpaceBasePointer =
         get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
-#endif
   }
 }

@ -1911,35 +1782,7 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
                                         rp->stencil_self_dependency ||
                                         wm_prog_data->uses_kill;

-#if GFX_VER >= 9
-      ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
-      ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
-
-      ps.InputCoverageMaskState = ICMS_NONE;
-      assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
-      if (!wm_prog_data->uses_sample_mask)
-         ps.InputCoverageMaskState = ICMS_NONE;
-      else if (wm_prog_data->per_coarse_pixel_dispatch)
-         ps.InputCoverageMaskState  = ICMS_NORMAL;
-      else if (wm_prog_data->post_depth_coverage)
-         ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
-      else
-         ps.InputCoverageMaskState = ICMS_NORMAL;
-#else
      ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-#endif
-
-#if GFX_VER >= 11
-      ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
-         wm_prog_data->uses_depth_w_coefficients;
-      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
-#endif
-#if GFX_VERx10 >= 125
-      /* TODO: We should only require this when the last geometry shader uses
-       *       a fragment shading rate that is not constant.
-       */
-      ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch;
-#endif
   }
 }
 #endif
@ -1986,37 +1829,6 @@ compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
      (ms && ms->alpha_to_coverage_enable);
 }

-#if GFX_VER == 12
-static void
-emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
-                                   const struct vk_render_pass_state *rp)
-{
-   const int replication_count =
-      anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
-
-   assert(replication_count >= 1);
-   if (replication_count == 1) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
-      return;
-   }
-
-   uint32_t view_mask = rp->view_mask;
-   assert(replication_count == util_bitcount(view_mask));
-   assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
-
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
-      pr.ReplicaMask = (1 << replication_count) - 1;
-      pr.ReplicationCount = replication_count - 1;
-
-      int i = 0;
-      u_foreach_bit(view_index, rp->view_mask) {
-         pr.RTAIOffset[i] = view_index;
-         i++;
-      }
-   }
-}
-#endif
-
 void
 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
                             const struct vk_graphics_pipeline_state *state)
@ -2033,10 +1845,6 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,

   emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);

-#if GFX_VER == 12
-   emit_3dstate_primitive_replication(pipeline, state->rp);
-#endif
-
 #if 0
   /* From gfx7_vs_state.c */

@ -2075,28 +1883,6 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
 #endif
 }

-#if GFX_VERx10 >= 125
-
-void
-genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
-{
-   struct anv_device *device = pipeline->base.device;
-   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
-   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
-
-   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
-   const struct intel_device_info *devinfo = device->info;
-
-   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
-      cfe.MaximumNumberofThreads =
-         devinfo->max_cs_threads * devinfo->subslice_total;
-      cfe.ScratchSpaceBuffer =
-         get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
-   }
-}
-
-#else /* #if GFX_VERx10 >= 125 */
-
 void
 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
 {
@ -2123,12 +1909,8 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
      vfe.MaximumNumberofThreads =
         devinfo->max_cs_threads * devinfo->subslice_total - 1;
      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
-#if GFX_VER < 11
      vfe.ResetGatewayTimer      = true;
-#endif
-#if GFX_VER <= 8
      vfe.BypassGatewayControl   = true;
-#endif
      vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
      vfe.CURBEAllocationSize    = vfe_curbe_allocation;

@ -2163,7 +1945,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
         brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),

      /* Wa_1606682166 */
-      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
+      .SamplerCount           = get_sampler_count(cs_bin),
      /* We add 1 because the CS indirect parameters buffer isn't accounted
       * for in bind_map.surface_count.
       */
@ -2180,18 +1962,6 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
      .CrossThreadConstantDataReadLength =
         cs_prog_data->push.cross_thread.regs,
 #endif
-#if GFX_VER >= 12
-      /* TODO: Check if we are missing workarounds and enable mid-thread
-       * preemption.
-       *
-       * We still have issues with mid-thread preemption (it was already
-       * disabled by the kernel on gfx11, due to missing workarounds). It's
-       * possible that we are just missing some workarounds, and could enable
-       * it later, but for now let's disable it to fix a GPU in compute in Car
-       * Chase (and possibly more).
-       */
-      .ThreadPreemptionDisable = true,
-#endif

      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
   };
@ -2199,5 +1969,3 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
                                        pipeline->interface_descriptor_data,
                                        &desc);
 }
-
-#endif /* #if GFX_VERx10 >= 125 */