intel/blorp: Redescribe gfx12.5 surfaces for CCS fast clears

According to HSD 1407682962 and the associated simulator code, fast-clear performance can be affected by: image alignment, tiling, dimensionality, and row pitch. Redescribe surfaces in order avoid fast-clearing at a slower rate. Also, benchmarking the main patch in the performance CI (hw=A750) shows that some traces are helped significantly: * TotalWarWarhammer3 +5.58% (n=2) * Factorio +3.75% (n=1) * TerminatorResistance +3.3% (n=2) * Borderlands3 +3.23% (n=2) We could additionally increase the alignment requirements of surfaces in order to deterministically increase fast-clear performance. That's left out of this patch in order to avoid any functional pitfalls that can arise with increased memory consumption. As a result, performance will continue to be affected by how ISL/drivers/apps configure main surface memory alignments (directly or indirectly). Thanks to Lionel Landwerlin for pointing me to the relevant simulator code. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11168 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11418 Reviewed-by: Rohan Garg <rohan.garg@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33776>
2026-02-17 02:00:28 +01:00 · 2025-01-31 05:38:06 -05:00 · 2025-01-31 05:38:06 -05:00 · 312952048b
commit 312952048b
parent 169e22f962
2 changed files with 126 additions and 11 deletions
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@ -164,7 +164,7 @@ blorp_surface_info_init(struct blorp_batch *batch,
   info->addr = surf->addr;

   info->aux_usage = surf->aux_usage;
-   if (info->aux_usage != ISL_AUX_USAGE_NONE) {
+   if (!blorp_address_is_null(surf->aux_addr)) {
      info->aux_surf = *surf->aux_surf;
      info->aux_addr = surf->aux_addr;
   }
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@ -446,22 +446,21 @@ convert_rt_from_3d_to_2d(const struct isl_device *isl_dev,
   info->surf.size_B = size_B;
 }

-void
-blorp_fast_clear(struct blorp_batch *batch,
-                 const struct blorp_surf *surf,
-                 enum isl_format format, struct isl_swizzle swizzle,
-                 uint32_t level, uint32_t start_layer, uint32_t num_layers,
-                 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
+static void
+fast_clear_surf(struct blorp_batch *batch,
+                const struct blorp_surf *surf,
+                enum isl_format format, struct isl_swizzle swizzle,
+                uint32_t level, uint32_t start_layer, uint32_t num_layers)
 {
   struct blorp_params params;
   blorp_params_init(&params);
   params.num_layers = num_layers;
   assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);

-   params.x0 = x0;
-   params.y0 = y0;
-   params.x1 = x1;
-   params.y1 = y1;
+   params.x0 = 0;
+   params.y0 = 0;
+   params.x1 = u_minify(surf->surf->logical_level0_px.w, level);
+   params.y1 = u_minify(surf->surf->logical_level0_px.h, level);

   if (batch->blorp->isl_dev->info->ver >= 20) {
      union isl_color_value clear_color =
@ -522,6 +521,122 @@ blorp_fast_clear(struct blorp_batch *batch,
   batch->blorp->exec(batch, &params);
 }

+void
+blorp_fast_clear(struct blorp_batch *batch,
+                 const struct blorp_surf *surf,
+                 enum isl_format format, struct isl_swizzle swizzle,
+                 uint32_t level, uint32_t start_layer, uint32_t num_layers,
+                 uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
+{
+   assert(x0 == 0);
+   assert(y0 == 0);
+   assert(x1 == u_minify(surf->surf->logical_level0_px.w, level));
+   assert(y1 == u_minify(surf->surf->logical_level0_px.h, level));
+
+   /* We may want to perform a virtual address-based clear. Collect the memory
+    * range information to do that.
+    */
+   int64_t size_B = 0;
+   int unaligned_height = 0;
+   struct blorp_address addr = surf->addr;
+   if (surf->surf->samples == 1) {
+      uint64_t start_tile_B, end_tile_B;
+      if (isl_surf_image_has_unique_tiles(surf->surf, level,
+                                          start_layer, num_layers,
+                                          &start_tile_B, &end_tile_B)) {
+         size_B = end_tile_B - start_tile_B;
+         addr.offset += start_tile_B;
+      } else if (level == 0 && start_layer == 0 && num_layers == 1) {
+         assert(surf->surf->tiling == ISL_TILING_4 ||
+                surf->surf->tiling == ISL_TILING_Y0);
+         assert(surf->surf->levels > 1 ||
+                surf->surf->logical_level0_px.d > 1 ||
+                surf->surf->logical_level0_px.a > 1);
+         const int phys_height0 = ALIGN(surf->surf->logical_level0_px.h,
+                                        surf->surf->image_alignment_el.h);
+         unaligned_height = phys_height0 % 32;
+         size_B = surf->surf->row_pitch_B * (phys_height0 - unaligned_height);
+      }
+   }
+
+   if (ISL_GFX_VERX10(batch->blorp->isl_dev) == 125 && size_B > 0) {
+      /* According to HSD 1407682962 and its simulator implementation, CCS
+       * fast-clears will operate at a slower rate if any of the following are
+       * true:
+       *
+       *    1) The clear rectangle covers less than 16KB of main surface data
+       *       (i.e., less than 64B of CCS data).
+       *    2) The surface type is SURFTYPE_3D.
+       *    3) The surface tiling is Tile4 and either a) the base address is
+       *       not aligned to 64KB OR b) the pitch is not aligned to 16-tiles.
+       *
+       * This slow-down can also occur on subrectangles within a larger clear
+       * rectangle. Redescribe this memory range to reduce the chance of
+       * slow-downs.
+       */
+      const int _16k = 16 * 1024;
+      const int _64k = 64 * 1024;
+      struct isl_surf isl_surf;
+      struct blorp_surf mem_surf = {
+         .surf = &isl_surf,
+         .addr = addr,
+         .clear_color_addr = surf->clear_color_addr,
+         .aux_usage = surf->aux_usage,
+      };
+
+      do {
+         if (mem_surf.addr.offset % _64k == 0) {
+            if (size_B <= _16k * 16 * 32) {
+               /* The size fits within a single row of tiles. So, we can align
+                * the pitch as needed.
+                */
+               isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
+                                 mem_surf.addr.offset, size_B, ISL_TILING_4);
+               assert(isl_surf.logical_level0_px.h == 32);
+               assert(isl_surf.logical_level0_px.a == 1);
+               isl_surf.row_pitch_B = ALIGN(isl_surf.row_pitch_B, 16 * 128);
+            } else {
+               isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
+                                 mem_surf.addr.offset, size_B, ISL_TILING_64);
+            }
+         } else {
+            int size_to_64k_alignment =
+               align64(mem_surf.addr.offset, _64k) - mem_surf.addr.offset;
+            isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
+                              mem_surf.addr.offset,
+                              size_B - size_to_64k_alignment < _16k ?
+                              size_B : size_to_64k_alignment, ISL_TILING_4);
+         }
+
+         assert(isl_surf.dim == ISL_SURF_DIM_2D);
+         fast_clear_surf(batch, &mem_surf, isl_surf.format, swizzle,
+                         0, 0, isl_surf.logical_level0_px.a);
+
+         size_B -= isl_surf.size_B;
+         mem_surf.addr.offset += isl_surf.size_B;
+      } while (size_B != 0);
+
+      /* Use coordinate-based clears to clear the area that is not aligned to
+       * a tile.
+       */
+      if (unaligned_height > 0) {
+         assert(level == 0 && start_layer == 0 && num_layers == 1);
+         assert(surf->surf->tiling == ISL_TILING_4);
+         isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
+                           mem_surf.addr.offset, surf->surf->row_pitch_B * 32,
+                           ISL_TILING_4);
+         assert(isl_surf.logical_level0_px.h == 32);
+         isl_surf.logical_level0_px.h = unaligned_height;
+         isl_surf.phys_level0_sa.h = unaligned_height;
+         fast_clear_surf(batch, &mem_surf, isl_surf.format, swizzle,
+                         0, 0, isl_surf.logical_level0_px.a);
+      }
+   } else {
+      fast_clear_surf(batch, surf, format, swizzle,
+                      level, start_layer, num_layers);
+   }
+}
+
 bool
 blorp_clear_supports_blitter(struct blorp_context *blorp,
                             const struct blorp_surf *surf,