intel/blorp: Redescribe gfx12.5 surfaces for CCS fast clears

According to HSD 1407682962 and the associated simulator code,
fast-clear performance can be affected by: image alignment, tiling,
dimensionality, and row pitch. Redescribe surfaces in order avoid
fast-clearing at a slower rate.

Also, benchmarking the main patch in the performance CI (hw=A750)
shows that some traces are helped significantly:
* TotalWarWarhammer3 +5.58% (n=2)
* Factorio +3.75% (n=1)
* TerminatorResistance +3.3% (n=2)
* Borderlands3 +3.23% (n=2)

We could additionally increase the alignment requirements of surfaces in
order to deterministically increase fast-clear performance. That's left
out of this patch in order to avoid any functional pitfalls that can
arise with increased memory consumption. As a result, performance will
continue to be affected by how ISL/drivers/apps configure main surface
memory alignments (directly or indirectly).

Thanks to Lionel Landwerlin for pointing me to the relevant simulator
code.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11168
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11418
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33776>
This commit is contained in:
Nanley Chery 2025-01-31 05:38:06 -05:00 committed by Marge Bot
parent 169e22f962
commit 312952048b
2 changed files with 126 additions and 11 deletions

View file

@ -164,7 +164,7 @@ blorp_surface_info_init(struct blorp_batch *batch,
info->addr = surf->addr;
info->aux_usage = surf->aux_usage;
if (info->aux_usage != ISL_AUX_USAGE_NONE) {
if (!blorp_address_is_null(surf->aux_addr)) {
info->aux_surf = *surf->aux_surf;
info->aux_addr = surf->aux_addr;
}

View file

@ -446,22 +446,21 @@ convert_rt_from_3d_to_2d(const struct isl_device *isl_dev,
info->surf.size_B = size_B;
}
void
blorp_fast_clear(struct blorp_batch *batch,
const struct blorp_surf *surf,
enum isl_format format, struct isl_swizzle swizzle,
uint32_t level, uint32_t start_layer, uint32_t num_layers,
uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
static void
fast_clear_surf(struct blorp_batch *batch,
const struct blorp_surf *surf,
enum isl_format format, struct isl_swizzle swizzle,
uint32_t level, uint32_t start_layer, uint32_t num_layers)
{
struct blorp_params params;
blorp_params_init(&params);
params.num_layers = num_layers;
assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);
params.x0 = x0;
params.y0 = y0;
params.x1 = x1;
params.y1 = y1;
params.x0 = 0;
params.y0 = 0;
params.x1 = u_minify(surf->surf->logical_level0_px.w, level);
params.y1 = u_minify(surf->surf->logical_level0_px.h, level);
if (batch->blorp->isl_dev->info->ver >= 20) {
union isl_color_value clear_color =
@ -522,6 +521,122 @@ blorp_fast_clear(struct blorp_batch *batch,
batch->blorp->exec(batch, &params);
}
void
blorp_fast_clear(struct blorp_batch *batch,
const struct blorp_surf *surf,
enum isl_format format, struct isl_swizzle swizzle,
uint32_t level, uint32_t start_layer, uint32_t num_layers,
uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1)
{
assert(x0 == 0);
assert(y0 == 0);
assert(x1 == u_minify(surf->surf->logical_level0_px.w, level));
assert(y1 == u_minify(surf->surf->logical_level0_px.h, level));
/* We may want to perform a virtual address-based clear. Collect the memory
* range information to do that.
*/
int64_t size_B = 0;
int unaligned_height = 0;
struct blorp_address addr = surf->addr;
if (surf->surf->samples == 1) {
uint64_t start_tile_B, end_tile_B;
if (isl_surf_image_has_unique_tiles(surf->surf, level,
start_layer, num_layers,
&start_tile_B, &end_tile_B)) {
size_B = end_tile_B - start_tile_B;
addr.offset += start_tile_B;
} else if (level == 0 && start_layer == 0 && num_layers == 1) {
assert(surf->surf->tiling == ISL_TILING_4 ||
surf->surf->tiling == ISL_TILING_Y0);
assert(surf->surf->levels > 1 ||
surf->surf->logical_level0_px.d > 1 ||
surf->surf->logical_level0_px.a > 1);
const int phys_height0 = ALIGN(surf->surf->logical_level0_px.h,
surf->surf->image_alignment_el.h);
unaligned_height = phys_height0 % 32;
size_B = surf->surf->row_pitch_B * (phys_height0 - unaligned_height);
}
}
if (ISL_GFX_VERX10(batch->blorp->isl_dev) == 125 && size_B > 0) {
/* According to HSD 1407682962 and its simulator implementation, CCS
* fast-clears will operate at a slower rate if any of the following are
* true:
*
* 1) The clear rectangle covers less than 16KB of main surface data
* (i.e., less than 64B of CCS data).
* 2) The surface type is SURFTYPE_3D.
* 3) The surface tiling is Tile4 and either a) the base address is
* not aligned to 64KB OR b) the pitch is not aligned to 16-tiles.
*
* This slow-down can also occur on subrectangles within a larger clear
* rectangle. Redescribe this memory range to reduce the chance of
* slow-downs.
*/
const int _16k = 16 * 1024;
const int _64k = 64 * 1024;
struct isl_surf isl_surf;
struct blorp_surf mem_surf = {
.surf = &isl_surf,
.addr = addr,
.clear_color_addr = surf->clear_color_addr,
.aux_usage = surf->aux_usage,
};
do {
if (mem_surf.addr.offset % _64k == 0) {
if (size_B <= _16k * 16 * 32) {
/* The size fits within a single row of tiles. So, we can align
* the pitch as needed.
*/
isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
mem_surf.addr.offset, size_B, ISL_TILING_4);
assert(isl_surf.logical_level0_px.h == 32);
assert(isl_surf.logical_level0_px.a == 1);
isl_surf.row_pitch_B = ALIGN(isl_surf.row_pitch_B, 16 * 128);
} else {
isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
mem_surf.addr.offset, size_B, ISL_TILING_64);
}
} else {
int size_to_64k_alignment =
align64(mem_surf.addr.offset, _64k) - mem_surf.addr.offset;
isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
mem_surf.addr.offset,
size_B - size_to_64k_alignment < _16k ?
size_B : size_to_64k_alignment, ISL_TILING_4);
}
assert(isl_surf.dim == ISL_SURF_DIM_2D);
fast_clear_surf(batch, &mem_surf, isl_surf.format, swizzle,
0, 0, isl_surf.logical_level0_px.a);
size_B -= isl_surf.size_B;
mem_surf.addr.offset += isl_surf.size_B;
} while (size_B != 0);
/* Use coordinate-based clears to clear the area that is not aligned to
* a tile.
*/
if (unaligned_height > 0) {
assert(level == 0 && start_layer == 0 && num_layers == 1);
assert(surf->surf->tiling == ISL_TILING_4);
isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
mem_surf.addr.offset, surf->surf->row_pitch_B * 32,
ISL_TILING_4);
assert(isl_surf.logical_level0_px.h == 32);
isl_surf.logical_level0_px.h = unaligned_height;
isl_surf.phys_level0_sa.h = unaligned_height;
fast_clear_surf(batch, &mem_surf, isl_surf.format, swizzle,
0, 0, isl_surf.logical_level0_px.a);
}
} else {
fast_clear_surf(batch, surf, format, swizzle,
level, start_layer, num_layers);
}
}
bool
blorp_clear_supports_blitter(struct blorp_context *blorp,
const struct blorp_surf *surf,