mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-28 11:50:23 +01:00
radeonsi: use better workgroup sizes for compute blits to improve perf
It depends on the copy area and the tiling of the destination image. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
This commit is contained in:
parent
269ab6cc62
commit
144fe156ef
2 changed files with 50 additions and 7 deletions
|
|
@ -1051,6 +1051,9 @@ static bool si_should_blit_clamp_xy(const struct pipe_blit_info *info)
|
|||
|
||||
bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, bool testing)
|
||||
{
|
||||
struct si_texture *sdst = (struct si_texture *)info->dst.resource;
|
||||
bool is_3d_tiling = sdst->surface.thick_tiling;
|
||||
|
||||
/* Compute blits require D16 right now (see the ISA).
|
||||
*
|
||||
* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
|
||||
|
|
@ -1097,10 +1100,52 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
|
|||
image[1].u.tex.first_layer = 0;
|
||||
image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
|
||||
|
||||
unsigned width = info->dst.box.width;
|
||||
unsigned height = info->dst.box.height;
|
||||
unsigned depth = info->dst.box.depth;
|
||||
unsigned block_x, block_y, block_z;
|
||||
|
||||
/* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout
|
||||
* of dst.
|
||||
*/
|
||||
if (is_3d_tiling) {
|
||||
/* Thick tiling. (microtiles are 3D boxes)
|
||||
* If the box height and depth is > 2, the block size will be 4x4x4.
|
||||
* If not, the threads will spill over to X.
|
||||
*/
|
||||
block_y = util_next_power_of_two(MIN2(height, 4));
|
||||
block_z = util_next_power_of_two(MIN2(depth, 4));
|
||||
block_x = 64 / (block_y * block_z);
|
||||
} else if (sdst->surface.is_linear) {
|
||||
/* If the box width is > 128B, the block size will be 64x1 for bpp <= 4, 32x2 for bpp == 8,
|
||||
* and 16x4 for bpp == 16.
|
||||
* If not, the threads will spill over to Y, then Z if they aren't small.
|
||||
*
|
||||
* This is derived from the fact that the linear image layout has 256B linear blocks, and
|
||||
* longer blocks don't benefit linear write performance, but they hurt tiled read performance.
|
||||
* We want to prioritize blocks that are 256Bx2 over 512Bx1 because the source can be tiled.
|
||||
*
|
||||
* Using the cache line size (128B) instead of hardcoding 256B makes linear blits slower.
|
||||
*/
|
||||
block_x = util_next_power_of_two(MIN3(width, 64, 256 / sdst->surface.bpe));
|
||||
block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
|
||||
block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
|
||||
block_x = 64 / (block_y * block_z);
|
||||
} else {
|
||||
/* Thin tiling. (microtiles are 2D rectangles)
|
||||
* If the box width and height is > 4, the block size will be 8x8.
|
||||
* If Y is <= 4, the threads will spill over to X.
|
||||
* If X is <= 4, the threads will spill over to Y, then Z if they aren't small.
|
||||
*/
|
||||
block_y = util_next_power_of_two(MIN2(height, 8));
|
||||
block_x = util_next_power_of_two(MIN2(width, 64 / block_y));
|
||||
block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
|
||||
block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
|
||||
block_x = 64 / (block_y * block_z);
|
||||
}
|
||||
|
||||
struct pipe_grid_info grid = {0};
|
||||
unsigned wg_dim =
|
||||
set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height,
|
||||
info->dst.box.depth);
|
||||
unsigned wg_dim = set_work_size(&grid, block_x, block_y, block_z, width, height, depth);
|
||||
|
||||
/* Get the shader key. */
|
||||
const struct util_format_description *dst_desc = util_format_description(info->dst.format);
|
||||
|
|
|
|||
|
|
@ -410,10 +410,8 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
|||
BITSET_SET(b.shader->info.msaa_images, 0);
|
||||
if (options->dst_is_msaa)
|
||||
BITSET_SET(b.shader->info.msaa_images, 1);
|
||||
/* TODO: 1D blits are 8x slower because the workgroup size is 8x8 */
|
||||
b.shader->info.workgroup_size[0] = 8;
|
||||
b.shader->info.workgroup_size[1] = 8;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
/* The workgroup size varies depending on the tiling layout and blit dimensions. */
|
||||
b.shader->info.workgroup_size_variable = true;
|
||||
b.shader->info.cs.user_data_components_amd = 3;
|
||||
|
||||
const struct glsl_type *img_type[2] = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue