radeonsi: align waves to 256B clear/copy area for the clear/copy_buffer shader

This is about 10% faster in certain unaligned cases.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30208>
This commit is contained in:
Marek Olšák 2024-07-16 02:15:34 -04:00 committed by Marge Bot
parent 2f9201e91b
commit e41fec7812
3 changed files with 43 additions and 5 deletions

View file

@ -406,6 +406,16 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
unsigned num_threads = DIV_ROUND_UP(dst_align_offset + size, dwords_per_thread * 4);
key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset && key.dst_last_thread_bytes;
/* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
* the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
* partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
* Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
*/
unsigned start_thread =
dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
key.has_start_thread = start_thread != 0;
void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
if (!shader) {
shader = si_create_dma_compute_shader(sctx, &key);
@ -415,9 +425,11 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
/* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
if (key.dst_last_thread_bytes)
sctx->cs_user_data[num_user_data_terms++] = num_threads - 1;
if (key.has_start_thread)
sctx->cs_user_data[num_user_data_terms++] = start_thread;
struct pipe_grid_info info = {};
set_work_size(&info, 64, 1, 1, num_threads, 1, 1);
set_work_size(&info, 64, 1, 1, start_thread + num_threads, 1, 1);
si_launch_grid_internal_ssbos(sctx, &info, shader, flags, coher, is_copy ? 2 : 1, sb,
is_copy ? 0x2 : 0x1);

View file

@ -1655,6 +1655,8 @@ union si_cs_clear_copy_buffer_key {
unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */
bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */
bool has_start_thread:1; /* whether the first few threads should be skipped, making later
waves start on a 256B boundary */
};
uint64_t key;
};

View file

@ -284,14 +284,35 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
b.shader->info.cs.user_data_components_amd =
(key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0);
b.shader->info.cs.user_data_components_amd = 0;
if (key->is_clear) {
b.shader->info.cs.user_data_components_amd +=
key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
}
/* Add the last thread ID value. */
unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
if (key->dst_last_thread_bytes)
b.shader->info.cs.user_data_components_amd = key->is_clear ? 5 : 1;
b.shader->info.cs.user_data_components_amd++;
unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
if (key->has_start_thread)
b.shader->info.cs.user_data_components_amd++;
nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
/* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
* aligned. Skip those threads here.
*/
nir_if *if_positive = NULL;
if (key->has_start_thread) {
nir_def *start_thread =
nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
thread_id = nir_isub(&b, thread_id, start_thread);
if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
}
/* Convert the global thread ID into bytes. */
nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
nir_def *value;
@ -446,7 +467,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
if (key->dst_last_thread_bytes) {
nir_def *last_thread_id =
nir_channel(&b, nir_load_user_data_amd(&b), key->is_clear ? 4 : 0);
nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);
if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
{
@ -517,6 +538,9 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
}
}
if (key->has_start_thread)
nir_pop_if(&b, if_positive);
return si_create_shader_state(sctx, b.shader);
}