mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 09:18:04 +02:00
radeonsi: align waves to 256B clear/copy area for the clear/copy_buffer shader
This is about 10% faster in certain unaligned cases. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30208>
This commit is contained in:
parent
2f9201e91b
commit
e41fec7812
3 changed files with 43 additions and 5 deletions
|
|
@ -406,6 +406,16 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
|
|||
unsigned num_threads = DIV_ROUND_UP(dst_align_offset + size, dwords_per_thread * 4);
|
||||
key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset && key.dst_last_thread_bytes;
|
||||
|
||||
/* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
|
||||
* the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
|
||||
* partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
|
||||
* Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
|
||||
*/
|
||||
unsigned start_thread =
|
||||
dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
|
||||
DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
|
||||
key.has_start_thread = start_thread != 0;
|
||||
|
||||
void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key);
|
||||
if (!shader) {
|
||||
shader = si_create_dma_compute_shader(sctx, &key);
|
||||
|
|
@ -415,9 +425,11 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
|
|||
/* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
|
||||
if (key.dst_last_thread_bytes)
|
||||
sctx->cs_user_data[num_user_data_terms++] = num_threads - 1;
|
||||
if (key.has_start_thread)
|
||||
sctx->cs_user_data[num_user_data_terms++] = start_thread;
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
set_work_size(&info, 64, 1, 1, num_threads, 1, 1);
|
||||
set_work_size(&info, 64, 1, 1, start_thread + num_threads, 1, 1);
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, shader, flags, coher, is_copy ? 2 : 1, sb,
|
||||
is_copy ? 0x2 : 0x1);
|
||||
|
|
|
|||
|
|
@ -1655,6 +1655,8 @@ union si_cs_clear_copy_buffer_key {
|
|||
unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
|
||||
unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */
|
||||
bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */
|
||||
bool has_start_thread:1; /* whether the first few threads should be skipped, making later
|
||||
waves start on a 256B boundary */
|
||||
};
|
||||
uint64_t key;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -284,14 +284,35 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
|
|||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
|
||||
b.shader->info.cs.user_data_components_amd =
|
||||
(key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0);
|
||||
b.shader->info.cs.user_data_components_amd = 0;
|
||||
|
||||
if (key->is_clear) {
|
||||
b.shader->info.cs.user_data_components_amd +=
|
||||
key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
|
||||
}
|
||||
|
||||
/* Add the last thread ID value. */
|
||||
unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
|
||||
if (key->dst_last_thread_bytes)
|
||||
b.shader->info.cs.user_data_components_amd = key->is_clear ? 5 : 1;
|
||||
b.shader->info.cs.user_data_components_amd++;
|
||||
|
||||
unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
|
||||
if (key->has_start_thread)
|
||||
b.shader->info.cs.user_data_components_amd++;
|
||||
|
||||
nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
|
||||
|
||||
/* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
|
||||
* aligned. Skip those threads here.
|
||||
*/
|
||||
nir_if *if_positive = NULL;
|
||||
if (key->has_start_thread) {
|
||||
nir_def *start_thread =
|
||||
nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
|
||||
thread_id = nir_isub(&b, thread_id, start_thread);
|
||||
if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
|
||||
}
|
||||
|
||||
/* Convert the global thread ID into bytes. */
|
||||
nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
|
||||
nir_def *value;
|
||||
|
|
@ -446,7 +467,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
|
|||
|
||||
if (key->dst_last_thread_bytes) {
|
||||
nir_def *last_thread_id =
|
||||
nir_channel(&b, nir_load_user_data_amd(&b), key->is_clear ? 4 : 0);
|
||||
nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);
|
||||
|
||||
if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
|
||||
{
|
||||
|
|
@ -517,6 +538,9 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co
|
|||
}
|
||||
}
|
||||
|
||||
if (key->has_start_thread)
|
||||
nir_pop_if(&b, if_positive);
|
||||
|
||||
return si_create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue