From e41fec78126e467a99938c122976e8a6b1a8f666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 16 Jul 2024 02:15:34 -0400 Subject: [PATCH] radeonsi: align waves to 256B clear/copy area for the clear/copy_buffer shader This is about 10% faster in certain unaligned cases. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 14 +++++++- src/gallium/drivers/radeonsi/si_pipe.h | 2 ++ .../drivers/radeonsi/si_shaderlib_nir.c | 32 ++++++++++++++++--- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 5120581a39e..e1dd7ea427e 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -406,6 +406,16 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource unsigned num_threads = DIV_ROUND_UP(dst_align_offset + size, dwords_per_thread * 4); key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset && key.dst_last_thread_bytes; + /* start_thread offsets threads to make sure all non-zero waves start clearing/copying from + * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block + * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256. + * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n. + */ + unsigned start_thread = + dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ? + DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0; + key.has_start_thread = start_thread != 0; + void *shader = _mesa_hash_table_u64_search(sctx->cs_dma_shaders, key.key); if (!shader) { shader = si_create_dma_compute_shader(sctx, &key); @@ -415,9 +425,11 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */ if (key.dst_last_thread_bytes) sctx->cs_user_data[num_user_data_terms++] = num_threads - 1; + if (key.has_start_thread) + sctx->cs_user_data[num_user_data_terms++] = start_thread; struct pipe_grid_info info = {}; - set_work_size(&info, 64, 1, 1, num_threads, 1, 1); + set_work_size(&info, 64, 1, 1, start_thread + num_threads, 1, 1); si_launch_grid_internal_ssbos(sctx, &info, shader, flags, coher, is_copy ? 2 : 1, sb, is_copy ? 0x2 : 0x1); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6a05c0146a7..1898fb3c6db 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1655,6 +1655,8 @@ union si_cs_clear_copy_buffer_key { unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */ unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */ bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */ + bool has_start_thread:1; /* whether the first few threads should be skipped, making later + waves start on a 256B boundary */ }; uint64_t key; }; diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index ad3a3428920..3296927039a 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -284,14 +284,35 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co b.shader->info.workgroup_size[1] = 1; b.shader->info.workgroup_size[2] = 1; b.shader->info.num_ssbos = key->is_clear ? 1 : 2; - b.shader->info.cs.user_data_components_amd = - (key->is_clear ? (key->clear_value_size_is_12 ? 3 : key->dwords_per_thread) : 0); + b.shader->info.cs.user_data_components_amd = 0; + + if (key->is_clear) { + b.shader->info.cs.user_data_components_amd += + key->clear_value_size_is_12 ? 3 : key->dwords_per_thread; + } /* Add the last thread ID value. */ + unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd; if (key->dst_last_thread_bytes) - b.shader->info.cs.user_data_components_amd = key->is_clear ? 5 : 1; + b.shader->info.cs.user_data_components_amd++; + + unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd; + if (key->has_start_thread) + b.shader->info.cs.user_data_components_amd++; nir_def *thread_id = ac_get_global_ids(&b, 1, 32); + + /* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it + * aligned. Skip those threads here. + */ + nir_if *if_positive = NULL; + if (key->has_start_thread) { + nir_def *start_thread = + nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index); + thread_id = nir_isub(&b, thread_id, start_thread); + if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0)); + } + /* Convert the global thread ID into bytes. */ nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread); nir_def *value; @@ -446,7 +467,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co if (key->dst_last_thread_bytes) { nir_def *last_thread_id = - nir_channel(&b, nir_load_user_data_amd(&b), key->is_clear ? 4 : 0); + nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index); if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id)); { @@ -517,6 +538,9 @@ void *si_create_dma_compute_shader(struct si_context *sctx, union si_cs_clear_co } } + if (key->has_start_thread) + nir_pop_if(&b, if_positive); + return si_create_shader_state(sctx, b.shader); }