From 92497d1c8fe8ca4e09f99826326d010436ec1ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 26 Apr 2024 19:44:11 -0400 Subject: [PATCH] radeonsi: minor simplifications of clear/copy_buffer shaders - always use L2_LRU (never use ACCESS_NON_TEMPORAL) - for better perf - never use ACCESS_COHERENT because the address might not be aligned to a cache line - assume the wave size is always 64 Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 12 ++--- src/gallium/drivers/radeonsi/si_pipe.h | 4 +- .../drivers/radeonsi/si_shaderlib_nir.c | 47 ++++++------------- .../drivers/radeonsi/si_test_dma_perf.c | 3 +- 4 files changed, 21 insertions(+), 45 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 355cd33b8d0..a007035604a 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -192,7 +192,7 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf unsigned writeable_bitmask) { if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { - sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + sctx->flags |= si_get_flush_flags(sctx, coher, L2_LRU); si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } @@ -351,16 +351,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res sb[0].buffer_offset = dst_offset; sb[0].buffer_size = size; - bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; - if (src) { sb[1].buffer = src; sb[1].buffer_offset = src_offset; sb[1].buffer_size = size; if (!sctx->cs_copy_buffer) { - sctx->cs_copy_buffer = si_create_dma_compute_shader( - sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); + sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD, + true); } si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher, @@ -373,8 +371,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; if (!sctx->cs_clear_buffer) { - sctx->cs_clear_buffer = si_create_dma_compute_shader( - sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); + sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, + false); } si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index bbe7ae1005f..a12bcbc199f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -49,8 +49,6 @@ struct ac_llvm_compiler; /* Tunables for compute-based clear_buffer and copy_buffer: */ #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 #define SI_COMPUTE_COPY_DW_PER_THREAD 4 -/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */ -#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) @@ -1729,7 +1727,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, - bool dst_stream_cache_policy, bool is_copy); + bool is_copy); void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx); void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index da81df00278..44c739fb297 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -231,9 +231,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx) /* data |= clear_value_masked; */ data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0)); - nir_store_ssbo(&b, data, zero, address, - .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0, - .align_mul = 4); + nir_store_ssbo(&b, data, zero, address, .align_mul = 4); return create_shader_state(sctx, b.shader); } @@ -645,25 +643,16 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx) nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12); nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3); - nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset, - .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0); + nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset); return create_shader_state(sctx, b.shader); } void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) { - unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT; - - /* Don't cache loads, because there is no reuse. */ - unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL; - nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options, "ubyte_to_ushort"); - - unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL); - - b.shader->info.workgroup_size[0] = default_wave_size; + b.shader->info.workgroup_size[0] = 64; b.shader->info.workgroup_size[1] = 1; b.shader->info.workgroup_size[2] = 1; b.shader->info.num_ssbos = 2; @@ -672,32 +661,24 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) nir_def *store_address = nir_imul_imm(&b, load_address, 2); nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1), - load_address, .access = load_qualifier); - nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0), - store_address, .access = store_qualifier); + load_address, .access = ACCESS_RESTRICT); + nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0), + store_address, .access = ACCESS_RESTRICT); return create_shader_state(sctx, b.shader); } /* Create a compute shader implementing clear_buffer or copy_buffer. */ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, - bool dst_stream_cache_policy, bool is_copy) + bool is_copy) { assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); - unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT; - if (dst_stream_cache_policy) - store_qualifier |= ACCESS_NON_TEMPORAL; - - /* Don't cache loads, because there is no reuse. */ - unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL; - nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options, "create_dma_compute"); + unsigned wg_size = 64; - unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL); - - b.shader->info.workgroup_size[0] = default_wave_size; + b.shader->info.workgroup_size[0] = wg_size; b.shader->info.workgroup_size[1] = 1; b.shader->info.workgroup_size[2] = 1; b.shader->info.num_ssbos = 1; @@ -717,7 +698,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_ */ nir_def *store_address = nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0), - default_wave_size * num_mem_ops), + wg_size * num_mem_ops), nir_channel(&b, nir_load_local_invocation_id(&b), 0)); /* Convert from a "store size unit" into bytes. */ @@ -741,19 +722,19 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_ if (is_copy && i < num_mem_ops) { if (i) { load_address = nir_iadd(&b, load_address, - nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size)); + nir_imm_int(&b, 4 * inst_dwords[i] * wg_size)); } values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address, - .access = load_qualifier); + .access = ACCESS_RESTRICT); } if (d >= 0) { if (d) { store_address = nir_iadd(&b, store_address, - nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size)); + nir_imm_int(&b, 4 * inst_dwords[d] * wg_size)); } nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address, - .access = store_qualifier); + .access = ACCESS_RESTRICT); } } diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index ac6846122ac..3d58b6704d3 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -120,8 +120,7 @@ void si_test_dma_perf(struct si_screen *sscreen) void *compute_shader = NULL; if (test_cs) { - compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, - cache_policy == L2_STREAM, is_copy); + compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, is_copy); } double score = 0;