From 92497d1c8fe8ca4e09f99826326d010436ec1ccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 26 Apr 2024 19:44:11 -0400
Subject: [PATCH] radeonsi: minor simplifications of clear/copy_buffer shaders

- always use L2_LRU (never use ACCESS_NON_TEMPORAL) - for better perf
- never use ACCESS_COHERENT because the address might not be aligned to
  a cache line
- assume the wave size is always 64

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
---
 .../drivers/radeonsi/si_compute_blit.c        | 12 ++---
 src/gallium/drivers/radeonsi/si_pipe.h        |  4 +-
 .../drivers/radeonsi/si_shaderlib_nir.c       | 47 ++++++-------------
 .../drivers/radeonsi/si_test_dma_perf.c       |  3 +-
 4 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 355cd33b8d0..a007035604a 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -192,7 +192,7 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
                                    unsigned writeable_bitmask)
 {
    if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+      sctx->flags |= si_get_flush_flags(sctx, coher, L2_LRU);
       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
    }
 
@@ -351,16 +351,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
    sb[0].buffer_offset = dst_offset;
    sb[0].buffer_size = size;
 
-   bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
-
    if (src) {
       sb[1].buffer = src;
       sb[1].buffer_offset = src_offset;
       sb[1].buffer_size = size;
 
       if (!sctx->cs_copy_buffer) {
-         sctx->cs_copy_buffer = si_create_dma_compute_shader(
-            sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+         sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
+                                                             true);
       }
 
       si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@@ -373,8 +371,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
          sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
 
       if (!sctx->cs_clear_buffer) {
-         sctx->cs_clear_buffer = si_create_dma_compute_shader(
-            sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+         sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
+                                                              false);
       }
 
       si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index bbe7ae1005f..a12bcbc199f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -49,8 +49,6 @@ struct ac_llvm_compiler;
 /* Tunables for compute-based clear_buffer and copy_buffer: */
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
-/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_LRU
 
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
@@ -1729,7 +1727,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                         unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool dst_stream_cache_policy, bool is_copy);
+                                   bool is_copy);
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index da81df00278..44c739fb297 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -231,9 +231,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
    /* data |= clear_value_masked; */
    data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
 
-   nir_store_ssbo(&b, data, zero, address,
-      .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
-      .align_mul = 4);
+   nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
 
    return create_shader_state(sctx, b.shader);
 }
@@ -645,25 +643,16 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
    nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
    nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
 
-   nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
-      .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
+   nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset);
 
    return create_shader_state(sctx, b.shader);
 }
 
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
 {
-   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
-
-   /* Don't cache loads, because there is no reuse. */
-   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
-
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                   "ubyte_to_ushort");
-
-   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
-
-   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[0] = 64;
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
    b.shader->info.num_ssbos = 2;
@@ -672,32 +661,24 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
    nir_def *store_address = nir_imul_imm(&b, load_address, 2);
 
    nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
-                                        load_address, .access = load_qualifier);
-   nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
-                  store_address, .access = store_qualifier);
+                                        load_address, .access = ACCESS_RESTRICT);
+   nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
+                  store_address, .access = ACCESS_RESTRICT);
 
    return create_shader_state(sctx, b.shader);
 }
 
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool dst_stream_cache_policy, bool is_copy)
+                                   bool is_copy)
 {
    assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
 
-   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
-   if (dst_stream_cache_policy)
-      store_qualifier |= ACCESS_NON_TEMPORAL;
-
-   /* Don't cache loads, because there is no reuse. */
-   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
-
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                   "create_dma_compute");
+   unsigned wg_size = 64;
 
-   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
-
-   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[0] = wg_size;
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
    b.shader->info.num_ssbos = 1;
@@ -717,7 +698,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
     */
    nir_def *store_address =
       nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
-                                default_wave_size * num_mem_ops),
+                                wg_size * num_mem_ops),
                nir_channel(&b, nir_load_local_invocation_id(&b), 0));
 
    /* Convert from a "store size unit" into bytes. */
@@ -741,19 +722,19 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
       if (is_copy && i < num_mem_ops) {
          if (i) {
             load_address = nir_iadd(&b, load_address,
-                                    nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
+                                    nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
          }
          values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
-                                   .access = load_qualifier);
+                                   .access = ACCESS_RESTRICT);
       }
 
       if (d >= 0) {
          if (d) {
             store_address = nir_iadd(&b, store_address,
-                                     nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
+                                     nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
          }
          nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
-                        .access = store_qualifier);
+                        .access = ACCESS_RESTRICT);
       }
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index ac6846122ac..3d58b6704d3 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -120,8 +120,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
 
          void *compute_shader = NULL;
          if (test_cs) {
-            compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
-                                              cache_policy == L2_STREAM, is_copy);
+            compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, is_copy);
          }
 
          double score = 0;