radeonsi: replace the clear_12bytes_buffer shader with the DMA compute shader

It can handle 12-byte clear values with these trivial changes. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
2026-05-05 00:58:05 +02:00 · 2024-04-27 07:41:38 -04:00 · 2024-04-27 07:41:38 -04:00 · b771d13557
commit b771d13557
parent 995e7d927c
3 changed files with 5 additions and 53 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -285,32 +285,6 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
                                 1, &sb, 0x1);
 }

-static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                                            unsigned dst_offset, unsigned size,
-                                            const uint32_t *clear_value, unsigned flags,
-                                            enum si_coherency coher)
-{
-   assert(dst_offset % 4 == 0);
-   assert(size % 4 == 0);
-   unsigned size_12 = DIV_ROUND_UP(size, 12);
-
-   struct pipe_shader_buffer sb = {0};
-   sb.buffer = dst;
-   sb.buffer_offset = dst_offset;
-   sb.buffer_size = size;
-
-   memcpy(sctx->cs_user_data, clear_value, 12);
-
-   struct pipe_grid_info info = {0};
-   set_work_size(&info, 64, 1, 1, size_12, 1, 1);
-
-   if (!sctx->cs_clear_12bytes_buffer)
-      sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(sctx);
-
-   si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_12bytes_buffer, flags, coher,
-                                 1, &sb, 0x1);
-}
-
 static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
                                        unsigned dst_offset, struct pipe_resource *src,
                                        unsigned src_offset, unsigned size,
@ -325,7 +299,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
   assert(!src || src_offset + size <= src->width0);

   bool is_copy = src != NULL;
-   unsigned dwords_per_thread = 4;
+   unsigned dwords_per_thread = clear_value_size == 12 ? 3 : 4;
   unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);

   struct pipe_grid_info info = {};
@ -342,13 +316,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
      sb[0].buffer_size = size;
   } else {
      assert(clear_value_size >= 4 && clear_value_size <= 16 &&
-             util_is_power_of_two_or_zero(clear_value_size));
+             (clear_value_size == 12 || util_is_power_of_two_or_zero(clear_value_size)));

      for (unsigned i = 0; i < 4; i++)
         sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
   }

-   void **shader = is_copy ? &sctx->cs_copy_buffer : &sctx->cs_clear_buffer;
+   void **shader = is_copy ? &sctx->cs_copy_buffer :
+                   clear_value_size == 12 ? &sctx->cs_clear_12bytes_buffer : &sctx->cs_clear_buffer;
   if (!*shader)
      *shader = si_create_dma_compute_shader(sctx, dwords_per_thread, !is_copy);

@ -377,11 +352,6 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
   if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped))
      clear_value = &clamped;

-   if (clear_value_size == 12) {
-      si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, flags, coher);
-      return;
-   }
-
   uint64_t aligned_size = size & ~3ull;
   if (aligned_size >= 4) {
      uint64_t compute_min_size;
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1727,7 +1727,6 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
-void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array);
 void *si_create_query_result_cs(struct si_context *sctx);
 void *gfx11_create_sh_query_result_cs(struct si_context *sctx);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -631,23 +631,6 @@ void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, un
   return create_shader_state(sctx, b.shader);
 }

-void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
-{
-   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
-                                                  "clear_12bytes_buffer");
-   b.shader->info.workgroup_size[0] = 64;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-   b.shader->info.cs.user_data_components_amd = 3;
-
-   nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
-   nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
-
-   nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset);
-
-   return create_shader_state(sctx, b.shader);
-}
-
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
 {
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
@ -672,7 +655,7 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
                                   bool is_clear)
 {
-   assert(util_is_power_of_two_nonzero(num_dwords_per_thread) && num_dwords_per_thread <= 4);
+   assert(num_dwords_per_thread && num_dwords_per_thread <= 4);

   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                  "create_dma_compute");