radeonsi: "create_dma_compute" shader in nir

Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25782>
2026-06-17 19:38:21 +02:00 · 2023-10-05 06:49:58 -05:00 · 2023-10-05 06:49:58 -05:00 · 6584088cd5
commit 6584088cd5
parent d54c140eb6
5 changed files with 86 additions and 110 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -395,7 +395,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
      if (!sctx->cs_copy_buffer) {
         sctx->cs_copy_buffer = si_create_dma_compute_shader(
-            &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+            sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
      }
      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@ -409,7 +409,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
      if (!sctx->cs_clear_buffer) {
         sctx->cs_clear_buffer = si_create_dma_compute_shader(
-            &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+            sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
      }
      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1650,7 +1650,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
-void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -645,3 +645,85 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
   return create_shader_state(sctx, b.shader);
 }
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy)
 {
   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
   const nir_shader_compiler_options *options =
      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
   if (dst_stream_cache_policy)
      store_qualifier |= ACCESS_NON_TEMPORAL;
   /* Don't cache loads, because there is no reuse. */
   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
   nir_builder b =
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
   b.shader->info.workgroup_size[0] = default_wave_size;
   b.shader->info.workgroup_size[1] = 1;
   b.shader->info.workgroup_size[2] = 1;
   b.shader->info.num_ssbos = 1;
   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
   for (unsigned i = 0; i < num_mem_ops; i++) {
      if (i * 4 < num_dwords_per_thread)
         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
   }
   /* If there are multiple stores,
    * the first store writes into 0 * wavesize + tid,
    * the 2nd store writes into 1 * wavesize + tid,
    * the 3rd store writes into 2 * wavesize + tid, etc.
    */
   nir_def *store_address = get_global_ids(&b, 1);
   /* Convert from a "store size unit" into bytes. */
   store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
   nir_def *load_address = store_address, *value, *values[num_mem_ops];
   value = nir_undef(&b, 1, 32);
   if (is_copy) {
      b.shader->info.num_ssbos++;
   } else {
      b.shader->info.cs.user_data_components_amd = inst_dwords[0];
      value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
   }
   /* Distance between a load and a store for latency hiding. */
   unsigned load_store_distance = is_copy ? 8 : 0;
   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
      int d = i - load_store_distance;
      if (is_copy && i < num_mem_ops) {
         if (i) {
            load_address = nir_iadd(&b, load_address,
                                    nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
         }
         values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
                                   .access = load_qualifier);
      }
      if (d >= 0) {
         if (d) {
            store_address = nir_iadd(&b, store_address,
                                     nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
         }
         nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
                        .access = store_qualifier);
      }
   }
   return create_shader_state(sctx, b.shader);
 }
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@ -67,112 +67,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
   return *vs;
 }
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy)
 {
   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
   unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
   if (dst_stream_cache_policy)
      store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
   /* Don't cache loads, because there is no reuse. */
   unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
   for (unsigned i = 0; i < num_mem_ops; i++) {
      if (i * 4 < num_dwords_per_thread)
         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
   }
   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
   if (!ureg)
      return NULL;
   unsigned default_wave_size = si_determine_wave_size(sscreen, NULL);
   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size);
   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
   struct ureg_src value;
   if (!is_copy) {
      ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
      value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
   }
   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
   struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
   struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
   struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
   struct ureg_src srcbuf;
   struct ureg_src *values = NULL;
   if (is_copy) {
      srcbuf = ureg_DECL_buffer(ureg, 1, false);
      values = malloc(num_mem_ops * sizeof(struct ureg_src));
   }
   /* If there are multiple stores, the first store writes into 0*wavesize+tid,
    * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
    */
   ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops),
             tid);
   /* Convert from a "store size unit" into bytes. */
   ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
   ureg_MOV(ureg, load_addr, ureg_src(store_addr));
   /* Distance between a load and a store for latency hiding. */
   unsigned load_store_distance = is_copy ? 8 : 0;
   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
      int d = i - load_store_distance;
      if (is_copy && i < num_mem_ops) {
         if (i) {
            ureg_UADD(ureg, load_addr, ureg_src(load_addr),
                      ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size));
         }
         values[i] = ureg_src(ureg_DECL_temporary(ureg));
         struct ureg_dst dst =
            ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
         struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
         ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
                          TGSI_TEXTURE_BUFFER, 0);
      }
      if (d >= 0) {
         if (d) {
            ureg_UADD(ureg, store_addr, ureg_src(store_addr),
                      ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size));
         }
         struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
         struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
         ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
                          TGSI_TEXTURE_BUFFER, 0);
      }
   }
   ureg_END(ureg);
   struct pipe_compute_state state = {};
   state.ir_type = PIPE_SHADER_IR_TGSI;
   state.prog = ureg_get_tokens(ureg, NULL);
   void *cs = ctx->create_compute_state(ctx, &state);
   ureg_destroy(ureg);
   ureg_free_tokens(state.prog);
   free(values);
   return cs;
 }
 /* Create the compute shader that is used to collect the results.
 *
 * One compute grid with a single thread is launched for every query result
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@ -119,7 +119,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
         void *compute_shader = NULL;
         if (test_cs) {
-            compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+            compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
                                              cache_policy == L2_STREAM, is_copy);
         }