radeonsi: pack the variable block size in one SGPR, 10 bits per component

The side effect of this is that the compute copy image shader now has enough free user SGPRs that it passes the src image via user SGPRs, resulting in lower wave lifetime. Previous copy shader: s_load_dwordx8 image_load s_load_dwordx8 s_waitcnt image_store Current copy shader: image_load s_load_dwordx8 s_waitcnt image_store Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
2026-05-09 04:38:03 +02:00 · 2021-03-21 19:59:37 -04:00 · 2021-03-21 19:59:37 -04:00 · 95940459be
commit 95940459be
parent 034c1e4845
3 changed files with 17 additions and 9 deletions
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@ -132,8 +132,11 @@ static void si_create_compute_state_async(void *job, int thread_index)

   program->shader.is_monolithic = true;

+   /* Variable block sizes need 10 bits (1 + log2(SI_MAX_VARIABLE_THREADS_PER_BLOCK)) per dim.
+    * We pack them into a single user SGPR.
+    */
   unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
-                         (sel->info.uses_variable_block_size ? 3 : 0) +
+                         (sel->info.uses_variable_block_size ? 1 : 0) +
                         sel->info.base.cs.user_data_components_amd;

   /* Fast path for compute shaders - some descriptors passed via user SGPRs. */
@ -707,7 +710,7 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
   unsigned block_size_reg = grid_size_reg +
                             /* 12 bytes = 3 dwords. */
                             12 * sel->info.uses_grid_size;
-   unsigned cs_user_data_reg = block_size_reg + 12 * program->sel.info.uses_variable_block_size;
+   unsigned cs_user_data_reg = block_size_reg + 4 * program->sel.info.uses_variable_block_size;

   radeon_begin(cs);

@ -730,10 +733,8 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
   }

   if (sel->info.uses_variable_block_size) {
-      radeon_set_sh_reg_seq(cs, block_size_reg, 3);
-      radeon_emit(cs, info->block[0]);
-      radeon_emit(cs, info->block[1]);
-      radeon_emit(cs, info->block[2]);
+      radeon_set_sh_reg(cs, block_size_reg,
+                        info->block[0] | (info->block[1] << 10) | (info->block[2] << 20));
   }

   if (sel->info.base.cs.user_data_components_amd) {
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -702,7 +702,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
      if (shader->selector->info.uses_grid_size)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
      if (shader->selector->info.uses_variable_block_size)
-         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);

      unsigned cs_user_data_dwords =
         shader->selector->info.base.cs.user_data_components_amd;
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@ -412,8 +412,15 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
 {
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);

-   assert(ctx->shader->selector->info.base.cs.local_size_variable);
-   return ac_get_arg(&ctx->ac, ctx->block_size);
+   assert(ctx->shader->selector->info.base.cs.local_size_variable &&
+          ctx->shader->selector->info.uses_variable_block_size);
+
+   LLVMValueRef chan[3] = {
+      si_unpack_param(ctx, ctx->block_size, 0, 10),
+      si_unpack_param(ctx, ctx->block_size, 10, 10),
+      si_unpack_param(ctx, ctx->block_size, 20, 10),
+   };
+   return ac_build_gather_values(&ctx->ac, chan, 3);
 }

 static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)