radeonsi: don't use threadID.yz/blockID.yz for compute_blit if they're always 0

This can improve performance because fewer VGPRs and SGPRs need to be initialized. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
2025-12-26 17:10:11 +01:00 · 2023-08-06 22:15:42 -04:00 · 2023-08-06 22:15:42 -04:00 · f3398683f2
commit f3398683f2
parent 3952b89ebb
3 changed files with 9 additions and 4 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -1121,6 +1121,11 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   image[1].u.tex.first_layer = 0;
   image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);

+   struct pipe_grid_info grid = {0};
+   unsigned wg_dim =
+      set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height,
+                    info->dst.box.depth);
+
   /* Get the shader key. */
   const struct util_format_description *dst_desc = util_format_description(info->dst.format);
   unsigned i = util_format_get_first_non_void_channel(info->dst.format);
@ -1128,6 +1133,7 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   options.key = 0;

   options.always_true = true;
+   options.wg_dim = wg_dim;
   options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
                       info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
   options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
@ -1177,9 +1183,6 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info,
   sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
   sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);

-   struct pipe_grid_info grid = {0};
-   set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height, info->dst.box.depth);
-
   si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
                                  SI_OP_SYNC_BEFORE_AFTER |
                                  (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1593,6 +1593,7 @@ union si_compute_blit_shader_key {
      /* The key saved in _mesa_hash_table_create_u32_keys() can't be 0. */
      bool always_true:1;
      /* Declaration modifiers. */
+      uint8_t wg_dim:2; /* 1, 2, or 3 */
      bool src_is_1d:1;
      bool dst_is_1d:1;
      bool src_is_msaa:1;
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -438,6 +438,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
      BITSET_SET(b.shader->info.msaa_images, 0);
   if (options->dst_is_msaa)
      BITSET_SET(b.shader->info.msaa_images, 1);
+   /* TODO: 1D blits are 8x slower because the workgroup size is 8x8 */
   b.shader->info.workgroup_size[0] = 8;
   b.shader->info.workgroup_size[1] = 8;
   b.shader->info.workgroup_size[2] = 1;
@ -462,7 +463,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha

   /* Instructions. */
   /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
-   nir_def *dst_xyz = get_global_ids(&b, 3);
+   nir_def *dst_xyz = nir_pad_vector_imm_int(&b, get_global_ids(&b, options->wg_dim), 0, 3);
   nir_def *src_xyz = dst_xyz;

   /* Flip src coordinates. */