radv: precompute compute/task shader register values

To make emission faster. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29014>
2025-12-24 17:30:12 +01:00 · 2024-05-02 09:11:38 +02:00 · 2024-05-02 09:11:38 +02:00 · 3b41fbd4b8
commit 3b41fbd4b8
parent 0549649bcf
5 changed files with 50 additions and 10 deletions
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -1916,12 +1916,11 @@ radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_
      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
   }

-   radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, radv_get_compute_resource_limits(pdev, shader));
-
+   radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
-   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
+   radeon_emit(cs, shader->info.regs.cs.compute_num_thread_x);
+   radeon_emit(cs, shader->info.regs.cs.compute_num_thread_y);
+   radeon_emit(cs, shader->info.regs.cs.compute_num_thread_z);
 }

 static void
--- a/src/amd/vulkan/radv_pipeline_compute.c
+++ b/src/amd/vulkan/radv_pipeline_compute.c
@ -38,7 +38,7 @@
 #include "vk_format.h"

 uint32_t
-radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs)
+radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader_info *info)
 {
   unsigned threads_per_threadgroup;
   unsigned threadgroups_per_cu = 1;
@ -46,8 +46,8 @@ radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const
   unsigned max_waves_per_sh = 0;

   /* Calculate best compute resource limits. */
-   threads_per_threadgroup = cs->info.cs.block_size[0] * cs->info.cs.block_size[1] * cs->info.cs.block_size[2];
-   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, cs->info.wave_size);
+   threads_per_threadgroup = info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2];
+   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, info->wave_size);

   if (pdev->info.gfx_level >= GFX10 && waves_per_threadgroup == 1)
      threadgroups_per_cu = 2;
@ -69,7 +69,7 @@ radv_get_compute_pipeline_metadata(const struct radv_device *device, const struc
   metadata->rsrc1 = cs->config.rsrc1;
   metadata->rsrc2 = cs->config.rsrc2;
   metadata->rsrc3 = cs->config.rsrc3;
-   metadata->compute_resource_limits = radv_get_compute_resource_limits(pdev, cs);
+   metadata->compute_resource_limits = radv_get_compute_resource_limits(pdev, &cs->info);
   metadata->block_size_x = cs->info.cs.block_size[0];
   metadata->block_size_y = cs->info.cs.block_size[1];
   metadata->block_size_z = cs->info.cs.block_size[2];
--- a/src/amd/vulkan/radv_pipeline_compute.h
+++ b/src/amd/vulkan/radv_pipeline_compute.h
@ -15,6 +15,7 @@

 struct radv_physical_device;
 struct radv_shader_binary;
+struct radv_shader_info;

 struct radv_compute_pipeline {
   struct radv_pipeline base;
@ -42,7 +43,7 @@ struct radv_compute_pipeline_metadata {
   uint64_t inline_push_const_mask;
 };

-uint32_t radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs);
+uint32_t radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader_info *info);

 void radv_get_compute_pipeline_metadata(const struct radv_device *device, const struct radv_compute_pipeline *pipeline,
                                        struct radv_compute_pipeline_metadata *metadata);
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -1461,6 +1461,33 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar
 }
 #endif

+static void
+radv_precompute_registers_hw_cs(struct radv_device *device, struct radv_shader_binary *binary)
+{
+   const struct radv_physical_device *pdev = radv_device_physical(device);
+   struct radv_shader_info *info = &binary->info;
+
+   info->regs.cs.compute_resource_limits = radv_get_compute_resource_limits(pdev, info);
+   info->regs.cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[0]);
+   info->regs.cs.compute_num_thread_y = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[1]);
+   info->regs.cs.compute_num_thread_z = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[2]);
+}
+
+static void
+radv_precompute_registers(struct radv_device *device, struct radv_shader_binary *binary)
+{
+   const struct radv_shader_info *info = &binary->info;
+
+   switch (info->stage) {
+   case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_TASK:
+      radv_precompute_registers_hw_cs(device, binary);
+      break;
+   default:
+      break;
+   }
+}
+
 static bool
 radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_binary *binary,
                               const struct radv_shader_args *args)
@ -1767,6 +1794,9 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
      config->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
   }

+   /* Precompute register values for faster emission. */
+   radv_precompute_registers(device, binary);
+
   return true;
 }

--- a/src/amd/vulkan/radv_shader_info.h
+++ b/src/amd/vulkan/radv_shader_info.h
@ -248,6 +248,16 @@ struct radv_shader_info {

   struct radv_legacy_gs_info gs_ring_info;
   struct gfx10_ngg_info ngg_info;
+
+   /* Precomputed register values. */
+   struct {
+      struct {
+         uint32_t compute_num_thread_x;
+         uint32_t compute_num_thread_y;
+         uint32_t compute_num_thread_z;
+         uint32_t compute_resource_limits;
+      } cs;
+   } regs;
 };

 void radv_nir_shader_info_init(gl_shader_stage stage, gl_shader_stage next_stage, struct radv_shader_info *info);