radv: precompute compute/task shader register values

To make emission faster.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29014>
This commit is contained in:
Samuel Pitoiset 2024-05-02 09:11:38 +02:00 committed by Marge Bot
parent 0549649bcf
commit 3b41fbd4b8
5 changed files with 50 additions and 10 deletions

View file

@ -1916,12 +1916,11 @@ radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_
radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
}
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, radv_get_compute_resource_limits(pdev, shader));
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
radeon_emit(cs, shader->info.regs.cs.compute_num_thread_x);
radeon_emit(cs, shader->info.regs.cs.compute_num_thread_y);
radeon_emit(cs, shader->info.regs.cs.compute_num_thread_z);
}
static void

View file

@ -38,7 +38,7 @@
#include "vk_format.h"
uint32_t
radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs)
radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader_info *info)
{
unsigned threads_per_threadgroup;
unsigned threadgroups_per_cu = 1;
@ -46,8 +46,8 @@ radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const
unsigned max_waves_per_sh = 0;
/* Calculate best compute resource limits. */
threads_per_threadgroup = cs->info.cs.block_size[0] * cs->info.cs.block_size[1] * cs->info.cs.block_size[2];
waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, cs->info.wave_size);
threads_per_threadgroup = info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2];
waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, info->wave_size);
if (pdev->info.gfx_level >= GFX10 && waves_per_threadgroup == 1)
threadgroups_per_cu = 2;
@ -69,7 +69,7 @@ radv_get_compute_pipeline_metadata(const struct radv_device *device, const struc
metadata->rsrc1 = cs->config.rsrc1;
metadata->rsrc2 = cs->config.rsrc2;
metadata->rsrc3 = cs->config.rsrc3;
metadata->compute_resource_limits = radv_get_compute_resource_limits(pdev, cs);
metadata->compute_resource_limits = radv_get_compute_resource_limits(pdev, &cs->info);
metadata->block_size_x = cs->info.cs.block_size[0];
metadata->block_size_y = cs->info.cs.block_size[1];
metadata->block_size_z = cs->info.cs.block_size[2];

View file

@ -15,6 +15,7 @@
struct radv_physical_device;
struct radv_shader_binary;
struct radv_shader_info;
struct radv_compute_pipeline {
struct radv_pipeline base;
@ -42,7 +43,7 @@ struct radv_compute_pipeline_metadata {
uint64_t inline_push_const_mask;
};
uint32_t radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader *cs);
uint32_t radv_get_compute_resource_limits(const struct radv_physical_device *pdev, const struct radv_shader_info *info);
void radv_get_compute_pipeline_metadata(const struct radv_device *device, const struct radv_compute_pipeline *pipeline,
struct radv_compute_pipeline_metadata *metadata);

View file

@ -1461,6 +1461,33 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar
}
#endif
static void
radv_precompute_registers_hw_cs(struct radv_device *device, struct radv_shader_binary *binary)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_shader_info *info = &binary->info;
info->regs.cs.compute_resource_limits = radv_get_compute_resource_limits(pdev, info);
info->regs.cs.compute_num_thread_x = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[0]);
info->regs.cs.compute_num_thread_y = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[1]);
info->regs.cs.compute_num_thread_z = S_00B81C_NUM_THREAD_FULL(info->cs.block_size[2]);
}
static void
radv_precompute_registers(struct radv_device *device, struct radv_shader_binary *binary)
{
const struct radv_shader_info *info = &binary->info;
switch (info->stage) {
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TASK:
radv_precompute_registers_hw_cs(device, binary);
break;
default:
break;
}
}
static bool
radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_binary *binary,
const struct radv_shader_args *args)
@ -1767,6 +1794,9 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
config->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
}
/* Precompute register values for faster emission. */
radv_precompute_registers(device, binary);
return true;
}

View file

@ -248,6 +248,16 @@ struct radv_shader_info {
struct radv_legacy_gs_info gs_ring_info;
struct gfx10_ngg_info ngg_info;
/* Precomputed register values. */
struct {
struct {
uint32_t compute_num_thread_x;
uint32_t compute_num_thread_y;
uint32_t compute_num_thread_z;
uint32_t compute_resource_limits;
} cs;
} regs;
};
void radv_nir_shader_info_init(gl_shader_stage stage, gl_shader_stage next_stage, struct radv_shader_info *info);