From 95940459be65b0a106a0b705e31607c71a3893bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 21 Mar 2021 19:59:37 -0400 Subject: [PATCH] radeonsi: pack the variable block size in one SGPR, 10 bits per component The side effect of this is that the compute copy image shader now has enough free user SGPRs that it passes the src image via user SGPRs, resulting in lower wave lifetime. Previous copy shader: s_load_dwordx8 image_load s_load_dwordx8 s_waitcnt image_store Current copy shader: image_load s_load_dwordx8 s_waitcnt image_store Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_compute.c | 13 +++++++------ src/gallium/drivers/radeonsi/si_shader.c | 2 +- src/gallium/drivers/radeonsi/si_shader_llvm.c | 11 +++++++++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 00e6a98ff76..b2e552c3a31 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -132,8 +132,11 @@ static void si_create_compute_state_async(void *job, int thread_index) program->shader.is_monolithic = true; + /* Variable block sizes need 10 bits (1 + log2(SI_MAX_VARIABLE_THREADS_PER_BLOCK)) per dim. + * We pack them into a single user SGPR. + */ unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) + - (sel->info.uses_variable_block_size ? 3 : 0) + + (sel->info.uses_variable_block_size ? 1 : 0) + sel->info.base.cs.user_data_components_amd; /* Fast path for compute shaders - some descriptors passed via user SGPRs. */ @@ -707,7 +710,7 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr unsigned block_size_reg = grid_size_reg + /* 12 bytes = 3 dwords. */ 12 * sel->info.uses_grid_size; - unsigned cs_user_data_reg = block_size_reg + 12 * program->sel.info.uses_variable_block_size; + unsigned cs_user_data_reg = block_size_reg + 4 * program->sel.info.uses_variable_block_size; radeon_begin(cs); @@ -730,10 +733,8 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr } if (sel->info.uses_variable_block_size) { - radeon_set_sh_reg_seq(cs, block_size_reg, 3); - radeon_emit(cs, info->block[0]); - radeon_emit(cs, info->block[1]); - radeon_emit(cs, info->block[2]); + radeon_set_sh_reg(cs, block_size_reg, + info->block[0] | (info->block[1] << 10) | (info->block[2] << 20)); } if (sel->info.base.cs.user_data_components_amd) { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b1691cf6a4d..65c574a60fb 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -702,7 +702,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) if (shader->selector->info.uses_grid_size) ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups); if (shader->selector->info.uses_variable_block_size) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size); unsigned cs_user_data_dwords = shader->selector->info.base.cs.user_data_components_amd; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 0729d95a67e..74ae2889d2b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -412,8 +412,15 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); - assert(ctx->shader->selector->info.base.cs.local_size_variable); - return ac_get_arg(&ctx->ac, ctx->block_size); + assert(ctx->shader->selector->info.base.cs.local_size_variable && + ctx->shader->selector->info.uses_variable_block_size); + + LLVMValueRef chan[3] = { + si_unpack_param(ctx, ctx->block_size, 0, 10), + si_unpack_param(ctx, ctx->block_size, 10, 10), + si_unpack_param(ctx, ctx->block_size, 20, 10), + }; + return ac_build_gather_values(&ctx->ac, chan, 3); } static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)