radeonsi: pack the variable block size in one SGPR, 10 bits per component

The side effect of this is that the compute copy image shader now has
enough free user SGPRs that it passes the src image via user SGPRs,
resulting in lower wave lifetime.

Previous copy shader:
    s_load_dwordx8
    image_load
    s_load_dwordx8
    s_waitcnt
    image_store

Current copy shader:
    image_load
    s_load_dwordx8
    s_waitcnt
    image_store

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
This commit is contained in:
Marek Olšák 2021-03-21 19:59:37 -04:00 committed by Marge Bot
parent 034c1e4845
commit 95940459be
3 changed files with 17 additions and 9 deletions

View file

@ -132,8 +132,11 @@ static void si_create_compute_state_async(void *job, int thread_index)
program->shader.is_monolithic = true;
/* Variable block sizes need 10 bits (1 + log2(SI_MAX_VARIABLE_THREADS_PER_BLOCK)) per dim.
* We pack them into a single user SGPR.
*/
unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
(sel->info.uses_variable_block_size ? 3 : 0) +
(sel->info.uses_variable_block_size ? 1 : 0) +
sel->info.base.cs.user_data_components_amd;
/* Fast path for compute shaders - some descriptors passed via user SGPRs. */
@ -707,7 +710,7 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
unsigned block_size_reg = grid_size_reg +
/* 12 bytes = 3 dwords. */
12 * sel->info.uses_grid_size;
unsigned cs_user_data_reg = block_size_reg + 12 * program->sel.info.uses_variable_block_size;
unsigned cs_user_data_reg = block_size_reg + 4 * program->sel.info.uses_variable_block_size;
radeon_begin(cs);
@ -730,10 +733,8 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
}
if (sel->info.uses_variable_block_size) {
radeon_set_sh_reg_seq(cs, block_size_reg, 3);
radeon_emit(cs, info->block[0]);
radeon_emit(cs, info->block[1]);
radeon_emit(cs, info->block[2]);
radeon_set_sh_reg(cs, block_size_reg,
info->block[0] | (info->block[1] << 10) | (info->block[2] << 20));
}
if (sel->info.base.cs.user_data_components_amd) {

View file

@ -702,7 +702,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
if (shader->selector->info.uses_grid_size)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
if (shader->selector->info.uses_variable_block_size)
ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);
unsigned cs_user_data_dwords =
shader->selector->info.base.cs.user_data_components_amd;

View file

@ -412,8 +412,15 @@ static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
assert(ctx->shader->selector->info.base.cs.local_size_variable);
return ac_get_arg(&ctx->ac, ctx->block_size);
assert(ctx->shader->selector->info.base.cs.local_size_variable &&
ctx->shader->selector->info.uses_variable_block_size);
LLVMValueRef chan[3] = {
si_unpack_param(ctx, ctx->block_size, 0, 10),
si_unpack_param(ctx, ctx->block_size, 10, 10),
si_unpack_param(ctx, ctx->block_size, 20, 10),
};
return ac_build_gather_values(&ctx->ac, chan, 3);
}
static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)