ac: lower load_num_workgroups in NIR

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39638>
This commit is contained in:
Marek Olšák 2026-01-31 15:55:40 -05:00 committed by Marge Bot
parent 1e11e83d1c
commit d1e6a5c1c8
17 changed files with 12 additions and 45 deletions

View file

@ -96,6 +96,7 @@ typedef struct {
unsigned wave_size;
unsigned workgroup_size;
bool use_llvm;
bool load_grid_size_from_user_sgpr;
} ac_nir_lower_intrinsics_to_args_options;
bool ac_nir_lower_intrinsics_to_args(nir_shader *shader, const struct ac_shader_args *ac_args,

View file

@ -184,6 +184,14 @@ lower_intrinsic_to_arg(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
UNREACHABLE("unexpected shader stage");
}
break;
case nir_intrinsic_load_num_workgroups:
if (s->options->load_grid_size_from_user_sgpr) {
replacement = ac_nir_load_arg(b, s->args, s->args->num_work_groups);
} else {
nir_def *addr = nir_pack_64_2x32(b, ac_nir_load_arg(b, s->args, s->args->num_work_groups));
replacement = ac_nir_load_smem(b, 3, addr, nir_imm_int(b, 0), 4, ACCESS_CAN_SPECULATE);
}
break;
case nir_intrinsic_load_pixel_coord:
replacement = nir_unpack_32_2x16(b, ac_nir_load_arg(b, s->args, s->args->pos_fixed_pt));
break;

View file

@ -148,7 +148,6 @@ struct aco_compiler_options {
bool record_ir;
bool record_stats;
bool has_ls_vgpr_init_bug;
bool load_grid_size_from_user_sgpr;
bool optimisations_disabled;
uint8_t enable_mrt_output_nan_fixup;
bool wgp_mode;

View file

@ -553,7 +553,6 @@ init_context(isel_context* ctx, nir_shader* shader)
RegType type = RegType::sgpr;
switch (intrinsic->intrinsic) {
case nir_intrinsic_load_push_constant:
case nir_intrinsic_load_num_workgroups:
case nir_intrinsic_load_num_subgroups:
case nir_intrinsic_vote_all:
case nir_intrinsic_vote_any:

View file

@ -4012,20 +4012,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
case nir_intrinsic_load_num_workgroups: {
Temp dst = get_ssa_temp(ctx, &instr->def);
if (ctx->options->load_grid_size_from_user_sgpr) {
bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
} else {
Temp addr = get_arg(ctx, ctx->args->num_work_groups);
assert(addr.regClass() == s2);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
}
emit_split_vector(ctx, dst, 3);
break;
}
case nir_intrinsic_ddx:
case nir_intrinsic_ddy:
case nir_intrinsic_ddx_fine:

View file

@ -838,12 +838,6 @@ static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRe
return result;
}
LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
LLVMValueRef index)
{
return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
}
/* This assumes that there is no unsigned integer wraparound during the address
* computation, excluding all GEPs within base_ptr. */
LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,

View file

@ -222,8 +222,6 @@ LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer p
void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
LLVMValueRef value);
LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
LLVMValueRef index);
LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
LLVMValueRef index);

View file

@ -2669,17 +2669,6 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
case nir_intrinsic_is_helper_invocation:
result = ac_build_load_helper_invocation(&ctx->ac);
break;
case nir_intrinsic_load_num_workgroups:
if (ctx->abi->load_grid_size_from_user_sgpr) {
result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
} else {
struct ac_llvm_pointer ptr;
ptr.pointee_type = ctx->ac.v3i32;
ptr.value = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
result = ac_build_load_invariant(&ctx->ac, ptr, ctx->ac.i32_0);
}
break;
case nir_intrinsic_load_subgroup_id:
assert(mesa_shader_stage_is_compute(ctx->stage) && ctx->ac.gfx_level >= GFX12);
result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.wave.id", ctx->ac.i32, NULL, 0, 0);

View file

@ -63,9 +63,6 @@ struct ac_shader_abi {
/* Clamp div by 0 (so it won't produce NaN) */
bool clamp_div_by_zero;
/* Whether to inline the compute dispatch size in user sgprs. */
bool load_grid_size_from_user_sgpr;
/* Whether to disable anisotropic filtering. */
bool disable_aniso_single_level;
};

View file

@ -119,7 +119,6 @@ radv_aco_convert_opts(struct aco_compiler_options *aco_info, const struct radv_n
ASSIGN_FIELD(debug.private_data);
aco_info->cu_info = &radv->info->cu_info;
aco_info->is_opengl = false;
aco_info->load_grid_size_from_user_sgpr = radv_args->load_grid_size_from_user_sgpr;
aco_info->optimisations_disabled = stage_key->optimisations_disabled;
aco_info->gfx_level = radv->info->gfx_level;
aco_info->family = radv->info->family;

View file

@ -240,7 +240,6 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, const struct radv_nir
ctx.abi.load_sampler_desc = radv_get_sampler_desc;
ctx.abi.clamp_shadow_reference = false;
ctx.abi.robust_buffer_access = options->robust_buffer_access_llvm;
ctx.abi.load_grid_size_from_user_sgpr = args->load_grid_size_from_user_sgpr;
bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
if (shader_count >= 2 || is_ngg)

View file

@ -497,6 +497,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
.wave_size = stage->info.wave_size,
.workgroup_size = stage->info.workgroup_size,
.use_llvm = radv_use_llvm_for_stage(pdev, stage->stage),
.load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr,
});
NIR_PASS(_, stage->nir, radv_nir_lower_abi, gfx_level, stage, gfx_state, pdev->info.address32_hi);

View file

@ -312,7 +312,6 @@ radv_init_shader_args(const struct radv_device *device, mesa_shader_stage stage,
args->explicit_scratch_args = !radv_use_llvm_for_stage(pdev, stage);
args->remap_spi_ps_input = !radv_use_llvm_for_stage(pdev, stage);
args->load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr;
for (int i = 0; i < MAX_SETS; i++)
args->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
@ -543,7 +542,7 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics
declare_global_input_sgprs(gfx_level, info, user_sgpr_info, args);
if (info->cs.uses_grid_size) {
if (args->load_grid_size_from_user_sgpr)
if (device->load_grid_size_from_user_sgpr)
add_ud_arg(args, 3, AC_ARG_VALUE, &args->ac.num_work_groups, AC_UD_CS_GRID_SIZE);
else
add_ud_arg(args, 2, AC_ARG_CONST_ADDR, &args->ac.num_work_groups, AC_UD_CS_GRID_SIZE);

View file

@ -121,7 +121,6 @@ struct radv_shader_args {
bool explicit_scratch_args;
bool remap_spi_ps_input;
bool load_grid_size_from_user_sgpr;
};
static inline struct radv_shader_args *

View file

@ -1144,6 +1144,7 @@ static void si_postprocess_nir(struct si_nir_shader_ctx *ctx)
.wave_size = shader->wave_size,
.workgroup_size = si_get_max_workgroup_size(shader),
.use_llvm = !nir->info.use_aco_amd,
.load_grid_size_from_user_sgpr = true,
});
/* LLVM keep non-uniform sampler as index, so can't do this in NIR.

View file

@ -53,7 +53,6 @@ si_fill_aco_options(struct si_screen *screen, mesa_shader_stage stage,
options->is_opengl = true;
options->has_ls_vgpr_init_bug = screen->info.has_ls_vgpr_init_bug;
options->load_grid_size_from_user_sgpr = true;
options->family = screen->info.family;
options->gfx_level = screen->info.gfx_level;
options->address32_hi = screen->info.address32_hi;

View file

@ -544,7 +544,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
ctx->abi.clamp_shadow_reference = true;
ctx->abi.robust_buffer_access = true;
ctx->abi.load_grid_size_from_user_sgpr = true;
ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero ||
info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
ctx->abi.disable_aniso_single_level = true;