ac/llvm: use new s_wait instructions and split the existing ones for gfx12

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29007>
This commit is contained in:
Marek Olšák 2023-04-27 03:34:01 -04:00
parent 12bca6123a
commit a6c46509cc
5 changed files with 66 additions and 46 deletions

View file

@ -2066,44 +2066,60 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
if (!wait_flags)
return;
unsigned expcnt = 7;
unsigned lgkmcnt = 63;
unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
unsigned vscnt = 63;
if (ctx->gfx_level >= GFX12) {
if (wait_flags & AC_WAIT_DS)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_KM)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_EXP)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_LOAD)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_STORE)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_SAMPLE)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
if (wait_flags & AC_WAIT_BVH)
ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
} else {
unsigned expcnt = 7;
unsigned lgkmcnt = 63;
unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
unsigned vscnt = 63;
if (wait_flags & AC_WAIT_EXP)
expcnt = 0;
if (wait_flags & AC_WAIT_LGKM)
lgkmcnt = 0;
if (wait_flags & AC_WAIT_VLOAD)
vmcnt = 0;
if (wait_flags & AC_WAIT_VSTORE) {
if (ctx->gfx_level >= GFX10)
vscnt = 0;
else
if (wait_flags & AC_WAIT_EXP)
expcnt = 0;
if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
lgkmcnt = 0;
if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
vmcnt = 0;
if (wait_flags & AC_WAIT_STORE) {
if (ctx->gfx_level >= GFX10)
vscnt = 0;
else
vmcnt = 0;
}
/* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
if (vscnt == 0) {
assert(!(wait_flags & AC_WAIT_EXP));
LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
return;
}
unsigned simm16;
if (ctx->gfx_level >= GFX11)
simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
else
simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
LLVMValueRef args[1] = {
LLVMConstInt(ctx->i32, simm16, false),
};
ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
}
/* There is no intrinsic for vscnt(0), so use a fence. */
if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
vscnt == 0) {
assert(!(wait_flags & AC_WAIT_EXP));
LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
return;
}
unsigned simm16;
if (ctx->gfx_level >= GFX11)
simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
else
simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
LLVMValueRef args[1] = {
LLVMConstInt(ctx->i32, simm16, false),
};
ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
}
LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,

View file

@ -30,10 +30,14 @@ enum
AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */
};
#define AC_WAIT_LGKM (1 << 0) /* LDS, GDS, constant, message */
#define AC_WAIT_VLOAD (1 << 1) /* VMEM load/sample instructions */
#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */
#define AC_WAIT_EXP (1 << 3) /* EXP instructions */
/* Fine-grained wait flags for GFX12, older chips merge them. */
#define AC_WAIT_DS (1 << 0) /* s_wait_dscnt (GFX12) or lgkmcnt (GFX6-11): LDS (all gens), GDS (GFX6-11) */
#define AC_WAIT_KM (1 << 1) /* s_wait_kmcnt (GFX12) or lgkmcnt (GFX6-11): SMEM, message */
#define AC_WAIT_EXP (1 << 2) /* s_wait_expcnt: Exports */
#define AC_WAIT_LOAD (1 << 3) /* s_wait_loadcnt (GFX12) or vmcnt (GFX6-11): VMEM loads */
#define AC_WAIT_STORE (1 << 4) /* s_wait_storecnt (GFX12) or vscnt (GFX10-11) or vmcnt (GFX6-9): VMEM stores */
#define AC_WAIT_SAMPLE (1 << 5) /* s_wait_samplecnt (GFX12) or vmcnt (GFX6-11): VMEM sample/gather */
#define AC_WAIT_BVH (1 << 6) /* s_wait_bvhcnt (GFX12) or vmcnt (GFX6-11): BVH */
struct ac_llvm_flow;
struct ac_llvm_compiler;

View file

@ -3208,9 +3208,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
unsigned wait_flags = 0;
if (modes & (nir_var_mem_global | nir_var_mem_ssbo | nir_var_image))
wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
wait_flags |= AC_WAIT_LOAD | AC_WAIT_STORE;
if (modes & nir_var_mem_shared)
wait_flags |= AC_WAIT_LGKM;
wait_flags |= AC_WAIT_DS;
if (wait_flags)
ac_build_waitcnt(&ctx->ac, wait_flags);
@ -3624,7 +3624,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
/* Set release=0 to start a GDS mutex. Set done=0 because it's not the last one. */
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
args, ARRAY_SIZE(args), 0);
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
LLVMValueRef global_count[4];
LLVMValueRef count_vec = get_src(ctx, instr->src[1]);
@ -3644,7 +3644,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
}
}
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
/* Set release=1 to end a GDS mutex. Set done=1 because it's the last one. */
args[6] = args[7] = ctx->ac.i1true;

View file

@ -351,7 +351,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, const struct radv_nir
* and contains a barrier, it will wait there and then
* reach s_endpgm.
*/
ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
ac_build_waitcnt(&ctx.ac, AC_WAIT_DS);
ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
}

View file

@ -704,7 +704,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
if (!shader->key.ge.opt.same_patch_vertices ||
shader->selector->info.base.inputs_read &
~shader->selector->info.tcs_vgpr_only_inputs) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
/* If both input and output patches are wholly in one wave, we don't need a barrier.
* That's true when both VS and TCS have the same number of patch vertices and
@ -715,7 +715,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
} else if (ctx->stage == MESA_SHADER_GEOMETRY) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
ac_build_s_barrier(&ctx->ac, ctx->stage);
}
}