mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 16:08:04 +02:00
radeonsi/gfx11: rework GDS streamout code to single-lane and enable streamout
GDS is basically scalar in gfx11. This is not exactly how it's supposed to be done (we should be using the GDS_STRMOUT registers), but it works. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16990>
This commit is contained in:
parent
44e4d42c23
commit
e24354c1b2
2 changed files with 93 additions and 18 deletions
|
|
@ -369,18 +369,79 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
|
|||
tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
|
||||
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
|
||||
|
||||
LLVMValueRef args[] = {
|
||||
LLVMValueRef args[8] = {
|
||||
LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
|
||||
tmp,
|
||||
ctx->ac.i32_0, // ordering
|
||||
ctx->ac.i32_0, // scope
|
||||
ctx->ac.i1false, // isVolatile
|
||||
LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
|
||||
ctx->ac.i1true, // wave release
|
||||
ctx->ac.i1true, // wave done
|
||||
ctx->ac.i32_0, /* value to add */
|
||||
ctx->ac.i32_0, /* ordering */
|
||||
ctx->ac.i32_0, /* scope */
|
||||
ctx->ac.i1false, /* isVolatile */
|
||||
LLVMConstInt(ctx->ac.i32, 1 << 24, false), /* OA index, bits 24+: lane count */
|
||||
ctx->ac.i1true, /* wave release */
|
||||
ctx->ac.i1true, /* wave done */
|
||||
};
|
||||
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
|
||||
ARRAY_SIZE(args), 0);
|
||||
|
||||
if (ctx->screen->info.gfx_level >= GFX11) {
|
||||
/* Gfx11 GDS instructions only operate on the first active lane. All other lanes are
|
||||
* ignored. So are their EXEC bits. This uses the mutex feature of ds_ordered_count
|
||||
* to emulate a multi-dword atomic.
|
||||
*
|
||||
* This is the expected code:
|
||||
* ds_ordered_count release=0 done=0 // lock mutex
|
||||
* ds_add_rtn_u32 dwords_written0
|
||||
* ds_add_rtn_u32 dwords_written1
|
||||
* ds_add_rtn_u32 dwords_written2
|
||||
* ds_add_rtn_u32 dwords_written3
|
||||
* ds_ordered_count release=1 done=1 // unlock mutex
|
||||
*
|
||||
* TODO: Increment GDS_STRMOUT registers instead of GDS memory.
|
||||
*/
|
||||
LLVMValueRef dwords_written[4] = {tmp, tmp, tmp, tmp};
|
||||
|
||||
/* Move all 4 VGPRs from other lanes to lane 0. */
|
||||
for (unsigned i = 1; i < 4; i++) {
|
||||
if (ctx->shader->selector->info.base.xfb_stride[i])
|
||||
dwords_written[i] = ac_build_quad_swizzle(&ctx->ac, tmp, i, i, i, i);
|
||||
}
|
||||
|
||||
/* Set release=0 to start a GDS mutex. Set done=0 because it's not the last one. */
|
||||
args[6] = args[7] = ctx->ac.i1false;
|
||||
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
|
||||
args, ARRAY_SIZE(args), 0);
|
||||
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (ctx->shader->selector->info.base.xfb_stride[i]) {
|
||||
LLVMValueRef gds_ptr =
|
||||
ac_build_gep_ptr(&ctx->ac, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
|
||||
|
||||
dwords_written[i] = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
|
||||
gds_ptr, dwords_written[i],
|
||||
LLVMAtomicOrderingMonotonic, false);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: This might not be needed if GDS executes instructions in order. */
|
||||
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
|
||||
|
||||
/* Set release=1 to end a GDS mutex. Set done=1 because it's the last one. */
|
||||
args[6] = args[7] = ctx->ac.i1true;
|
||||
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
|
||||
args, ARRAY_SIZE(args), 0);
|
||||
|
||||
tmp = dwords_written[0];
|
||||
for (unsigned i = 1; i < 4; i++) {
|
||||
if (ctx->shader->selector->info.base.xfb_stride[i]) {
|
||||
dwords_written[i] = ac_build_readlane(&ctx->ac, dwords_written[i], ctx->ac.i32_0);
|
||||
tmp = ac_build_writelane(&ctx->ac, tmp, dwords_written[i], LLVMConstInt(ctx->ac.i32, i, 0));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
args[1] = tmp; /* value to add */
|
||||
args[5] = LLVMConstInt(ctx->ac.i32, 4 << 24, false), /* bits 24+: lane count */
|
||||
|
||||
tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
|
||||
args, ARRAY_SIZE(args), 0);
|
||||
}
|
||||
|
||||
/* Keep offsets in a VGPR for quick retrieval via readlane by
|
||||
* the first wave for bounds checking, and also store in LDS
|
||||
|
|
@ -451,9 +512,28 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
|
|||
{
|
||||
tmp = LLVMBuildSub(builder, generated, emit, "");
|
||||
tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
|
||||
tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
|
||||
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
|
||||
LLVMAtomicOrderingMonotonic, false);
|
||||
|
||||
if (ctx->screen->info.gfx_level >= GFX11) {
|
||||
/* Gfx11 GDS instructions only operate on the first active lane.
|
||||
* This is an unrolled waterfall loop. We only get here when we overflow,
|
||||
* so it doesn't have to be fast.
|
||||
*/
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (bufmask_for_stream[stream] & BITFIELD_BIT(i)) {
|
||||
LLVMValueRef index = LLVMConstInt(ctx->ac.i32, i, 0);
|
||||
|
||||
ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, index, ""), 0);
|
||||
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub,
|
||||
LLVMBuildGEP(builder, gdsbase, &index, 1, ""),
|
||||
tmp, LLVMAtomicOrderingMonotonic, false);
|
||||
ac_build_endif(&ctx->ac, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub,
|
||||
LLVMBuildGEP(builder, gdsbase, &tid, 1, ""),
|
||||
tmp, LLVMAtomicOrderingMonotonic, false);
|
||||
}
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5222);
|
||||
ac_build_endif(&ctx->ac, 5221);
|
||||
|
|
|
|||
|
|
@ -3158,11 +3158,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
|
|||
if (!sel)
|
||||
return NULL;
|
||||
|
||||
if (sscreen->info.gfx_level == GFX11 && state->stream_output.num_outputs) {
|
||||
fprintf(stderr, "radeonsi: streamout unimplemented\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
sel->screen = sscreen;
|
||||
sel->compiler_ctx_state.debug = sctx->debug;
|
||||
sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue