diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 00a3346928f..d336f3a506b 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -3138,6 +3138,22 @@ void ac_build_else(struct ac_llvm_context *ctx, int label_id) current_branch->next_block = endif_block; } +/* Invoked after a branch is exited. */ +static void ac_branch_exited(struct ac_llvm_context *ctx) +{ + if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) { + /* The previous conditional branch contained demote. Kill threads + * after all conditional blocks because amdgcn.wqm.vote doesn't + * return usable values inside the blocks. + * + * This is an optional optimization that only kills whole inactive quads. + */ + LLVMValueRef cond = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); + ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond)); + ctx->conditional_demote_seen = false; + } +} + void ac_build_endif(struct ac_llvm_context *ctx, int label_id) { struct ac_llvm_flow *current_branch = get_current_flow(ctx); @@ -3149,6 +3165,7 @@ void ac_build_endif(struct ac_llvm_context *ctx, int label_id) set_basicblock_name(current_branch->next_block, "endif", label_id); ctx->flow->depth--; + ac_branch_exited(ctx); } void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) @@ -3162,6 +3179,7 @@ void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); set_basicblock_name(current_loop->next_block, "endloop", label_id); ctx->flow->depth--; + ac_branch_exited(ctx); } void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 72b349f7717..32da9ec9733 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -118,6 +118,7 @@ struct ac_llvm_context { * False = demoted lanes */ LLVMValueRef postponed_kill; + bool conditional_demote_seen; /* Since ac_nir_translate makes a local copy of ac_llvm_context, there * are two ac_llvm_contexts. Declare a pointer here, so that the control diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index fcab2570414..d9329cde2b8 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2813,13 +2813,37 @@ static void emit_demote(struct ac_nir_context *ctx, const nir_intrinsic_instr *i cond = ctx->ac.i1false; } - /* Kill immediately while maintaining WQM. */ - ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond)); - LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, ""); LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill); - return; + + if (!ctx->info->fs.needs_all_helper_invocations) { + /* This is an optional optimization that only kills whole inactive quads. + * It's not used when subgroup operations can possibly use all helper + * invocations. + */ + if (ctx->ac.flow->depth == 0) { + ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond)); + } else { + /* amdgcn.wqm.vote doesn't work inside conditional blocks. Here's why. + * + * The problem is that kill(wqm.vote(0)) kills all active threads within + * the block, which breaks the whole quad mode outside the block if + * the conditional block has partially active quads (2x2 pixel blocks). + * E.g. threads 0-3 are active outside the block, but only thread 0 is + * active inside the block. Thread 0 shouldn't be killed by demote, + * because threads 1-3 are still active outside the block. + * + * The fix for amdgcn.wqm.vote would be to return S_WQM((live & ~exec) | cond) + * instead of S_WQM(cond). + * + * The less efficient workaround we do here is to save the kill condition + * to a temporary (postponed_kill) and do kill(wqm.vote(cond)) after we + * exit the conditional block. + */ + ctx->ac.conditional_demote_seen = true; + } + } } static LLVMValueRef visit_load_local_invocation_index(struct ac_nir_context *ctx)