diff --git a/.pick_status.json b/.pick_status.json index bff822a9837..3a09a08b6f2 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -211,7 +211,7 @@ "description": "ac: add a bug workaround for the 100% NGG culling case", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "master_sha": null, "because_sha": "8db00a51f85109e958631ef74a458b0614f37097" }, diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 2531c5740e0..a131c2a10cd 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -4784,6 +4784,21 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav { LLVMBuilderRef builder = ctx->builder; LLVMValueRef tmp; + bool export_dummy_prim = false; + + /* HW workaround for a GPU hang with 100% culling. + * We always have to export at least 1 primitive. + * Export a degenerate triangle using vertex 0 for all 3 vertices. + */ + if (prim_cnt == ctx->i32_0 && + (ctx->family == CHIP_NAVI10 || + ctx->family == CHIP_NAVI12 || + ctx->family == CHIP_NAVI14)) { + assert(vtx_cnt == ctx->i32_0); + prim_cnt = ctx->i32_1; + vtx_cnt = ctx->i32_1; + export_dummy_prim = true; + } ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); @@ -4791,6 +4806,24 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); + if (export_dummy_prim) { + struct ac_ngg_prim prim = {}; + /* The vertex indices are 0,0,0. */ + prim.passthrough = ctx->i32_0; + + struct ac_export_args pos = {}; + pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0; + pos.target = V_008DFC_SQ_EXP_POS; + pos.enabled_channels = 0xf; + pos.done = true; + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), + ctx->i32_0, ""), 5021); + ac_build_export_prim(ctx, &prim); + ac_build_export(ctx, &pos); + ac_build_endif(ctx, 5021); + } + ac_build_endif(ctx, 5020); }