From e4882d6b7e2c2d76792b5fa8b2a78f74acc3f1d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 2 May 2022 21:38:07 -0400 Subject: [PATCH] ac/llvm: add gl_shader_stage parameter into ac_build_s_barrier this will be used later Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/llvm/ac_llvm_build.c | 6 ++-- src/amd/llvm/ac_llvm_build.h | 3 +- src/amd/llvm/ac_nir_to_llvm.c | 2 +- src/amd/vulkan/radv_nir_to_llvm.c | 13 ++++---- .../drivers/radeonsi/gfx10_shader_ngg.c | 30 ++++++++++--------- src/gallium/drivers/radeonsi/si_shader_llvm.c | 6 ++-- .../drivers/radeonsi/si_shader_llvm_tess.c | 2 +- 7 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 13c75d2307b..90b372b89bd 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -388,7 +388,7 @@ LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigne return phi; } -void ac_build_s_barrier(struct ac_llvm_context *ctx) +void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage) { ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); } @@ -4019,7 +4019,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { ac_build_wg_wavescan_top(ctx, ws); - ac_build_s_barrier(ctx); + ac_build_s_barrier(ctx, ws->stage); ac_build_wg_wavescan_bottom(ctx, ws); } @@ -4081,7 +4081,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { ac_build_wg_scan_top(ctx, ws); - ac_build_s_barrier(ctx); + ac_build_s_barrier(ctx, ws->stage); ac_build_wg_scan_bottom(ctx, ws); } diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 80cf4b5fcfd..94f4e75b30c 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -175,7 +175,7 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize); LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming, LLVMValueRef *values, LLVMBasicBlockRef *blocks); -void ac_build_s_barrier(struct ac_llvm_context *ctx); +void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage); void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr); LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope); @@ -502,6 +502,7 @@ LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_ * values across an entire workgroup, while respecting the order of waves. */ struct ac_wg_scan { + gl_shader_stage stage; bool enable_reduce; bool enable_exclusive; bool enable_inclusive; diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 22884f6d3ce..940a27605f9 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2959,7 +2959,7 @@ void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage) if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) return; - ac_build_s_barrier(ac); + ac_build_s_barrier(ac, stage); } static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 671a9a7ef8f..027ee6ed0da 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1338,7 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) if (ctx->stage == MESA_SHADER_VERTEX) { /* Wait for GS stores to finish. */ - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx)); values[0] = LLVMBuildLoad(builder, tmp, ""); @@ -1384,7 +1384,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx) LLVMBuildBr(ctx->ac.builder, merge_block); LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } static void @@ -1459,7 +1459,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp, tmp2; - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); const LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); @@ -1522,6 +1522,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) /* Inclusive scan addition across the current wave. */ LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); struct ac_wg_scan vertlive_scan = {0}; + vertlive_scan.stage = ctx->stage; vertlive_scan.op = nir_op_iadd; vertlive_scan.enable_reduce = true; vertlive_scan.enable_exclusive = true; @@ -1564,7 +1565,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) } ac_build_endif(&ctx->ac, 5130); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); /* Export primitive data */ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); @@ -2076,7 +2077,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */ if (ctx.ac.chip_class == GFX10 && shader_count == 1) - ac_build_s_barrier(&ctx.ac); + ac_build_s_barrier(&ctx.ac, shaders[0]->info.stage); } for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) { @@ -2149,7 +2150,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, * and contains a barrier, it will wait there and then * reach s_endpgm. */ - ac_emit_barrier(&ctx.ac, ctx.stage); + ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage); } nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl( diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index c7ae46fa402..f66e2593574 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -481,6 +481,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout if (!info->num_stream_output_components[stream]) continue; + primemit_scan[stream].stage = ctx->stage; primemit_scan[stream].enable_exclusive = true; primemit_scan[stream].op = nir_op_iadd; primemit_scan[stream].src = nggso->prim_enable[stream]; @@ -499,7 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout } } - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ LLVMValueRef wgoffset_dw[4] = {}; @@ -1022,7 +1023,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) builder, packed_data, ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0))); ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); LLVMValueRef tid = ac_get_thread_id(&ctx->ac); @@ -1141,7 +1142,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr); } ac_build_endif(&ctx->ac, 16002); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); gs_accepted = LLVMBuildLoad(builder, gs_accepted, ""); @@ -1171,7 +1172,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) } ac_build_endif(&ctx->ac, 16008); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); /* Load the vertex masks and compute the new ES thread count. */ LLVMValueRef new_num_es_threads, prefix_sum, kill_wave; @@ -1262,7 +1263,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) ac_build_s_endpgm(&ctx->ac); } ac_build_endif(&ctx->ac, 19202); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); /* Send the final vertex and primitive counts. */ ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads, @@ -1408,7 +1409,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) /* These two also use LDS. */ if (gfx10_ngg_writes_user_edgeflags(shader) || (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); ctx->return_value = ret; } @@ -1512,7 +1513,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) /* Streamout already inserted the barrier, so don't insert it again. */ if (!ctx->so.num_outputs) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ @@ -1536,7 +1537,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader)) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ @@ -1630,7 +1631,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi) if (ctx->stage == MESA_SHADER_VERTEX) { /* Wait for GS stores to finish. */ - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx)); tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); @@ -1861,7 +1862,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 15090); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) @@ -1925,7 +1926,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx); LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); @@ -2003,7 +2004,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) /* Wait for streamout to finish before we kill primitives. */ if (ctx->so.num_outputs) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); ac_build_ifcc(&ctx->ac, prim_enable, 0); { @@ -2061,7 +2062,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, 0); } ac_build_endif(&ctx->ac, 0); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } /* Determine vertex liveness. */ @@ -2096,6 +2097,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) /* Inclusive scan addition across the current wave. */ LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); struct ac_wg_scan vertlive_scan = {}; + vertlive_scan.stage = ctx->stage; vertlive_scan.op = nir_op_iadd; vertlive_scan.enable_reduce = true; vertlive_scan.enable_exclusive = true; @@ -2129,7 +2131,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 5130); - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); /* Export primitive data */ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 3e36a7ddbd9..883a7372d3c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -957,7 +957,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) { /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */ if (ctx->screen->info.chip_class == GFX10) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); @@ -1016,10 +1016,10 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad if (!shader->key.ge.opt.same_patch_vertices || shader->selector->info.base.inputs_read & ~shader->selector->info.tcs_vgpr_only_inputs) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index d60710f74c6..ac831098a62 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -690,7 +690,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re * a single wave due to a bug workaround disallowing multi-wave HS workgroups. */ if (ctx->screen->info.chip_class != GFX6) - ac_build_s_barrier(&ctx->ac); + ac_build_s_barrier(&ctx->ac, ctx->stage); } /* Do this only for invocation 0, because the tess levels are per-patch,