From 9d7ac70ffb3567dfafaa524aae0891d07c92aba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 16 Nov 2021 21:56:05 -0500 Subject: [PATCH] radeonsi: implement shader culling in GS It already does compaction, so we just need to load vertex positions and cull. This was easier than expected. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 78 ++++++++++++++++++- src/gallium/drivers/radeonsi/si_shader.c | 15 ++-- src/gallium/drivers/radeonsi/si_shader_llvm.c | 2 +- .../drivers/radeonsi/si_shader_llvm_gs.c | 3 +- .../drivers/radeonsi/si_state_draw.cpp | 8 +- .../drivers/radeonsi/si_state_shaders.cpp | 10 ++- 6 files changed, 98 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index dee70769bfc..92f52c2bb16 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -78,7 +78,10 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns { const struct si_shader_info *info = &ctx->shader->selector->info; - if (ctx->stage == MESA_SHADER_VERTEX) { + if (ctx->stage == MESA_SHADER_GEOMETRY) { + *num_vertices = u_vertices_per_prim(info->base.gs.output_primitive); + return LLVMConstInt(ctx->ac.i32, *num_vertices, false); + } else if (ctx->stage == MESA_SHADER_VERTEX) { if (info->base.vs.blit_sgprs_amd) { /* Blits always use axis-aligned rectangles with 3 vertices. */ *num_vertices = 3; @@ -1954,6 +1957,79 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, 5109); } + /* Cull primitives. */ + if (ctx->shader->key.ge.opt.ngg_culling) { + assert(info->num_stream_output_components[0]); + + LLVMValueRef gs_vtxptr = ngg_gs_vertex_ptr(ctx, tid); + LLVMValueRef live = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, gs_vtxptr, 0), ""); + live = LLVMBuildTrunc(builder, live, ctx->ac.i1, ""); + LLVMValueRef is_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + LLVMValueRef prim_enable = LLVMBuildAnd(builder, live, is_emit, ""); + + /* Wait for streamout to finish before we kill primitives. */ + if (sel->so.num_outputs) + ac_build_s_barrier(&ctx->ac); + + ac_build_ifcc(&ctx->ac, prim_enable, 0); + { + LLVMValueRef vtxptr[3] = {}; + LLVMValueRef pos[3][4] = {}; + + for (unsigned i = 0; i < verts_per_prim; i++) { + tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + vtxptr[i] = ac_build_gep0(&ctx->ac, ngg_gs_vertex_ptr(ctx, tmp), ctx->ac.i32_0); + } + + for (unsigned i = 0; i < info->num_outputs; i++) { + /* If the stream index is non-zero for all channels, skip the output. */ + if (info->output_streams[i] & 0x3 && + (info->output_streams[i] >> 2) & 0x3 && + (info->output_streams[i] >> 4) & 0x3 && + (info->output_streams[i] >> 6) & 0x3) + continue; + + switch (info->output_semantic[i]) { + case VARYING_SLOT_POS: + /* Load the positions from LDS. */ + for (unsigned vert = 0; vert < verts_per_prim; vert++) { + for (unsigned comp = 0; comp < 4; comp++) { + /* Z is not needed. */ + if (comp == 2) + continue; + + tmp = ac_build_gep0(&ctx->ac, vtxptr[vert], + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + pos[vert][comp] = LLVMBuildLoad(builder, tmp, ""); + pos[vert][comp] = ac_to_float(&ctx->ac, pos[vert][comp]); + } + } + + /* Divide XY by W. */ + for (unsigned vert = 0; vert < verts_per_prim; vert++) { + for (unsigned comp = 0; comp < 2; comp++) + pos[vert][comp] = ac_build_fdiv(&ctx->ac, pos[vert][comp], pos[vert][3]); + } + break; + } + } + + LLVMValueRef clipdist_accepted = ctx->ac.i1true; /* TODO */ + LLVMValueRef accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + + cull_primitive(ctx, pos, clipdist_accepted, accepted, NULL); + + accepted = LLVMBuildLoad(builder, accepted, ""); + LLVMValueRef rejected = LLVMBuildNot(builder, LLVMBuildTrunc(builder, accepted, ctx->ac.i1, ""), ""); + + ac_build_ifcc(&ctx->ac, rejected, 0); + LLVMBuildStore(builder, ctx->ac.i8_0, ngg_gs_get_emit_primflag_ptr(ctx, gs_vtxptr, 0)); + ac_build_endif(&ctx->ac, 0); + } + ac_build_endif(&ctx->ac, 0); + ac_build_s_barrier(&ctx->ac); + } + /* Determine vertex liveness. */ LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4c53477e92b..df108dca8cc 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -546,10 +546,14 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + } else { + /* GS */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ } - if (ctx->stage != MESA_SHADER_GEOMETRY) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info); if (ctx->stage == MESA_SHADER_VERTEX) declare_vb_descriptor_input_sgprs(ctx); @@ -583,10 +587,8 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; } - } else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) { - num_user_sgprs = GFX9_GS_NUM_USER_SGPR; } else { - num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS; + num_user_sgprs = GFX9_GS_NUM_USER_SGPR; } /* The NGG cull shader has to return all 9 VGPRs. @@ -1264,8 +1266,7 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize); fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); - if (stage != MESA_SHADER_GEOMETRY) - fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); + fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); } if (stage <= MESA_SHADER_GEOMETRY) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 80659a352ad..4e7a8a49431 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1093,7 +1093,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); LLVMValueRef ngg_cull_main_fn = NULL; - if (sel->info.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.opt.ngg_culling) { + if (sel->info.stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) { if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) { si_llvm_dispose(&ctx); return false; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 0a9f503ddb4..3ca42259116 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -116,9 +116,10 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); if (ctx->screen->use_ngg) { ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); + ret = si_insert_input_ptr(ctx, ret, ctx->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO); } - unsigned vgpr = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS; + unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[0], vgpr++); ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[1], vgpr++); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 2add72f5623..2a8a9f1b1f0 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2267,10 +2267,10 @@ static void si_draw(struct pipe_context *ctx, if (GFX_VERSION >= GFX10) { struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso; - if (NGG && !HAS_GS && - /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type - * is not points, so this check is only needed without tessellation. */ - (HAS_TESS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) && + if (NGG && + /* Tessellation and GS set ngg_cull_vert_threshold to UINT_MAX if the prim type + * is not points, so this check is only needed for VS. */ + (HAS_TESS || HAS_GS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) && /* Only the first draw for a shader starts with culling disabled and it's disabled * until we pass the total_direct_count check and then it stays enabled until * the shader is changed. This eliminates most culling on/off state changes. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index d005a44a6e7..81be1e07bd6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3017,11 +3017,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx, bool ngg_culling_allowed = sscreen->info.chip_class >= GFX10 && sscreen->use_ngg_culling && - (sel->info.stage == MESA_SHADER_VERTEX || - sel->info.stage == MESA_SHADER_TESS_EVAL) && sel->info.writes_position && !sel->info.writes_viewport_index && /* cull only against viewport 0 */ - !sel->info.base.writes_memory && !sel->so.num_outputs && + !sel->info.base.writes_memory && + /* NGG GS supports culling with streamout because it culls after streamout. */ + (sel->info.stage == MESA_SHADER_GEOMETRY || !sel->so.num_outputs) && + (sel->info.stage != MESA_SHADER_GEOMETRY || sel->info.num_stream_output_components[0]) && (sel->info.stage != MESA_SHADER_VERTEX || (!sel->info.base.vs.blit_sgprs_amd && !sel->info.base.vs.window_space_position)); @@ -3034,7 +3035,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->ngg_cull_vert_threshold = 0; /* always enabled */ else sel->ngg_cull_vert_threshold = 128; - } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) { + } else if (sel->info.stage == MESA_SHADER_TESS_EVAL || + sel->info.stage == MESA_SHADER_GEOMETRY) { if (sel->rast_prim != PIPE_PRIM_POINTS) sel->ngg_cull_vert_threshold = 0; /* always enabled */ }