diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 8ecfd74e24f..c2dc9d4fd6f 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -119,6 +119,37 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns } } +unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader) +{ + const struct si_shader_info *info = &shader->selector->info; + + if (shader->selector->stage == MESA_SHADER_GEOMETRY) + return u_vertices_per_prim(info->base.gs.output_primitive); + else if (shader->selector->stage == MESA_SHADER_VERTEX) { + if (info->base.vs.blit_sgprs_amd) { + /* Blits always use axis-aligned rectangles with 3 vertices. */ + return 3; + } else if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) + return 2; + else { + /* We always build up all three indices for the prim export + * independent of the primitive type. The additional garbage + * data shouldn't hurt. This is used by exports and streamout. + */ + return 3; + } + } else { + assert(shader->selector->stage == MESA_SHADER_TESS_EVAL); + + if (info->base.tess.point_mode) + return 1; + else if (info->base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) + return 2; + else + return 3; + } +} + bool gfx10_ngg_export_prim_early(struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; @@ -2398,11 +2429,17 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader) { const struct si_shader_selector *sel = shader->selector; + bool uses_streamout = si_shader_uses_streamout(shader); - if (sel->stage == MESA_SHADER_GEOMETRY && si_shader_uses_streamout(shader)) - return 44; - - return 8; + if (sel->stage == MESA_SHADER_GEOMETRY) { + return uses_streamout ? 44 : 8; + } else { + return ac_ngg_get_scratch_lds_size(sel->stage, + si_get_max_workgroup_size(shader), + shader->wave_size, + uses_streamout, + shader->key.ge.opt.ngg_culling) / 4; + } } /** @@ -2469,8 +2506,25 @@ retry_select_mode: } } else { /* VS and TES. */ - /* LDS size for passing data from ES to GS. */ - esvert_lds_size = ngg_nogs_vertex_size(shader); + + bool uses_instance_id = gs_sel->info.uses_instanceid; + bool uses_primitive_id = gs_sel->info.uses_primid; + if (gs_stage == MESA_SHADER_VERTEX) { + uses_instance_id |= + shader->key.ge.part.vs.prolog.instance_divisor_is_one || + shader->key.ge.part.vs.prolog.instance_divisor_is_fetched; + } else { + uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id; + } + + esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size( + gs_stage, gs_sel->info.num_outputs, + si_shader_uses_streamout(shader), + shader->key.ge.mono.u.vs_export_prim_id, + gfx10_ngg_writes_user_edgeflags(shader), + shader->key.ge.opt.ngg_culling, + uses_instance_id, + uses_primitive_id) / 4; } unsigned max_gsprims = max_gsprims_base; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 73dd8d4ee0d..ef864365511 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -227,7 +227,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) switch (shader->selector->stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - return shader->key.ge.as_ngg ? 128 : 0; + return shader->key.ge.as_ngg ? shader->selector->screen->ngg_subgroup_size : 0; case MESA_SHADER_TESS_CTRL: /* Return this so that LLVM doesn't remove s_barrier @@ -397,7 +397,7 @@ void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, u ac_add_arg(args, file, registers, type, arg); } -void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) +void si_init_shader_args(struct si_shader_context *ctx) { struct si_shader *shader = ctx->shader; unsigned i, num_returns, num_return_sgprs; @@ -613,36 +613,12 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) declare_tes_input_vgprs(ctx); } - if ((ctx->shader->key.ge.as_es || ngg_cull_shader) && + if (ctx->shader->key.ge.as_es && (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) { - unsigned num_user_sgprs, num_vgprs; - - if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) { - /* For the NGG cull shader, add 1 SGPR to hold - * the vertex buffer pointer. - */ - num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1; - - if (shader->selector->info.num_vbos_in_user_sgprs) { - assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); - num_user_sgprs = - SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4; - } - } else { - num_user_sgprs = GFX9_GS_NUM_USER_SGPR; - } - - /* The NGG cull shader has to return all 9 VGPRs. - * - * The normal merged ESGS shader only has to return the 5 VGPRs - * for the GS stage. - */ - num_vgprs = ngg_cull_shader ? 9 : 5; - /* ES return values are inputs to GS. */ - for (i = 0; i < 8 + num_user_sgprs; i++) + for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) ac_add_return(&ctx->args, AC_ARG_SGPR); - for (i = 0; i < num_vgprs; i++) + for (i = 0; i < 5; i++) ac_add_return(&ctx->args, AC_ARG_VGPR); } break; @@ -1403,17 +1379,13 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) } bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *prolog_key, - const union si_shader_key *key, bool ngg_cull_shader, - bool is_gs) + const struct si_vs_prolog_bits *prolog_key) { assert(sel->stage == MESA_SHADER_VERTEX); /* VGPR initialization fixup for Vega10 and Raven is always done in the * VS prolog. */ - return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix || - /* The 2nd VS prolog loads input VGPRs from LDS */ - (key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs); + return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix; } /** @@ -1422,13 +1394,12 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel, * * \param info Shader info of the vertex shader. * \param num_input_sgprs Number of input SGPRs for the vertex shader. - * \param has_old_ Whether the preceding shader part is the NGG cull shader. * \param prolog_key Key of the VS prolog * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. * \param key Output shader part key. */ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, - bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, + const struct si_vs_prolog_bits *prolog_key, struct si_shader *shader_out, union si_shader_part_key *key) { memset(key, 0, sizeof(*key)); @@ -1440,10 +1411,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ key->vs_prolog.as_es = shader_out->key.ge.as_es; key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg; - if (shader_out->selector->stage != MESA_SHADER_GEOMETRY && - !ngg_cull_shader && shader_out->key.ge.opt.ngg_culling) - key->vs_prolog.load_vgprs_after_culling = 1; - if (shader_out->selector->stage == MESA_SHADER_TESS_CTRL) { key->vs_prolog.as_ls = 1; key->vs_prolog.num_merged_next_stage_vgprs = 2; @@ -1647,6 +1614,68 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir, return false; } +static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) +{ + struct si_shader_selector *sel = shader->selector; + const union si_shader_key *key = &shader->key; + assert(key->ge.as_ngg); + + ac_nir_lower_ngg_options options = { + .family = sel->screen->info.family, + .gfx_level = sel->screen->info.gfx_level, + .max_workgroup_size = si_get_max_workgroup_size(shader), + .wave_size = shader->wave_size, + .can_cull = !!key->ge.opt.ngg_culling, + .disable_streamout = key->ge.opt.remove_streamout, + .vs_output_param_offset = shader->info.vs_output_param_offset, + }; + + if (nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_TESS_EVAL) { + /* Per instance inputs, used to remove instance load after culling. */ + unsigned instance_rate_inputs = 0; + + if (nir->info.stage == MESA_SHADER_VERTEX) { + instance_rate_inputs = + key->ge.part.vs.prolog.instance_divisor_is_one | + key->ge.part.vs.prolog.instance_divisor_is_fetched; + + /* Manually mark the instance ID used, so the shader can repack it. */ + if (instance_rate_inputs) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + } else { + /* Manually mark the primitive ID used, so the shader can repack it. */ + if (key->ge.mono.u.vs_export_prim_id) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); + } + + unsigned clip_plane_enable = + SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling); + unsigned clipdist_mask = + (sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask; + + options.num_vertices_per_primitive = gfx10_ngg_get_vertices_per_prim(shader); + options.early_prim_export = gfx10_ngg_export_prim_early(shader); + options.passthrough = gfx10_is_ngg_passthrough(shader); + options.use_edgeflags = gfx10_edgeflags_have_effect(shader); + options.has_gen_prim_query = options.has_xfb_prim_query = + sel->screen->use_ngg_streamout && !sel->info.base.vs.blit_sgprs_amd; + options.primitive_id_location = + key->ge.mono.u.vs_export_prim_id ? sel->info.num_outputs : -1; + options.instance_rate_inputs = instance_rate_inputs; + options.clipdist_enable_mask = clipdist_mask; + options.user_clip_plane_enable_mask = clip_plane_enable; + + NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); + } + + /* may generate some subgroup op like ballot */ + NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options); + + /* may generate some vector output store */ + NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out); +} + struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel) { struct pipe_screen *screen = &sel->screen->b; @@ -1878,6 +1907,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir, if (is_last_vgt_stage) si_assign_param_offsets(nir, shader); + /* Only lower last VGT NGG shader stage. */ + if (sel->stage < MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) { + si_lower_ngg(shader, nir); + opt_offsets = true; + } + if (progress2 || opt_offsets) si_nir_opts(sel->screen, nir, false); @@ -2176,13 +2211,12 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler { struct si_shader_selector *vs = main_part->selector; - if (!si_vs_needs_prolog(vs, key, &shader->key, false, - shader->selector->stage == MESA_SHADER_GEOMETRY)) + if (!si_vs_needs_prolog(vs, key)) return true; /* Get the prolog. */ union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, + si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, key, shader, &prolog_key); shader->prolog = diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b1ff7fe654d..83cad1a8e21 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -613,7 +613,6 @@ union si_shader_part_key { unsigned as_ls : 1; unsigned as_es : 1; unsigned as_ngg : 1; - unsigned load_vgprs_after_culling : 1; /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic : 1; } vs_prolog; @@ -1002,6 +1001,8 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, struct util_debug_callback *debug); /* si_shader_nir.c */ +extern const nir_lower_subgroups_options si_nir_subgroups_options; + void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first); void si_nir_late_opts(nir_shader *nir); char *si_finalize_nir(struct pipe_screen *screen, void *nirptr); diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 405deed969b..164c20a55be 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -161,13 +161,12 @@ bool si_is_multi_part_shader(struct si_shader *shader); bool si_is_merged_shader(struct si_shader *shader); void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, enum ac_arg_type type, struct ac_arg *arg, unsigned idx); -void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader); +void si_init_shader_args(struct si_shader_context *ctx); unsigned si_get_max_workgroup_size(const struct si_shader *shader); bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *prolog_key, - const union si_shader_key *key, bool ngg_cull_shader, bool is_gs); + const struct si_vs_prolog_bits *prolog_key); void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, - bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, + const struct si_vs_prolog_bits *prolog_key, struct si_shader *shader_out, union si_shader_part_key *key); struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir, uint64_t tcs_vgpr_only_inputs); @@ -180,6 +179,7 @@ void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); /* gfx10_shader_ngg.c */ LLVMValueRef gfx10_get_thread_id_in_tg(struct si_shader_context *ctx); +unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader); bool gfx10_ngg_export_prim_early(struct si_shader *shader); void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx); void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3], @@ -205,7 +205,7 @@ void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscre struct ac_llvm_compiler *compiler, unsigned wave_size); void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, unsigned num_return_elems, unsigned max_workgroup_size); -void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader); +void si_llvm_create_main_func(struct si_shader_context *ctx); void si_llvm_optimize_module(struct si_shader_context *ctx); void si_llvm_dispose(struct si_shader_context *ctx); LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, @@ -228,7 +228,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, struct ac_llvm_poi enum ac_arg_type *main_arg_types, bool same_thread_count); bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader, - struct nir_shader *nir, bool free_nir, bool ngg_cull_shader); + struct nir_shader *nir, bool free_nir); bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, const struct pipe_stream_output_info *so, struct util_debug_callback *debug, struct nir_shader *nir, @@ -278,6 +278,6 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex struct si_shader_output_values *outputs, unsigned noutput); void si_llvm_vs_build_end(struct si_shader_context *ctx); void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); -void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx); #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 94abe102a11..07d47691e16 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -197,21 +197,21 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy ac_llvm_set_target_features(ctx->main_fn.value, &ctx->ac); } -void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader) +void si_llvm_create_main_func(struct si_shader_context *ctx) { struct si_shader *shader = ctx->shader; LLVMTypeRef returns[AC_MAX_ARGS]; unsigned i; - si_init_shader_args(ctx, ngg_cull_shader); + si_init_shader_args(ctx); for (i = 0; i < ctx->args.num_sgprs_returned; i++) returns[i] = ctx->ac.i32; /* SGPR */ for (; i < ctx->args.return_count; i++) returns[i] = ctx->ac.f32; /* VGPR */ - si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, - ctx->args.return_count, si_get_max_workgroup_size(shader)); + si_llvm_create_func(ctx, "main", returns, ctx->args.return_count, + si_get_max_workgroup_size(shader)); /* Reserve register locations for VGPR inputs the PS prolog may need. */ if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { @@ -954,7 +954,7 @@ static LLVMValueRef si_llvm_load_streamout_buffer(struct ac_shader_abi *abi, uns } bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader, - struct nir_shader *nir, bool free_nir, bool ngg_cull_shader) + struct nir_shader *nir, bool free_nir) { struct si_shader_selector *sel = shader->selector; const struct si_shader_info *info = &sel->info; @@ -975,7 +975,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad ctx->abi.atomic_add_prim_count = gfx10_ngg_atomic_add_prim_count; si_llvm_init_resource_callbacks(ctx); - si_llvm_create_main_func(ctx, ngg_cull_shader); + si_llvm_create_main_func(ctx); if (ctx->stage <= MESA_SHADER_GEOMETRY && (ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY)) @@ -983,7 +983,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad switch (ctx->stage) { case MESA_SHADER_VERTEX: - si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); + si_llvm_init_vs_callbacks(ctx); /* preload instance_divisor_constbuf to be used for input load after culling */ if (ctx->shader->key.ge.opt.ngg_culling && @@ -1100,57 +1100,50 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad * determined during linking / PM4 creation. */ si_llvm_declare_esgs_ring(ctx); + ctx->ac.lds.value = ctx->esgs_ring; + ctx->ac.lds.pointee_type = ctx->ac.i32; /* This is really only needed when streamout and / or vertex * compaction is enabled. */ - if (!ctx->gs_ngg_scratch.value && (ctx->so.num_outputs || shader->key.ge.opt.ngg_culling)) { + if (si_shader_uses_streamout(shader) || shader->key.ge.opt.ngg_culling) { LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS), .pointee_type = asi32 }; LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 4); + LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); } } /* For merged shaders (VS-TCS, VS-GS, TES-GS): */ if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) { - /* TES is special because it has only 1 shader part if NGG shader culling is disabled, - * and therefore it doesn't use the wrapper function. + /* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper + * function does this. */ - bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es && - !shader->key.ge.opt.ngg_culling; - - /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there - * instead. For monolithic shaders, the wrapper function does this. - */ - if ((!shader->is_monolithic || no_wrapper_func) && - (ctx->stage == MESA_SHADER_TESS_EVAL || - (ctx->stage == MESA_SHADER_VERTEX && - !si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, ngg_cull_shader, - false)))) - ac_init_exec_full_mask(&ctx->ac); - - /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease - * register usage. - */ - if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && - shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) { - /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */ - if (ctx->screen->info.gfx_level == GFX10) - ac_build_s_barrier(&ctx->ac, ctx->stage); - - gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); - - /* Build the primitive export at the beginning - * of the shader if possible. - */ - if (gfx10_ngg_export_prim_early(shader)) - gfx10_ngg_build_export_prim(ctx, NULL, NULL); + if (ctx->stage == MESA_SHADER_TESS_EVAL) { + /* TES has only 1 shader part, therefore it doesn't use the wrapper function. */ + if (!shader->is_monolithic || !shader->key.ge.as_es) + ac_init_exec_full_mask(&ctx->ac); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + /* If the prolog is present, EXEC is set there instead. */ + if (!si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) { + /* When no prolog, only mono VS with TCS/GS present has wrapper function. */ + if (!(shader->is_monolithic && (shader->key.ge.as_ls || shader->key.ge.as_es))) + ac_init_exec_full_mask(&ctx->ac); + } } + /* NGG VS and NGG TES: nir ngg lowering send gs_alloc_req at the beginning when culling + * is disabled, but GFX10 may hang if not all waves are launched before gs_alloc_req. + * We work around this HW bug by inserting a barrier before gs_alloc_req. + */ + if (ctx->screen->info.gfx_level == GFX10 && + (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && + shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) + ac_build_s_barrier(&ctx->ac, ctx->stage); + /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */ if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) gfx10_ngg_gs_emit_begin(ctx); @@ -1164,10 +1157,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad * not here. */ thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */ - } else if (((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) || - (shader->key.ge.as_ngg && !shader->key.ge.as_es)) { - /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS. - * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS), + } else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) { + /* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS), * the if statement is inserted by the wrapper function. */ thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */ @@ -1253,11 +1244,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad si_llvm_ls_build_end(ctx); else if (shader->key.ge.as_es) si_llvm_es_build_end(ctx); - else if (ngg_cull_shader) - gfx10_ngg_culling_build_end(ctx); - else if (shader->key.ge.as_ngg) - gfx10_ngg_build_end(ctx); - else + else if (!shader->key.ge.as_ngg) si_llvm_vs_build_end(ctx); break; @@ -1268,11 +1255,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad case MESA_SHADER_TESS_EVAL: if (ctx->shader->key.ge.as_es) si_llvm_es_build_end(ctx); - else if (ngg_cull_shader) - gfx10_ngg_culling_build_end(ctx); - else if (ctx->shader->key.ge.as_ngg) - gfx10_ngg_build_end(ctx); - else + else if (!ctx->shader->key.ge.as_ngg) si_llvm_vs_build_end(ctx); break; @@ -1323,84 +1306,30 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size); ctx.so = *so; - struct ac_llvm_pointer ngg_cull_main_fn = {}; - if (sel->stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) { - if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) { - si_llvm_dispose(&ctx); - return false; - } - ngg_cull_main_fn = ctx.main_fn; - ctx.main_fn.value = NULL; - } - - if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) { + if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir)) { si_llvm_dispose(&ctx); return false; } - if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX) { - struct ac_llvm_pointer parts[4]; - unsigned num_parts = 0; - bool first_is_prolog = false; - struct ac_llvm_pointer main_fn = ctx.main_fn; + if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX && + si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) { + struct ac_llvm_pointer parts[2]; + parts[1] = ctx.main_fn; - /* Preserve main arguments. */ + /* Preserve main arguments. */ enum ac_arg_type main_arg_types[AC_MAX_ARGS]; for (int i = 0; i < ctx.args.arg_count; i++) main_arg_types[i] = ctx.args.args[i].type; main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID; - if (ngg_cull_main_fn.value) { - if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, true, false)) { - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true, - &shader->key.ge.part.vs.prolog, shader, &prolog_key); - prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &prolog_key); - parts[num_parts++] = ctx.main_fn; - first_is_prolog = true; - } - parts[num_parts++] = ngg_cull_main_fn; - } - - if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, false, false)) { - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false, - &shader->key.ge.part.vs.prolog, shader, &prolog_key); - prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &prolog_key); - parts[num_parts++] = ctx.main_fn; - if (num_parts == 1) - first_is_prolog = true; - } - parts[num_parts++] = main_fn; - - si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, main_arg_types, false); - } else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn.value) { - struct ac_llvm_pointer parts[3], prolog, main_fn = ctx.main_fn; - - /* Preserve main arguments. */ - enum ac_arg_type main_arg_types[AC_MAX_ARGS]; - for (int i = 0; i < ctx.args.arg_count; i++) - main_arg_types[i] = ctx.args.args[i].type; - main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID; - - /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */ union si_shader_part_key prolog_key; - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; - prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5; - prolog_key.vs_prolog.as_ngg = 1; - prolog_key.vs_prolog.load_vgprs_after_culling = 1; + si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, + &shader->key.ge.part.vs.prolog, shader, &prolog_key); prolog_key.vs_prolog.is_monolithic = true; si_llvm_build_vs_prolog(&ctx, &prolog_key); - prolog = ctx.main_fn; + parts[0] = ctx.main_fn; - parts[0] = ngg_cull_main_fn; - parts[1] = prolog; - parts[2] = main_fn; - - si_build_wrapper_function(&ctx, parts, 3, 0, 0, main_arg_types, false); + si_build_wrapper_function(&ctx, parts, 2, 1, 0, main_arg_types, false); } else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_CTRL) { /* Preserve main arguments. */ enum ac_arg_type main_arg_types[AC_MAX_ARGS]; @@ -1409,7 +1338,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * struct si_shader_selector *ls = shader->key.ge.part.tcs.ls; struct ac_llvm_pointer parts[4]; bool vs_needs_prolog = - si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog, &shader->key, false, false); + si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog); /* TCS main part */ parts[2] = ctx.main_fn; @@ -1432,7 +1361,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * nir = si_get_nir_shader(&shader_ls, &free_nir, sel->info.tcs_vgpr_only_inputs); si_update_shader_binary_info(shader, nir); - if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) { + if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir)) { si_llvm_dispose(&ctx); return false; } @@ -1446,7 +1375,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * /* LS prolog */ if (vs_needs_prolog) { union si_shader_part_key vs_prolog_key; - si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false, + si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, &shader->key.ge.part.tcs.ls_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); @@ -1503,7 +1432,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * nir = si_get_nir_shader(&shader_es, &free_nir, 0); si_update_shader_binary_info(shader, nir); - if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) { + if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir)) { si_llvm_dispose(&ctx); return false; } @@ -1517,9 +1446,9 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * /* ES prolog */ if (es->stage == MESA_SHADER_VERTEX && - si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog, &shader->key, false, true)) { + si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog)) { union si_shader_part_key vs_prolog_key; - si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false, + si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, &shader->key.ge.part.gs.vs_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index c5b1c123231..7deecd389d0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -462,7 +462,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, builder = ctx.ac.builder; /* Build the main function. */ - si_llvm_create_main_func(&ctx, false); + si_llvm_create_main_func(&ctx); ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 31ba30d5f32..12d45448974 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -698,7 +698,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex ac_build_export(&ctx->ac, &pos_args[i]); } - if (!shader->info.nr_param_exports) + if (!shader->info.nr_param_exports || + /* GFX11 VS/TES param export is handled in nir */ + (ctx->screen->info.gfx_level >= GFX11 && ctx->stage != MESA_SHADER_GEOMETRY)) return; /* Build parameter exports. Use 2 loops to export params in ascending order. @@ -895,18 +897,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part } } - /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */ - if (key->vs_prolog.load_vgprs_after_culling) { - for (i = 5; i <= 8; i++) { - bool is_tes_rel_patch_id = i == 7; - LLVMTypeRef t = is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32; - input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i], LLVMPointerType(t, AC_ADDR_SPACE_LDS), ""); - input_vgprs[i] = LLVMBuildLoad2(ctx->ac.builder, t, input_vgprs[i], ""); - if (is_tes_rel_patch_id) - input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, ""); - } - } - unsigned vertex_id_vgpr = first_vs_vgpr; unsigned instance_id_vgpr = ctx->screen->info.gfx_level >= GFX10 ? first_vs_vgpr + 3 @@ -960,7 +950,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part si_llvm_build_ret(ctx, ret); } -void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx) { ctx->abi.load_inputs = si_load_vs_input; } diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index b0c2e4ff5b5..413dce3cfa4 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -240,6 +240,16 @@ static bool si_lower_intrinsics(nir_shader *nir) NULL); } +const nir_lower_subgroups_options si_nir_subgroups_options = { + .subgroup_size = 64, + .ballot_bit_size = 64, + .ballot_components = 1, + .lower_to_scalar = true, + .lower_subgroup_masks = true, + .lower_vote_trivial = false, + .lower_vote_eq = true, +}; + /** * Perform "lowering" operations on the NIR that are run once when the shader * selector is created. @@ -269,16 +279,7 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) NIR_PASS_V(nir, si_lower_intrinsics); - const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = 64, - .ballot_bit_size = 64, - .ballot_components = 1, - .lower_to_scalar = true, - .lower_subgroup_masks = true, - .lower_vote_trivial = false, - .lower_vote_eq = true, - }; - NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options); + NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options); NIR_PASS_V(nir, nir_lower_discard_or_demote, (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||