radeonsi: replace llvm ngg vs/tes with nir lowering

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17109>
This commit is contained in:
Qiang Yu 2022-06-12 20:36:39 +08:00 committed by Marge Bot
parent 3542d5ce6b
commit 028d0590f8
8 changed files with 216 additions and 207 deletions

View file

@ -119,6 +119,37 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns
}
}
unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader)
{
const struct si_shader_info *info = &shader->selector->info;
if (shader->selector->stage == MESA_SHADER_GEOMETRY)
return u_vertices_per_prim(info->base.gs.output_primitive);
else if (shader->selector->stage == MESA_SHADER_VERTEX) {
if (info->base.vs.blit_sgprs_amd) {
/* Blits always use axis-aligned rectangles with 3 vertices. */
return 3;
} else if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES)
return 2;
else {
/* We always build up all three indices for the prim export
* independent of the primitive type. The additional garbage
* data shouldn't hurt. This is used by exports and streamout.
*/
return 3;
}
} else {
assert(shader->selector->stage == MESA_SHADER_TESS_EVAL);
if (info->base.tess.point_mode)
return 1;
else if (info->base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
return 2;
else
return 3;
}
}
bool gfx10_ngg_export_prim_early(struct si_shader *shader)
{
struct si_shader_selector *sel = shader->selector;
@ -2398,11 +2429,17 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts
unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
{
const struct si_shader_selector *sel = shader->selector;
bool uses_streamout = si_shader_uses_streamout(shader);
if (sel->stage == MESA_SHADER_GEOMETRY && si_shader_uses_streamout(shader))
return 44;
return 8;
if (sel->stage == MESA_SHADER_GEOMETRY) {
return uses_streamout ? 44 : 8;
} else {
return ac_ngg_get_scratch_lds_size(sel->stage,
si_get_max_workgroup_size(shader),
shader->wave_size,
uses_streamout,
shader->key.ge.opt.ngg_culling) / 4;
}
}
/**
@ -2469,8 +2506,25 @@ retry_select_mode:
}
} else {
/* VS and TES. */
/* LDS size for passing data from ES to GS. */
esvert_lds_size = ngg_nogs_vertex_size(shader);
bool uses_instance_id = gs_sel->info.uses_instanceid;
bool uses_primitive_id = gs_sel->info.uses_primid;
if (gs_stage == MESA_SHADER_VERTEX) {
uses_instance_id |=
shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
} else {
uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
}
esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
gs_stage, gs_sel->info.num_outputs,
si_shader_uses_streamout(shader),
shader->key.ge.mono.u.vs_export_prim_id,
gfx10_ngg_writes_user_edgeflags(shader),
shader->key.ge.opt.ngg_culling,
uses_instance_id,
uses_primitive_id) / 4;
}
unsigned max_gsprims = max_gsprims_base;

View file

@ -227,7 +227,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
switch (shader->selector->stage) {
case MESA_SHADER_VERTEX:
case MESA_SHADER_TESS_EVAL:
return shader->key.ge.as_ngg ? 128 : 0;
return shader->key.ge.as_ngg ? shader->selector->screen->ngg_subgroup_size : 0;
case MESA_SHADER_TESS_CTRL:
/* Return this so that LLVM doesn't remove s_barrier
@ -397,7 +397,7 @@ void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, u
ac_add_arg(args, file, registers, type, arg);
}
void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
void si_init_shader_args(struct si_shader_context *ctx)
{
struct si_shader *shader = ctx->shader;
unsigned i, num_returns, num_return_sgprs;
@ -613,36 +613,12 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
declare_tes_input_vgprs(ctx);
}
if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
if (ctx->shader->key.ge.as_es &&
(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
unsigned num_user_sgprs, num_vgprs;
if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
/* For the NGG cull shader, add 1 SGPR to hold
* the vertex buffer pointer.
*/
num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1;
if (shader->selector->info.num_vbos_in_user_sgprs) {
assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
num_user_sgprs =
SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4;
}
} else {
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
}
/* The NGG cull shader has to return all 9 VGPRs.
*
* The normal merged ESGS shader only has to return the 5 VGPRs
* for the GS stage.
*/
num_vgprs = ngg_cull_shader ? 9 : 5;
/* ES return values are inputs to GS. */
for (i = 0; i < 8 + num_user_sgprs; i++)
for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
ac_add_return(&ctx->args, AC_ARG_SGPR);
for (i = 0; i < num_vgprs; i++)
for (i = 0; i < 5; i++)
ac_add_return(&ctx->args, AC_ARG_VGPR);
}
break;
@ -1403,17 +1379,13 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
}
bool si_vs_needs_prolog(const struct si_shader_selector *sel,
const struct si_vs_prolog_bits *prolog_key,
const union si_shader_key *key, bool ngg_cull_shader,
bool is_gs)
const struct si_vs_prolog_bits *prolog_key)
{
assert(sel->stage == MESA_SHADER_VERTEX);
/* VGPR initialization fixup for Vega10 and Raven is always done in the
* VS prolog. */
return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix ||
/* The 2nd VS prolog loads input VGPRs from LDS */
(key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs);
return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix;
}
/**
@ -1422,13 +1394,12 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
*
* \param info Shader info of the vertex shader.
* \param num_input_sgprs Number of input SGPRs for the vertex shader.
* \param has_old_ Whether the preceding shader part is the NGG cull shader.
* \param prolog_key Key of the VS prolog
* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
* \param key Output shader part key.
*/
void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
const struct si_vs_prolog_bits *prolog_key,
struct si_shader *shader_out, union si_shader_part_key *key)
{
memset(key, 0, sizeof(*key));
@ -1440,10 +1411,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
key->vs_prolog.as_es = shader_out->key.ge.as_es;
key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
if (shader_out->selector->stage != MESA_SHADER_GEOMETRY &&
!ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
key->vs_prolog.load_vgprs_after_culling = 1;
if (shader_out->selector->stage == MESA_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
key->vs_prolog.num_merged_next_stage_vgprs = 2;
@ -1647,6 +1614,68 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
return false;
}
static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
{
struct si_shader_selector *sel = shader->selector;
const union si_shader_key *key = &shader->key;
assert(key->ge.as_ngg);
ac_nir_lower_ngg_options options = {
.family = sel->screen->info.family,
.gfx_level = sel->screen->info.gfx_level,
.max_workgroup_size = si_get_max_workgroup_size(shader),
.wave_size = shader->wave_size,
.can_cull = !!key->ge.opt.ngg_culling,
.disable_streamout = key->ge.opt.remove_streamout,
.vs_output_param_offset = shader->info.vs_output_param_offset,
};
if (nir->info.stage == MESA_SHADER_VERTEX ||
nir->info.stage == MESA_SHADER_TESS_EVAL) {
/* Per instance inputs, used to remove instance load after culling. */
unsigned instance_rate_inputs = 0;
if (nir->info.stage == MESA_SHADER_VERTEX) {
instance_rate_inputs =
key->ge.part.vs.prolog.instance_divisor_is_one |
key->ge.part.vs.prolog.instance_divisor_is_fetched;
/* Manually mark the instance ID used, so the shader can repack it. */
if (instance_rate_inputs)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
} else {
/* Manually mark the primitive ID used, so the shader can repack it. */
if (key->ge.mono.u.vs_export_prim_id)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
}
unsigned clip_plane_enable =
SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
unsigned clipdist_mask =
(sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask;
options.num_vertices_per_primitive = gfx10_ngg_get_vertices_per_prim(shader);
options.early_prim_export = gfx10_ngg_export_prim_early(shader);
options.passthrough = gfx10_is_ngg_passthrough(shader);
options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
options.has_gen_prim_query = options.has_xfb_prim_query =
sel->screen->use_ngg_streamout && !sel->info.base.vs.blit_sgprs_amd;
options.primitive_id_location =
key->ge.mono.u.vs_export_prim_id ? sel->info.num_outputs : -1;
options.instance_rate_inputs = instance_rate_inputs;
options.clipdist_enable_mask = clipdist_mask;
options.user_clip_plane_enable_mask = clip_plane_enable;
NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
}
/* may generate some subgroup op like ballot */
NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
/* may generate some vector output store */
NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
}
struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
{
struct pipe_screen *screen = &sel->screen->b;
@ -1878,6 +1907,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
if (is_last_vgt_stage)
si_assign_param_offsets(nir, shader);
/* Only lower last VGT NGG shader stage. */
if (sel->stage < MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) {
si_lower_ngg(shader, nir);
opt_offsets = true;
}
if (progress2 || opt_offsets)
si_nir_opts(sel->screen, nir, false);
@ -2176,13 +2211,12 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler
{
struct si_shader_selector *vs = main_part->selector;
if (!si_vs_needs_prolog(vs, key, &shader->key, false,
shader->selector->stage == MESA_SHADER_GEOMETRY))
if (!si_vs_needs_prolog(vs, key))
return true;
/* Get the prolog. */
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, key, shader,
&prolog_key);
shader->prolog =

View file

@ -613,7 +613,6 @@ union si_shader_part_key {
unsigned as_ls : 1;
unsigned as_es : 1;
unsigned as_ngg : 1;
unsigned load_vgprs_after_culling : 1;
/* Prologs for monolithic shaders shouldn't set EXEC. */
unsigned is_monolithic : 1;
} vs_prolog;
@ -1002,6 +1001,8 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
struct util_debug_callback *debug);
/* si_shader_nir.c */
extern const nir_lower_subgroups_options si_nir_subgroups_options;
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
void si_nir_late_opts(nir_shader *nir);
char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);

View file

@ -161,13 +161,12 @@ bool si_is_multi_part_shader(struct si_shader *shader);
bool si_is_merged_shader(struct si_shader *shader);
void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader);
void si_init_shader_args(struct si_shader_context *ctx);
unsigned si_get_max_workgroup_size(const struct si_shader *shader);
bool si_vs_needs_prolog(const struct si_shader_selector *sel,
const struct si_vs_prolog_bits *prolog_key,
const union si_shader_key *key, bool ngg_cull_shader, bool is_gs);
const struct si_vs_prolog_bits *prolog_key);
void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
const struct si_vs_prolog_bits *prolog_key,
struct si_shader *shader_out, union si_shader_part_key *key);
struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
uint64_t tcs_vgpr_only_inputs);
@ -180,6 +179,7 @@ void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
/* gfx10_shader_ngg.c */
LLVMValueRef gfx10_get_thread_id_in_tg(struct si_shader_context *ctx);
unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader);
bool gfx10_ngg_export_prim_early(struct si_shader *shader);
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
@ -205,7 +205,7 @@ void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscre
struct ac_llvm_compiler *compiler, unsigned wave_size);
void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
unsigned num_return_elems, unsigned max_workgroup_size);
void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader);
void si_llvm_create_main_func(struct si_shader_context *ctx);
void si_llvm_optimize_module(struct si_shader_context *ctx);
void si_llvm_dispose(struct si_shader_context *ctx);
LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
@ -228,7 +228,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, struct ac_llvm_poi
enum ac_arg_type *main_arg_types,
bool same_thread_count);
bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
struct nir_shader *nir, bool free_nir, bool ngg_cull_shader);
struct nir_shader *nir, bool free_nir);
bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
struct si_shader *shader, const struct pipe_stream_output_info *so,
struct util_debug_callback *debug, struct nir_shader *nir,
@ -278,6 +278,6 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex
struct si_shader_output_values *outputs, unsigned noutput);
void si_llvm_vs_build_end(struct si_shader_context *ctx);
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx);
#endif

View file

@ -197,21 +197,21 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
ac_llvm_set_target_features(ctx->main_fn.value, &ctx->ac);
}
void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
void si_llvm_create_main_func(struct si_shader_context *ctx)
{
struct si_shader *shader = ctx->shader;
LLVMTypeRef returns[AC_MAX_ARGS];
unsigned i;
si_init_shader_args(ctx, ngg_cull_shader);
si_init_shader_args(ctx);
for (i = 0; i < ctx->args.num_sgprs_returned; i++)
returns[i] = ctx->ac.i32; /* SGPR */
for (; i < ctx->args.return_count; i++)
returns[i] = ctx->ac.f32; /* VGPR */
si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns,
ctx->args.return_count, si_get_max_workgroup_size(shader));
si_llvm_create_func(ctx, "main", returns, ctx->args.return_count,
si_get_max_workgroup_size(shader));
/* Reserve register locations for VGPR inputs the PS prolog may need. */
if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
@ -954,7 +954,7 @@ static LLVMValueRef si_llvm_load_streamout_buffer(struct ac_shader_abi *abi, uns
}
bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
struct nir_shader *nir, bool free_nir)
{
struct si_shader_selector *sel = shader->selector;
const struct si_shader_info *info = &sel->info;
@ -975,7 +975,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
ctx->abi.atomic_add_prim_count = gfx10_ngg_atomic_add_prim_count;
si_llvm_init_resource_callbacks(ctx);
si_llvm_create_main_func(ctx, ngg_cull_shader);
si_llvm_create_main_func(ctx);
if (ctx->stage <= MESA_SHADER_GEOMETRY &&
(ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY))
@ -983,7 +983,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
switch (ctx->stage) {
case MESA_SHADER_VERTEX:
si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
si_llvm_init_vs_callbacks(ctx);
/* preload instance_divisor_constbuf to be used for input load after culling */
if (ctx->shader->key.ge.opt.ngg_culling &&
@ -1100,57 +1100,50 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
* determined during linking / PM4 creation.
*/
si_llvm_declare_esgs_ring(ctx);
ctx->ac.lds.value = ctx->esgs_ring;
ctx->ac.lds.pointee_type = ctx->ac.i32;
/* This is really only needed when streamout and / or vertex
* compaction is enabled.
*/
if (!ctx->gs_ngg_scratch.value && (ctx->so.num_outputs || shader->key.ge.opt.ngg_culling)) {
if (si_shader_uses_streamout(shader) || shader->key.ge.opt.ngg_culling) {
LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
.value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS),
.pointee_type = asi32
};
LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32));
LLVMSetAlignment(ctx->gs_ngg_scratch.value, 4);
LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
}
}
/* For merged shaders (VS-TCS, VS-GS, TES-GS): */
if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) {
/* TES is special because it has only 1 shader part if NGG shader culling is disabled,
* and therefore it doesn't use the wrapper function.
/* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper
* function does this.
*/
bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es &&
!shader->key.ge.opt.ngg_culling;
/* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
* instead. For monolithic shaders, the wrapper function does this.
*/
if ((!shader->is_monolithic || no_wrapper_func) &&
(ctx->stage == MESA_SHADER_TESS_EVAL ||
(ctx->stage == MESA_SHADER_VERTEX &&
!si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, ngg_cull_shader,
false))))
ac_init_exec_full_mask(&ctx->ac);
/* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
* register usage.
*/
if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) {
/* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
if (ctx->screen->info.gfx_level == GFX10)
ac_build_s_barrier(&ctx->ac, ctx->stage);
gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
/* Build the primitive export at the beginning
* of the shader if possible.
*/
if (gfx10_ngg_export_prim_early(shader))
gfx10_ngg_build_export_prim(ctx, NULL, NULL);
if (ctx->stage == MESA_SHADER_TESS_EVAL) {
/* TES has only 1 shader part, therefore it doesn't use the wrapper function. */
if (!shader->is_monolithic || !shader->key.ge.as_es)
ac_init_exec_full_mask(&ctx->ac);
} else if (ctx->stage == MESA_SHADER_VERTEX) {
/* If the prolog is present, EXEC is set there instead. */
if (!si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) {
/* When no prolog, only mono VS with TCS/GS present has wrapper function. */
if (!(shader->is_monolithic && (shader->key.ge.as_ls || shader->key.ge.as_es)))
ac_init_exec_full_mask(&ctx->ac);
}
}
/* NGG VS and NGG TES: nir ngg lowering send gs_alloc_req at the beginning when culling
* is disabled, but GFX10 may hang if not all waves are launched before gs_alloc_req.
* We work around this HW bug by inserting a barrier before gs_alloc_req.
*/
if (ctx->screen->info.gfx_level == GFX10 &&
(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling)
ac_build_s_barrier(&ctx->ac, ctx->stage);
/* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
gfx10_ngg_gs_emit_begin(ctx);
@ -1164,10 +1157,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
* not here.
*/
thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
} else if (((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) ||
(shader->key.ge.as_ngg && !shader->key.ge.as_es)) {
/* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
} else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) {
/* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
* the if statement is inserted by the wrapper function.
*/
thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
@ -1253,11 +1244,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
si_llvm_ls_build_end(ctx);
else if (shader->key.ge.as_es)
si_llvm_es_build_end(ctx);
else if (ngg_cull_shader)
gfx10_ngg_culling_build_end(ctx);
else if (shader->key.ge.as_ngg)
gfx10_ngg_build_end(ctx);
else
else if (!shader->key.ge.as_ngg)
si_llvm_vs_build_end(ctx);
break;
@ -1268,11 +1255,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
case MESA_SHADER_TESS_EVAL:
if (ctx->shader->key.ge.as_es)
si_llvm_es_build_end(ctx);
else if (ngg_cull_shader)
gfx10_ngg_culling_build_end(ctx);
else if (ctx->shader->key.ge.as_ngg)
gfx10_ngg_build_end(ctx);
else
else if (!ctx->shader->key.ge.as_ngg)
si_llvm_vs_build_end(ctx);
break;
@ -1323,84 +1306,30 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
ctx.so = *so;
struct ac_llvm_pointer ngg_cull_main_fn = {};
if (sel->stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) {
if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
si_llvm_dispose(&ctx);
return false;
}
ngg_cull_main_fn = ctx.main_fn;
ctx.main_fn.value = NULL;
}
if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) {
if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir)) {
si_llvm_dispose(&ctx);
return false;
}
if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX) {
struct ac_llvm_pointer parts[4];
unsigned num_parts = 0;
bool first_is_prolog = false;
struct ac_llvm_pointer main_fn = ctx.main_fn;
if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX &&
si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) {
struct ac_llvm_pointer parts[2];
parts[1] = ctx.main_fn;
/* Preserve main arguments. */
/* Preserve main arguments. */
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
for (int i = 0; i < ctx.args.arg_count; i++)
main_arg_types[i] = ctx.args.args[i].type;
main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID;
if (ngg_cull_main_fn.value) {
if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, true, false)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
first_is_prolog = true;
}
parts[num_parts++] = ngg_cull_main_fn;
}
if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, false, false)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
if (num_parts == 1)
first_is_prolog = true;
}
parts[num_parts++] = main_fn;
si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, main_arg_types, false);
} else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn.value) {
struct ac_llvm_pointer parts[3], prolog, main_fn = ctx.main_fn;
/* Preserve main arguments. */
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
for (int i = 0; i < ctx.args.arg_count; i++)
main_arg_types[i] = ctx.args.args[i].type;
main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID;
/* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
union si_shader_part_key prolog_key;
memset(&prolog_key, 0, sizeof(prolog_key));
prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
prolog_key.vs_prolog.as_ngg = 1;
prolog_key.vs_prolog.load_vgprs_after_culling = 1;
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs,
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &prolog_key);
prolog = ctx.main_fn;
parts[0] = ctx.main_fn;
parts[0] = ngg_cull_main_fn;
parts[1] = prolog;
parts[2] = main_fn;
si_build_wrapper_function(&ctx, parts, 3, 0, 0, main_arg_types, false);
si_build_wrapper_function(&ctx, parts, 2, 1, 0, main_arg_types, false);
} else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_CTRL) {
/* Preserve main arguments. */
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
@ -1409,7 +1338,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
struct si_shader_selector *ls = shader->key.ge.part.tcs.ls;
struct ac_llvm_pointer parts[4];
bool vs_needs_prolog =
si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog, &shader->key, false, false);
si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog);
/* TCS main part */
parts[2] = ctx.main_fn;
@ -1432,7 +1361,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
nir = si_get_nir_shader(&shader_ls, &free_nir, sel->info.tcs_vgpr_only_inputs);
si_update_shader_binary_info(shader, nir);
if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir)) {
si_llvm_dispose(&ctx);
return false;
}
@ -1446,7 +1375,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
/* LS prolog */
if (vs_needs_prolog) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs,
&shader->key.ge.part.tcs.ls_prolog, shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
@ -1503,7 +1432,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
nir = si_get_nir_shader(&shader_es, &free_nir, 0);
si_update_shader_binary_info(shader, nir);
if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir)) {
si_llvm_dispose(&ctx);
return false;
}
@ -1517,9 +1446,9 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
/* ES prolog */
if (es->stage == MESA_SHADER_VERTEX &&
si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog, &shader->key, false, true)) {
si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog)) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs,
&shader->key.ge.part.gs.vs_prolog, shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);

View file

@ -462,7 +462,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
builder = ctx.ac.builder;
/* Build the main function. */
si_llvm_create_main_func(&ctx, false);
si_llvm_create_main_func(&ctx);
ctx.gsvs_ring[0] =
ac_build_load_to_sgpr(&ctx.ac,

View file

@ -698,7 +698,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex
ac_build_export(&ctx->ac, &pos_args[i]);
}
if (!shader->info.nr_param_exports)
if (!shader->info.nr_param_exports ||
/* GFX11 VS/TES param export is handled in nir */
(ctx->screen->info.gfx_level >= GFX11 && ctx->stage != MESA_SHADER_GEOMETRY))
return;
/* Build parameter exports. Use 2 loops to export params in ascending order.
@ -895,18 +897,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
}
}
/* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
if (key->vs_prolog.load_vgprs_after_culling) {
for (i = 5; i <= 8; i++) {
bool is_tes_rel_patch_id = i == 7;
LLVMTypeRef t = is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32;
input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i], LLVMPointerType(t, AC_ADDR_SPACE_LDS), "");
input_vgprs[i] = LLVMBuildLoad2(ctx->ac.builder, t, input_vgprs[i], "");
if (is_tes_rel_patch_id)
input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
}
}
unsigned vertex_id_vgpr = first_vs_vgpr;
unsigned instance_id_vgpr = ctx->screen->info.gfx_level >= GFX10
? first_vs_vgpr + 3
@ -960,7 +950,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
si_llvm_build_ret(ctx, ret);
}
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx)
{
ctx->abi.load_inputs = si_load_vs_input;
}

View file

@ -240,6 +240,16 @@ static bool si_lower_intrinsics(nir_shader *nir)
NULL);
}
const nir_lower_subgroups_options si_nir_subgroups_options = {
.subgroup_size = 64,
.ballot_bit_size = 64,
.ballot_components = 1,
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_vote_trivial = false,
.lower_vote_eq = true,
};
/**
* Perform "lowering" operations on the NIR that are run once when the shader
* selector is created.
@ -269,16 +279,7 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
NIR_PASS_V(nir, si_lower_intrinsics);
const nir_lower_subgroups_options subgroups_options = {
.subgroup_size = 64,
.ballot_bit_size = 64,
.ballot_components = 1,
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_vote_trivial = false,
.lower_vote_eq = true,
};
NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
NIR_PASS_V(nir, nir_lower_discard_or_demote,
(sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||