mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 04:20:08 +01:00
radeonsi: replace llvm ngg vs/tes with nir lowering
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Signed-off-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17109>
This commit is contained in:
parent
3542d5ce6b
commit
028d0590f8
8 changed files with 216 additions and 207 deletions
|
|
@ -119,6 +119,37 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns
|
|||
}
|
||||
}
|
||||
|
||||
unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader)
|
||||
{
|
||||
const struct si_shader_info *info = &shader->selector->info;
|
||||
|
||||
if (shader->selector->stage == MESA_SHADER_GEOMETRY)
|
||||
return u_vertices_per_prim(info->base.gs.output_primitive);
|
||||
else if (shader->selector->stage == MESA_SHADER_VERTEX) {
|
||||
if (info->base.vs.blit_sgprs_amd) {
|
||||
/* Blits always use axis-aligned rectangles with 3 vertices. */
|
||||
return 3;
|
||||
} else if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES)
|
||||
return 2;
|
||||
else {
|
||||
/* We always build up all three indices for the prim export
|
||||
* independent of the primitive type. The additional garbage
|
||||
* data shouldn't hurt. This is used by exports and streamout.
|
||||
*/
|
||||
return 3;
|
||||
}
|
||||
} else {
|
||||
assert(shader->selector->stage == MESA_SHADER_TESS_EVAL);
|
||||
|
||||
if (info->base.tess.point_mode)
|
||||
return 1;
|
||||
else if (info->base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
|
||||
return 2;
|
||||
else
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
bool gfx10_ngg_export_prim_early(struct si_shader *shader)
|
||||
{
|
||||
struct si_shader_selector *sel = shader->selector;
|
||||
|
|
@ -2398,11 +2429,17 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts
|
|||
unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
|
||||
{
|
||||
const struct si_shader_selector *sel = shader->selector;
|
||||
bool uses_streamout = si_shader_uses_streamout(shader);
|
||||
|
||||
if (sel->stage == MESA_SHADER_GEOMETRY && si_shader_uses_streamout(shader))
|
||||
return 44;
|
||||
|
||||
return 8;
|
||||
if (sel->stage == MESA_SHADER_GEOMETRY) {
|
||||
return uses_streamout ? 44 : 8;
|
||||
} else {
|
||||
return ac_ngg_get_scratch_lds_size(sel->stage,
|
||||
si_get_max_workgroup_size(shader),
|
||||
shader->wave_size,
|
||||
uses_streamout,
|
||||
shader->key.ge.opt.ngg_culling) / 4;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -2469,8 +2506,25 @@ retry_select_mode:
|
|||
}
|
||||
} else {
|
||||
/* VS and TES. */
|
||||
/* LDS size for passing data from ES to GS. */
|
||||
esvert_lds_size = ngg_nogs_vertex_size(shader);
|
||||
|
||||
bool uses_instance_id = gs_sel->info.uses_instanceid;
|
||||
bool uses_primitive_id = gs_sel->info.uses_primid;
|
||||
if (gs_stage == MESA_SHADER_VERTEX) {
|
||||
uses_instance_id |=
|
||||
shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
|
||||
shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
|
||||
} else {
|
||||
uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
|
||||
}
|
||||
|
||||
esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
|
||||
gs_stage, gs_sel->info.num_outputs,
|
||||
si_shader_uses_streamout(shader),
|
||||
shader->key.ge.mono.u.vs_export_prim_id,
|
||||
gfx10_ngg_writes_user_edgeflags(shader),
|
||||
shader->key.ge.opt.ngg_culling,
|
||||
uses_instance_id,
|
||||
uses_primitive_id) / 4;
|
||||
}
|
||||
|
||||
unsigned max_gsprims = max_gsprims_base;
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
|
|||
switch (shader->selector->stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
case MESA_SHADER_TESS_EVAL:
|
||||
return shader->key.ge.as_ngg ? 128 : 0;
|
||||
return shader->key.ge.as_ngg ? shader->selector->screen->ngg_subgroup_size : 0;
|
||||
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
/* Return this so that LLVM doesn't remove s_barrier
|
||||
|
|
@ -397,7 +397,7 @@ void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, u
|
|||
ac_add_arg(args, file, registers, type, arg);
|
||||
}
|
||||
|
||||
void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
|
||||
void si_init_shader_args(struct si_shader_context *ctx)
|
||||
{
|
||||
struct si_shader *shader = ctx->shader;
|
||||
unsigned i, num_returns, num_return_sgprs;
|
||||
|
|
@ -613,36 +613,12 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
|
|||
declare_tes_input_vgprs(ctx);
|
||||
}
|
||||
|
||||
if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
|
||||
if (ctx->shader->key.ge.as_es &&
|
||||
(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
|
||||
unsigned num_user_sgprs, num_vgprs;
|
||||
|
||||
if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
|
||||
/* For the NGG cull shader, add 1 SGPR to hold
|
||||
* the vertex buffer pointer.
|
||||
*/
|
||||
num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1;
|
||||
|
||||
if (shader->selector->info.num_vbos_in_user_sgprs) {
|
||||
assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
|
||||
num_user_sgprs =
|
||||
SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4;
|
||||
}
|
||||
} else {
|
||||
num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
|
||||
}
|
||||
|
||||
/* The NGG cull shader has to return all 9 VGPRs.
|
||||
*
|
||||
* The normal merged ESGS shader only has to return the 5 VGPRs
|
||||
* for the GS stage.
|
||||
*/
|
||||
num_vgprs = ngg_cull_shader ? 9 : 5;
|
||||
|
||||
/* ES return values are inputs to GS. */
|
||||
for (i = 0; i < 8 + num_user_sgprs; i++)
|
||||
for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
|
||||
ac_add_return(&ctx->args, AC_ARG_SGPR);
|
||||
for (i = 0; i < num_vgprs; i++)
|
||||
for (i = 0; i < 5; i++)
|
||||
ac_add_return(&ctx->args, AC_ARG_VGPR);
|
||||
}
|
||||
break;
|
||||
|
|
@ -1403,17 +1379,13 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
|
|||
}
|
||||
|
||||
bool si_vs_needs_prolog(const struct si_shader_selector *sel,
|
||||
const struct si_vs_prolog_bits *prolog_key,
|
||||
const union si_shader_key *key, bool ngg_cull_shader,
|
||||
bool is_gs)
|
||||
const struct si_vs_prolog_bits *prolog_key)
|
||||
{
|
||||
assert(sel->stage == MESA_SHADER_VERTEX);
|
||||
|
||||
/* VGPR initialization fixup for Vega10 and Raven is always done in the
|
||||
* VS prolog. */
|
||||
return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix ||
|
||||
/* The 2nd VS prolog loads input VGPRs from LDS */
|
||||
(key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs);
|
||||
return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1422,13 +1394,12 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
|
|||
*
|
||||
* \param info Shader info of the vertex shader.
|
||||
* \param num_input_sgprs Number of input SGPRs for the vertex shader.
|
||||
* \param has_old_ Whether the preceding shader part is the NGG cull shader.
|
||||
* \param prolog_key Key of the VS prolog
|
||||
* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
|
||||
* \param key Output shader part key.
|
||||
*/
|
||||
void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
|
||||
bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
|
||||
const struct si_vs_prolog_bits *prolog_key,
|
||||
struct si_shader *shader_out, union si_shader_part_key *key)
|
||||
{
|
||||
memset(key, 0, sizeof(*key));
|
||||
|
|
@ -1440,10 +1411,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
|
|||
key->vs_prolog.as_es = shader_out->key.ge.as_es;
|
||||
key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
|
||||
|
||||
if (shader_out->selector->stage != MESA_SHADER_GEOMETRY &&
|
||||
!ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
|
||||
key->vs_prolog.load_vgprs_after_culling = 1;
|
||||
|
||||
if (shader_out->selector->stage == MESA_SHADER_TESS_CTRL) {
|
||||
key->vs_prolog.as_ls = 1;
|
||||
key->vs_prolog.num_merged_next_stage_vgprs = 2;
|
||||
|
|
@ -1647,6 +1614,68 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
|
|||
return false;
|
||||
}
|
||||
|
||||
static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
|
||||
{
|
||||
struct si_shader_selector *sel = shader->selector;
|
||||
const union si_shader_key *key = &shader->key;
|
||||
assert(key->ge.as_ngg);
|
||||
|
||||
ac_nir_lower_ngg_options options = {
|
||||
.family = sel->screen->info.family,
|
||||
.gfx_level = sel->screen->info.gfx_level,
|
||||
.max_workgroup_size = si_get_max_workgroup_size(shader),
|
||||
.wave_size = shader->wave_size,
|
||||
.can_cull = !!key->ge.opt.ngg_culling,
|
||||
.disable_streamout = key->ge.opt.remove_streamout,
|
||||
.vs_output_param_offset = shader->info.vs_output_param_offset,
|
||||
};
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_VERTEX ||
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL) {
|
||||
/* Per instance inputs, used to remove instance load after culling. */
|
||||
unsigned instance_rate_inputs = 0;
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
||||
instance_rate_inputs =
|
||||
key->ge.part.vs.prolog.instance_divisor_is_one |
|
||||
key->ge.part.vs.prolog.instance_divisor_is_fetched;
|
||||
|
||||
/* Manually mark the instance ID used, so the shader can repack it. */
|
||||
if (instance_rate_inputs)
|
||||
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
|
||||
} else {
|
||||
/* Manually mark the primitive ID used, so the shader can repack it. */
|
||||
if (key->ge.mono.u.vs_export_prim_id)
|
||||
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
|
||||
}
|
||||
|
||||
unsigned clip_plane_enable =
|
||||
SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
|
||||
unsigned clipdist_mask =
|
||||
(sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask;
|
||||
|
||||
options.num_vertices_per_primitive = gfx10_ngg_get_vertices_per_prim(shader);
|
||||
options.early_prim_export = gfx10_ngg_export_prim_early(shader);
|
||||
options.passthrough = gfx10_is_ngg_passthrough(shader);
|
||||
options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
|
||||
options.has_gen_prim_query = options.has_xfb_prim_query =
|
||||
sel->screen->use_ngg_streamout && !sel->info.base.vs.blit_sgprs_amd;
|
||||
options.primitive_id_location =
|
||||
key->ge.mono.u.vs_export_prim_id ? sel->info.num_outputs : -1;
|
||||
options.instance_rate_inputs = instance_rate_inputs;
|
||||
options.clipdist_enable_mask = clipdist_mask;
|
||||
options.user_clip_plane_enable_mask = clip_plane_enable;
|
||||
|
||||
NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
|
||||
}
|
||||
|
||||
/* may generate some subgroup op like ballot */
|
||||
NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
|
||||
|
||||
/* may generate some vector output store */
|
||||
NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
|
||||
}
|
||||
|
||||
struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
|
||||
{
|
||||
struct pipe_screen *screen = &sel->screen->b;
|
||||
|
|
@ -1878,6 +1907,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
|
|||
if (is_last_vgt_stage)
|
||||
si_assign_param_offsets(nir, shader);
|
||||
|
||||
/* Only lower last VGT NGG shader stage. */
|
||||
if (sel->stage < MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) {
|
||||
si_lower_ngg(shader, nir);
|
||||
opt_offsets = true;
|
||||
}
|
||||
|
||||
if (progress2 || opt_offsets)
|
||||
si_nir_opts(sel->screen, nir, false);
|
||||
|
||||
|
|
@ -2176,13 +2211,12 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler
|
|||
{
|
||||
struct si_shader_selector *vs = main_part->selector;
|
||||
|
||||
if (!si_vs_needs_prolog(vs, key, &shader->key, false,
|
||||
shader->selector->stage == MESA_SHADER_GEOMETRY))
|
||||
if (!si_vs_needs_prolog(vs, key))
|
||||
return true;
|
||||
|
||||
/* Get the prolog. */
|
||||
union si_shader_part_key prolog_key;
|
||||
si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
|
||||
si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, key, shader,
|
||||
&prolog_key);
|
||||
|
||||
shader->prolog =
|
||||
|
|
|
|||
|
|
@ -613,7 +613,6 @@ union si_shader_part_key {
|
|||
unsigned as_ls : 1;
|
||||
unsigned as_es : 1;
|
||||
unsigned as_ngg : 1;
|
||||
unsigned load_vgprs_after_culling : 1;
|
||||
/* Prologs for monolithic shaders shouldn't set EXEC. */
|
||||
unsigned is_monolithic : 1;
|
||||
} vs_prolog;
|
||||
|
|
@ -1002,6 +1001,8 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
|
|||
struct util_debug_callback *debug);
|
||||
|
||||
/* si_shader_nir.c */
|
||||
extern const nir_lower_subgroups_options si_nir_subgroups_options;
|
||||
|
||||
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
|
||||
void si_nir_late_opts(nir_shader *nir);
|
||||
char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
|
||||
|
|
|
|||
|
|
@ -161,13 +161,12 @@ bool si_is_multi_part_shader(struct si_shader *shader);
|
|||
bool si_is_merged_shader(struct si_shader *shader);
|
||||
void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
|
||||
enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
|
||||
void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
void si_init_shader_args(struct si_shader_context *ctx);
|
||||
unsigned si_get_max_workgroup_size(const struct si_shader *shader);
|
||||
bool si_vs_needs_prolog(const struct si_shader_selector *sel,
|
||||
const struct si_vs_prolog_bits *prolog_key,
|
||||
const union si_shader_key *key, bool ngg_cull_shader, bool is_gs);
|
||||
const struct si_vs_prolog_bits *prolog_key);
|
||||
void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
|
||||
bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
|
||||
const struct si_vs_prolog_bits *prolog_key,
|
||||
struct si_shader *shader_out, union si_shader_part_key *key);
|
||||
struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
|
||||
uint64_t tcs_vgpr_only_inputs);
|
||||
|
|
@ -180,6 +179,7 @@ void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
|
|||
|
||||
/* gfx10_shader_ngg.c */
|
||||
LLVMValueRef gfx10_get_thread_id_in_tg(struct si_shader_context *ctx);
|
||||
unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader);
|
||||
bool gfx10_ngg_export_prim_early(struct si_shader *shader);
|
||||
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
|
||||
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
|
||||
|
|
@ -205,7 +205,7 @@ void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscre
|
|||
struct ac_llvm_compiler *compiler, unsigned wave_size);
|
||||
void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
|
||||
unsigned num_return_elems, unsigned max_workgroup_size);
|
||||
void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
void si_llvm_create_main_func(struct si_shader_context *ctx);
|
||||
void si_llvm_optimize_module(struct si_shader_context *ctx);
|
||||
void si_llvm_dispose(struct si_shader_context *ctx);
|
||||
LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
|
||||
|
|
@ -228,7 +228,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, struct ac_llvm_poi
|
|||
enum ac_arg_type *main_arg_types,
|
||||
bool same_thread_count);
|
||||
bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
|
||||
struct nir_shader *nir, bool free_nir, bool ngg_cull_shader);
|
||||
struct nir_shader *nir, bool free_nir);
|
||||
bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
|
||||
struct si_shader *shader, const struct pipe_stream_output_info *so,
|
||||
struct util_debug_callback *debug, struct nir_shader *nir,
|
||||
|
|
@ -278,6 +278,6 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex
|
|||
struct si_shader_output_values *outputs, unsigned noutput);
|
||||
void si_llvm_vs_build_end(struct si_shader_context *ctx);
|
||||
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -197,21 +197,21 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
|
|||
ac_llvm_set_target_features(ctx->main_fn.value, &ctx->ac);
|
||||
}
|
||||
|
||||
void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
|
||||
void si_llvm_create_main_func(struct si_shader_context *ctx)
|
||||
{
|
||||
struct si_shader *shader = ctx->shader;
|
||||
LLVMTypeRef returns[AC_MAX_ARGS];
|
||||
unsigned i;
|
||||
|
||||
si_init_shader_args(ctx, ngg_cull_shader);
|
||||
si_init_shader_args(ctx);
|
||||
|
||||
for (i = 0; i < ctx->args.num_sgprs_returned; i++)
|
||||
returns[i] = ctx->ac.i32; /* SGPR */
|
||||
for (; i < ctx->args.return_count; i++)
|
||||
returns[i] = ctx->ac.f32; /* VGPR */
|
||||
|
||||
si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns,
|
||||
ctx->args.return_count, si_get_max_workgroup_size(shader));
|
||||
si_llvm_create_func(ctx, "main", returns, ctx->args.return_count,
|
||||
si_get_max_workgroup_size(shader));
|
||||
|
||||
/* Reserve register locations for VGPR inputs the PS prolog may need. */
|
||||
if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
|
||||
|
|
@ -954,7 +954,7 @@ static LLVMValueRef si_llvm_load_streamout_buffer(struct ac_shader_abi *abi, uns
|
|||
}
|
||||
|
||||
bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
|
||||
struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
|
||||
struct nir_shader *nir, bool free_nir)
|
||||
{
|
||||
struct si_shader_selector *sel = shader->selector;
|
||||
const struct si_shader_info *info = &sel->info;
|
||||
|
|
@ -975,7 +975,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
ctx->abi.atomic_add_prim_count = gfx10_ngg_atomic_add_prim_count;
|
||||
|
||||
si_llvm_init_resource_callbacks(ctx);
|
||||
si_llvm_create_main_func(ctx, ngg_cull_shader);
|
||||
si_llvm_create_main_func(ctx);
|
||||
|
||||
if (ctx->stage <= MESA_SHADER_GEOMETRY &&
|
||||
(ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY))
|
||||
|
|
@ -983,7 +983,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
|
||||
switch (ctx->stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
|
||||
si_llvm_init_vs_callbacks(ctx);
|
||||
|
||||
/* preload instance_divisor_constbuf to be used for input load after culling */
|
||||
if (ctx->shader->key.ge.opt.ngg_culling &&
|
||||
|
|
@ -1100,57 +1100,50 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
* determined during linking / PM4 creation.
|
||||
*/
|
||||
si_llvm_declare_esgs_ring(ctx);
|
||||
ctx->ac.lds.value = ctx->esgs_ring;
|
||||
ctx->ac.lds.pointee_type = ctx->ac.i32;
|
||||
|
||||
/* This is really only needed when streamout and / or vertex
|
||||
* compaction is enabled.
|
||||
*/
|
||||
if (!ctx->gs_ngg_scratch.value && (ctx->so.num_outputs || shader->key.ge.opt.ngg_culling)) {
|
||||
if (si_shader_uses_streamout(shader) || shader->key.ge.opt.ngg_culling) {
|
||||
LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
|
||||
ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
|
||||
.value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS),
|
||||
.pointee_type = asi32
|
||||
};
|
||||
LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32));
|
||||
LLVMSetAlignment(ctx->gs_ngg_scratch.value, 4);
|
||||
LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
|
||||
}
|
||||
}
|
||||
|
||||
/* For merged shaders (VS-TCS, VS-GS, TES-GS): */
|
||||
if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) {
|
||||
/* TES is special because it has only 1 shader part if NGG shader culling is disabled,
|
||||
* and therefore it doesn't use the wrapper function.
|
||||
/* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper
|
||||
* function does this.
|
||||
*/
|
||||
bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es &&
|
||||
!shader->key.ge.opt.ngg_culling;
|
||||
|
||||
/* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
|
||||
* instead. For monolithic shaders, the wrapper function does this.
|
||||
*/
|
||||
if ((!shader->is_monolithic || no_wrapper_func) &&
|
||||
(ctx->stage == MESA_SHADER_TESS_EVAL ||
|
||||
(ctx->stage == MESA_SHADER_VERTEX &&
|
||||
!si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, ngg_cull_shader,
|
||||
false))))
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
|
||||
/* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
|
||||
* register usage.
|
||||
*/
|
||||
if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
|
||||
shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) {
|
||||
/* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
|
||||
if (ctx->screen->info.gfx_level == GFX10)
|
||||
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
||||
|
||||
gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
|
||||
|
||||
/* Build the primitive export at the beginning
|
||||
* of the shader if possible.
|
||||
*/
|
||||
if (gfx10_ngg_export_prim_early(shader))
|
||||
gfx10_ngg_build_export_prim(ctx, NULL, NULL);
|
||||
if (ctx->stage == MESA_SHADER_TESS_EVAL) {
|
||||
/* TES has only 1 shader part, therefore it doesn't use the wrapper function. */
|
||||
if (!shader->is_monolithic || !shader->key.ge.as_es)
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
} else if (ctx->stage == MESA_SHADER_VERTEX) {
|
||||
/* If the prolog is present, EXEC is set there instead. */
|
||||
if (!si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) {
|
||||
/* When no prolog, only mono VS with TCS/GS present has wrapper function. */
|
||||
if (!(shader->is_monolithic && (shader->key.ge.as_ls || shader->key.ge.as_es)))
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
}
|
||||
}
|
||||
|
||||
/* NGG VS and NGG TES: nir ngg lowering send gs_alloc_req at the beginning when culling
|
||||
* is disabled, but GFX10 may hang if not all waves are launched before gs_alloc_req.
|
||||
* We work around this HW bug by inserting a barrier before gs_alloc_req.
|
||||
*/
|
||||
if (ctx->screen->info.gfx_level == GFX10 &&
|
||||
(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
|
||||
shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling)
|
||||
ac_build_s_barrier(&ctx->ac, ctx->stage);
|
||||
|
||||
/* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
|
||||
if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
|
||||
gfx10_ngg_gs_emit_begin(ctx);
|
||||
|
|
@ -1164,10 +1157,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
* not here.
|
||||
*/
|
||||
thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
|
||||
} else if (((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) ||
|
||||
(shader->key.ge.as_ngg && !shader->key.ge.as_es)) {
|
||||
/* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
|
||||
* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
|
||||
} else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) {
|
||||
/* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
|
||||
* the if statement is inserted by the wrapper function.
|
||||
*/
|
||||
thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
|
||||
|
|
@ -1253,11 +1244,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
si_llvm_ls_build_end(ctx);
|
||||
else if (shader->key.ge.as_es)
|
||||
si_llvm_es_build_end(ctx);
|
||||
else if (ngg_cull_shader)
|
||||
gfx10_ngg_culling_build_end(ctx);
|
||||
else if (shader->key.ge.as_ngg)
|
||||
gfx10_ngg_build_end(ctx);
|
||||
else
|
||||
else if (!shader->key.ge.as_ngg)
|
||||
si_llvm_vs_build_end(ctx);
|
||||
break;
|
||||
|
||||
|
|
@ -1268,11 +1255,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
|||
case MESA_SHADER_TESS_EVAL:
|
||||
if (ctx->shader->key.ge.as_es)
|
||||
si_llvm_es_build_end(ctx);
|
||||
else if (ngg_cull_shader)
|
||||
gfx10_ngg_culling_build_end(ctx);
|
||||
else if (ctx->shader->key.ge.as_ngg)
|
||||
gfx10_ngg_build_end(ctx);
|
||||
else
|
||||
else if (!ctx->shader->key.ge.as_ngg)
|
||||
si_llvm_vs_build_end(ctx);
|
||||
break;
|
||||
|
||||
|
|
@ -1323,84 +1306,30 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
|
||||
ctx.so = *so;
|
||||
|
||||
struct ac_llvm_pointer ngg_cull_main_fn = {};
|
||||
if (sel->stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) {
|
||||
if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
|
||||
si_llvm_dispose(&ctx);
|
||||
return false;
|
||||
}
|
||||
ngg_cull_main_fn = ctx.main_fn;
|
||||
ctx.main_fn.value = NULL;
|
||||
}
|
||||
|
||||
if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) {
|
||||
if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir)) {
|
||||
si_llvm_dispose(&ctx);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX) {
|
||||
struct ac_llvm_pointer parts[4];
|
||||
unsigned num_parts = 0;
|
||||
bool first_is_prolog = false;
|
||||
struct ac_llvm_pointer main_fn = ctx.main_fn;
|
||||
if (shader->is_monolithic && sel->stage == MESA_SHADER_VERTEX &&
|
||||
si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog)) {
|
||||
struct ac_llvm_pointer parts[2];
|
||||
parts[1] = ctx.main_fn;
|
||||
|
||||
/* Preserve main arguments. */
|
||||
/* Preserve main arguments. */
|
||||
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
|
||||
for (int i = 0; i < ctx.args.arg_count; i++)
|
||||
main_arg_types[i] = ctx.args.args[i].type;
|
||||
main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID;
|
||||
|
||||
if (ngg_cull_main_fn.value) {
|
||||
if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, true, false)) {
|
||||
union si_shader_part_key prolog_key;
|
||||
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
|
||||
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
|
||||
prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_llvm_build_vs_prolog(&ctx, &prolog_key);
|
||||
parts[num_parts++] = ctx.main_fn;
|
||||
first_is_prolog = true;
|
||||
}
|
||||
parts[num_parts++] = ngg_cull_main_fn;
|
||||
}
|
||||
|
||||
if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, false, false)) {
|
||||
union si_shader_part_key prolog_key;
|
||||
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
|
||||
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
|
||||
prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_llvm_build_vs_prolog(&ctx, &prolog_key);
|
||||
parts[num_parts++] = ctx.main_fn;
|
||||
if (num_parts == 1)
|
||||
first_is_prolog = true;
|
||||
}
|
||||
parts[num_parts++] = main_fn;
|
||||
|
||||
si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, main_arg_types, false);
|
||||
} else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn.value) {
|
||||
struct ac_llvm_pointer parts[3], prolog, main_fn = ctx.main_fn;
|
||||
|
||||
/* Preserve main arguments. */
|
||||
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
|
||||
for (int i = 0; i < ctx.args.arg_count; i++)
|
||||
main_arg_types[i] = ctx.args.args[i].type;
|
||||
main_arg_types[MIN2(AC_MAX_ARGS - 1, ctx.args.arg_count)] = AC_ARG_INVALID;
|
||||
|
||||
/* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
|
||||
union si_shader_part_key prolog_key;
|
||||
memset(&prolog_key, 0, sizeof(prolog_key));
|
||||
prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
|
||||
prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
|
||||
prolog_key.vs_prolog.as_ngg = 1;
|
||||
prolog_key.vs_prolog.load_vgprs_after_culling = 1;
|
||||
si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs,
|
||||
&shader->key.ge.part.vs.prolog, shader, &prolog_key);
|
||||
prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_llvm_build_vs_prolog(&ctx, &prolog_key);
|
||||
prolog = ctx.main_fn;
|
||||
parts[0] = ctx.main_fn;
|
||||
|
||||
parts[0] = ngg_cull_main_fn;
|
||||
parts[1] = prolog;
|
||||
parts[2] = main_fn;
|
||||
|
||||
si_build_wrapper_function(&ctx, parts, 3, 0, 0, main_arg_types, false);
|
||||
si_build_wrapper_function(&ctx, parts, 2, 1, 0, main_arg_types, false);
|
||||
} else if (shader->is_monolithic && sel->stage == MESA_SHADER_TESS_CTRL) {
|
||||
/* Preserve main arguments. */
|
||||
enum ac_arg_type main_arg_types[AC_MAX_ARGS];
|
||||
|
|
@ -1409,7 +1338,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
struct si_shader_selector *ls = shader->key.ge.part.tcs.ls;
|
||||
struct ac_llvm_pointer parts[4];
|
||||
bool vs_needs_prolog =
|
||||
si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog, &shader->key, false, false);
|
||||
si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog);
|
||||
|
||||
/* TCS main part */
|
||||
parts[2] = ctx.main_fn;
|
||||
|
|
@ -1432,7 +1361,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
nir = si_get_nir_shader(&shader_ls, &free_nir, sel->info.tcs_vgpr_only_inputs);
|
||||
si_update_shader_binary_info(shader, nir);
|
||||
|
||||
if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
|
||||
if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir)) {
|
||||
si_llvm_dispose(&ctx);
|
||||
return false;
|
||||
}
|
||||
|
|
@ -1446,7 +1375,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
/* LS prolog */
|
||||
if (vs_needs_prolog) {
|
||||
union si_shader_part_key vs_prolog_key;
|
||||
si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
|
||||
si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs,
|
||||
&shader->key.ge.part.tcs.ls_prolog, shader, &vs_prolog_key);
|
||||
vs_prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
|
||||
|
|
@ -1503,7 +1432,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
nir = si_get_nir_shader(&shader_es, &free_nir, 0);
|
||||
si_update_shader_binary_info(shader, nir);
|
||||
|
||||
if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
|
||||
if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir)) {
|
||||
si_llvm_dispose(&ctx);
|
||||
return false;
|
||||
}
|
||||
|
|
@ -1517,9 +1446,9 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
|
||||
/* ES prolog */
|
||||
if (es->stage == MESA_SHADER_VERTEX &&
|
||||
si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog, &shader->key, false, true)) {
|
||||
si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog)) {
|
||||
union si_shader_part_key vs_prolog_key;
|
||||
si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
|
||||
si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs,
|
||||
&shader->key.ge.part.gs.vs_prolog, shader, &vs_prolog_key);
|
||||
vs_prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
|
||||
|
|
|
|||
|
|
@ -462,7 +462,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
|
|||
builder = ctx.ac.builder;
|
||||
|
||||
/* Build the main function. */
|
||||
si_llvm_create_main_func(&ctx, false);
|
||||
si_llvm_create_main_func(&ctx);
|
||||
|
||||
ctx.gsvs_ring[0] =
|
||||
ac_build_load_to_sgpr(&ctx.ac,
|
||||
|
|
|
|||
|
|
@ -698,7 +698,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex
|
|||
ac_build_export(&ctx->ac, &pos_args[i]);
|
||||
}
|
||||
|
||||
if (!shader->info.nr_param_exports)
|
||||
if (!shader->info.nr_param_exports ||
|
||||
/* GFX11 VS/TES param export is handled in nir */
|
||||
(ctx->screen->info.gfx_level >= GFX11 && ctx->stage != MESA_SHADER_GEOMETRY))
|
||||
return;
|
||||
|
||||
/* Build parameter exports. Use 2 loops to export params in ascending order.
|
||||
|
|
@ -895,18 +897,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
|
|||
}
|
||||
}
|
||||
|
||||
/* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
|
||||
if (key->vs_prolog.load_vgprs_after_culling) {
|
||||
for (i = 5; i <= 8; i++) {
|
||||
bool is_tes_rel_patch_id = i == 7;
|
||||
LLVMTypeRef t = is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32;
|
||||
input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i], LLVMPointerType(t, AC_ADDR_SPACE_LDS), "");
|
||||
input_vgprs[i] = LLVMBuildLoad2(ctx->ac.builder, t, input_vgprs[i], "");
|
||||
if (is_tes_rel_patch_id)
|
||||
input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
|
||||
}
|
||||
}
|
||||
|
||||
unsigned vertex_id_vgpr = first_vs_vgpr;
|
||||
unsigned instance_id_vgpr = ctx->screen->info.gfx_level >= GFX10
|
||||
? first_vs_vgpr + 3
|
||||
|
|
@ -960,7 +950,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
|
|||
si_llvm_build_ret(ctx, ret);
|
||||
}
|
||||
|
||||
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
|
||||
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx)
|
||||
{
|
||||
ctx->abi.load_inputs = si_load_vs_input;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -240,6 +240,16 @@ static bool si_lower_intrinsics(nir_shader *nir)
|
|||
NULL);
|
||||
}
|
||||
|
||||
const nir_lower_subgroups_options si_nir_subgroups_options = {
|
||||
.subgroup_size = 64,
|
||||
.ballot_bit_size = 64,
|
||||
.ballot_components = 1,
|
||||
.lower_to_scalar = true,
|
||||
.lower_subgroup_masks = true,
|
||||
.lower_vote_trivial = false,
|
||||
.lower_vote_eq = true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Perform "lowering" operations on the NIR that are run once when the shader
|
||||
* selector is created.
|
||||
|
|
@ -269,16 +279,7 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
|
|||
|
||||
NIR_PASS_V(nir, si_lower_intrinsics);
|
||||
|
||||
const nir_lower_subgroups_options subgroups_options = {
|
||||
.subgroup_size = 64,
|
||||
.ballot_bit_size = 64,
|
||||
.ballot_components = 1,
|
||||
.lower_to_scalar = true,
|
||||
.lower_subgroup_masks = true,
|
||||
.lower_vote_trivial = false,
|
||||
.lower_vote_eq = true,
|
||||
};
|
||||
NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
|
||||
NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
|
||||
|
||||
NIR_PASS_V(nir, nir_lower_discard_or_demote,
|
||||
(sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue