mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-30 18:40:13 +01:00
radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling
Only non-indexed triangle lists and strips are supported. This increases performance if there is something to cull. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
This commit is contained in:
parent
c377f45c18
commit
735a3ba007
6 changed files with 221 additions and 53 deletions
|
|
@ -667,6 +667,20 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
|
|||
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
|
||||
}
|
||||
|
||||
static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
|
||||
LLVMValueRef ret, struct ac_arg param,
|
||||
unsigned return_index)
|
||||
{
|
||||
LLVMValueRef v = ac_get_arg(&ctx->ac, param);
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
|
||||
ac_llvm_extract_elem(&ctx->ac, v, i),
|
||||
return_index + i, "");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void load_bitmasks_2x64(struct si_shader_context *ctx,
|
||||
LLVMValueRef lds_ptr, unsigned dw_offset,
|
||||
LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
|
||||
|
|
@ -874,10 +888,18 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
|
|||
* - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
|
||||
*/
|
||||
|
||||
LLVMValueRef vtxindex[] = {
|
||||
si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16),
|
||||
si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16),
|
||||
si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16),
|
||||
LLVMValueRef vtxindex[3];
|
||||
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
|
||||
/* For the GS fast launch, the VS prologs simply puts the Vertex IDs
|
||||
* into these VGPRs.
|
||||
*/
|
||||
vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
|
||||
vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
|
||||
vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
|
||||
} else {
|
||||
vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
|
||||
vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
|
||||
vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
|
||||
};
|
||||
LLVMValueRef gs_vtxptr[] = {
|
||||
ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
|
||||
|
|
@ -1143,6 +1165,11 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
|
|||
8 + SI_SGPR_DRAWID);
|
||||
ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
|
||||
8 + SI_VS_NUM_USER_SGPR);
|
||||
|
||||
for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
|
||||
ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
|
||||
8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
|
||||
}
|
||||
} else {
|
||||
assert(ctx->type == PIPE_SHADER_TESS_EVAL);
|
||||
ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
|
||||
|
|
@ -1152,10 +1179,16 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
|
|||
}
|
||||
|
||||
unsigned vgpr;
|
||||
if (ctx->type == PIPE_SHADER_VERTEX)
|
||||
vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
|
||||
else
|
||||
if (ctx->type == PIPE_SHADER_VERTEX) {
|
||||
if (shader->selector->num_vbos_in_user_sgprs) {
|
||||
vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
|
||||
shader->selector->num_vbos_in_user_sgprs * 4;
|
||||
} else {
|
||||
vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
|
||||
}
|
||||
} else {
|
||||
vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
|
||||
}
|
||||
|
||||
val = LLVMBuildLoad(builder, new_vgpr0, "");
|
||||
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
|
||||
|
|
@ -1986,8 +2019,16 @@ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
|
|||
|
||||
/* All these are per subgroup: */
|
||||
bool max_vert_out_per_gs_instance = false;
|
||||
unsigned max_esverts_base = 128;
|
||||
unsigned max_gsprims_base = 128; /* default prim group size clamp */
|
||||
unsigned max_esverts_base = 128;
|
||||
|
||||
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
|
||||
max_gsprims_base = 128 / 3;
|
||||
max_esverts_base = max_gsprims_base * 3;
|
||||
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
|
||||
max_gsprims_base = 126;
|
||||
max_esverts_base = 128;
|
||||
}
|
||||
|
||||
/* Hardware has the following non-natural restrictions on the value
|
||||
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
|
||||
|
|
|
|||
|
|
@ -802,7 +802,7 @@ union si_vgt_param_key {
|
|||
uint32_t index;
|
||||
};
|
||||
|
||||
#define SI_NUM_VGT_STAGES_KEY_BITS 5
|
||||
#define SI_NUM_VGT_STAGES_KEY_BITS 6
|
||||
#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
|
||||
|
||||
/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
|
||||
|
|
@ -813,6 +813,7 @@ union si_vgt_stages_key {
|
|||
#if UTIL_ARCH_LITTLE_ENDIAN
|
||||
unsigned tess:1;
|
||||
unsigned gs:1;
|
||||
unsigned ngg_gs_fast_launch:1;
|
||||
unsigned ngg_passthrough:1;
|
||||
unsigned ngg:1; /* gfx10+ */
|
||||
unsigned streamout:1; /* only used with NGG */
|
||||
|
|
@ -822,6 +823,7 @@ union si_vgt_stages_key {
|
|||
unsigned streamout:1;
|
||||
unsigned ngg:1;
|
||||
unsigned ngg_passthrough:1;
|
||||
unsigned ngg_gs_fast_launch:1;
|
||||
unsigned gs:1;
|
||||
unsigned tess:1;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1474,11 +1474,20 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
|
|||
ctx->type == PIPE_SHADER_TESS_EVAL)) {
|
||||
unsigned num_user_sgprs, num_vgprs;
|
||||
|
||||
/* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */
|
||||
if (ctx->type == PIPE_SHADER_VERTEX)
|
||||
if (ctx->type == PIPE_SHADER_VERTEX) {
|
||||
/* For the NGG cull shader, add 1 SGPR to hold
|
||||
* the vertex buffer pointer.
|
||||
*/
|
||||
num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
|
||||
else
|
||||
|
||||
if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
|
||||
assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
|
||||
num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
|
||||
shader->selector->num_vbos_in_user_sgprs * 4;
|
||||
}
|
||||
} else {
|
||||
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
|
||||
}
|
||||
|
||||
/* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
|
||||
*
|
||||
|
|
@ -2278,13 +2287,16 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
|
|||
}
|
||||
|
||||
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
|
||||
const struct si_vs_prolog_bits *key)
|
||||
const struct si_vs_prolog_bits *prolog_key,
|
||||
const struct si_shader_key *key,
|
||||
bool ngg_cull_shader)
|
||||
{
|
||||
/* VGPR initialization fixup for Vega10 and Raven is always done in the
|
||||
* VS prolog. */
|
||||
return sel->vs_needs_prolog ||
|
||||
key->ls_vgpr_fix ||
|
||||
key->unpack_instance_id_from_vertex_id;
|
||||
prolog_key->ls_vgpr_fix ||
|
||||
prolog_key->unpack_instance_id_from_vertex_id ||
|
||||
(ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
|
||||
}
|
||||
|
||||
static bool si_build_main_function(struct si_shader_context *ctx,
|
||||
|
|
@ -2436,7 +2448,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
|
|||
(shader->key.as_es || shader->key.as_ls) &&
|
||||
(ctx->type == PIPE_SHADER_TESS_EVAL ||
|
||||
(ctx->type == PIPE_SHADER_VERTEX &&
|
||||
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
|
||||
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
|
||||
&shader->key, ngg_cull_shader)))) {
|
||||
si_init_exec_from_input(ctx,
|
||||
ctx->merged_wave_info, 0);
|
||||
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
|
||||
|
|
@ -2551,8 +2564,14 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info,
|
|||
key->vs_prolog.as_es = shader_out->key.as_es;
|
||||
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
|
||||
|
||||
if (!ngg_cull_shader)
|
||||
if (ngg_cull_shader) {
|
||||
key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
|
||||
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
|
||||
key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
|
||||
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
|
||||
} else {
|
||||
key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
|
||||
}
|
||||
|
||||
if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
|
||||
key->vs_prolog.as_ls = 1;
|
||||
|
|
@ -2937,11 +2956,12 @@ int si_compile_shader(struct si_screen *sscreen,
|
|||
if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
|
||||
LLVMValueRef parts[4];
|
||||
unsigned num_parts = 0;
|
||||
bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
|
||||
bool has_prolog = false;
|
||||
LLVMValueRef main_fn = ctx.main_fn;
|
||||
|
||||
if (ngg_cull_main_fn) {
|
||||
if (need_prolog) {
|
||||
if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
|
||||
&shader->key, true)) {
|
||||
union si_shader_part_key prolog_key;
|
||||
si_get_vs_prolog_key(&sel->info,
|
||||
shader->info.num_input_sgprs,
|
||||
|
|
@ -2951,11 +2971,13 @@ int si_compile_shader(struct si_screen *sscreen,
|
|||
prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_build_vs_prolog_function(&ctx, &prolog_key);
|
||||
parts[num_parts++] = ctx.main_fn;
|
||||
has_prolog = true;
|
||||
}
|
||||
parts[num_parts++] = ngg_cull_main_fn;
|
||||
}
|
||||
|
||||
if (need_prolog) {
|
||||
if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
|
||||
&shader->key, false)) {
|
||||
union si_shader_part_key prolog_key;
|
||||
si_get_vs_prolog_key(&sel->info,
|
||||
shader->info.num_input_sgprs,
|
||||
|
|
@ -2965,11 +2987,12 @@ int si_compile_shader(struct si_screen *sscreen,
|
|||
prolog_key.vs_prolog.is_monolithic = true;
|
||||
si_build_vs_prolog_function(&ctx, &prolog_key);
|
||||
parts[num_parts++] = ctx.main_fn;
|
||||
has_prolog = true;
|
||||
}
|
||||
parts[num_parts++] = main_fn;
|
||||
|
||||
si_build_wrapper_function(&ctx, parts, num_parts,
|
||||
need_prolog ? 1 : 0, 0);
|
||||
has_prolog ? 1 : 0, 0);
|
||||
|
||||
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
|
||||
si_build_prim_discard_compute_shader(&ctx);
|
||||
|
|
@ -2986,7 +3009,8 @@ int si_compile_shader(struct si_screen *sscreen,
|
|||
struct si_shader_selector *ls = shader->key.part.tcs.ls;
|
||||
LLVMValueRef parts[4];
|
||||
bool vs_needs_prolog =
|
||||
si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
|
||||
si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
|
||||
&shader->key, false);
|
||||
|
||||
/* TCS main part */
|
||||
parts[2] = ctx.main_fn;
|
||||
|
|
@ -3086,7 +3110,8 @@ int si_compile_shader(struct si_screen *sscreen,
|
|||
|
||||
/* ES prolog */
|
||||
if (es->type == PIPE_SHADER_VERTEX &&
|
||||
si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
|
||||
si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
|
||||
&shader->key, false)) {
|
||||
union si_shader_part_key vs_prolog_key;
|
||||
si_get_vs_prolog_key(&es->info,
|
||||
shader_es.info.num_input_sgprs,
|
||||
|
|
@ -3391,6 +3416,72 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
|
|||
}
|
||||
}
|
||||
|
||||
if (key->vs_prolog.gs_fast_launch_tri_list ||
|
||||
key->vs_prolog.gs_fast_launch_tri_strip) {
|
||||
LLVMValueRef wave_id, thread_id_in_tg;
|
||||
|
||||
wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
|
||||
thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
|
||||
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
|
||||
ac_get_thread_id(&ctx->ac));
|
||||
|
||||
/* The GS fast launch initializes all VGPRs to the value of
|
||||
* the first thread, so we have to add the thread ID.
|
||||
*
|
||||
* Only these are initialized by the hw:
|
||||
* VGPR2: Base Primitive ID
|
||||
* VGPR5: Base Vertex ID
|
||||
* VGPR6: Instance ID
|
||||
*/
|
||||
|
||||
/* Put the vertex thread IDs into VGPRs as-is instead of packing them.
|
||||
* The NGG cull shader will read them from there.
|
||||
*/
|
||||
if (key->vs_prolog.gs_fast_launch_tri_list) {
|
||||
input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
|
||||
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 0 */
|
||||
LLVMConstInt(ctx->i32, 0, 0));
|
||||
input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
|
||||
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 1 */
|
||||
LLVMConstInt(ctx->i32, 1, 0));
|
||||
input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
|
||||
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 2 */
|
||||
LLVMConstInt(ctx->i32, 2, 0));
|
||||
} else {
|
||||
assert(key->vs_prolog.gs_fast_launch_tri_strip);
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
/* Triangle indices: */
|
||||
LLVMValueRef index[3] = {
|
||||
thread_id_in_tg,
|
||||
LLVMBuildAdd(builder, thread_id_in_tg,
|
||||
LLVMConstInt(ctx->i32, 1, 0), ""),
|
||||
LLVMBuildAdd(builder, thread_id_in_tg,
|
||||
LLVMConstInt(ctx->i32, 2, 0), ""),
|
||||
};
|
||||
LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
|
||||
thread_id_in_tg, ctx->i1, "");
|
||||
LLVMValueRef flatshade_first =
|
||||
LLVMBuildICmp(builder, LLVMIntEQ,
|
||||
si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
|
||||
ctx->i32_0, "");
|
||||
|
||||
ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
|
||||
flatshade_first, index);
|
||||
input_vgprs[0] = index[0];
|
||||
input_vgprs[1] = index[1];
|
||||
input_vgprs[4] = index[2];
|
||||
}
|
||||
|
||||
/* Triangles always have all edge flags set initially. */
|
||||
input_vgprs[3] = LLVMConstInt(ctx->i32, 0x7 << 8, 0);
|
||||
|
||||
input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
|
||||
thread_id_in_tg, ""); /* PrimID */
|
||||
input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
|
||||
thread_id_in_tg, ""); /* VertexID */
|
||||
input_vgprs[8] = input_vgprs[6]; /* InstanceID */
|
||||
}
|
||||
|
||||
unsigned vertex_id_vgpr = first_vs_vgpr;
|
||||
unsigned instance_id_vgpr =
|
||||
ctx->screen->info.chip_class >= GFX10 ?
|
||||
|
|
@ -3498,7 +3589,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
|
|||
{
|
||||
struct si_shader_selector *vs = main_part->selector;
|
||||
|
||||
if (!si_vs_needs_prolog(vs, key))
|
||||
if (!si_vs_needs_prolog(vs, key, &shader->key, false))
|
||||
return true;
|
||||
|
||||
/* Get the prolog. */
|
||||
|
|
|
|||
|
|
@ -273,9 +273,12 @@ enum {
|
|||
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
|
||||
};
|
||||
|
||||
#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
|
||||
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
|
||||
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
|
||||
#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
|
||||
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
|
||||
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
|
||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
|
||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
|
||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
|
||||
|
||||
/**
|
||||
* For VS shader keys, describe any fixups required for vertex fetch.
|
||||
|
|
@ -564,6 +567,8 @@ union si_shader_part_key {
|
|||
unsigned as_es:1;
|
||||
unsigned as_ngg:1;
|
||||
unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */
|
||||
unsigned gs_fast_launch_tri_list:1; /* for NGG culling */
|
||||
unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */
|
||||
/* Prologs for monolithic shaders shouldn't set EXEC. */
|
||||
unsigned is_monolithic:1;
|
||||
} vs_prolog;
|
||||
|
|
@ -655,7 +660,7 @@ struct si_shader_key {
|
|||
unsigned clip_disable:1;
|
||||
|
||||
/* For NGG VS and TES. */
|
||||
unsigned ngg_culling:3; /* SI_NGG_CULL_* */
|
||||
unsigned ngg_culling:5; /* SI_NGG_CULL_* */
|
||||
|
||||
/* For shaders where monolithic variants have better code.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -2042,12 +2042,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
|||
if (sctx->ngg &&
|
||||
rast_prim == PIPE_PRIM_TRIANGLES &&
|
||||
(sctx->screen->always_use_ngg_culling ||
|
||||
/* At least 1500 non-indexed triangles (4500 vertices) are needed
|
||||
* per draw call (no TES/GS) to enable NGG culling. Triangle strips
|
||||
* don't need this, because they have good reuse and therefore
|
||||
* perform the same as indexed triangles.
|
||||
/* At least 1024 non-indexed vertices (8 subgroups) are needed
|
||||
* per draw call (no TES/GS) to enable NGG culling.
|
||||
*/
|
||||
(!index_size && prim == PIPE_PRIM_TRIANGLES && direct_count > 4500 &&
|
||||
(!index_size && direct_count >= 1024 &&
|
||||
(prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
|
||||
!sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
|
||||
si_get_vs(sctx)->cso->ngg_culling_allowed) {
|
||||
unsigned ngg_culling = 0;
|
||||
|
|
@ -2068,6 +2067,18 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
|
|||
if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
|
||||
ngg_culling |= SI_NGG_CULL_BACK_FACE;
|
||||
}
|
||||
|
||||
/* Use NGG fast launch for certain non-indexed primitive types.
|
||||
* A draw must have at least 1 full primitive.
|
||||
*/
|
||||
if (ngg_culling && !index_size && direct_count >= 3 &&
|
||||
!sctx->tes_shader.cso && !sctx->gs_shader.cso) {
|
||||
if (prim == PIPE_PRIM_TRIANGLES)
|
||||
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
|
||||
else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
|
||||
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
|
||||
}
|
||||
|
||||
if (ngg_culling != sctx->ngg_culling) {
|
||||
sctx->ngg_culling = ngg_culling;
|
||||
sctx->do_update_shaders = true;
|
||||
|
|
|
|||
|
|
@ -1234,6 +1234,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
late_alloc_wave64 = 0;
|
||||
else if (num_cu_per_sh <= 6)
|
||||
late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
|
||||
else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
|
||||
late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
|
||||
else
|
||||
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
|
||||
|
||||
|
|
@ -1316,26 +1318,36 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
|
||||
S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
|
||||
|
||||
shader->ge_cntl =
|
||||
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
|
||||
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
|
||||
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
|
||||
shader->ge_cntl =
|
||||
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
|
||||
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
|
||||
shader->ge_cntl =
|
||||
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
|
||||
} else {
|
||||
shader->ge_cntl =
|
||||
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
|
||||
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
|
||||
|
||||
/* Bug workaround for a possible hang with non-tessellation cases.
|
||||
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
|
||||
*
|
||||
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
|
||||
*/
|
||||
if ((sscreen->info.family == CHIP_NAVI10 ||
|
||||
sscreen->info.family == CHIP_NAVI12 ||
|
||||
sscreen->info.family == CHIP_NAVI14) &&
|
||||
(es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
|
||||
shader->ngg.hw_max_esverts != 256) {
|
||||
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
|
||||
/* Bug workaround for a possible hang with non-tessellation cases.
|
||||
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
|
||||
*
|
||||
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
|
||||
*/
|
||||
if ((sscreen->info.family == CHIP_NAVI10 ||
|
||||
sscreen->info.family == CHIP_NAVI12 ||
|
||||
sscreen->info.family == CHIP_NAVI14) &&
|
||||
(es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
|
||||
shader->ngg.hw_max_esverts != 256) {
|
||||
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
|
||||
|
||||
if (shader->ngg.hw_max_esverts > 5) {
|
||||
shader->ge_cntl |=
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
|
||||
if (shader->ngg.hw_max_esverts > 5) {
|
||||
shader->ge_cntl |=
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3954,6 +3966,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
|
|||
|
||||
if (key.u.ngg) {
|
||||
stages |= S_028B54_PRIMGEN_EN(1) |
|
||||
S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
|
||||
S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
|
||||
S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
|
||||
} else if (key.u.gs)
|
||||
|
|
@ -4109,8 +4122,13 @@ bool si_update_shaders(struct si_context *sctx)
|
|||
}
|
||||
|
||||
/* This must be done after the shader variant is selected. */
|
||||
if (sctx->ngg)
|
||||
key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current);
|
||||
if (sctx->ngg) {
|
||||
struct si_shader *vs = si_get_vs(sctx)->current;
|
||||
|
||||
key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
|
||||
key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
|
||||
SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
|
||||
}
|
||||
|
||||
si_update_vgt_shader_config(sctx, key);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue