radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling

Only non-indexed triangle lists and strips are supported. This increases
performance if there is something to cull.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
This commit is contained in:
Marek Olšák 2020-01-08 20:21:04 -05:00
parent c377f45c18
commit 735a3ba007
6 changed files with 221 additions and 53 deletions

View file

@ -667,6 +667,20 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
}
static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
LLVMValueRef ret, struct ac_arg param,
unsigned return_index)
{
LLVMValueRef v = ac_get_arg(&ctx->ac, param);
for (unsigned i = 0; i < 4; i++) {
ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
ac_llvm_extract_elem(&ctx->ac, v, i),
return_index + i, "");
}
return ret;
}
static void load_bitmasks_2x64(struct si_shader_context *ctx,
LLVMValueRef lds_ptr, unsigned dw_offset,
LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
@ -874,10 +888,18 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
* - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
*/
LLVMValueRef vtxindex[] = {
si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16),
si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16),
si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16),
LLVMValueRef vtxindex[3];
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
/* For the GS fast launch, the VS prologs simply puts the Vertex IDs
* into these VGPRs.
*/
vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
} else {
vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
};
LLVMValueRef gs_vtxptr[] = {
ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
@ -1143,6 +1165,11 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
8 + SI_SGPR_DRAWID);
ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
8 + SI_VS_NUM_USER_SGPR);
for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
}
} else {
assert(ctx->type == PIPE_SHADER_TESS_EVAL);
ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
@ -1152,10 +1179,16 @@ void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
}
unsigned vgpr;
if (ctx->type == PIPE_SHADER_VERTEX)
vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
else
if (ctx->type == PIPE_SHADER_VERTEX) {
if (shader->selector->num_vbos_in_user_sgprs) {
vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
shader->selector->num_vbos_in_user_sgprs * 4;
} else {
vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
}
} else {
vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
}
val = LLVMBuildLoad(builder, new_vgpr0, "");
ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
@ -1986,8 +2019,16 @@ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
/* All these are per subgroup: */
bool max_vert_out_per_gs_instance = false;
unsigned max_esverts_base = 128;
unsigned max_gsprims_base = 128; /* default prim group size clamp */
unsigned max_esverts_base = 128;
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
max_gsprims_base = 128 / 3;
max_esverts_base = max_gsprims_base * 3;
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
max_gsprims_base = 126;
max_esverts_base = 128;
}
/* Hardware has the following non-natural restrictions on the value
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of

View file

@ -802,7 +802,7 @@ union si_vgt_param_key {
uint32_t index;
};
#define SI_NUM_VGT_STAGES_KEY_BITS 5
#define SI_NUM_VGT_STAGES_KEY_BITS 6
#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
@ -813,6 +813,7 @@ union si_vgt_stages_key {
#if UTIL_ARCH_LITTLE_ENDIAN
unsigned tess:1;
unsigned gs:1;
unsigned ngg_gs_fast_launch:1;
unsigned ngg_passthrough:1;
unsigned ngg:1; /* gfx10+ */
unsigned streamout:1; /* only used with NGG */
@ -822,6 +823,7 @@ union si_vgt_stages_key {
unsigned streamout:1;
unsigned ngg:1;
unsigned ngg_passthrough:1;
unsigned ngg_gs_fast_launch:1;
unsigned gs:1;
unsigned tess:1;
#endif

View file

@ -1474,11 +1474,20 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
ctx->type == PIPE_SHADER_TESS_EVAL)) {
unsigned num_user_sgprs, num_vgprs;
/* For the NGG cull shader, add 1 SGPR to hold the vertex buffer pointer. */
if (ctx->type == PIPE_SHADER_VERTEX)
if (ctx->type == PIPE_SHADER_VERTEX) {
/* For the NGG cull shader, add 1 SGPR to hold
* the vertex buffer pointer.
*/
num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
else
if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
shader->selector->num_vbos_in_user_sgprs * 4;
}
} else {
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
}
/* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
*
@ -2278,13 +2287,16 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
}
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
const struct si_vs_prolog_bits *key)
const struct si_vs_prolog_bits *prolog_key,
const struct si_shader_key *key,
bool ngg_cull_shader)
{
/* VGPR initialization fixup for Vega10 and Raven is always done in the
* VS prolog. */
return sel->vs_needs_prolog ||
key->ls_vgpr_fix ||
key->unpack_instance_id_from_vertex_id;
prolog_key->ls_vgpr_fix ||
prolog_key->unpack_instance_id_from_vertex_id ||
(ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
static bool si_build_main_function(struct si_shader_context *ctx,
@ -2436,7 +2448,8 @@ static bool si_build_main_function(struct si_shader_context *ctx,
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
&shader->key, ngg_cull_shader)))) {
si_init_exec_from_input(ctx,
ctx->merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@ -2551,8 +2564,14 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info,
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
if (!ngg_cull_shader)
if (ngg_cull_shader) {
key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
} else {
key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
}
if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
@ -2937,11 +2956,12 @@ int si_compile_shader(struct si_screen *sscreen,
if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
LLVMValueRef parts[4];
unsigned num_parts = 0;
bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
bool has_prolog = false;
LLVMValueRef main_fn = ctx.main_fn;
if (ngg_cull_main_fn) {
if (need_prolog) {
if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
&shader->key, true)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
@ -2951,11 +2971,13 @@ int si_compile_shader(struct si_screen *sscreen,
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
has_prolog = true;
}
parts[num_parts++] = ngg_cull_main_fn;
}
if (need_prolog) {
if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
&shader->key, false)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
@ -2965,11 +2987,12 @@ int si_compile_shader(struct si_screen *sscreen,
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
has_prolog = true;
}
parts[num_parts++] = main_fn;
si_build_wrapper_function(&ctx, parts, num_parts,
need_prolog ? 1 : 0, 0);
has_prolog ? 1 : 0, 0);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
@ -2986,7 +3009,8 @@ int si_compile_shader(struct si_screen *sscreen,
struct si_shader_selector *ls = shader->key.part.tcs.ls;
LLVMValueRef parts[4];
bool vs_needs_prolog =
si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
&shader->key, false);
/* TCS main part */
parts[2] = ctx.main_fn;
@ -3086,7 +3110,8 @@ int si_compile_shader(struct si_screen *sscreen,
/* ES prolog */
if (es->type == PIPE_SHADER_VERTEX &&
si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
&shader->key, false)) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&es->info,
shader_es.info.num_input_sgprs,
@ -3391,6 +3416,72 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
}
}
if (key->vs_prolog.gs_fast_launch_tri_list ||
key->vs_prolog.gs_fast_launch_tri_strip) {
LLVMValueRef wave_id, thread_id_in_tg;
wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
ac_get_thread_id(&ctx->ac));
/* The GS fast launch initializes all VGPRs to the value of
* the first thread, so we have to add the thread ID.
*
* Only these are initialized by the hw:
* VGPR2: Base Primitive ID
* VGPR5: Base Vertex ID
* VGPR6: Instance ID
*/
/* Put the vertex thread IDs into VGPRs as-is instead of packing them.
* The NGG cull shader will read them from there.
*/
if (key->vs_prolog.gs_fast_launch_tri_list) {
input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 0 */
LLVMConstInt(ctx->i32, 0, 0));
input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 1 */
LLVMConstInt(ctx->i32, 1, 0));
input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
LLVMConstInt(ctx->i32, 3, 0), /* Vertex 2 */
LLVMConstInt(ctx->i32, 2, 0));
} else {
assert(key->vs_prolog.gs_fast_launch_tri_strip);
LLVMBuilderRef builder = ctx->ac.builder;
/* Triangle indices: */
LLVMValueRef index[3] = {
thread_id_in_tg,
LLVMBuildAdd(builder, thread_id_in_tg,
LLVMConstInt(ctx->i32, 1, 0), ""),
LLVMBuildAdd(builder, thread_id_in_tg,
LLVMConstInt(ctx->i32, 2, 0), ""),
};
LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
thread_id_in_tg, ctx->i1, "");
LLVMValueRef flatshade_first =
LLVMBuildICmp(builder, LLVMIntEQ,
si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
ctx->i32_0, "");
ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
flatshade_first, index);
input_vgprs[0] = index[0];
input_vgprs[1] = index[1];
input_vgprs[4] = index[2];
}
/* Triangles always have all edge flags set initially. */
input_vgprs[3] = LLVMConstInt(ctx->i32, 0x7 << 8, 0);
input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
thread_id_in_tg, ""); /* PrimID */
input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
thread_id_in_tg, ""); /* VertexID */
input_vgprs[8] = input_vgprs[6]; /* InstanceID */
}
unsigned vertex_id_vgpr = first_vs_vgpr;
unsigned instance_id_vgpr =
ctx->screen->info.chip_class >= GFX10 ?
@ -3498,7 +3589,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
{
struct si_shader_selector *vs = main_part->selector;
if (!si_vs_needs_prolog(vs, key))
if (!si_vs_needs_prolog(vs, key, &shader->key, false))
return true;
/* Get the prolog. */

View file

@ -273,9 +273,12 @@ enum {
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
};
#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
/**
* For VS shader keys, describe any fixups required for vertex fetch.
@ -564,6 +567,8 @@ union si_shader_part_key {
unsigned as_es:1;
unsigned as_ngg:1;
unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */
unsigned gs_fast_launch_tri_list:1; /* for NGG culling */
unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */
/* Prologs for monolithic shaders shouldn't set EXEC. */
unsigned is_monolithic:1;
} vs_prolog;
@ -655,7 +660,7 @@ struct si_shader_key {
unsigned clip_disable:1;
/* For NGG VS and TES. */
unsigned ngg_culling:3; /* SI_NGG_CULL_* */
unsigned ngg_culling:5; /* SI_NGG_CULL_* */
/* For shaders where monolithic variants have better code.
*

View file

@ -2042,12 +2042,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (sctx->ngg &&
rast_prim == PIPE_PRIM_TRIANGLES &&
(sctx->screen->always_use_ngg_culling ||
/* At least 1500 non-indexed triangles (4500 vertices) are needed
* per draw call (no TES/GS) to enable NGG culling. Triangle strips
* don't need this, because they have good reuse and therefore
* perform the same as indexed triangles.
/* At least 1024 non-indexed vertices (8 subgroups) are needed
* per draw call (no TES/GS) to enable NGG culling.
*/
(!index_size && prim == PIPE_PRIM_TRIANGLES && direct_count > 4500 &&
(!index_size && direct_count >= 1024 &&
(prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
!sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
si_get_vs(sctx)->cso->ngg_culling_allowed) {
unsigned ngg_culling = 0;
@ -2068,6 +2067,18 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
ngg_culling |= SI_NGG_CULL_BACK_FACE;
}
/* Use NGG fast launch for certain non-indexed primitive types.
* A draw must have at least 1 full primitive.
*/
if (ngg_culling && !index_size && direct_count >= 3 &&
!sctx->tes_shader.cso && !sctx->gs_shader.cso) {
if (prim == PIPE_PRIM_TRIANGLES)
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
}
if (ngg_culling != sctx->ngg_culling) {
sctx->ngg_culling = ngg_culling;
sctx->do_update_shaders = true;

View file

@ -1234,6 +1234,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
late_alloc_wave64 = 0;
else if (num_cu_per_sh <= 6)
late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
else
late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
@ -1316,26 +1318,36 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
shader->ge_cntl =
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
shader->ge_cntl =
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
shader->ge_cntl =
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
} else {
shader->ge_cntl =
S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
/* Bug workaround for a possible hang with non-tessellation cases.
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
*
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
*/
if ((sscreen->info.family == CHIP_NAVI10 ||
sscreen->info.family == CHIP_NAVI12 ||
sscreen->info.family == CHIP_NAVI14) &&
(es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
shader->ngg.hw_max_esverts != 256) {
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
/* Bug workaround for a possible hang with non-tessellation cases.
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
*
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
*/
if ((sscreen->info.family == CHIP_NAVI10 ||
sscreen->info.family == CHIP_NAVI12 ||
sscreen->info.family == CHIP_NAVI14) &&
(es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
shader->ngg.hw_max_esverts != 256) {
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
if (shader->ngg.hw_max_esverts > 5) {
shader->ge_cntl |=
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
if (shader->ngg.hw_max_esverts > 5) {
shader->ge_cntl |=
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
}
}
}
@ -3954,6 +3966,7 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
if (key.u.ngg) {
stages |= S_028B54_PRIMGEN_EN(1) |
S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
} else if (key.u.gs)
@ -4109,8 +4122,13 @@ bool si_update_shaders(struct si_context *sctx)
}
/* This must be done after the shader variant is selected. */
if (sctx->ngg)
key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current);
if (sctx->ngg) {
struct si_shader *vs = si_get_vs(sctx)->current;
key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
si_update_vgt_shader_config(sctx, key);