diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 9ad81bd30ae..49823ac9c67 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -2103,12 +2103,36 @@ retry_select_mode: prim_amp_factor = gs_sel->info.base.gs.vertices_out; } + /* Fix up the thread counts for fast launch. */ + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + /* The vertex count must be a multiple of 3. */ + max_esverts -= max_esverts % 3; + /* We can only decrease the size, not increase it. */ + if (max_gsprims * 3 < max_esverts) { + max_esverts = max_gsprims * 3; + } else { + max_gsprims = max_esverts / 3; + } + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + /* The primitive count must be even to get correct winding for triangle strips. */ + max_gsprims &= ~1; + if (max_gsprims - 2 < max_esverts) { + max_esverts = max_gsprims + 2; + } else { + max_gsprims = max_esverts - 2; + max_gsprims &= ~1; + max_esverts = max_gsprims + 2; + } + } + /* On gfx10, the GE only checks against the maximum number of ES verts after * allocating a full GS primitive. So we need to ensure that whenever * this check passes, there is enough space for a full primitive without * vertex reuse. */ - if (gs_sel->screen->info.chip_class == GFX10) + if (gs_sel->screen->info.chip_class == GFX10 && + !(shader->key.opt.ngg_culling & (SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST | + SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP))) shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; else shader->ngg.hw_max_esverts = max_esverts; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 221d28eecf3..29400f1e96b 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1305,12 +1305,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST || + shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts); } else { shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |