ac: add NGG subgroup size computation from radeonsi

RADV will use it.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
This commit is contained in:
Marek Olšák 2025-05-30 22:19:24 -04:00 committed by Marge Bot
parent 4263b49778
commit fa8db1ccd3
7 changed files with 224 additions and 183 deletions

View file

@ -1258,3 +1258,180 @@ ac_shader_io_get_unique_index_patch(unsigned semantic)
return 0;
}
}
static void
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
bool use_adjacency)
{
unsigned max_reuse = max_esverts - min_verts_per_prim;
if (use_adjacency)
max_reuse /= 2;
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
}
/**
* Determine subgroup information like maximum number of vertices and prims.
*
* This happens before the shader is uploaded, since LDS relocations during
* upload depend on the subgroup size.
*/
bool
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs,
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
ac_ngg_subgroup_info *out)
{
const unsigned gs_num_invocations = MAX2(gs_invocations, 1);
const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
const unsigned min_verts_per_prim = is_gs ? max_verts_per_prim : 1;
/* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
/* The LDS scratch is at the beginning of LDS space. */
const unsigned max_lds_size = 16 * 1024 - ngg_lds_scratch_size / 4;
const unsigned target_lds_size = max_lds_size;
unsigned esvert_lds_size = 0;
unsigned gsprim_lds_size = 0;
/* All these are per subgroup: */
const unsigned min_esverts =
gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
bool max_vert_out_per_gs_instance = false;
unsigned max_gsprims_base, max_esverts_base;
max_gsprims_base = max_esverts_base = max_workgroup_size;
if (is_gs) {
bool force_multi_cycling = false;
unsigned max_out_verts_per_gsprim = gs_vertices_out * gs_num_invocations;
retry_select_mode:
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
if (max_out_verts_per_gsprim) {
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
}
} else {
/* Use special multi-cycling mode in which each GS
* instance gets its own subgroup. Does not work with
* tessellation. */
max_vert_out_per_gs_instance = true;
max_gsprims_base = 1;
max_out_verts_per_gsprim = gs_vertices_out;
}
esvert_lds_size = esgs_vertex_stride / 4;
gsprim_lds_size = (ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim;
if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
if (tess_turns_off_ngg || es_stage != MESA_SHADER_TESS_EVAL) {
force_multi_cycling = true;
goto retry_select_mode;
}
}
} else {
/* VS and TES. */
esvert_lds_size = ngg_lds_vertex_size / 4;
}
unsigned max_gsprims = max_gsprims_base;
unsigned max_esverts = max_esverts_base;
if (esvert_lds_size)
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
if (gsprim_lds_size)
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
if (esvert_lds_size || gsprim_lds_size) {
/* Now that we have a rough proportionality between esverts
* and gsprims based on the primitive type, scale both of them
* down simultaneously based on required LDS space.
*
* We could be smarter about this if we knew how much vertex
* reuse to expect.
*/
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
if (lds_total > target_lds_size) {
max_esverts = max_esverts * target_lds_size / lds_total;
max_gsprims = max_gsprims * target_lds_size / lds_total;
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
}
}
/* Round up towards full wave sizes for better ALU utilization. */
if (!max_vert_out_per_gs_instance) {
unsigned orig_max_esverts;
unsigned orig_max_gsprims;
do {
orig_max_esverts = max_esverts;
orig_max_gsprims = max_gsprims;
max_esverts = align(max_esverts, wave_size);
max_esverts = MIN2(max_esverts, max_esverts_base);
if (esvert_lds_size)
max_esverts =
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
/* Hardware restriction: minimum value of max_esverts */
max_esverts = MAX2(max_esverts, min_esverts);
max_gsprims = align(max_gsprims, wave_size);
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
if (gsprim_lds_size) {
/* Don't count unusable vertices to the LDS size. Those are vertices above
* the maximum number of vertices that can occur in the workgroup,
* which is e.g. max_gsprims * 3 for triangles.
*/
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
max_gsprims =
MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
}
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
/* Verify the restriction. */
assert(max_esverts >= min_esverts);
} else {
max_esverts = MAX2(max_esverts, min_esverts);
}
unsigned max_out_vertices =
max_vert_out_per_gs_instance
? gs_vertices_out
: is_gs
? max_gsprims * gs_num_invocations * gs_vertices_out
: max_esverts;
assert(max_out_vertices <= 256);
out->hw_max_esverts = max_esverts;
out->max_gsprims = max_gsprims;
out->max_out_verts = max_out_vertices;
out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
/* Don't count unusable vertices. */
out->esgs_lds_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
esvert_lds_size;
out->ngg_out_lds_size = max_gsprims * gsprim_lds_size;
if (is_gs)
out->ngg_out_lds_size += ngg_lds_scratch_size / 4;
else
out->esgs_lds_size += ngg_lds_scratch_size / 4;
assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
/* If asserts are disabled, we use the same conditions to return false */
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
max_out_vertices <= 256 &&
out->hw_max_esverts >= min_esverts;
}

View file

@ -321,6 +321,22 @@ unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level);
unsigned ac_shader_io_get_unique_index_patch(unsigned semantic);
typedef struct {
uint16_t esgs_lds_size; /* in dwords */
uint16_t ngg_out_lds_size; /* in dwords */
uint16_t hw_max_esverts;
uint16_t max_gsprims;
uint16_t max_out_verts;
bool max_vert_out_per_gs_instance;
} ac_ngg_subgroup_info;
bool
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs,
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
ac_ngg_subgroup_info *out);
#ifdef __cplusplus
}
#endif

View file

@ -25,15 +25,6 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader)
sel->screen->info.gfx_level < GFX11;
}
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
unsigned min_verts_per_prim, bool use_adjacency)
{
unsigned max_reuse = max_esverts - min_verts_per_prim;
if (use_adjacency)
max_reuse /= 2;
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
}
/**
* Determine subgroup information like maximum number of vertices and prims.
*
@ -46,157 +37,15 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
const struct si_shader_selector *es_sel =
shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
const gl_shader_stage gs_stage = gs_sel->stage;
const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
unsigned gs_vertices_out = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 0;
unsigned gs_invocations = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.invocations : 0;
/* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
/* The LDS scratch is at the beginning of LDS space. */
const unsigned max_lds_size = 16 * 1024 - shader->info.ngg_lds_scratch_size / 4;
const unsigned target_lds_size = max_lds_size;
unsigned esvert_lds_size = 0;
unsigned gsprim_lds_size = 0;
/* All these are per subgroup: */
const unsigned min_esverts =
gs_sel->screen->info.gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
bool max_vert_out_per_gs_instance = false;
unsigned max_gsprims_base, max_esverts_base;
max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
if (gs_stage == MESA_SHADER_GEOMETRY) {
bool force_multi_cycling = false;
unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
retry_select_mode:
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
if (max_out_verts_per_gsprim) {
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
}
} else {
/* Use special multi-cycling mode in which each GS
* instance gets its own subgroup. Does not work with
* tessellation. */
max_vert_out_per_gs_instance = true;
max_gsprims_base = 1;
max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
}
esvert_lds_size = es_sel->info.esgs_vertex_stride / 4;
gsprim_lds_size = (shader->info.ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim;
if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
force_multi_cycling = true;
goto retry_select_mode;
}
}
} else {
/* VS and TES. */
esvert_lds_size = shader->info.ngg_lds_vertex_size / 4;
}
unsigned max_gsprims = max_gsprims_base;
unsigned max_esverts = max_esverts_base;
if (esvert_lds_size)
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
if (gsprim_lds_size)
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
if (esvert_lds_size || gsprim_lds_size) {
/* Now that we have a rough proportionality between esverts
* and gsprims based on the primitive type, scale both of them
* down simultaneously based on required LDS space.
*
* We could be smarter about this if we knew how much vertex
* reuse to expect.
*/
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
if (lds_total > target_lds_size) {
max_esverts = max_esverts * target_lds_size / lds_total;
max_gsprims = max_gsprims * target_lds_size / lds_total;
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
}
}
/* Round up towards full wave sizes for better ALU utilization. */
if (!max_vert_out_per_gs_instance) {
unsigned orig_max_esverts;
unsigned orig_max_gsprims;
do {
orig_max_esverts = max_esverts;
orig_max_gsprims = max_gsprims;
max_esverts = align(max_esverts, shader->wave_size);
max_esverts = MIN2(max_esverts, max_esverts_base);
if (esvert_lds_size)
max_esverts =
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
/* Hardware restriction: minimum value of max_esverts */
max_esverts = MAX2(max_esverts, min_esverts);
max_gsprims = align(max_gsprims, shader->wave_size);
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
if (gsprim_lds_size) {
/* Don't count unusable vertices to the LDS size. Those are vertices above
* the maximum number of vertices that can occur in the workgroup,
* which is e.g. max_gsprims * 3 for triangles.
*/
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
max_gsprims =
MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
}
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
/* Verify the restriction. */
assert(max_esverts >= min_esverts);
} else {
max_esverts = MAX2(max_esverts, min_esverts);
}
unsigned max_out_vertices =
max_vert_out_per_gs_instance
? gs_sel->info.base.gs.vertices_out
: gs_stage == MESA_SHADER_GEOMETRY
? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
: max_esverts;
assert(max_out_vertices <= 256);
shader->ngg.hw_max_esverts = max_esverts;
shader->ngg.max_gsprims = max_gsprims;
shader->ngg.max_out_verts = max_out_vertices;
shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
/* Don't count unusable vertices. */
shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
esvert_lds_size;
shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
if (gs_stage == MESA_SHADER_GEOMETRY)
shader->ngg.ngg_emit_size += shader->info.ngg_lds_scratch_size / 4;
else
shader->gs_info.esgs_ring_size += shader->info.ngg_lds_scratch_size / 4;
assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
/* If asserts are disabled, we use the same conditions to return false */
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
max_out_vertices <= 256 &&
shader->ngg.hw_max_esverts >= min_esverts;
return ac_ngg_compute_subgroup_info(gs_sel->screen->info.gfx_level, es_sel->stage,
gs_sel->stage == MESA_SHADER_GEOMETRY,
input_prim, gs_vertices_out, gs_invocations,
si_get_max_workgroup_size(shader), shader->wave_size,
es_sel->info.esgs_vertex_stride, shader->info.ngg_lds_vertex_size,
shader->info.ngg_lds_scratch_size, gs_sel->tess_turns_off_ngg,
&shader->ngg.info);
}

View file

@ -196,14 +196,15 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
(sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
sym->name = "esgs_ring";
sym->size = shader->gs_info.esgs_ring_size * 4;
sym->size = (shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
: shader->gs_info.esgs_ring_size) * 4;
sym->align = 64 * 1024;
}
if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
sym->name = "ngg_emit";
sym->size = shader->ngg.ngg_emit_size * 4;
sym->size = shader->ngg.info.ngg_out_lds_size * 4;
sym->align = 4;
}
@ -454,10 +455,11 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade
if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
(stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
unsigned size_in_dw = shader->gs_info.esgs_ring_size;
unsigned size_in_dw = shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
: shader->gs_info.esgs_ring_size;
if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
size_in_dw += shader->ngg.ngg_emit_size;
size_in_dw += shader->ngg.info.ngg_out_lds_size;
shader->config.lds_size =
DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));

View file

@ -826,12 +826,8 @@ struct si_shader {
} gs;
struct {
/* Computed by gfx10_ngg_calculate_subgroup_info. */
uint16_t ngg_emit_size; /* in dwords */
uint16_t hw_max_esverts;
uint16_t max_gsprims;
uint16_t max_out_verts;
bool max_vert_out_per_gs_instance;
/* Computed by ac_ngg_calculate_subgroup_info. */
ac_ngg_subgroup_info info;
/* Register values. */
unsigned ge_max_output_per_subgroup;
unsigned ge_ngg_subgrp_cntl;

View file

@ -209,7 +209,8 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write,
break;
case aco_symbol_lds_ngg_gs_out_vertex_base:
assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg);
value = shader->gs_info.esgs_ring_size * 4;
value = (key->ge.as_ngg ? shader->ngg.info.esgs_lds_size
: shader->gs_info.esgs_ring_size) * 4;
break;
case aco_symbol_const_data_addr:
if (!const_offset)

View file

@ -1603,11 +1603,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
: V_02870C_SPI_SHADER_NONE) |
S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
: V_02870C_SPI_SHADER_NONE);
shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.info.max_out_verts);
shader->ngg.vgt_gs_instance_cnt =
S_028B90_ENABLE(gs_num_invocations > 1) |
S_028B90_CNT(gs_num_invocations) |
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.info.max_vert_out_per_gs_instance);
shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
if (gs_stage == MESA_SHADER_GEOMETRY) {
@ -1710,19 +1710,19 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ?
gs_sel->info.base.gs.vertices_out : 1;
shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
S_03096C_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) |
S_03096C_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) |
S_03096C_PRIM_GRP_SIZE_GFX11(
CLAMP(max_prim_grp_size / MAX2(prim_amp_factor, 1), 1, 256)) |
S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(sscreen->info.gfx_level >= GFX12);
} else {
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts);
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.info.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts);
shader->ngg.vgt_gs_onchip_cntl =
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) |
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) |
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.info.max_gsprims * gs_num_invocations);
/* On gfx10, the GE only checks against the maximum number of ES verts after
* allocating a full GS primitive. So we need to ensure that whenever
@ -1734,13 +1734,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
*/
if ((sscreen->info.gfx_level == GFX10) &&
(es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
shader->ngg.hw_max_esverts != 256 &&
shader->ngg.hw_max_esverts > 5) {
shader->ngg.info.hw_max_esverts != 256 &&
shader->ngg.info.hw_max_esverts > 5) {
/* This could be based on the input primitive type. 5 is the worst case
* for primitive types with adjacency.
*/
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts - 5);
}
}