From fa8db1ccd3cc518a78e69697ca177777cdfadb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 30 May 2025 22:19:24 -0400 Subject: [PATCH] ac: add NGG subgroup size computation from radeonsi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RADV will use it. Reviewed-by: Timur Kristóf Part-of: --- src/amd/common/ac_shader_util.c | 177 ++++++++++++++++++ src/amd/common/ac_shader_util.h | 16 ++ .../drivers/radeonsi/gfx10_shader_ngg.c | 169 +---------------- src/gallium/drivers/radeonsi/si_shader.c | 10 +- src/gallium/drivers/radeonsi/si_shader.h | 8 +- src/gallium/drivers/radeonsi/si_shader_aco.c | 3 +- .../drivers/radeonsi/si_state_shaders.cpp | 24 +-- 7 files changed, 224 insertions(+), 183 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index ca868cf28e8..3354935ddcb 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -1258,3 +1258,180 @@ ac_shader_io_get_unique_index_patch(unsigned semantic) return 0; } } + +static void +clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim, + bool use_adjacency) +{ + unsigned max_reuse = max_esverts - min_verts_per_prim; + if (use_adjacency) + max_reuse /= 2; + *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); +} + +/** + * Determine subgroup information like maximum number of vertices and prims. + * + * This happens before the shader is uploaded, since LDS relocations during + * upload depend on the subgroup size. + */ +bool +ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs, + enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations, + unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride, + unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg, + ac_ngg_subgroup_info *out) +{ + const unsigned gs_num_invocations = MAX2(gs_invocations, 1); + const bool use_adjacency = mesa_prim_has_adjacency(input_prim); + const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim); + const unsigned min_verts_per_prim = is_gs ? max_verts_per_prim : 1; + + /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */ + /* The LDS scratch is at the beginning of LDS space. */ + const unsigned max_lds_size = 16 * 1024 - ngg_lds_scratch_size / 4; + const unsigned target_lds_size = max_lds_size; + unsigned esvert_lds_size = 0; + unsigned gsprim_lds_size = 0; + + /* All these are per subgroup: */ + const unsigned min_esverts = + gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */ + gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim); + bool max_vert_out_per_gs_instance = false; + unsigned max_gsprims_base, max_esverts_base; + + max_gsprims_base = max_esverts_base = max_workgroup_size; + + if (is_gs) { + bool force_multi_cycling = false; + unsigned max_out_verts_per_gsprim = gs_vertices_out * gs_num_invocations; + +retry_select_mode: + if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) { + if (max_out_verts_per_gsprim) { + max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); + } + } else { + /* Use special multi-cycling mode in which each GS + * instance gets its own subgroup. Does not work with + * tessellation. */ + max_vert_out_per_gs_instance = true; + max_gsprims_base = 1; + max_out_verts_per_gsprim = gs_vertices_out; + } + + esvert_lds_size = esgs_vertex_stride / 4; + gsprim_lds_size = (ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim; + + if (gsprim_lds_size > target_lds_size && !force_multi_cycling) { + if (tess_turns_off_ngg || es_stage != MESA_SHADER_TESS_EVAL) { + force_multi_cycling = true; + goto retry_select_mode; + } + } + } else { + /* VS and TES. */ + esvert_lds_size = ngg_lds_vertex_size / 4; + } + + unsigned max_gsprims = max_gsprims_base; + unsigned max_esverts = max_esverts_base; + + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + + if (esvert_lds_size || gsprim_lds_size) { + /* Now that we have a rough proportionality between esverts + * and gsprims based on the primitive type, scale both of them + * down simultaneously based on required LDS space. + * + * We could be smarter about this if we knew how much vertex + * reuse to expect. + */ + unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; + if (lds_total > target_lds_size) { + max_esverts = max_esverts * target_lds_size / lds_total; + max_gsprims = max_gsprims * target_lds_size / lds_total; + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } + } + + /* Round up towards full wave sizes for better ALU utilization. */ + if (!max_vert_out_per_gs_instance) { + unsigned orig_max_esverts; + unsigned orig_max_gsprims; + do { + orig_max_esverts = max_esverts; + orig_max_gsprims = max_gsprims; + + max_esverts = align(max_esverts, wave_size); + max_esverts = MIN2(max_esverts, max_esverts_base); + if (esvert_lds_size) + max_esverts = + MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + + /* Hardware restriction: minimum value of max_esverts */ + max_esverts = MAX2(max_esverts, min_esverts); + + max_gsprims = align(max_gsprims, wave_size); + max_gsprims = MIN2(max_gsprims, max_gsprims_base); + if (gsprim_lds_size) { + /* Don't count unusable vertices to the LDS size. Those are vertices above + * the maximum number of vertices that can occur in the workgroup, + * which is e.g. max_gsprims * 3 for triangles. + */ + unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + max_gsprims = + MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size); + } + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); + + /* Verify the restriction. */ + assert(max_esverts >= min_esverts); + } else { + max_esverts = MAX2(max_esverts, min_esverts); + } + + unsigned max_out_vertices = + max_vert_out_per_gs_instance + ? gs_vertices_out + : is_gs + ? max_gsprims * gs_num_invocations * gs_vertices_out + : max_esverts; + assert(max_out_vertices <= 256); + + out->hw_max_esverts = max_esverts; + out->max_gsprims = max_gsprims; + out->max_out_verts = max_out_vertices; + out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + + /* Don't count unusable vertices. */ + out->esgs_lds_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * + esvert_lds_size; + out->ngg_out_lds_size = max_gsprims * gsprim_lds_size; + + if (is_gs) + out->ngg_out_lds_size += ngg_lds_scratch_size / 4; + else + out->esgs_lds_size += ngg_lds_scratch_size / 4; + + assert(out->hw_max_esverts >= min_esverts); /* HW limitation */ + + /* If asserts are disabled, we use the same conditions to return false */ + return max_esverts >= max_verts_per_prim && max_gsprims >= 1 && + max_out_vertices <= 256 && + out->hw_max_esverts >= min_esverts; +} diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 8848b8ed6fe..33a74b53b37 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -321,6 +321,22 @@ unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level); unsigned ac_shader_io_get_unique_index_patch(unsigned semantic); +typedef struct { + uint16_t esgs_lds_size; /* in dwords */ + uint16_t ngg_out_lds_size; /* in dwords */ + uint16_t hw_max_esverts; + uint16_t max_gsprims; + uint16_t max_out_verts; + bool max_vert_out_per_gs_instance; +} ac_ngg_subgroup_info; + +bool +ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs, + enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations, + unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride, + unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg, + ac_ngg_subgroup_info *out); + #ifdef __cplusplus } #endif diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 22998c2594d..ae2038c911a 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -25,15 +25,6 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader) sel->screen->info.gfx_level < GFX11; } -static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, - unsigned min_verts_per_prim, bool use_adjacency) -{ - unsigned max_reuse = max_esverts - min_verts_per_prim; - if (use_adjacency) - max_reuse /= 2; - *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); -} - /** * Determine subgroup information like maximum number of vertices and prims. * @@ -46,157 +37,15 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) const struct si_shader_selector *es_sel = shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; const gl_shader_stage gs_stage = gs_sel->stage; - const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false); - const bool use_adjacency = mesa_prim_has_adjacency(input_prim); - const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim); - const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; + unsigned gs_vertices_out = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 0; + unsigned gs_invocations = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.invocations : 0; - /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */ - /* The LDS scratch is at the beginning of LDS space. */ - const unsigned max_lds_size = 16 * 1024 - shader->info.ngg_lds_scratch_size / 4; - const unsigned target_lds_size = max_lds_size; - unsigned esvert_lds_size = 0; - unsigned gsprim_lds_size = 0; - - /* All these are per subgroup: */ - const unsigned min_esverts = - gs_sel->screen->info.gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */ - gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim); - bool max_vert_out_per_gs_instance = false; - unsigned max_gsprims_base, max_esverts_base; - - max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader); - - if (gs_stage == MESA_SHADER_GEOMETRY) { - bool force_multi_cycling = false; - unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations; - -retry_select_mode: - if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) { - if (max_out_verts_per_gsprim) { - max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); - } - } else { - /* Use special multi-cycling mode in which each GS - * instance gets its own subgroup. Does not work with - * tessellation. */ - max_vert_out_per_gs_instance = true; - max_gsprims_base = 1; - max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out; - } - - esvert_lds_size = es_sel->info.esgs_vertex_stride / 4; - gsprim_lds_size = (shader->info.ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim; - - if (gsprim_lds_size > target_lds_size && !force_multi_cycling) { - if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) { - force_multi_cycling = true; - goto retry_select_mode; - } - } - } else { - /* VS and TES. */ - esvert_lds_size = shader->info.ngg_lds_vertex_size / 4; - } - - unsigned max_gsprims = max_gsprims_base; - unsigned max_esverts = max_esverts_base; - - if (esvert_lds_size) - max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); - if (gsprim_lds_size) - max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - - if (esvert_lds_size || gsprim_lds_size) { - /* Now that we have a rough proportionality between esverts - * and gsprims based on the primitive type, scale both of them - * down simultaneously based on required LDS space. - * - * We could be smarter about this if we knew how much vertex - * reuse to expect. - */ - unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; - if (lds_total > target_lds_size) { - max_esverts = max_esverts * target_lds_size / lds_total; - max_gsprims = max_gsprims * target_lds_size / lds_total; - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } - } - - /* Round up towards full wave sizes for better ALU utilization. */ - if (!max_vert_out_per_gs_instance) { - unsigned orig_max_esverts; - unsigned orig_max_gsprims; - do { - orig_max_esverts = max_esverts; - orig_max_gsprims = max_gsprims; - - max_esverts = align(max_esverts, shader->wave_size); - max_esverts = MIN2(max_esverts, max_esverts_base); - if (esvert_lds_size) - max_esverts = - MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - - /* Hardware restriction: minimum value of max_esverts */ - max_esverts = MAX2(max_esverts, min_esverts); - - max_gsprims = align(max_gsprims, shader->wave_size); - max_gsprims = MIN2(max_gsprims, max_gsprims_base); - if (gsprim_lds_size) { - /* Don't count unusable vertices to the LDS size. Those are vertices above - * the maximum number of vertices that can occur in the workgroup, - * which is e.g. max_gsprims * 3 for triangles. - */ - unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - max_gsprims = - MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size); - } - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); - - /* Verify the restriction. */ - assert(max_esverts >= min_esverts); - } else { - max_esverts = MAX2(max_esverts, min_esverts); - } - - unsigned max_out_vertices = - max_vert_out_per_gs_instance - ? gs_sel->info.base.gs.vertices_out - : gs_stage == MESA_SHADER_GEOMETRY - ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out - : max_esverts; - assert(max_out_vertices <= 256); - - shader->ngg.hw_max_esverts = max_esverts; - shader->ngg.max_gsprims = max_gsprims; - shader->ngg.max_out_verts = max_out_vertices; - shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; - - /* Don't count unusable vertices. */ - shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * - esvert_lds_size; - shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; - - if (gs_stage == MESA_SHADER_GEOMETRY) - shader->ngg.ngg_emit_size += shader->info.ngg_lds_scratch_size / 4; - else - shader->gs_info.esgs_ring_size += shader->info.ngg_lds_scratch_size / 4; - - assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */ - - /* If asserts are disabled, we use the same conditions to return false */ - return max_esverts >= max_verts_per_prim && max_gsprims >= 1 && - max_out_vertices <= 256 && - shader->ngg.hw_max_esverts >= min_esverts; + return ac_ngg_compute_subgroup_info(gs_sel->screen->info.gfx_level, es_sel->stage, + gs_sel->stage == MESA_SHADER_GEOMETRY, + input_prim, gs_vertices_out, gs_invocations, + si_get_max_workgroup_size(shader), shader->wave_size, + es_sel->info.esgs_vertex_stride, shader->info.ngg_lds_vertex_size, + shader->info.ngg_lds_scratch_size, gs_sel->tess_turns_off_ngg, + &shader->ngg.info); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 108520f492e..b7821d0cf45 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -196,14 +196,15 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) { struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "esgs_ring"; - sym->size = shader->gs_info.esgs_ring_size * 4; + sym->size = (shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size + : shader->gs_info.esgs_ring_size) * 4; sym->align = 64 * 1024; } if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) { struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "ngg_emit"; - sym->size = shader->ngg.ngg_emit_size * 4; + sym->size = shader->ngg.info.ngg_out_lds_size * 4; sym->align = 4; } @@ -454,10 +455,11 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY && (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) { - unsigned size_in_dw = shader->gs_info.esgs_ring_size; + unsigned size_in_dw = shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size + : shader->gs_info.esgs_ring_size; if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) - size_in_dw += shader->ngg.ngg_emit_size; + size_in_dw += shader->ngg.info.ngg_out_lds_size; shader->config.lds_size = DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage)); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index be3baa3d1bd..022615efc47 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -826,12 +826,8 @@ struct si_shader { } gs; struct { - /* Computed by gfx10_ngg_calculate_subgroup_info. */ - uint16_t ngg_emit_size; /* in dwords */ - uint16_t hw_max_esverts; - uint16_t max_gsprims; - uint16_t max_out_verts; - bool max_vert_out_per_gs_instance; + /* Computed by ac_ngg_calculate_subgroup_info. */ + ac_ngg_subgroup_info info; /* Register values. */ unsigned ge_max_output_per_subgroup; unsigned ge_ngg_subgrp_cntl; diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c index 0ce57cf3e7a..900a64aa800 100644 --- a/src/gallium/drivers/radeonsi/si_shader_aco.c +++ b/src/gallium/drivers/radeonsi/si_shader_aco.c @@ -209,7 +209,8 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write, break; case aco_symbol_lds_ngg_gs_out_vertex_base: assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg); - value = shader->gs_info.esgs_ring_size * 4; + value = (key->ge.as_ngg ? shader->ngg.info.esgs_lds_size + : shader->gs_info.esgs_ring_size) * 4; break; case aco_symbol_const_data_addr: if (!const_offset) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index ed2babf5963..cbfb31d37c0 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1603,11 +1603,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader : V_02870C_SPI_SHADER_NONE) | S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE); - shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts); + shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.info.max_out_verts); shader->ngg.vgt_gs_instance_cnt = S_028B90_ENABLE(gs_num_invocations > 1) | S_028B90_CNT(gs_num_invocations) | - S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance); + S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.info.max_vert_out_per_gs_instance); shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true); if (gs_stage == MESA_SHADER_GEOMETRY) { @@ -1710,19 +1710,19 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 1; - shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) | - S_03096C_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | + shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) | + S_03096C_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) | S_03096C_PRIM_GRP_SIZE_GFX11( CLAMP(max_prim_grp_size / MAX2(prim_amp_factor, 1), 1, 256)) | S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(sscreen->info.gfx_level >= GFX12); } else { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts); + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.info.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts); shader->ngg.vgt_gs_onchip_cntl = - S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | - S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations); + S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) | + S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.info.max_gsprims * gs_num_invocations); /* On gfx10, the GE only checks against the maximum number of ES verts after * allocating a full GS primitive. So we need to ensure that whenever @@ -1734,13 +1734,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader */ if ((sscreen->info.gfx_level == GFX10) && (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256 && - shader->ngg.hw_max_esverts > 5) { + shader->ngg.info.hw_max_esverts != 256 && + shader->ngg.info.hw_max_esverts > 5) { /* This could be based on the input primitive type. 5 is the worst case * for primitive types with adjacency. */ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts - 5); } }