mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-04 09:10:12 +01:00
ac: add NGG subgroup size computation from radeonsi
RADV will use it. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
This commit is contained in:
parent
4263b49778
commit
fa8db1ccd3
7 changed files with 224 additions and 183 deletions
|
|
@ -1258,3 +1258,180 @@ ac_shader_io_get_unique_index_patch(unsigned semantic)
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
|
||||
bool use_adjacency)
|
||||
{
|
||||
unsigned max_reuse = max_esverts - min_verts_per_prim;
|
||||
if (use_adjacency)
|
||||
max_reuse /= 2;
|
||||
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine subgroup information like maximum number of vertices and prims.
|
||||
*
|
||||
* This happens before the shader is uploaded, since LDS relocations during
|
||||
* upload depend on the subgroup size.
|
||||
*/
|
||||
bool
|
||||
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs,
|
||||
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
|
||||
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
|
||||
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
|
||||
ac_ngg_subgroup_info *out)
|
||||
{
|
||||
const unsigned gs_num_invocations = MAX2(gs_invocations, 1);
|
||||
const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
|
||||
const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
|
||||
const unsigned min_verts_per_prim = is_gs ? max_verts_per_prim : 1;
|
||||
|
||||
/* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
|
||||
/* The LDS scratch is at the beginning of LDS space. */
|
||||
const unsigned max_lds_size = 16 * 1024 - ngg_lds_scratch_size / 4;
|
||||
const unsigned target_lds_size = max_lds_size;
|
||||
unsigned esvert_lds_size = 0;
|
||||
unsigned gsprim_lds_size = 0;
|
||||
|
||||
/* All these are per subgroup: */
|
||||
const unsigned min_esverts =
|
||||
gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
|
||||
gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
|
||||
bool max_vert_out_per_gs_instance = false;
|
||||
unsigned max_gsprims_base, max_esverts_base;
|
||||
|
||||
max_gsprims_base = max_esverts_base = max_workgroup_size;
|
||||
|
||||
if (is_gs) {
|
||||
bool force_multi_cycling = false;
|
||||
unsigned max_out_verts_per_gsprim = gs_vertices_out * gs_num_invocations;
|
||||
|
||||
retry_select_mode:
|
||||
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
|
||||
if (max_out_verts_per_gsprim) {
|
||||
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
|
||||
}
|
||||
} else {
|
||||
/* Use special multi-cycling mode in which each GS
|
||||
* instance gets its own subgroup. Does not work with
|
||||
* tessellation. */
|
||||
max_vert_out_per_gs_instance = true;
|
||||
max_gsprims_base = 1;
|
||||
max_out_verts_per_gsprim = gs_vertices_out;
|
||||
}
|
||||
|
||||
esvert_lds_size = esgs_vertex_stride / 4;
|
||||
gsprim_lds_size = (ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim;
|
||||
|
||||
if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
|
||||
if (tess_turns_off_ngg || es_stage != MESA_SHADER_TESS_EVAL) {
|
||||
force_multi_cycling = true;
|
||||
goto retry_select_mode;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* VS and TES. */
|
||||
esvert_lds_size = ngg_lds_vertex_size / 4;
|
||||
}
|
||||
|
||||
unsigned max_gsprims = max_gsprims_base;
|
||||
unsigned max_esverts = max_esverts_base;
|
||||
|
||||
if (esvert_lds_size)
|
||||
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
|
||||
if (gsprim_lds_size)
|
||||
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
|
||||
if (esvert_lds_size || gsprim_lds_size) {
|
||||
/* Now that we have a rough proportionality between esverts
|
||||
* and gsprims based on the primitive type, scale both of them
|
||||
* down simultaneously based on required LDS space.
|
||||
*
|
||||
* We could be smarter about this if we knew how much vertex
|
||||
* reuse to expect.
|
||||
*/
|
||||
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
|
||||
if (lds_total > target_lds_size) {
|
||||
max_esverts = max_esverts * target_lds_size / lds_total;
|
||||
max_gsprims = max_gsprims * target_lds_size / lds_total;
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Round up towards full wave sizes for better ALU utilization. */
|
||||
if (!max_vert_out_per_gs_instance) {
|
||||
unsigned orig_max_esverts;
|
||||
unsigned orig_max_gsprims;
|
||||
do {
|
||||
orig_max_esverts = max_esverts;
|
||||
orig_max_gsprims = max_gsprims;
|
||||
|
||||
max_esverts = align(max_esverts, wave_size);
|
||||
max_esverts = MIN2(max_esverts, max_esverts_base);
|
||||
if (esvert_lds_size)
|
||||
max_esverts =
|
||||
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
|
||||
/* Hardware restriction: minimum value of max_esverts */
|
||||
max_esverts = MAX2(max_esverts, min_esverts);
|
||||
|
||||
max_gsprims = align(max_gsprims, wave_size);
|
||||
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
|
||||
if (gsprim_lds_size) {
|
||||
/* Don't count unusable vertices to the LDS size. Those are vertices above
|
||||
* the maximum number of vertices that can occur in the workgroup,
|
||||
* which is e.g. max_gsprims * 3 for triangles.
|
||||
*/
|
||||
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
max_gsprims =
|
||||
MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
|
||||
}
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
|
||||
|
||||
/* Verify the restriction. */
|
||||
assert(max_esverts >= min_esverts);
|
||||
} else {
|
||||
max_esverts = MAX2(max_esverts, min_esverts);
|
||||
}
|
||||
|
||||
unsigned max_out_vertices =
|
||||
max_vert_out_per_gs_instance
|
||||
? gs_vertices_out
|
||||
: is_gs
|
||||
? max_gsprims * gs_num_invocations * gs_vertices_out
|
||||
: max_esverts;
|
||||
assert(max_out_vertices <= 256);
|
||||
|
||||
out->hw_max_esverts = max_esverts;
|
||||
out->max_gsprims = max_gsprims;
|
||||
out->max_out_verts = max_out_vertices;
|
||||
out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
|
||||
|
||||
/* Don't count unusable vertices. */
|
||||
out->esgs_lds_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
|
||||
esvert_lds_size;
|
||||
out->ngg_out_lds_size = max_gsprims * gsprim_lds_size;
|
||||
|
||||
if (is_gs)
|
||||
out->ngg_out_lds_size += ngg_lds_scratch_size / 4;
|
||||
else
|
||||
out->esgs_lds_size += ngg_lds_scratch_size / 4;
|
||||
|
||||
assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
|
||||
|
||||
/* If asserts are disabled, we use the same conditions to return false */
|
||||
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
|
||||
max_out_vertices <= 256 &&
|
||||
out->hw_max_esverts >= min_esverts;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -321,6 +321,22 @@ unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level);
|
|||
|
||||
unsigned ac_shader_io_get_unique_index_patch(unsigned semantic);
|
||||
|
||||
typedef struct {
|
||||
uint16_t esgs_lds_size; /* in dwords */
|
||||
uint16_t ngg_out_lds_size; /* in dwords */
|
||||
uint16_t hw_max_esverts;
|
||||
uint16_t max_gsprims;
|
||||
uint16_t max_out_verts;
|
||||
bool max_vert_out_per_gs_instance;
|
||||
} ac_ngg_subgroup_info;
|
||||
|
||||
bool
|
||||
ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, gl_shader_stage es_stage, bool is_gs,
|
||||
enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
|
||||
unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
|
||||
unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
|
||||
ac_ngg_subgroup_info *out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -25,15 +25,6 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader)
|
|||
sel->screen->info.gfx_level < GFX11;
|
||||
}
|
||||
|
||||
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
|
||||
unsigned min_verts_per_prim, bool use_adjacency)
|
||||
{
|
||||
unsigned max_reuse = max_esverts - min_verts_per_prim;
|
||||
if (use_adjacency)
|
||||
max_reuse /= 2;
|
||||
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine subgroup information like maximum number of vertices and prims.
|
||||
*
|
||||
|
|
@ -46,157 +37,15 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
|
|||
const struct si_shader_selector *es_sel =
|
||||
shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
|
||||
const gl_shader_stage gs_stage = gs_sel->stage;
|
||||
const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
|
||||
const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
|
||||
const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
|
||||
const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
|
||||
const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
|
||||
unsigned gs_vertices_out = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 0;
|
||||
unsigned gs_invocations = gs_stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.invocations : 0;
|
||||
|
||||
/* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
|
||||
/* The LDS scratch is at the beginning of LDS space. */
|
||||
const unsigned max_lds_size = 16 * 1024 - shader->info.ngg_lds_scratch_size / 4;
|
||||
const unsigned target_lds_size = max_lds_size;
|
||||
unsigned esvert_lds_size = 0;
|
||||
unsigned gsprim_lds_size = 0;
|
||||
|
||||
/* All these are per subgroup: */
|
||||
const unsigned min_esverts =
|
||||
gs_sel->screen->info.gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
|
||||
gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
|
||||
bool max_vert_out_per_gs_instance = false;
|
||||
unsigned max_gsprims_base, max_esverts_base;
|
||||
|
||||
max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
|
||||
|
||||
if (gs_stage == MESA_SHADER_GEOMETRY) {
|
||||
bool force_multi_cycling = false;
|
||||
unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
|
||||
|
||||
retry_select_mode:
|
||||
if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
|
||||
if (max_out_verts_per_gsprim) {
|
||||
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
|
||||
}
|
||||
} else {
|
||||
/* Use special multi-cycling mode in which each GS
|
||||
* instance gets its own subgroup. Does not work with
|
||||
* tessellation. */
|
||||
max_vert_out_per_gs_instance = true;
|
||||
max_gsprims_base = 1;
|
||||
max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
|
||||
}
|
||||
|
||||
esvert_lds_size = es_sel->info.esgs_vertex_stride / 4;
|
||||
gsprim_lds_size = (shader->info.ngg_lds_vertex_size / 4) * max_out_verts_per_gsprim;
|
||||
|
||||
if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
|
||||
if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
|
||||
force_multi_cycling = true;
|
||||
goto retry_select_mode;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* VS and TES. */
|
||||
esvert_lds_size = shader->info.ngg_lds_vertex_size / 4;
|
||||
}
|
||||
|
||||
unsigned max_gsprims = max_gsprims_base;
|
||||
unsigned max_esverts = max_esverts_base;
|
||||
|
||||
if (esvert_lds_size)
|
||||
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
|
||||
if (gsprim_lds_size)
|
||||
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
|
||||
if (esvert_lds_size || gsprim_lds_size) {
|
||||
/* Now that we have a rough proportionality between esverts
|
||||
* and gsprims based on the primitive type, scale both of them
|
||||
* down simultaneously based on required LDS space.
|
||||
*
|
||||
* We could be smarter about this if we knew how much vertex
|
||||
* reuse to expect.
|
||||
*/
|
||||
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
|
||||
if (lds_total > target_lds_size) {
|
||||
max_esverts = max_esverts * target_lds_size / lds_total;
|
||||
max_gsprims = max_gsprims * target_lds_size / lds_total;
|
||||
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Round up towards full wave sizes for better ALU utilization. */
|
||||
if (!max_vert_out_per_gs_instance) {
|
||||
unsigned orig_max_esverts;
|
||||
unsigned orig_max_gsprims;
|
||||
do {
|
||||
orig_max_esverts = max_esverts;
|
||||
orig_max_gsprims = max_gsprims;
|
||||
|
||||
max_esverts = align(max_esverts, shader->wave_size);
|
||||
max_esverts = MIN2(max_esverts, max_esverts_base);
|
||||
if (esvert_lds_size)
|
||||
max_esverts =
|
||||
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
|
||||
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
|
||||
/* Hardware restriction: minimum value of max_esverts */
|
||||
max_esverts = MAX2(max_esverts, min_esverts);
|
||||
|
||||
max_gsprims = align(max_gsprims, shader->wave_size);
|
||||
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
|
||||
if (gsprim_lds_size) {
|
||||
/* Don't count unusable vertices to the LDS size. Those are vertices above
|
||||
* the maximum number of vertices that can occur in the workgroup,
|
||||
* which is e.g. max_gsprims * 3 for triangles.
|
||||
*/
|
||||
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
|
||||
max_gsprims =
|
||||
MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
|
||||
}
|
||||
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
|
||||
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
|
||||
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
|
||||
|
||||
/* Verify the restriction. */
|
||||
assert(max_esverts >= min_esverts);
|
||||
} else {
|
||||
max_esverts = MAX2(max_esverts, min_esverts);
|
||||
}
|
||||
|
||||
unsigned max_out_vertices =
|
||||
max_vert_out_per_gs_instance
|
||||
? gs_sel->info.base.gs.vertices_out
|
||||
: gs_stage == MESA_SHADER_GEOMETRY
|
||||
? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
|
||||
: max_esverts;
|
||||
assert(max_out_vertices <= 256);
|
||||
|
||||
shader->ngg.hw_max_esverts = max_esverts;
|
||||
shader->ngg.max_gsprims = max_gsprims;
|
||||
shader->ngg.max_out_verts = max_out_vertices;
|
||||
shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
|
||||
|
||||
/* Don't count unusable vertices. */
|
||||
shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
|
||||
esvert_lds_size;
|
||||
shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
|
||||
|
||||
if (gs_stage == MESA_SHADER_GEOMETRY)
|
||||
shader->ngg.ngg_emit_size += shader->info.ngg_lds_scratch_size / 4;
|
||||
else
|
||||
shader->gs_info.esgs_ring_size += shader->info.ngg_lds_scratch_size / 4;
|
||||
|
||||
assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
|
||||
|
||||
/* If asserts are disabled, we use the same conditions to return false */
|
||||
return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
|
||||
max_out_vertices <= 256 &&
|
||||
shader->ngg.hw_max_esverts >= min_esverts;
|
||||
return ac_ngg_compute_subgroup_info(gs_sel->screen->info.gfx_level, es_sel->stage,
|
||||
gs_sel->stage == MESA_SHADER_GEOMETRY,
|
||||
input_prim, gs_vertices_out, gs_invocations,
|
||||
si_get_max_workgroup_size(shader), shader->wave_size,
|
||||
es_sel->info.esgs_vertex_stride, shader->info.ngg_lds_vertex_size,
|
||||
shader->info.ngg_lds_scratch_size, gs_sel->tess_turns_off_ngg,
|
||||
&shader->ngg.info);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -196,14 +196,15 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
|
|||
(sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
|
||||
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
|
||||
sym->name = "esgs_ring";
|
||||
sym->size = shader->gs_info.esgs_ring_size * 4;
|
||||
sym->size = (shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
|
||||
: shader->gs_info.esgs_ring_size) * 4;
|
||||
sym->align = 64 * 1024;
|
||||
}
|
||||
|
||||
if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
|
||||
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
|
||||
sym->name = "ngg_emit";
|
||||
sym->size = shader->ngg.ngg_emit_size * 4;
|
||||
sym->size = shader->ngg.info.ngg_out_lds_size * 4;
|
||||
sym->align = 4;
|
||||
}
|
||||
|
||||
|
|
@ -454,10 +455,11 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade
|
|||
|
||||
if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
|
||||
(stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
|
||||
unsigned size_in_dw = shader->gs_info.esgs_ring_size;
|
||||
unsigned size_in_dw = shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
|
||||
: shader->gs_info.esgs_ring_size;
|
||||
|
||||
if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
|
||||
size_in_dw += shader->ngg.ngg_emit_size;
|
||||
size_in_dw += shader->ngg.info.ngg_out_lds_size;
|
||||
|
||||
shader->config.lds_size =
|
||||
DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
|
||||
|
|
|
|||
|
|
@ -826,12 +826,8 @@ struct si_shader {
|
|||
} gs;
|
||||
|
||||
struct {
|
||||
/* Computed by gfx10_ngg_calculate_subgroup_info. */
|
||||
uint16_t ngg_emit_size; /* in dwords */
|
||||
uint16_t hw_max_esverts;
|
||||
uint16_t max_gsprims;
|
||||
uint16_t max_out_verts;
|
||||
bool max_vert_out_per_gs_instance;
|
||||
/* Computed by ac_ngg_calculate_subgroup_info. */
|
||||
ac_ngg_subgroup_info info;
|
||||
/* Register values. */
|
||||
unsigned ge_max_output_per_subgroup;
|
||||
unsigned ge_ngg_subgrp_cntl;
|
||||
|
|
|
|||
|
|
@ -209,7 +209,8 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write,
|
|||
break;
|
||||
case aco_symbol_lds_ngg_gs_out_vertex_base:
|
||||
assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg);
|
||||
value = shader->gs_info.esgs_ring_size * 4;
|
||||
value = (key->ge.as_ngg ? shader->ngg.info.esgs_lds_size
|
||||
: shader->gs_info.esgs_ring_size) * 4;
|
||||
break;
|
||||
case aco_symbol_const_data_addr:
|
||||
if (!const_offset)
|
||||
|
|
|
|||
|
|
@ -1603,11 +1603,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
: V_02870C_SPI_SHADER_NONE) |
|
||||
S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
|
||||
: V_02870C_SPI_SHADER_NONE);
|
||||
shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
|
||||
shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.info.max_out_verts);
|
||||
shader->ngg.vgt_gs_instance_cnt =
|
||||
S_028B90_ENABLE(gs_num_invocations > 1) |
|
||||
S_028B90_CNT(gs_num_invocations) |
|
||||
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
|
||||
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.info.max_vert_out_per_gs_instance);
|
||||
shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
|
||||
|
||||
if (gs_stage == MESA_SHADER_GEOMETRY) {
|
||||
|
|
@ -1710,19 +1710,19 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ?
|
||||
gs_sel->info.base.gs.vertices_out : 1;
|
||||
|
||||
shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
|
||||
shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) |
|
||||
S_03096C_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) |
|
||||
S_03096C_PRIM_GRP_SIZE_GFX11(
|
||||
CLAMP(max_prim_grp_size / MAX2(prim_amp_factor, 1), 1, 256)) |
|
||||
S_03096C_DIS_PG_SIZE_ADJUST_FOR_STRIP(sscreen->info.gfx_level >= GFX12);
|
||||
} else {
|
||||
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts);
|
||||
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.info.max_gsprims) |
|
||||
S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts);
|
||||
|
||||
shader->ngg.vgt_gs_onchip_cntl =
|
||||
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
|
||||
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
|
||||
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
|
||||
S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.info.hw_max_esverts) |
|
||||
S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.info.max_gsprims) |
|
||||
S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.info.max_gsprims * gs_num_invocations);
|
||||
|
||||
/* On gfx10, the GE only checks against the maximum number of ES verts after
|
||||
* allocating a full GS primitive. So we need to ensure that whenever
|
||||
|
|
@ -1734,13 +1734,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
|
|||
*/
|
||||
if ((sscreen->info.gfx_level == GFX10) &&
|
||||
(es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
|
||||
shader->ngg.hw_max_esverts != 256 &&
|
||||
shader->ngg.hw_max_esverts > 5) {
|
||||
shader->ngg.info.hw_max_esverts != 256 &&
|
||||
shader->ngg.info.hw_max_esverts > 5) {
|
||||
/* This could be based on the input primitive type. 5 is the worst case
|
||||
* for primitive types with adjacency.
|
||||
*/
|
||||
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
|
||||
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
|
||||
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.info.hw_max_esverts - 5);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue