radeonsi: consider both stages to determine merged shader wave_size

Previously we determine wave_size of merged shader stages separately,
and ignore the condition which may cause them to be different.

Now we determine the wave_size of the TCS/GS part first, then use the
wave_size for VS/TES part. So that we can condider the previous shader
stage's information when determine the wave_size of TCS/GS, and two
stages in the merged shader can affect each other's wave_size.

This requires si_shader_selector to have two kinds of main part for
wave32 and wave64 when part mode, to be combined with other shader
part with various wave size.

This also enables merged shader stages with different
si_shader_info->has_divergent_loop to use wave32. We'll add another
condition for KHR_shader_subgroup latter.

Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30610>
This commit is contained in:
Qiang Yu 2024-08-06 14:30:54 +08:00
parent 196d91ed78
commit a78d1d49e6
3 changed files with 74 additions and 53 deletions

View file

@ -3144,8 +3144,11 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
struct si_shader *shader, struct util_debug_callback *debug)
{
if (sscreen->info.gfx_level >= GFX9)
shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls;
if (sscreen->info.gfx_level >= GFX9) {
assert(shader->wave_size == 32 || shader->wave_size == 64);
unsigned index = shader->wave_size / 32 - 1;
shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
}
return true;
}
@ -3157,10 +3160,13 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_
struct si_shader *shader, struct util_debug_callback *debug)
{
if (sscreen->info.gfx_level >= GFX9) {
if (shader->key.ge.as_ngg)
shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
else
if (shader->key.ge.as_ngg) {
assert(shader->wave_size == 32 || shader->wave_size == 64);
unsigned index = shader->wave_size / 32 - 1;
shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
} else {
shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
}
}
return true;
@ -3382,7 +3388,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
struct si_shader *shader, struct util_debug_callback *debug)
{
struct si_shader_selector *sel = shader->selector;
struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
if (sel->stage == MESA_SHADER_FRAGMENT) {
shader->ps.writes_samplemask = sel->info.writes_samplemask &&
@ -3445,14 +3451,13 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
* by multiple contexts.
*/
if (!shader->key.ge.as_ngg) {
assert(sel->main_shader_part == mainp);
assert(sel->main_shader_part->gs_copy_shader);
assert(sel->main_shader_part->gs_copy_shader->bo);
assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
assert(!sel->main_shader_part->gs_copy_shader->scratch_va);
assert(mainp->gs_copy_shader);
assert(mainp->gs_copy_shader->bo);
assert(!mainp->gs_copy_shader->previous_stage_sel);
assert(!mainp->gs_copy_shader->scratch_va);
shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
sizeof(*shader->gs_copy_shader));
/* Increase the reference count. */
pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);

View file

@ -552,12 +552,14 @@ struct si_shader_selector {
/* The compiled NIR shader without a prolog and/or epilog (not
* uploaded to a buffer object).
*
* [0] for wave32, [1] for wave64.
*/
struct si_shader *main_shader_part;
struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
struct si_shader *main_shader_part_es; /* as_es is set in the key */
struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
struct si_shader *main_shader_part[2];
struct si_shader *main_shader_part_ls[2]; /* as_ls is set in the key */
struct si_shader *main_shader_part_es; /* as_es && !as_ngg in the key */
struct si_shader *main_shader_part_ngg[2]; /* !as_es && as_ngg in the key */
struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
struct nir_shader *nir;
void *nir_binary;
@ -1039,19 +1041,26 @@ bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *sha
/* Return the pointer to the main shader part's pointer. */
static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
const union si_shader_key *key)
const union si_shader_key *key,
unsigned wave_size)
{
assert(wave_size == 32 || wave_size == 64);
unsigned index = wave_size / 32 - 1;
if (sel->stage <= MESA_SHADER_GEOMETRY) {
if (key->ge.as_ls)
return &sel->main_shader_part_ls;
return &sel->main_shader_part_ls[index];
if (key->ge.as_es && key->ge.as_ngg)
return &sel->main_shader_part_ngg_es;
if (key->ge.as_es)
return &sel->main_shader_part_ngg_es[index];
if (key->ge.as_es) {
/* legacy GS only support wave 64 */
assert(wave_size == 64);
return &sel->main_shader_part_es;
}
if (key->ge.as_ngg)
return &sel->main_shader_part_ngg;
return &sel->main_shader_part_ngg[index];
}
return &sel->main_shader_part;
return &sel->main_shader_part[index];
}
static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)

View file

@ -31,6 +31,12 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
struct si_shader_info *info = &shader->selector->info;
gl_shader_stage stage = shader->selector->stage;
struct si_shader_selector *prev_sel = NULL;
if (stage == MESA_SHADER_TESS_CTRL)
prev_sel = shader->key.ge.part.tcs.ls;
else if (stage == MESA_SHADER_GEOMETRY)
prev_sel = shader->key.ge.part.gs.es;
if (sscreen->info.gfx_level < GFX10)
return 64;
@ -87,20 +93,15 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
!(sscreen->info.gfx_level == GFX10 && shader->key.ge.opt.ngg_culling))
return 32;
/* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
* individual shaders of merged shaders to match the wave size between them.
*/
bool merged_shader = stage <= MESA_SHADER_GEOMETRY && !shader->is_gs_copy_shader &&
(shader->key.ge.as_ls || shader->key.ge.as_es ||
stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
/* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
* while the other half is idling but occupying VGPRs, preventing other waves from launching.
* Wave32 eliminates the idling half to allow the next wave to start.
*
* Gfx11: Wave32 continues to be faster with divergent loops despite worse VALU performance.
*/
if (!merged_shader && info->has_divergent_loop)
if (info->has_divergent_loop ||
/* Merged shader has to use same wave size for two shader stages. */
(prev_sel && prev_sel->info.has_divergent_loop))
return 32;
return 64;
@ -2867,9 +2868,9 @@ static union si_shader_key zeroed;
static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
struct si_compiler_ctx_state *compiler_state,
const union si_shader_key *key)
const union si_shader_key *key, unsigned wave_size)
{
struct si_shader **mainp = si_get_main_shader_part(sel, key);
struct si_shader **mainp = si_get_main_shader_part(sel, key, wave_size);
if (!*mainp) {
struct si_shader *main_part = CALLOC_STRUCT(si_shader);
@ -2889,7 +2890,7 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad
main_part->key.ge.as_ngg = key->ge.as_ngg;
}
main_part->is_monolithic = false;
main_part->wave_size = si_determine_wave_size(sscreen, main_part);
main_part->wave_size = wave_size;
if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
&compiler_state->debug)) {
@ -3104,13 +3105,13 @@ current_not_ready:
simple_mtx_lock(&previous_stage_sel->mutex);
ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state,
&shader1_key);
&shader1_key, shader->wave_size);
simple_mtx_unlock(&previous_stage_sel->mutex);
}
if (ok) {
ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state,
(union si_shader_key*)key);
(union si_shader_key*)key, shader->wave_size);
}
if (!ok) {
@ -3346,7 +3347,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind
simple_mtx_unlock(&sscreen->shader_cache_mutex);
}
*si_get_main_shader_part(sel, &shader->key) = shader;
*si_get_main_shader_part(sel, &shader->key, shader->wave_size) = shader;
/* Unset "outputs_written" flags for outputs converted to
* DEFAULT_VAL, so that later inter-shader optimizations don't
@ -3575,16 +3576,19 @@ static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader
ctx, &sscreen->live_shader_cache, state, &cache_hit);
if (sel && cache_hit && sctx->debug.debug_message) {
if (sel->main_shader_part)
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part, &sctx->debug);
if (sel->main_shader_part_ls)
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls, &sctx->debug);
for (unsigned i = 0; i < 2; i++) {
if (sel->main_shader_part[i])
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part[i], &sctx->debug);
if (sel->main_shader_part_ls[i])
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls[i], &sctx->debug);
if (sel->main_shader_part_ngg[i])
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg[i], &sctx->debug);
if (sel->main_shader_part_ngg_es[i])
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es[i], &sctx->debug);
}
if (sel->main_shader_part_es)
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_es, &sctx->debug);
if (sel->main_shader_part_ngg)
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg, &sctx->debug);
if (sel->main_shader_part_ngg_es)
si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es, &sctx->debug);
}
return sel;
}
@ -4017,16 +4021,19 @@ static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
si_delete_shader(sctx, sel->variants[i]);
}
if (sel->main_shader_part)
si_delete_shader(sctx, sel->main_shader_part);
if (sel->main_shader_part_ls)
si_delete_shader(sctx, sel->main_shader_part_ls);
for (unsigned i = 0; i < 2; i++) {
if (sel->main_shader_part[i])
si_delete_shader(sctx, sel->main_shader_part[i]);
if (sel->main_shader_part_ls[i])
si_delete_shader(sctx, sel->main_shader_part_ls[i]);
if (sel->main_shader_part_ngg[i])
si_delete_shader(sctx, sel->main_shader_part_ngg[i]);
if (sel->main_shader_part_ngg_es[i])
si_delete_shader(sctx, sel->main_shader_part_ngg_es[i]);
}
if (sel->main_shader_part_es)
si_delete_shader(sctx, sel->main_shader_part_es);
if (sel->main_shader_part_ngg)
si_delete_shader(sctx, sel->main_shader_part_ngg);
if (sel->main_shader_part_ngg_es)
si_delete_shader(sctx, sel->main_shader_part_ngg_es);
free(sel->keys);
free(sel->variants);