diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 2e2ecc8d63e..30d521732bb 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3144,8 +3144,11 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, struct util_debug_callback *debug) { - if (sscreen->info.gfx_level >= GFX9) - shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls; + if (sscreen->info.gfx_level >= GFX9) { + assert(shader->wave_size == 32 || shader->wave_size == 64); + unsigned index = shader->wave_size / 32 - 1; + shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index]; + } return true; } @@ -3157,10 +3160,13 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_ struct si_shader *shader, struct util_debug_callback *debug) { if (sscreen->info.gfx_level >= GFX9) { - if (shader->key.ge.as_ngg) - shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es; - else + if (shader->key.ge.as_ngg) { + assert(shader->wave_size == 32 || shader->wave_size == 64); + unsigned index = shader->wave_size / 32 - 1; + shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index]; + } else { shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es; + } } return true; @@ -3382,7 +3388,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler struct si_shader *shader, struct util_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; - struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); + struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size); if (sel->stage == MESA_SHADER_FRAGMENT) { shader->ps.writes_samplemask = sel->info.writes_samplemask && @@ -3445,14 +3451,13 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler * by multiple contexts. */ if (!shader->key.ge.as_ngg) { - assert(sel->main_shader_part == mainp); - assert(sel->main_shader_part->gs_copy_shader); - assert(sel->main_shader_part->gs_copy_shader->bo); - assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel); - assert(!sel->main_shader_part->gs_copy_shader->scratch_va); + assert(mainp->gs_copy_shader); + assert(mainp->gs_copy_shader->bo); + assert(!mainp->gs_copy_shader->previous_stage_sel); + assert(!mainp->gs_copy_shader->scratch_va); shader->gs_copy_shader = CALLOC_STRUCT(si_shader); - memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader, + memcpy(shader->gs_copy_shader, mainp->gs_copy_shader, sizeof(*shader->gs_copy_shader)); /* Increase the reference count. */ pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 3fca476cceb..a0ba146f150 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -552,12 +552,14 @@ struct si_shader_selector { /* The compiled NIR shader without a prolog and/or epilog (not * uploaded to a buffer object). + * + * [0] for wave32, [1] for wave64. */ - struct si_shader *main_shader_part; - struct si_shader *main_shader_part_ls; /* as_ls is set in the key */ - struct si_shader *main_shader_part_es; /* as_es is set in the key */ - struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ - struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ + struct si_shader *main_shader_part[2]; + struct si_shader *main_shader_part_ls[2]; /* as_ls is set in the key */ + struct si_shader *main_shader_part_es; /* as_es && !as_ngg in the key */ + struct si_shader *main_shader_part_ngg[2]; /* !as_es && as_ngg in the key */ + struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */ struct nir_shader *nir; void *nir_binary; @@ -1039,19 +1041,26 @@ bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *sha /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, - const union si_shader_key *key) + const union si_shader_key *key, + unsigned wave_size) { + assert(wave_size == 32 || wave_size == 64); + unsigned index = wave_size / 32 - 1; + if (sel->stage <= MESA_SHADER_GEOMETRY) { if (key->ge.as_ls) - return &sel->main_shader_part_ls; + return &sel->main_shader_part_ls[index]; if (key->ge.as_es && key->ge.as_ngg) - return &sel->main_shader_part_ngg_es; - if (key->ge.as_es) + return &sel->main_shader_part_ngg_es[index]; + if (key->ge.as_es) { + /* legacy GS only support wave 64 */ + assert(wave_size == 64); return &sel->main_shader_part_es; + } if (key->ge.as_ngg) - return &sel->main_shader_part_ngg; + return &sel->main_shader_part_ngg[index]; } - return &sel->main_shader_part; + return &sel->main_shader_part[index]; } static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 92abebc719f..32e0c33febc 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -31,6 +31,12 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha struct si_shader_info *info = &shader->selector->info; gl_shader_stage stage = shader->selector->stage; + struct si_shader_selector *prev_sel = NULL; + if (stage == MESA_SHADER_TESS_CTRL) + prev_sel = shader->key.ge.part.tcs.ls; + else if (stage == MESA_SHADER_GEOMETRY) + prev_sel = shader->key.ge.part.gs.es; + if (sscreen->info.gfx_level < GFX10) return 64; @@ -87,20 +93,15 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha !(sscreen->info.gfx_level == GFX10 && shader->key.ge.opt.ngg_culling)) return 32; - /* TODO: Merged shaders must use the same wave size because the driver doesn't recompile - * individual shaders of merged shaders to match the wave size between them. - */ - bool merged_shader = stage <= MESA_SHADER_GEOMETRY && !shader->is_gs_copy_shader && - (shader->key.ge.as_ls || shader->key.ge.as_es || - stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY); - /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave * while the other half is idling but occupying VGPRs, preventing other waves from launching. * Wave32 eliminates the idling half to allow the next wave to start. * * Gfx11: Wave32 continues to be faster with divergent loops despite worse VALU performance. */ - if (!merged_shader && info->has_divergent_loop) + if (info->has_divergent_loop || + /* Merged shader has to use same wave size for two shader stages. */ + (prev_sel && prev_sel->info.has_divergent_loop)) return 32; return 64; @@ -2867,9 +2868,9 @@ static union si_shader_key zeroed; static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel, struct si_compiler_ctx_state *compiler_state, - const union si_shader_key *key) + const union si_shader_key *key, unsigned wave_size) { - struct si_shader **mainp = si_get_main_shader_part(sel, key); + struct si_shader **mainp = si_get_main_shader_part(sel, key, wave_size); if (!*mainp) { struct si_shader *main_part = CALLOC_STRUCT(si_shader); @@ -2889,7 +2890,7 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shad main_part->key.ge.as_ngg = key->ge.as_ngg; } main_part->is_monolithic = false; - main_part->wave_size = si_determine_wave_size(sscreen, main_part); + main_part->wave_size = wave_size; if (!si_compile_shader(sscreen, compiler_state->compiler, main_part, &compiler_state->debug)) { @@ -3104,13 +3105,13 @@ current_not_ready: simple_mtx_lock(&previous_stage_sel->mutex); ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state, - &shader1_key); + &shader1_key, shader->wave_size); simple_mtx_unlock(&previous_stage_sel->mutex); } if (ok) { ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state, - (union si_shader_key*)key); + (union si_shader_key*)key, shader->wave_size); } if (!ok) { @@ -3346,7 +3347,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind simple_mtx_unlock(&sscreen->shader_cache_mutex); } - *si_get_main_shader_part(sel, &shader->key) = shader; + *si_get_main_shader_part(sel, &shader->key, shader->wave_size) = shader; /* Unset "outputs_written" flags for outputs converted to * DEFAULT_VAL, so that later inter-shader optimizations don't @@ -3575,16 +3576,19 @@ static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader ctx, &sscreen->live_shader_cache, state, &cache_hit); if (sel && cache_hit && sctx->debug.debug_message) { - if (sel->main_shader_part) - si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part, &sctx->debug); - if (sel->main_shader_part_ls) - si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls, &sctx->debug); + for (unsigned i = 0; i < 2; i++) { + if (sel->main_shader_part[i]) + si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part[i], &sctx->debug); + if (sel->main_shader_part_ls[i]) + si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls[i], &sctx->debug); + if (sel->main_shader_part_ngg[i]) + si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg[i], &sctx->debug); + if (sel->main_shader_part_ngg_es[i]) + si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es[i], &sctx->debug); + } + if (sel->main_shader_part_es) si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_es, &sctx->debug); - if (sel->main_shader_part_ngg) - si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg, &sctx->debug); - if (sel->main_shader_part_ngg_es) - si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es, &sctx->debug); } return sel; } @@ -4017,16 +4021,19 @@ static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) si_delete_shader(sctx, sel->variants[i]); } - if (sel->main_shader_part) - si_delete_shader(sctx, sel->main_shader_part); - if (sel->main_shader_part_ls) - si_delete_shader(sctx, sel->main_shader_part_ls); + for (unsigned i = 0; i < 2; i++) { + if (sel->main_shader_part[i]) + si_delete_shader(sctx, sel->main_shader_part[i]); + if (sel->main_shader_part_ls[i]) + si_delete_shader(sctx, sel->main_shader_part_ls[i]); + if (sel->main_shader_part_ngg[i]) + si_delete_shader(sctx, sel->main_shader_part_ngg[i]); + if (sel->main_shader_part_ngg_es[i]) + si_delete_shader(sctx, sel->main_shader_part_ngg_es[i]); + } + if (sel->main_shader_part_es) si_delete_shader(sctx, sel->main_shader_part_es); - if (sel->main_shader_part_ngg) - si_delete_shader(sctx, sel->main_shader_part_ngg); - if (sel->main_shader_part_ngg_es) - si_delete_shader(sctx, sel->main_shader_part_ngg_es); free(sel->keys); free(sel->variants);