nir/lower_shader_calls: move scratch loads closer to where they're needed

The intel backend compiler is not dealing with the scratch loads
emitted by this pass very well. There are 2 reasons for this :

  - all loads are at the top of the shader

  - the loads are global load intrinsics (cannot be differentiated
    from ssbo loads for example)

This leads the backend to generate ridiculous amount of spills.

To help a bit (actually quite a lot), we can move the scratch loads in
the blocks where they're needed, using the dominance information.
Quite often that also ends up moving loads in a block that might not
be reached by all the lanes, so we're potentially avoiding some loads.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16556>
This commit is contained in:
Lionel Landwerlin 2022-05-18 18:31:27 +03:00 committed by Marge Bot
parent 5717f13dff
commit 3c242e551d
2 changed files with 104 additions and 2 deletions

View file

@ -4840,6 +4840,11 @@ typedef struct nir_lower_shader_calls_options {
/* Stack alignment */
unsigned stack_alignment;
/* Put loads from the stack as close as possible from where they're needed.
* You might want to disable combined_loads for best effects.
*/
bool localized_loads;
} nir_lower_shader_calls_options;
bool

View file

@ -1685,6 +1685,96 @@ nir_opt_sort_and_pack_stack(nir_shader *shader,
return true;
}
/* Find the last block dominating all the uses of a SSA value. */
static nir_block *
find_last_dominant_use_block(nir_function_impl *impl, nir_ssa_def *value)
{
nir_foreach_block_reverse_safe(block, impl) {
bool fits = true;
/* Store on the current block of the value */
if (block == value->parent_instr->block)
return block;
nir_foreach_if_use(src, value) {
nir_block *block_before_if =
nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
if (!nir_block_dominates(block, block_before_if)) {
fits = false;
break;
}
}
if (!fits)
continue;
nir_foreach_use(src, value) {
if (src->parent_instr->type == nir_instr_type_phi &&
block == src->parent_instr->block) {
fits = false;
break;
}
if (!nir_block_dominates(block, src->parent_instr->block)) {
fits = false;
break;
}
}
if (!fits)
continue;
return block;
}
unreachable("Cannot find block");
}
/* Put the scratch loads in the branches where they're needed. */
static bool
nir_opt_stack_loads(nir_shader *shader)
{
bool progress = false;
nir_foreach_function(func, shader) {
if (!func->impl)
continue;
nir_metadata_require(func->impl, nir_metadata_dominance |
nir_metadata_block_index);
bool func_progress = false;
nir_foreach_block_safe(block, func->impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_stack)
continue;
nir_ssa_def *value = &intrin->dest.ssa;
nir_block *new_block = find_last_dominant_use_block(func->impl, value);
if (new_block == block)
continue;
/* Move the scratch load in the new block, after the phis. */
nir_instr_remove(instr);
nir_instr_insert(nir_before_block_after_phis(new_block), instr);
func_progress = true;
}
}
nir_metadata_preserve(func->impl,
func_progress ? (nir_metadata_block_index |
nir_metadata_dominance |
nir_metadata_loop_analysis) :
nir_metadata_all);
progress |= func_progress;
}
return progress;
}
/** Lower shader call instructions to split shaders.
*
* Shader calls can be split into an initial shader and a series of "resume"
@ -1785,8 +1875,15 @@ nir_lower_shader_calls(nir_shader *shader,
for (unsigned i = 0; i < num_calls; i++)
NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills);
NIR_PASS_V(shader, nir_lower_stack_to_scratch,
options->address_format);
if (options->localized_loads) {
/* Once loads have been combined we can try to put them closer to where
* they're needed.
*/
for (unsigned i = 0; i < num_calls; i++)
NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
}
NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
for (unsigned i = 0; i < num_calls; i++) {
NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
options->address_format);