radeonsi: do opt_large_constants & lower_indirect_derefs after uniform inlining

because loop unrolling caused by uniform inlining can eliminate large constants
and indirect derefs.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14266>
This commit is contained in:
Marek Olšák 2021-12-11 23:51:03 -05:00 committed by Marge Bot
parent 198ad7e4dc
commit 3fb77ef2e0
2 changed files with 34 additions and 25 deletions

View file

@ -23,9 +23,10 @@
*/
#include "ac_exp_param.h"
#include "ac_nir.h"
#include "ac_rtld.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_serialize.h"
#include "nir.h"
#include "nir_serialize.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
@ -1386,6 +1387,8 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
return NULL;
}
bool progress = false;
bool inline_uniforms = false;
uint32_t *inlined_uniform_values;
si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
@ -1437,14 +1440,37 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
nir->info.num_inlinable_uniforms,
inlined_uniform_values,
nir->info.inlinable_uniform_dw_offsets);
progress = true;
}
if (progress)
si_nir_opts(sel->screen, nir, true);
/* Lower large variables that are always constant with load_constant intrinsics, which
* get turned into PC-relative loads from a data section next to the shader.
*
* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
* this should be done after that.
*
* The pass crashes if there are dead temps of lowered IO interface types, so remove
* them first.
*/
bool progress2 = false;
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
/* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
* this should be done after that.
*/
progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.chip_class);
if (progress2)
si_nir_opts(sel->screen, nir, false);
if (progress || progress2)
si_nir_late_opts(nir);
/* This must be done again. */
NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
nir_var_shader_out);
}
NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
/* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
* 200 is tuned for Viewperf. It should be done last.

View file

@ -23,7 +23,6 @@
*/
#include "ac_nir_to_llvm.h"
#include "ac_nir.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_deref.h"
@ -412,6 +411,8 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info
break;
case nir_intrinsic_load_deref:
case nir_intrinsic_store_deref:
/* These can only occur if there is indirect temp indexing. */
break;
case nir_intrinsic_interp_deref_at_centroid:
case nir_intrinsic_interp_deref_at_sample:
case nir_intrinsic_interp_deref_at_offset:
@ -936,24 +937,6 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
}
si_nir_opts(sscreen, nir, true);
/* Lower large variables that are always constant with load_constant
* intrinsics, which get turned into PC-relative loads from a data
* section next to the shader.
*
* st/mesa calls finalize_nir twice, but we can't call this pass twice.
*/
bool changed = false;
if (!nir->constant_data) {
/* The pass crashes if there are dead temps of lowered IO interface types. */
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
NIR_PASS(changed, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
}
changed |= ac_nir_lower_indirect_derefs(nir, sscreen->info.chip_class);
if (changed)
si_nir_opts(sscreen, nir, false);
/* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
si_nir_late_opts(nir);