diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index e908efe8077..6be802d1425 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -377,7 +377,7 @@ radv_postprocess_nir(const struct radv_compiler_info *compiler_info, const struc /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences. * Each s_wait can cost 1000 cycles, so make sure all VS input loads are grouped. */ - NIR_PASS(_, stage->nir, nir_opt_move_to_top, nir_move_to_top_input_loads); + NIR_PASS(_, stage->nir, nir_opt_move_to_top, nir_move_to_top_input_loads_simple); NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts); NIR_PASS(_, stage->nir, nir_opt_move, sink_opts); } else { diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index f79f5104af4..60b59063f17 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5892,8 +5892,21 @@ typedef enum { nir_move_to_entry_block_only = BITFIELD_BIT(0), /* Instruction options. */ - nir_move_to_top_input_loads = BITFIELD_BIT(1), - nir_move_to_top_load_smem_amd = BITFIELD_BIT(2), + + /* Simple input loads are non-interpolated loads and interpolated loads + * with pixel, centroid, and sample barycentrics. Other barycentrics are + * excluded. + */ + nir_move_to_top_input_loads_simple = BITFIELD_BIT(1), + + /* Interpolated loads with non-trivial barycentrics, such as at_offset and + * at_sample. (this option is not recommended for Control (game) because + * it moves at_sample with complex ALU perspective-correct interpolation + * out of conditional blocks) + */ + nir_move_to_top_input_loads_complex_baryc = BITFIELD_BIT(2), + + nir_move_to_top_load_smem_amd = BITFIELD_BIT(3), } nir_opt_move_to_top_options; bool nir_opt_move_to_top(nir_shader *nir, nir_opt_move_to_top_options options); diff --git a/src/compiler/nir/nir_opt_move_to_top.c b/src/compiler/nir/nir_opt_move_to_top.c index 747ef749446..ffe30449b85 100644 --- a/src/compiler/nir/nir_opt_move_to_top.c +++ b/src/compiler/nir/nir_opt_move_to_top.c @@ -12,9 +12,9 @@ * of instructions that are moved. * * Used either as a scheduling optimization or to accommodate hw or compiler - * backend limitations. You would typically use this if you don't use - * nir_lower_io_vars_to_temporaries and want to move input loads to top, - * but note that such global code motion passes often increase register usage. + * backend limitations. It would typically be used if + * nir_lower_io_vars_to_temporaries isn't used and it's desirable to move input + * loads to top, but such global code motion often increases register usage. */ #include "nir.h" @@ -138,10 +138,29 @@ handle_load(nir_builder *b, nir_intrinsic_instr *intr, void *_state) * an input load. The specific intrinsics that are moved are * listed in can_move_src_to_top. */ - move |= state->options & nir_move_to_top_input_loads && - nir_intrinsic_has_io_semantics(intr) && - nir_intrinsic_infos[intr->intrinsic].has_dest && - !nir_is_output_load(intr); + if (state->options & (nir_move_to_top_input_loads_simple | + nir_move_to_top_input_loads_complex_baryc) && + nir_intrinsic_has_io_semantics(intr) && + nir_intrinsic_infos[intr->intrinsic].has_dest && + !nir_is_output_load(intr)) { + + if (intr->intrinsic == nir_intrinsic_load_interpolated_input) { + nir_intrinsic_instr *baryc = + nir_def_as_intrinsic_or_null(intr->src[0].ssa); + + nir_opt_move_to_top_options baryc_option = + baryc && + (baryc->intrinsic == nir_intrinsic_load_barycentric_pixel || + baryc->intrinsic == nir_intrinsic_load_barycentric_centroid || + baryc->intrinsic == nir_intrinsic_load_barycentric_sample) ? + nir_move_to_top_input_loads_simple : + nir_move_to_top_input_loads_complex_baryc; + + move |= !!(state->options & baryc_option); + } else { + move |= !!(state->options & nir_move_to_top_input_loads_simple); + } + } move |= state->options & nir_move_to_top_load_smem_amd && (intr->intrinsic == nir_intrinsic_load_global_amd && diff --git a/src/compiler/nir/nir_opt_varyings.c b/src/compiler/nir/nir_opt_varyings.c index 4a2fe57c266..54ee5bbc611 100644 --- a/src/compiler/nir/nir_opt_varyings.c +++ b/src/compiler/nir/nir_opt_varyings.c @@ -5604,7 +5604,7 @@ nir_opt_varyings_bulk(nir_shader **shaders, uint32_t num_shaders, bool spirv, if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS(_, nir, nir_opt_move_to_top, nir_move_to_entry_block_only | - nir_move_to_top_input_loads); + nir_move_to_top_input_loads_simple); } /* nir_opt_varyings requires scalar IO. Scalarize all varyings (not just diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index a04061e8106..988c133c422 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -728,8 +728,11 @@ ir3_finalize_nir(struct ir3_compiler *compiler, * more optimal at the top. */ if (s->info.stage == MESA_SHADER_VERTEX || - s->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS(_, s, nir_opt_move_to_top, nir_move_to_top_input_loads); + s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, s, nir_opt_move_to_top, + nir_move_to_top_input_loads_simple | + nir_move_to_top_input_loads_complex_baryc); + } if (s->info.stage == MESA_SHADER_GEOMETRY) { /* nir_unlower_io_to_vars expects constant indirect offsets to be folded diff --git a/src/gallium/drivers/radeonsi/gfx/si_shader.c b/src/gallium/drivers/radeonsi/gfx/si_shader.c index f3e8e1ac95c..6fe325ee52c 100644 --- a/src/gallium/drivers/radeonsi/gfx/si_shader.c +++ b/src/gallium/drivers/radeonsi/gfx/si_shader.c @@ -914,7 +914,7 @@ static void si_preprocess_nir(struct si_nir_shader_ctx *ctx) */ if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS(progress, nir, nir_opt_move_to_top, nir_move_to_top_input_loads); + NIR_PASS(progress, nir, nir_opt_move_to_top, nir_move_to_top_input_loads_simple); /* Remove dead temps before we lower indirect indexing. */ NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL);