mesa/src/compiler/nir/nir_opt_sink.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

269 lines
8.5 KiB
C
Raw Normal View History

/*
* Copyright © 2018 Red Hat
* Copyright © 2019 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Rob Clark (robdclark@gmail.com>
* Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
* Rhys Perry (pendingchaos02@gmail.com)
*
*/
#include "nir.h"
/*
* A simple pass that moves some instructions into the least common
* anscestor of consuming instructions.
*/
nir/opt_sink: Also consider load_preamble as const Acts like constants, schedule them like constants. This lets us move lowered frag coord code down. Results on dolphin ubers: total instructions in shared programs: 195144 -> 196633 (0.76%) instructions in affected programs: 175737 -> 177226 (0.85%) helped: 28 HURT: 27 Instructions are HURT. total bytes in shared programs: 1379980 -> 1388308 (0.60%) bytes in affected programs: 1244250 -> 1252578 (0.67%) helped: 28 HURT: 27 Bytes are HURT. total halfregs in shared programs: 13591 -> 13557 (-0.25%) halfregs in affected programs: 2176 -> 2142 (-1.56%) helped: 12 HURT: 2 Inconclusive result (%-change mean confidence interval includes 0). total threads in shared programs: 233728 -> 234112 (0.16%) threads in affected programs: 3264 -> 3648 (11.76%) helped: 6 HURT: 0 Threads are helped. Results on Android shader-db: total instructions in shared programs: 1775324 -> 1775912 (0.03%) instructions in affected programs: 155305 -> 155893 (0.38%) helped: 353 HURT: 548 Instructions are HURT. total bytes in shared programs: 11676650 -> 11678454 (0.02%) bytes in affected programs: 1058924 -> 1060728 (0.17%) helped: 370 HURT: 547 Inconclusive result (value mean confidence interval includes 0). total halfregs in shared programs: 484143 -> 471212 (-2.67%) halfregs in affected programs: 98833 -> 85902 (-13.08%) helped: 2478 HURT: 674 Halfregs are helped. Instr count changes due to losing the RA lottery. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-27 17:38:02 -04:00
/*
* Detect whether a source is like a constant for the purposes of register
* pressure calculations (e.g. can be remat anywhere effectively for free).
*/
static bool
is_constant_like(nir_src *src)
{
/* Constants are constants */
if (nir_src_is_const(*src))
return true;
/* Otherwise, look for constant-like intrinsics */
nir_instr *parent = src->ssa->parent_instr;
if (parent->type != nir_instr_type_intrinsic)
return false;
return (nir_instr_as_intrinsic(parent)->intrinsic ==
nir_intrinsic_load_preamble);
}
static bool
can_sink_instr(nir_instr *instr, nir_move_options options, bool *can_mov_out_of_loop)
{
/* Some intrinsic might require uniform sources and
* moving out of loops can add divergence.
*/
*can_mov_out_of_loop = true;
switch (instr->type) {
case nir_instr_type_load_const:
case nir_instr_type_undef: {
return options & nir_move_const_undef;
}
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
if (nir_op_is_vec_or_mov(alu->op) || alu->op == nir_op_b2i32)
return options & nir_move_copies;
if (nir_alu_instr_is_comparison(alu))
return options & nir_move_comparisons;
nir/opt_sink: Move ALU with constant sources In general, sinking ALU instructions can negatively impact register pressure, since it extends the live ranges of the sources, although it does shrink the live range of the destination. However, constants do not usually contribute to register pressure. This is not a totally true assumption, but it's pretty good in practice, since... * constants can be rematerialized (backend-dependent) * constants can often be inlined (ISA-dependent) * constants can sometimes be promoted to free uniform registers (ISA-dependent) * constants can live in scalar registers although the ALU destination might need a vector register (and vector registers are assumed to be much more expensive than scalar registers, again ISA-dependent) So, assume that constants have zero effect on register pressure. Now consider an ALU instruction where all but one source is a constant. Then there are two cases: 1. The ALU instruction is moved past when its source was otherwise killed. Then there is no effect on register pressure, since the source live range is extended exactly as much as the destination live range shrinks. 2. The ALU instruction is moved down but its source is still alive where it's moved to. Then register pressure is improved, since the source live range is unchanged while the destination live range shrinks. So, as a heuristic, we always move ALU instructions where n-1 sources are constant. As an inevitable special case, this also (necessarily) moves unary ALU ops, which should be beneficial by the same justification. This is not 100% perfect but it is well-motivated. Results on AGX are decent: total instructions in shared programs: 1796101 -> 1795652 (-0.02%) instructions in affected programs: 326822 -> 326373 (-0.14%) helped: 800 HURT: 371 Inconclusive result (%-change mean confidence interval includes 0). total bytes in shared programs: 11805004 -> 11801424 (-0.03%) bytes in affected programs: 2610630 -> 2607050 (-0.14%) helped: 912 HURT: 462 Inconclusive result (%-change mean confidence interval includes 0). total halfregs in shared programs: 525818 -> 515399 (-1.98%) halfregs in affected programs: 118197 -> 107778 (-8.81%) helped: 2095 HURT: 804 Halfregs are helped. total threads in shared programs: 18916608 -> 18917056 (<.01%) threads in affected programs: 4800 -> 5248 (9.33%) helped: 7 HURT: 0 Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
/* Assuming that constants do not contribute to register pressure, it is
* beneficial to sink ALU instructions where all non constant sources
* are the same.
nir/opt_sink: Move ALU with constant sources In general, sinking ALU instructions can negatively impact register pressure, since it extends the live ranges of the sources, although it does shrink the live range of the destination. However, constants do not usually contribute to register pressure. This is not a totally true assumption, but it's pretty good in practice, since... * constants can be rematerialized (backend-dependent) * constants can often be inlined (ISA-dependent) * constants can sometimes be promoted to free uniform registers (ISA-dependent) * constants can live in scalar registers although the ALU destination might need a vector register (and vector registers are assumed to be much more expensive than scalar registers, again ISA-dependent) So, assume that constants have zero effect on register pressure. Now consider an ALU instruction where all but one source is a constant. Then there are two cases: 1. The ALU instruction is moved past when its source was otherwise killed. Then there is no effect on register pressure, since the source live range is extended exactly as much as the destination live range shrinks. 2. The ALU instruction is moved down but its source is still alive where it's moved to. Then register pressure is improved, since the source live range is unchanged while the destination live range shrinks. So, as a heuristic, we always move ALU instructions where n-1 sources are constant. As an inevitable special case, this also (necessarily) moves unary ALU ops, which should be beneficial by the same justification. This is not 100% perfect but it is well-motivated. Results on AGX are decent: total instructions in shared programs: 1796101 -> 1795652 (-0.02%) instructions in affected programs: 326822 -> 326373 (-0.14%) helped: 800 HURT: 371 Inconclusive result (%-change mean confidence interval includes 0). total bytes in shared programs: 11805004 -> 11801424 (-0.03%) bytes in affected programs: 2610630 -> 2607050 (-0.14%) helped: 912 HURT: 462 Inconclusive result (%-change mean confidence interval includes 0). total halfregs in shared programs: 525818 -> 515399 (-1.98%) halfregs in affected programs: 118197 -> 107778 (-8.81%) helped: 2095 HURT: 804 Halfregs are helped. total threads in shared programs: 18916608 -> 18917056 (<.01%) threads in affected programs: 4800 -> 5248 (9.33%) helped: 7 HURT: 0 Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
*/
if (!(options & nir_move_alu))
return false;
unsigned inputs = nir_op_infos[alu->op].num_inputs;
int non_const = -1;
nir/opt_sink: Move ALU with constant sources In general, sinking ALU instructions can negatively impact register pressure, since it extends the live ranges of the sources, although it does shrink the live range of the destination. However, constants do not usually contribute to register pressure. This is not a totally true assumption, but it's pretty good in practice, since... * constants can be rematerialized (backend-dependent) * constants can often be inlined (ISA-dependent) * constants can sometimes be promoted to free uniform registers (ISA-dependent) * constants can live in scalar registers although the ALU destination might need a vector register (and vector registers are assumed to be much more expensive than scalar registers, again ISA-dependent) So, assume that constants have zero effect on register pressure. Now consider an ALU instruction where all but one source is a constant. Then there are two cases: 1. The ALU instruction is moved past when its source was otherwise killed. Then there is no effect on register pressure, since the source live range is extended exactly as much as the destination live range shrinks. 2. The ALU instruction is moved down but its source is still alive where it's moved to. Then register pressure is improved, since the source live range is unchanged while the destination live range shrinks. So, as a heuristic, we always move ALU instructions where n-1 sources are constant. As an inevitable special case, this also (necessarily) moves unary ALU ops, which should be beneficial by the same justification. This is not 100% perfect but it is well-motivated. Results on AGX are decent: total instructions in shared programs: 1796101 -> 1795652 (-0.02%) instructions in affected programs: 326822 -> 326373 (-0.14%) helped: 800 HURT: 371 Inconclusive result (%-change mean confidence interval includes 0). total bytes in shared programs: 11805004 -> 11801424 (-0.03%) bytes in affected programs: 2610630 -> 2607050 (-0.14%) helped: 912 HURT: 462 Inconclusive result (%-change mean confidence interval includes 0). total halfregs in shared programs: 525818 -> 515399 (-1.98%) halfregs in affected programs: 118197 -> 107778 (-8.81%) helped: 2095 HURT: 804 Halfregs are helped. total threads in shared programs: 18916608 -> 18917056 (<.01%) threads in affected programs: 4800 -> 5248 (9.33%) helped: 7 HURT: 0 Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
for (unsigned i = 0; i < inputs; ++i) {
nir/opt_sink: Also consider load_preamble as const Acts like constants, schedule them like constants. This lets us move lowered frag coord code down. Results on dolphin ubers: total instructions in shared programs: 195144 -> 196633 (0.76%) instructions in affected programs: 175737 -> 177226 (0.85%) helped: 28 HURT: 27 Instructions are HURT. total bytes in shared programs: 1379980 -> 1388308 (0.60%) bytes in affected programs: 1244250 -> 1252578 (0.67%) helped: 28 HURT: 27 Bytes are HURT. total halfregs in shared programs: 13591 -> 13557 (-0.25%) halfregs in affected programs: 2176 -> 2142 (-1.56%) helped: 12 HURT: 2 Inconclusive result (%-change mean confidence interval includes 0). total threads in shared programs: 233728 -> 234112 (0.16%) threads in affected programs: 3264 -> 3648 (11.76%) helped: 6 HURT: 0 Threads are helped. Results on Android shader-db: total instructions in shared programs: 1775324 -> 1775912 (0.03%) instructions in affected programs: 155305 -> 155893 (0.38%) helped: 353 HURT: 548 Instructions are HURT. total bytes in shared programs: 11676650 -> 11678454 (0.02%) bytes in affected programs: 1058924 -> 1060728 (0.17%) helped: 370 HURT: 547 Inconclusive result (value mean confidence interval includes 0). total halfregs in shared programs: 484143 -> 471212 (-2.67%) halfregs in affected programs: 98833 -> 85902 (-13.08%) helped: 2478 HURT: 674 Halfregs are helped. Instr count changes due to losing the RA lottery. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-27 17:38:02 -04:00
if (is_constant_like(&alu->src[i].src))
continue;
else if (non_const < 0)
non_const = i;
else if (!nir_alu_srcs_equal(alu, alu, non_const, i))
return false;
nir/opt_sink: Move ALU with constant sources In general, sinking ALU instructions can negatively impact register pressure, since it extends the live ranges of the sources, although it does shrink the live range of the destination. However, constants do not usually contribute to register pressure. This is not a totally true assumption, but it's pretty good in practice, since... * constants can be rematerialized (backend-dependent) * constants can often be inlined (ISA-dependent) * constants can sometimes be promoted to free uniform registers (ISA-dependent) * constants can live in scalar registers although the ALU destination might need a vector register (and vector registers are assumed to be much more expensive than scalar registers, again ISA-dependent) So, assume that constants have zero effect on register pressure. Now consider an ALU instruction where all but one source is a constant. Then there are two cases: 1. The ALU instruction is moved past when its source was otherwise killed. Then there is no effect on register pressure, since the source live range is extended exactly as much as the destination live range shrinks. 2. The ALU instruction is moved down but its source is still alive where it's moved to. Then register pressure is improved, since the source live range is unchanged while the destination live range shrinks. So, as a heuristic, we always move ALU instructions where n-1 sources are constant. As an inevitable special case, this also (necessarily) moves unary ALU ops, which should be beneficial by the same justification. This is not 100% perfect but it is well-motivated. Results on AGX are decent: total instructions in shared programs: 1796101 -> 1795652 (-0.02%) instructions in affected programs: 326822 -> 326373 (-0.14%) helped: 800 HURT: 371 Inconclusive result (%-change mean confidence interval includes 0). total bytes in shared programs: 11805004 -> 11801424 (-0.03%) bytes in affected programs: 2610630 -> 2607050 (-0.14%) helped: 912 HURT: 462 Inconclusive result (%-change mean confidence interval includes 0). total halfregs in shared programs: 525818 -> 515399 (-1.98%) halfregs in affected programs: 118197 -> 107778 (-8.81%) helped: 2095 HURT: 804 Halfregs are helped. total threads in shared programs: 18916608 -> 18917056 (<.01%) threads in affected programs: 4800 -> 5248 (9.33%) helped: 7 HURT: 0 Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
}
return true;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ubo_vec4:
*can_mov_out_of_loop = false;
return options & nir_move_load_ubo;
case nir_intrinsic_load_ssbo:
*can_mov_out_of_loop = false;
return (options & nir_move_load_ssbo) && nir_intrinsic_can_reorder(intrin);
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_primitive_input:
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_per_vertex_input:
case nir_intrinsic_load_frag_coord:
case nir_intrinsic_load_frag_coord_zw:
case nir_intrinsic_load_pixel_coord:
case nir_intrinsic_load_attribute_pan:
return options & nir_move_load_input;
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_kernel_input:
return options & nir_move_load_uniform;
case nir_intrinsic_inverse_ballot:
case nir_intrinsic_is_subgroup_invocation_lt_amd:
*can_mov_out_of_loop = false;
return options & nir_move_copies;
case nir_intrinsic_load_constant_agx:
case nir_intrinsic_load_local_pixel_agx:
return true;
default:
return false;
}
}
default:
return false;
}
}
bool
nir_can_move_instr(nir_instr *instr, nir_move_options options)
{
bool out_of_loop;
return can_sink_instr(instr, options, &out_of_loop);
}
static nir_loop *
get_innermost_loop(nir_cf_node *node)
{
for (; node != NULL; node = node->parent) {
if (node->type == nir_cf_node_loop) {
nir_loop *loop = nir_cf_node_as_loop(node);
if (nir_loop_first_block(loop)->predecessors->entries > 1)
return loop;
}
}
return NULL;
}
static bool
loop_contains_block(nir_loop *loop, nir_block *block)
{
assert(!nir_loop_has_continue_construct(loop));
nir_block *before = nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node));
nir_block *after = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
return block->index > before->index && block->index < after->index;
}
/* Given the LCA of all uses and the definition, find a block on the path
* between them in the dominance tree that is outside of as many loops as
* possible. If "sink_out_of_loops" is false, then we disallow sinking the
* definition outside of the loop it's defined in (if any).
*/
static nir_block *
adjust_block_for_loops(nir_block *use_block, nir_block *def_block,
bool sink_out_of_loops)
{
nir_loop *def_loop = NULL;
if (!sink_out_of_loops)
def_loop = get_innermost_loop(&def_block->cf_node);
for (nir_block *cur_block = use_block; cur_block != def_block->imm_dom;
cur_block = cur_block->imm_dom) {
if (!sink_out_of_loops && def_loop &&
!loop_contains_block(def_loop, use_block)) {
use_block = cur_block;
continue;
}
nir_cf_node *next = nir_cf_node_next(&cur_block->cf_node);
if (next && next->type == nir_cf_node_loop &&
nir_block_cf_tree_next(cur_block)->predecessors->entries > 1) {
nir_loop *following_loop = nir_cf_node_as_loop(next);
if (loop_contains_block(following_loop, use_block)) {
use_block = cur_block;
continue;
}
}
}
return use_block;
}
/* iterate a ssa def's use's and try to find a more optimal block to
* move it to, using the dominance tree. In short, if all of the uses
* are contained in a single block, the load will be moved there,
* otherwise it will be move to the least common ancestor block of all
* the uses
*/
static nir_block *
get_preferred_block(nir_def *def, bool sink_out_of_loops)
{
nir_block *lca = NULL;
nir_foreach_use_including_if(use, def) {
lca = nir_dominance_lca(lca, nir_src_get_block(use));
}
/* return in case, we didn't find a reachable user */
if (!lca)
return NULL;
/* We don't sink any instructions into loops to avoid repeated executions
* This might occasionally increase register pressure, but seems overall
* the better choice.
*/
lca = adjust_block_for_loops(lca, def->parent_instr->block,
sink_out_of_loops);
assert(nir_block_dominates(def->parent_instr->block, lca));
return lca;
}
bool
nir_opt_sink(nir_shader *shader, nir_move_options options)
{
bool progress = false;
nir_foreach_function_impl(impl, shader) {
nir_metadata_require(impl,
nir_metadata_control_flow);
nir_foreach_block_reverse(block, impl) {
nir_foreach_instr_reverse_safe(instr, block) {
bool sink_out_of_loops;
if (!can_sink_instr(instr, options, &sink_out_of_loops))
continue;
nir_def *def = nir_instr_def(instr);
nir_block *use_block =
get_preferred_block(def, sink_out_of_loops);
if (!use_block || use_block == instr->block)
continue;
nir_instr_remove(instr);
nir_instr_insert(nir_after_phis(use_block), instr);
progress = true;
}
}
nir_metadata_preserve(impl,
nir_metadata_control_flow);
}
return progress;
}