2019-05-22 20:23:03 +01:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2018 Red Hat
|
|
|
|
|
* Copyright © 2019 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Rob Clark (robdclark@gmail.com>
|
|
|
|
|
* Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
|
|
|
|
|
* Rhys Perry (pendingchaos02@gmail.com)
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nir.h"
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A simple pass that moves some instructions into the least common
|
|
|
|
|
* anscestor of consuming instructions.
|
|
|
|
|
*/
|
|
|
|
|
|
2023-08-27 17:38:02 -04:00
|
|
|
/*
|
|
|
|
|
* Detect whether a source is like a constant for the purposes of register
|
|
|
|
|
* pressure calculations (e.g. can be remat anywhere effectively for free).
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
is_constant_like(nir_src *src)
|
|
|
|
|
{
|
|
|
|
|
/* Constants are constants */
|
|
|
|
|
if (nir_src_is_const(*src))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* Otherwise, look for constant-like intrinsics */
|
|
|
|
|
nir_instr *parent = src->ssa->parent_instr;
|
|
|
|
|
if (parent->type != nir_instr_type_intrinsic)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return (nir_instr_as_intrinsic(parent)->intrinsic ==
|
|
|
|
|
nir_intrinsic_load_preamble);
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-28 20:53:27 +02:00
|
|
|
static bool
|
|
|
|
|
can_sink_instr(nir_instr *instr, nir_move_options options, bool *can_mov_out_of_loop)
|
2019-05-22 20:23:03 +01:00
|
|
|
{
|
2024-08-28 20:53:27 +02:00
|
|
|
/* Some intrinsic might require uniform sources and
|
|
|
|
|
* moving out of loops can add divergence.
|
|
|
|
|
*/
|
|
|
|
|
*can_mov_out_of_loop = true;
|
2020-06-24 11:23:05 +01:00
|
|
|
switch (instr->type) {
|
|
|
|
|
case nir_instr_type_load_const:
|
2023-08-15 09:59:06 -05:00
|
|
|
case nir_instr_type_undef: {
|
2020-06-24 11:23:05 +01:00
|
|
|
return options & nir_move_const_undef;
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
2020-06-24 11:23:05 +01:00
|
|
|
case nir_instr_type_alu: {
|
2023-08-24 07:14:28 -04:00
|
|
|
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
|
|
|
|
|
|
|
|
|
if (nir_op_is_vec_or_mov(alu->op) || alu->op == nir_op_b2i32)
|
2020-06-24 11:23:05 +01:00
|
|
|
return options & nir_move_copies;
|
2023-08-24 07:14:28 -04:00
|
|
|
if (nir_alu_instr_is_comparison(alu))
|
2020-06-24 11:23:05 +01:00
|
|
|
return options & nir_move_comparisons;
|
nir/opt_sink: Move ALU with constant sources
In general, sinking ALU instructions can negatively impact register pressure,
since it extends the live ranges of the sources, although it does shrink the live range
of the destination.
However, constants do not usually contribute to register pressure. This is not a
totally true assumption, but it's pretty good in practice, since...
* constants can be rematerialized (backend-dependent)
* constants can often be inlined (ISA-dependent)
* constants can sometimes be promoted to free uniform registers (ISA-dependent)
* constants can live in scalar registers although the ALU destination might need
a vector register (and vector registers are assumed to be much more expensive
than scalar registers, again ISA-dependent)
So, assume that constants have zero effect on register pressure. Now consider an
ALU instruction where all but one source is a constant. Then there are two
cases:
1. The ALU instruction is moved past when its source was otherwise killed. Then
there is no effect on register pressure, since the source live range is
extended exactly as much as the destination live range shrinks.
2. The ALU instruction is moved down but its source is still alive where it's
moved to. Then register pressure is improved, since the source live range is
unchanged while the destination live range shrinks.
So, as a heuristic, we always move ALU instructions where n-1 sources are
constant. As an inevitable special case, this also (necessarily) moves unary ALU
ops, which should be beneficial by the same justification. This is not 100%
perfect but it is well-motivated. Results on AGX are decent:
total instructions in shared programs: 1796101 -> 1795652 (-0.02%)
instructions in affected programs: 326822 -> 326373 (-0.14%)
helped: 800
HURT: 371
Inconclusive result (%-change mean confidence interval includes 0).
total bytes in shared programs: 11805004 -> 11801424 (-0.03%)
bytes in affected programs: 2610630 -> 2607050 (-0.14%)
helped: 912
HURT: 462
Inconclusive result (%-change mean confidence interval includes 0).
total halfregs in shared programs: 525818 -> 515399 (-1.98%)
halfregs in affected programs: 118197 -> 107778 (-8.81%)
helped: 2095
HURT: 804
Halfregs are helped.
total threads in shared programs: 18916608 -> 18917056 (<.01%)
threads in affected programs: 4800 -> 5248 (9.33%)
helped: 7
HURT: 0
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
|
|
|
|
|
|
|
|
/* Assuming that constants do not contribute to register pressure, it is
|
nir: sink/move alu with two identical, non constant sources.
Foz-DB Navi21:
Totals from 32363 (40.76% of 79395) affected shaders:
MaxWaves: 787499 -> 787675 (+0.02%); split: +0.02%, -0.00%
Instrs: 28783404 -> 28783464 (+0.00%); split: -0.01%, +0.01%
CodeSize: 156763536 -> 156765148 (+0.00%); split: -0.01%, +0.02%
VGPRs: 1493304 -> 1492848 (-0.03%); split: -0.04%, +0.01%
Latency: 243022511 -> 243051994 (+0.01%); split: -0.08%, +0.09%
InvThroughput: 57827398 -> 57828129 (+0.00%); split: -0.05%, +0.05%
VClause: 582208 -> 582298 (+0.02%); split: -0.07%, +0.08%
SClause: 959634 -> 959312 (-0.03%); split: -0.07%, +0.04%
Copies: 1965821 -> 1965826 (+0.00%); split: -0.17%, +0.17%
Branches: 710593 -> 710596 (+0.00%); split: -0.00%, +0.01%
PreSGPRs: 1313513 -> 1313632 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 1210596 -> 1209103 (-0.12%); split: -0.12%, +0.00%
VALU: 19463445 -> 19463497 (+0.00%); split: -0.02%, +0.02%
SALU: 3319529 -> 3319500 (-0.00%); split: -0.01%, +0.01%
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32783>
2024-12-26 18:57:09 +01:00
|
|
|
* beneficial to sink ALU instructions where all non constant sources
|
|
|
|
|
* are the same.
|
nir/opt_sink: Move ALU with constant sources
In general, sinking ALU instructions can negatively impact register pressure,
since it extends the live ranges of the sources, although it does shrink the live range
of the destination.
However, constants do not usually contribute to register pressure. This is not a
totally true assumption, but it's pretty good in practice, since...
* constants can be rematerialized (backend-dependent)
* constants can often be inlined (ISA-dependent)
* constants can sometimes be promoted to free uniform registers (ISA-dependent)
* constants can live in scalar registers although the ALU destination might need
a vector register (and vector registers are assumed to be much more expensive
than scalar registers, again ISA-dependent)
So, assume that constants have zero effect on register pressure. Now consider an
ALU instruction where all but one source is a constant. Then there are two
cases:
1. The ALU instruction is moved past when its source was otherwise killed. Then
there is no effect on register pressure, since the source live range is
extended exactly as much as the destination live range shrinks.
2. The ALU instruction is moved down but its source is still alive where it's
moved to. Then register pressure is improved, since the source live range is
unchanged while the destination live range shrinks.
So, as a heuristic, we always move ALU instructions where n-1 sources are
constant. As an inevitable special case, this also (necessarily) moves unary ALU
ops, which should be beneficial by the same justification. This is not 100%
perfect but it is well-motivated. Results on AGX are decent:
total instructions in shared programs: 1796101 -> 1795652 (-0.02%)
instructions in affected programs: 326822 -> 326373 (-0.14%)
helped: 800
HURT: 371
Inconclusive result (%-change mean confidence interval includes 0).
total bytes in shared programs: 11805004 -> 11801424 (-0.03%)
bytes in affected programs: 2610630 -> 2607050 (-0.14%)
helped: 912
HURT: 462
Inconclusive result (%-change mean confidence interval includes 0).
total halfregs in shared programs: 525818 -> 515399 (-1.98%)
halfregs in affected programs: 118197 -> 107778 (-8.81%)
helped: 2095
HURT: 804
Halfregs are helped.
total threads in shared programs: 18916608 -> 18917056 (<.01%)
threads in affected programs: 4800 -> 5248 (9.33%)
helped: 7
HURT: 0
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
|
|
|
*/
|
|
|
|
|
if (!(options & nir_move_alu))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
unsigned inputs = nir_op_infos[alu->op].num_inputs;
|
nir: sink/move alu with two identical, non constant sources.
Foz-DB Navi21:
Totals from 32363 (40.76% of 79395) affected shaders:
MaxWaves: 787499 -> 787675 (+0.02%); split: +0.02%, -0.00%
Instrs: 28783404 -> 28783464 (+0.00%); split: -0.01%, +0.01%
CodeSize: 156763536 -> 156765148 (+0.00%); split: -0.01%, +0.02%
VGPRs: 1493304 -> 1492848 (-0.03%); split: -0.04%, +0.01%
Latency: 243022511 -> 243051994 (+0.01%); split: -0.08%, +0.09%
InvThroughput: 57827398 -> 57828129 (+0.00%); split: -0.05%, +0.05%
VClause: 582208 -> 582298 (+0.02%); split: -0.07%, +0.08%
SClause: 959634 -> 959312 (-0.03%); split: -0.07%, +0.04%
Copies: 1965821 -> 1965826 (+0.00%); split: -0.17%, +0.17%
Branches: 710593 -> 710596 (+0.00%); split: -0.00%, +0.01%
PreSGPRs: 1313513 -> 1313632 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 1210596 -> 1209103 (-0.12%); split: -0.12%, +0.00%
VALU: 19463445 -> 19463497 (+0.00%); split: -0.02%, +0.02%
SALU: 3319529 -> 3319500 (-0.00%); split: -0.01%, +0.01%
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32783>
2024-12-26 18:57:09 +01:00
|
|
|
int non_const = -1;
|
nir/opt_sink: Move ALU with constant sources
In general, sinking ALU instructions can negatively impact register pressure,
since it extends the live ranges of the sources, although it does shrink the live range
of the destination.
However, constants do not usually contribute to register pressure. This is not a
totally true assumption, but it's pretty good in practice, since...
* constants can be rematerialized (backend-dependent)
* constants can often be inlined (ISA-dependent)
* constants can sometimes be promoted to free uniform registers (ISA-dependent)
* constants can live in scalar registers although the ALU destination might need
a vector register (and vector registers are assumed to be much more expensive
than scalar registers, again ISA-dependent)
So, assume that constants have zero effect on register pressure. Now consider an
ALU instruction where all but one source is a constant. Then there are two
cases:
1. The ALU instruction is moved past when its source was otherwise killed. Then
there is no effect on register pressure, since the source live range is
extended exactly as much as the destination live range shrinks.
2. The ALU instruction is moved down but its source is still alive where it's
moved to. Then register pressure is improved, since the source live range is
unchanged while the destination live range shrinks.
So, as a heuristic, we always move ALU instructions where n-1 sources are
constant. As an inevitable special case, this also (necessarily) moves unary ALU
ops, which should be beneficial by the same justification. This is not 100%
perfect but it is well-motivated. Results on AGX are decent:
total instructions in shared programs: 1796101 -> 1795652 (-0.02%)
instructions in affected programs: 326822 -> 326373 (-0.14%)
helped: 800
HURT: 371
Inconclusive result (%-change mean confidence interval includes 0).
total bytes in shared programs: 11805004 -> 11801424 (-0.03%)
bytes in affected programs: 2610630 -> 2607050 (-0.14%)
helped: 912
HURT: 462
Inconclusive result (%-change mean confidence interval includes 0).
total halfregs in shared programs: 525818 -> 515399 (-1.98%)
halfregs in affected programs: 118197 -> 107778 (-8.81%)
helped: 2095
HURT: 804
Halfregs are helped.
total threads in shared programs: 18916608 -> 18917056 (<.01%)
threads in affected programs: 4800 -> 5248 (9.33%)
helped: 7
HURT: 0
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inputs; ++i) {
|
2023-08-27 17:38:02 -04:00
|
|
|
if (is_constant_like(&alu->src[i].src))
|
nir: sink/move alu with two identical, non constant sources.
Foz-DB Navi21:
Totals from 32363 (40.76% of 79395) affected shaders:
MaxWaves: 787499 -> 787675 (+0.02%); split: +0.02%, -0.00%
Instrs: 28783404 -> 28783464 (+0.00%); split: -0.01%, +0.01%
CodeSize: 156763536 -> 156765148 (+0.00%); split: -0.01%, +0.02%
VGPRs: 1493304 -> 1492848 (-0.03%); split: -0.04%, +0.01%
Latency: 243022511 -> 243051994 (+0.01%); split: -0.08%, +0.09%
InvThroughput: 57827398 -> 57828129 (+0.00%); split: -0.05%, +0.05%
VClause: 582208 -> 582298 (+0.02%); split: -0.07%, +0.08%
SClause: 959634 -> 959312 (-0.03%); split: -0.07%, +0.04%
Copies: 1965821 -> 1965826 (+0.00%); split: -0.17%, +0.17%
Branches: 710593 -> 710596 (+0.00%); split: -0.00%, +0.01%
PreSGPRs: 1313513 -> 1313632 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 1210596 -> 1209103 (-0.12%); split: -0.12%, +0.00%
VALU: 19463445 -> 19463497 (+0.00%); split: -0.02%, +0.02%
SALU: 3319529 -> 3319500 (-0.00%); split: -0.01%, +0.01%
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32783>
2024-12-26 18:57:09 +01:00
|
|
|
continue;
|
|
|
|
|
else if (non_const < 0)
|
|
|
|
|
non_const = i;
|
|
|
|
|
else if (!nir_alu_srcs_equal(alu, alu, non_const, i))
|
|
|
|
|
return false;
|
nir/opt_sink: Move ALU with constant sources
In general, sinking ALU instructions can negatively impact register pressure,
since it extends the live ranges of the sources, although it does shrink the live range
of the destination.
However, constants do not usually contribute to register pressure. This is not a
totally true assumption, but it's pretty good in practice, since...
* constants can be rematerialized (backend-dependent)
* constants can often be inlined (ISA-dependent)
* constants can sometimes be promoted to free uniform registers (ISA-dependent)
* constants can live in scalar registers although the ALU destination might need
a vector register (and vector registers are assumed to be much more expensive
than scalar registers, again ISA-dependent)
So, assume that constants have zero effect on register pressure. Now consider an
ALU instruction where all but one source is a constant. Then there are two
cases:
1. The ALU instruction is moved past when its source was otherwise killed. Then
there is no effect on register pressure, since the source live range is
extended exactly as much as the destination live range shrinks.
2. The ALU instruction is moved down but its source is still alive where it's
moved to. Then register pressure is improved, since the source live range is
unchanged while the destination live range shrinks.
So, as a heuristic, we always move ALU instructions where n-1 sources are
constant. As an inevitable special case, this also (necessarily) moves unary ALU
ops, which should be beneficial by the same justification. This is not 100%
perfect but it is well-motivated. Results on AGX are decent:
total instructions in shared programs: 1796101 -> 1795652 (-0.02%)
instructions in affected programs: 326822 -> 326373 (-0.14%)
helped: 800
HURT: 371
Inconclusive result (%-change mean confidence interval includes 0).
total bytes in shared programs: 11805004 -> 11801424 (-0.03%)
bytes in affected programs: 2610630 -> 2607050 (-0.14%)
helped: 912
HURT: 462
Inconclusive result (%-change mean confidence interval includes 0).
total halfregs in shared programs: 525818 -> 515399 (-1.98%)
halfregs in affected programs: 118197 -> 107778 (-8.81%)
helped: 2095
HURT: 804
Halfregs are helped.
total threads in shared programs: 18916608 -> 18917056 (<.01%)
threads in affected programs: 4800 -> 5248 (9.33%)
helped: 7
HURT: 0
Threads are helped.
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24833>
2023-08-20 12:19:55 -04:00
|
|
|
}
|
|
|
|
|
|
nir: sink/move alu with two identical, non constant sources.
Foz-DB Navi21:
Totals from 32363 (40.76% of 79395) affected shaders:
MaxWaves: 787499 -> 787675 (+0.02%); split: +0.02%, -0.00%
Instrs: 28783404 -> 28783464 (+0.00%); split: -0.01%, +0.01%
CodeSize: 156763536 -> 156765148 (+0.00%); split: -0.01%, +0.02%
VGPRs: 1493304 -> 1492848 (-0.03%); split: -0.04%, +0.01%
Latency: 243022511 -> 243051994 (+0.01%); split: -0.08%, +0.09%
InvThroughput: 57827398 -> 57828129 (+0.00%); split: -0.05%, +0.05%
VClause: 582208 -> 582298 (+0.02%); split: -0.07%, +0.08%
SClause: 959634 -> 959312 (-0.03%); split: -0.07%, +0.04%
Copies: 1965821 -> 1965826 (+0.00%); split: -0.17%, +0.17%
Branches: 710593 -> 710596 (+0.00%); split: -0.00%, +0.01%
PreSGPRs: 1313513 -> 1313632 (+0.01%); split: -0.00%, +0.01%
PreVGPRs: 1210596 -> 1209103 (-0.12%); split: -0.12%, +0.00%
VALU: 19463445 -> 19463497 (+0.00%); split: -0.02%, +0.02%
SALU: 3319529 -> 3319500 (-0.00%); split: -0.01%, +0.01%
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32783>
2024-12-26 18:57:09 +01:00
|
|
|
return true;
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
2020-06-24 11:23:05 +01:00
|
|
|
case nir_instr_type_intrinsic: {
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_ubo:
|
2021-12-09 13:09:35 -08:00
|
|
|
case nir_intrinsic_load_ubo_vec4:
|
2024-08-28 20:53:27 +02:00
|
|
|
*can_mov_out_of_loop = false;
|
2020-06-24 11:23:05 +01:00
|
|
|
return options & nir_move_load_ubo;
|
2021-01-21 17:01:07 +00:00
|
|
|
case nir_intrinsic_load_ssbo:
|
2024-08-28 20:53:27 +02:00
|
|
|
*can_mov_out_of_loop = false;
|
2021-01-21 17:01:07 +00:00
|
|
|
return (options & nir_move_load_ssbo) && nir_intrinsic_can_reorder(intrin);
|
2020-06-24 11:23:05 +01:00
|
|
|
case nir_intrinsic_load_input:
|
2024-07-06 04:24:31 -04:00
|
|
|
case nir_intrinsic_load_per_primitive_input:
|
2020-06-24 11:23:05 +01:00
|
|
|
case nir_intrinsic_load_interpolated_input:
|
|
|
|
|
case nir_intrinsic_load_per_vertex_input:
|
2023-08-20 12:55:02 -04:00
|
|
|
case nir_intrinsic_load_frag_coord:
|
|
|
|
|
case nir_intrinsic_load_frag_coord_zw:
|
2024-12-11 19:07:10 -08:00
|
|
|
case nir_intrinsic_load_frag_coord_zw_pan:
|
2023-08-20 12:55:02 -04:00
|
|
|
case nir_intrinsic_load_pixel_coord:
|
2024-12-12 21:22:11 +00:00
|
|
|
case nir_intrinsic_load_attribute_pan:
|
2020-06-24 11:23:05 +01:00
|
|
|
return options & nir_move_load_input;
|
2022-02-17 10:14:45 +01:00
|
|
|
case nir_intrinsic_load_uniform:
|
2024-05-28 23:43:29 +02:00
|
|
|
case nir_intrinsic_load_kernel_input:
|
2022-02-17 10:14:45 +01:00
|
|
|
return options & nir_move_load_uniform;
|
2024-05-31 16:44:52 +02:00
|
|
|
case nir_intrinsic_inverse_ballot:
|
2024-09-15 12:16:54 +02:00
|
|
|
case nir_intrinsic_is_subgroup_invocation_lt_amd:
|
2024-08-28 20:53:27 +02:00
|
|
|
*can_mov_out_of_loop = false;
|
2024-05-31 16:44:52 +02:00
|
|
|
return options & nir_move_copies;
|
2023-08-19 19:49:09 -04:00
|
|
|
case nir_intrinsic_load_constant_agx:
|
2023-08-20 12:51:02 -04:00
|
|
|
case nir_intrinsic_load_local_pixel_agx:
|
2023-08-19 19:49:09 -04:00
|
|
|
return true;
|
2020-06-24 11:23:05 +01:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
2020-06-24 11:23:05 +01:00
|
|
|
default:
|
|
|
|
|
return false;
|
2019-10-14 17:15:04 +01:00
|
|
|
}
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
|
|
|
|
|
2024-08-28 20:53:27 +02:00
|
|
|
bool
|
|
|
|
|
nir_can_move_instr(nir_instr *instr, nir_move_options options)
|
|
|
|
|
{
|
|
|
|
|
bool out_of_loop;
|
|
|
|
|
return can_sink_instr(instr, options, &out_of_loop);
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-22 20:23:03 +01:00
|
|
|
static nir_loop *
|
|
|
|
|
get_innermost_loop(nir_cf_node *node)
|
|
|
|
|
{
|
|
|
|
|
for (; node != NULL; node = node->parent) {
|
2024-04-16 17:32:29 +02:00
|
|
|
if (node->type == nir_cf_node_loop) {
|
|
|
|
|
nir_loop *loop = nir_cf_node_as_loop(node);
|
|
|
|
|
if (nir_loop_first_block(loop)->predecessors->entries > 1)
|
|
|
|
|
return loop;
|
|
|
|
|
}
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-25 14:02:48 +02:00
|
|
|
static bool
|
|
|
|
|
loop_contains_block(nir_loop *loop, nir_block *block)
|
2019-05-22 20:23:03 +01:00
|
|
|
{
|
2021-12-02 10:31:56 +01:00
|
|
|
assert(!nir_loop_has_continue_construct(loop));
|
2019-09-25 14:02:48 +02:00
|
|
|
nir_block *before = nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node));
|
|
|
|
|
nir_block *after = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
|
2019-05-22 20:23:03 +01:00
|
|
|
|
2019-09-25 14:02:48 +02:00
|
|
|
return block->index > before->index && block->index < after->index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Given the LCA of all uses and the definition, find a block on the path
|
|
|
|
|
* between them in the dominance tree that is outside of as many loops as
|
2019-09-25 14:17:23 +02:00
|
|
|
* possible. If "sink_out_of_loops" is false, then we disallow sinking the
|
|
|
|
|
* definition outside of the loop it's defined in (if any).
|
2019-09-25 14:02:48 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static nir_block *
|
2019-09-25 14:17:23 +02:00
|
|
|
adjust_block_for_loops(nir_block *use_block, nir_block *def_block,
|
|
|
|
|
bool sink_out_of_loops)
|
2019-09-25 14:02:48 +02:00
|
|
|
{
|
2019-09-25 14:17:23 +02:00
|
|
|
nir_loop *def_loop = NULL;
|
|
|
|
|
if (!sink_out_of_loops)
|
|
|
|
|
def_loop = get_innermost_loop(&def_block->cf_node);
|
|
|
|
|
|
2019-09-25 14:02:48 +02:00
|
|
|
for (nir_block *cur_block = use_block; cur_block != def_block->imm_dom;
|
|
|
|
|
cur_block = cur_block->imm_dom) {
|
2019-09-25 14:17:23 +02:00
|
|
|
if (!sink_out_of_loops && def_loop &&
|
|
|
|
|
!loop_contains_block(def_loop, use_block)) {
|
|
|
|
|
use_block = cur_block;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-25 14:02:48 +02:00
|
|
|
nir_cf_node *next = nir_cf_node_next(&cur_block->cf_node);
|
2024-04-16 17:32:29 +02:00
|
|
|
if (next && next->type == nir_cf_node_loop &&
|
|
|
|
|
nir_block_cf_tree_next(cur_block)->predecessors->entries > 1) {
|
2019-09-25 14:02:48 +02:00
|
|
|
nir_loop *following_loop = nir_cf_node_as_loop(next);
|
|
|
|
|
if (loop_contains_block(following_loop, use_block)) {
|
2023-08-08 12:00:35 -05:00
|
|
|
use_block = cur_block;
|
|
|
|
|
continue;
|
2019-09-25 14:02:48 +02:00
|
|
|
}
|
|
|
|
|
}
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
2019-09-25 14:02:48 +02:00
|
|
|
|
|
|
|
|
return use_block;
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* iterate a ssa def's use's and try to find a more optimal block to
|
|
|
|
|
* move it to, using the dominance tree. In short, if all of the uses
|
|
|
|
|
* are contained in a single block, the load will be moved there,
|
|
|
|
|
* otherwise it will be move to the least common ancestor block of all
|
|
|
|
|
* the uses
|
|
|
|
|
*/
|
|
|
|
|
static nir_block *
|
2023-08-12 16:17:15 -04:00
|
|
|
get_preferred_block(nir_def *def, bool sink_out_of_loops)
|
2019-05-22 20:23:03 +01:00
|
|
|
{
|
|
|
|
|
nir_block *lca = NULL;
|
|
|
|
|
|
2023-04-06 13:19:31 -04:00
|
|
|
nir_foreach_use_including_if(use, def) {
|
2024-08-27 09:21:41 +02:00
|
|
|
lca = nir_dominance_lca(lca, nir_src_get_block(use));
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
|
|
|
|
|
2020-12-02 15:34:26 +01:00
|
|
|
/* return in case, we didn't find a reachable user */
|
|
|
|
|
if (!lca)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
nir: don't sink instructions into loops
Repeatedly loading constants or evaluating ALU operations
in loops doesn't seem beneficial. This might increase the register
pressure, but the tradeoff seems worth it.
Totals from 13629 (9.77% of 139517) affected shaders (RAVEN):
SGPRs: 1179481 -> 1184697 (+0.44%); split: -0.03%, +0.47%
VGPRs: 978776 -> 978732 (-0.00%); split: -0.02%, +0.02%
SpillSGPRs: 51036 -> 50943 (-0.18%); split: -1.35%, +1.17%
CodeSize: 113775020 -> 113428812 (-0.30%); split: -0.34%, +0.04%
MaxWaves: 49877 -> 49881 (+0.01%); split: +0.02%, -0.01%
Instrs: 22295979 -> 22204936 (-0.41%); split: -0.42%, +0.02%
Cycles: 1637198832 -> 1626916048 (-0.63%); split: -0.64%, +0.01%
VMEM: 2403434 -> 2507645 (+4.34%); split: +4.76%, -0.42%
SMEM: 849676 -> 834576 (-1.78%); split: +0.60%, -2.38%
VClause: 412396 -> 398139 (-3.46%); split: -3.46%, +0.01%
SClause: 810480 -> 817349 (+0.85%); split: -0.19%, +1.04%
Copies: 2188260 -> 2166716 (-0.98%); split: -1.18%, +0.19%
Branches: 761204 -> 760475 (-0.10%); split: -0.15%, +0.05%
PreSGPRs: 972892 -> 981054 (+0.84%); split: -0.05%, +0.89%
PreVGPRs: 925390 -> 925420 (+0.00%); split: -0.02%, +0.02%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7694>
2020-11-19 12:21:17 +01:00
|
|
|
/* We don't sink any instructions into loops to avoid repeated executions
|
|
|
|
|
* This might occasionally increase register pressure, but seems overall
|
|
|
|
|
* the better choice.
|
2019-09-25 14:02:48 +02:00
|
|
|
*/
|
nir: don't sink instructions into loops
Repeatedly loading constants or evaluating ALU operations
in loops doesn't seem beneficial. This might increase the register
pressure, but the tradeoff seems worth it.
Totals from 13629 (9.77% of 139517) affected shaders (RAVEN):
SGPRs: 1179481 -> 1184697 (+0.44%); split: -0.03%, +0.47%
VGPRs: 978776 -> 978732 (-0.00%); split: -0.02%, +0.02%
SpillSGPRs: 51036 -> 50943 (-0.18%); split: -1.35%, +1.17%
CodeSize: 113775020 -> 113428812 (-0.30%); split: -0.34%, +0.04%
MaxWaves: 49877 -> 49881 (+0.01%); split: +0.02%, -0.01%
Instrs: 22295979 -> 22204936 (-0.41%); split: -0.42%, +0.02%
Cycles: 1637198832 -> 1626916048 (-0.63%); split: -0.64%, +0.01%
VMEM: 2403434 -> 2507645 (+4.34%); split: +4.76%, -0.42%
SMEM: 849676 -> 834576 (-1.78%); split: +0.60%, -2.38%
VClause: 412396 -> 398139 (-3.46%); split: -3.46%, +0.01%
SClause: 810480 -> 817349 (+0.85%); split: -0.19%, +1.04%
Copies: 2188260 -> 2166716 (-0.98%); split: -1.18%, +0.19%
Branches: 761204 -> 760475 (-0.10%); split: -0.15%, +0.05%
PreSGPRs: 972892 -> 981054 (+0.84%); split: -0.05%, +0.89%
PreVGPRs: 925390 -> 925420 (+0.00%); split: -0.02%, +0.02%
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7694>
2020-11-19 12:21:17 +01:00
|
|
|
lca = adjust_block_for_loops(lca, def->parent_instr->block,
|
|
|
|
|
sink_out_of_loops);
|
|
|
|
|
assert(nir_block_dominates(def->parent_instr->block, lca));
|
2019-09-25 14:02:48 +02:00
|
|
|
|
2019-05-22 20:23:03 +01:00
|
|
|
return lca;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
nir_opt_sink(nir_shader *shader, nir_move_options options)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
nir_metadata_require(impl,
|
2024-06-16 16:32:01 -04:00
|
|
|
nir_metadata_control_flow);
|
2019-05-22 20:23:03 +01:00
|
|
|
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_foreach_block_reverse(block, impl) {
|
2019-05-22 20:23:03 +01:00
|
|
|
nir_foreach_instr_reverse_safe(instr, block) {
|
2024-08-28 20:53:27 +02:00
|
|
|
bool sink_out_of_loops;
|
|
|
|
|
if (!can_sink_instr(instr, options, &sink_out_of_loops))
|
2019-05-22 20:23:03 +01:00
|
|
|
continue;
|
|
|
|
|
|
2023-08-15 12:05:54 -05:00
|
|
|
nir_def *def = nir_instr_def(instr);
|
2019-09-25 14:17:23 +02:00
|
|
|
|
2019-05-22 20:23:03 +01:00
|
|
|
nir_block *use_block =
|
2023-08-08 12:00:35 -05:00
|
|
|
get_preferred_block(def, sink_out_of_loops);
|
2019-05-22 20:23:03 +01:00
|
|
|
|
|
|
|
|
if (!use_block || use_block == instr->block)
|
|
|
|
|
continue;
|
|
|
|
|
|
2020-12-02 16:03:32 +00:00
|
|
|
nir_instr_remove(instr);
|
|
|
|
|
nir_instr_insert(nir_after_phis(use_block), instr);
|
2019-05-22 20:23:03 +01:00
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_metadata_preserve(impl,
|
2024-06-16 16:32:01 -04:00
|
|
|
nir_metadata_control_flow);
|
2019-05-22 20:23:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|