2024-01-04 17:31:42 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2025-02-05 14:25:15 -08:00
|
|
|
#include "brw_shader.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
#include "dev/intel_debug.h"
|
|
|
|
|
|
2024-01-04 17:31:42 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_optimize(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
const nir_shader *nir = s.nir;
|
|
|
|
|
|
|
|
|
|
s.debug_optimizer(nir, "start", 0, 0);
|
|
|
|
|
|
|
|
|
|
/* Start by validating the shader we currently have. */
|
2024-12-29 17:39:39 -08:00
|
|
|
brw_validate(s);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-08-16 21:29:48 -07:00
|
|
|
/* Track how much non-SSA at this point. */
|
|
|
|
|
{
|
2024-12-06 21:20:58 -08:00
|
|
|
const brw_def_analysis &defs = s.def_analysis.require();
|
2024-08-16 21:29:48 -07:00
|
|
|
s.shader_stats.non_ssa_registers_after_nir =
|
|
|
|
|
defs.count() - defs.ssa_count();
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 17:31:42 -08:00
|
|
|
bool progress = false;
|
|
|
|
|
int iteration = 0;
|
|
|
|
|
int pass_num = 0;
|
|
|
|
|
|
|
|
|
|
#define OPT(pass, ...) ({ \
|
|
|
|
|
pass_num++; \
|
|
|
|
|
bool this_progress = pass(s, ##__VA_ARGS__); \
|
|
|
|
|
\
|
|
|
|
|
if (this_progress) \
|
|
|
|
|
s.debug_optimizer(nir, #pass, iteration, pass_num); \
|
|
|
|
|
\
|
2024-12-29 17:39:39 -08:00
|
|
|
brw_validate(s); \
|
2024-01-04 17:31:42 -08:00
|
|
|
\
|
|
|
|
|
progress = progress || this_progress; \
|
|
|
|
|
this_progress; \
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if (s.compiler->lower_dpas)
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_dpas);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_split_virtual_grfs);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
/* Before anything else, eliminate dead code. The results of some NIR
|
|
|
|
|
* instructions may effectively be calculated twice. Once when the
|
|
|
|
|
* instruction is encountered, and again when the user of that result is
|
|
|
|
|
* encountered. Wipe those away before algebraic optimizations and
|
|
|
|
|
* especially copy propagation can mix things up.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_remove_extra_rounding_modes);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_eliminate_find_live_channel);
|
intel/brw: Eliminate top-level FIND_LIVE_CHANNEL & BROADCAST once
brw_fs_opt_eliminate_find_live_channel eliminates FIND_LIVE_CHANNEL
outside of control flow. None of our optimization passes generate
additional cases of that instruction, so once it's gone, we shouldn't
ever have to run the pass again. Moving it out of the loop should
save a bit of CPU time.
While we're at it, also clean adjacent BROADCAST instructions that
consume the result of our FIND_LIVE_CHANNEL. Without this, we have
to perform copy propagation to get the MOV 0 immediate into the
BROADCAST, then algebraic to turn it into a MOV, which enables more
copy propagation...not to mention CSE gets involved. Since this
FIND_LIVE_CHANNEL + BROADCAST pattern from emit_uniformize() is
really common, and it's trivial to clean up, we can do that. This
lets the initial copy prop in the loop see MOV instead of BROADCAST.
Zero impact on fossil-db, but less work in the optimization loop.
Together with the previous patches, this cuts compile time in
Borderlands 3 on Alchemist by -1.38539% +/- 0.1632% (n = 24).
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28286>
2024-03-14 00:32:25 -07:00
|
|
|
|
2024-01-04 17:31:42 -08:00
|
|
|
do {
|
|
|
|
|
progress = false;
|
|
|
|
|
pass_num = 0;
|
|
|
|
|
iteration++;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_algebraic);
|
|
|
|
|
OPT(brw_opt_cse_defs);
|
|
|
|
|
if (!OPT(brw_opt_copy_propagation_defs))
|
|
|
|
|
OPT(brw_opt_copy_propagation);
|
|
|
|
|
OPT(brw_opt_cmod_propagation);
|
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
|
|
|
OPT(brw_opt_saturate_propagation);
|
|
|
|
|
OPT(brw_opt_register_coalesce);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_compact_virtual_grfs);
|
2024-01-04 17:31:42 -08:00
|
|
|
} while (progress);
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
|
|
|
|
|
|
2024-01-04 17:31:42 -08:00
|
|
|
progress = false;
|
|
|
|
|
pass_num = 0;
|
|
|
|
|
|
brw: Combine convergent texture buffer fetches into fewer loads
Borderlands 3 (both DX11 and DX12 renderers) have a common pattern
across many shaders:
con 32x4 %510 = (uint32)txf %2 (handle), %1191 (0x10) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %512 = (uint32)txf %2 (handle), %1511 (0x11) (coord), %1 (0x0) (lod), 0 (texture)
...
con 32x4 %550 = (uint32)txf %2 (handle), %1549 (0x25) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %552 = (uint32)txf %2 (handle), %1551 (0x26) (coord), %1 (0x0) (lod), 0 (texture)
A single basic block contains piles of texelFetches from a 1D buffer
texture, with constant coordinates. In most cases, only the .x channel
of the result is read. So we have something on the order of 28 sampler
messages, each asking for...a single uint32_t scalar value. Because our
sampler doesn't have any support for convergent block loads (like the
untyped LSC transpose messages for SSBOs)...this means we were emitting
SIMD8/16 (or SIMD16/32 on Xe2) sampler messages for every single scalar,
replicating what's effectively a SIMD1 value to the entire register.
This is hugely wasteful, both in terms of register pressure, and also in
back-and-forth sending and receiving memory messages.
The good news is we can take advantage of our explicit SIMD model to
handle this more efficiently. This patch adds a new optimization pass
that detects a series of SHADER_OPCODE_TXF_LOGICAL, in the same basic
block, with constant offsets, from the same texture. It constructs a
new divergent coordinate where each channel is one of the constants
(i.e <10, 11, 12, ..., 26> in the above example). It issues a new
NoMask divergent texel fetch which loads N useful channels in one go,
and replaces the rest with expansion MOVs that splat the SIMD1 result
back to the full SIMD width. (These get copy propagated away.)
We can pick the SIMD size of the load independently of the native shader
width as well. On Xe2, those 28 convergent loads become a single SIMD32
ld message. On earlier hardware, we use 2 SIMD16 messages. Or we can
use a smaller size when there aren't many to combine.
In fossil-db, this cuts 27% of send messages in affected shaders, 3-6%
of cycles, 2-3% of instructions, and 8-12% of live registers. On A770,
this improves performance of Borderlands 3 by roughly 2.5-3.5%.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32573>
2024-12-09 13:25:18 -08:00
|
|
|
if (OPT(brw_opt_combine_convergent_txf))
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_copy_propagation_defs);
|
brw: Combine convergent texture buffer fetches into fewer loads
Borderlands 3 (both DX11 and DX12 renderers) have a common pattern
across many shaders:
con 32x4 %510 = (uint32)txf %2 (handle), %1191 (0x10) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %512 = (uint32)txf %2 (handle), %1511 (0x11) (coord), %1 (0x0) (lod), 0 (texture)
...
con 32x4 %550 = (uint32)txf %2 (handle), %1549 (0x25) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %552 = (uint32)txf %2 (handle), %1551 (0x26) (coord), %1 (0x0) (lod), 0 (texture)
A single basic block contains piles of texelFetches from a 1D buffer
texture, with constant coordinates. In most cases, only the .x channel
of the result is read. So we have something on the order of 28 sampler
messages, each asking for...a single uint32_t scalar value. Because our
sampler doesn't have any support for convergent block loads (like the
untyped LSC transpose messages for SSBOs)...this means we were emitting
SIMD8/16 (or SIMD16/32 on Xe2) sampler messages for every single scalar,
replicating what's effectively a SIMD1 value to the entire register.
This is hugely wasteful, both in terms of register pressure, and also in
back-and-forth sending and receiving memory messages.
The good news is we can take advantage of our explicit SIMD model to
handle this more efficiently. This patch adds a new optimization pass
that detects a series of SHADER_OPCODE_TXF_LOGICAL, in the same basic
block, with constant offsets, from the same texture. It constructs a
new divergent coordinate where each channel is one of the constants
(i.e <10, 11, 12, ..., 26> in the above example). It issues a new
NoMask divergent texel fetch which loads N useful channels in one go,
and replaces the rest with expansion MOVs that splat the SIMD1 result
back to the full SIMD width. (These get copy propagated away.)
We can pick the SIMD size of the load independently of the native shader
width as well. On Xe2, those 28 convergent loads become a single SIMD32
ld message. On earlier hardware, we use 2 SIMD16 messages. Or we can
use a smaller size when there aren't many to combine.
In fossil-db, this cuts 27% of send messages in affected shaders, 3-6%
of cycles, 2-3% of instructions, and 8-12% of live registers. On A770,
this improves performance of Borderlands 3 by roughly 2.5-3.5%.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32573>
2024-12-09 13:25:18 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
if (OPT(brw_lower_pack)) {
|
|
|
|
|
OPT(brw_opt_register_coalesce);
|
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_subgroup_ops);
|
|
|
|
|
OPT(brw_lower_csel);
|
|
|
|
|
OPT(brw_lower_simd_width);
|
2024-02-20 22:23:07 -08:00
|
|
|
OPT(brw_lower_scalar_fp64_MAD);
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_barycentrics);
|
|
|
|
|
OPT(brw_lower_logical_sends);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
|
|
|
|
|
|
2024-01-04 17:31:42 -08:00
|
|
|
/* After logical SEND lowering. */
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
if (!OPT(brw_opt_copy_propagation_defs))
|
|
|
|
|
OPT(brw_opt_copy_propagation);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
|
|
|
|
|
* Do this before splitting SENDs.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
if (OPT(brw_opt_zero_samples)) {
|
|
|
|
|
if (!OPT(brw_opt_copy_propagation_defs)) {
|
|
|
|
|
OPT(brw_opt_copy_propagation);
|
2024-10-10 14:11:00 -07:00
|
|
|
}
|
|
|
|
|
}
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
if (s.devinfo->ver >= 30)
|
|
|
|
|
OPT(brw_opt_send_to_send_gather);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_split_sends);
|
|
|
|
|
OPT(brw_workaround_nomask_control_flow);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
if (progress) {
|
brw/opt: Always do both kinds of copy propagation before lower_load_payload
shader-db:
All Intel platforms except Skylake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18092932 -> 18092713 (<.01%)
instructions in affected programs: 139290 -> 139071 (-0.16%)
helped: 103
HURT: 18
helped stats (abs) min: 1 max: 8 x̄: 2.43 x̃: 2
helped stats (rel) min: 0.02% max: 9.09% x̄: 0.73% x̃: 0.29%
HURT stats (abs) min: 1 max: 5 x̄: 1.72 x̃: 1
HURT stats (rel) min: 0.02% max: 0.55% x̄: 0.10% x̃: 0.08%
95% mean confidence interval for instructions value: -2.17 -1.45
95% mean confidence interval for instructions %-change: -0.83% -0.38%
Instructions are helped.
total cycles in shared programs: 922792268 -> 921495900 (-0.14%)
cycles in affected programs: 400296984 -> 399000616 (-0.32%)
helped: 765
HURT: 635
helped stats (abs) min: 2 max: 77018 x̄: 6739.33 x̃: 60
helped stats (rel) min: <.01% max: 35.59% x̄: 1.98% x̃: 0.32%
HURT stats (abs) min: 2 max: 88658 x̄: 6077.51 x̃: 152
HURT stats (rel) min: <.01% max: 51.33% x̄: 2.75% x̃: 0.63%
95% mean confidence interval for cycles value: -1620.41 -231.54
95% mean confidence interval for cycles %-change: -0.10% 0.44%
Inconclusive result (%-change mean confidence interval includes 0).
LOST: 4
GAINED: 3
Skylake
total instructions in shared programs: 18658324 -> 18579715 (-0.42%)
instructions in affected programs: 2089957 -> 2011348 (-3.76%)
helped: 9842
HURT: 23
helped stats (abs) min: 1 max: 24 x̄: 7.99 x̃: 8
helped stats (rel) min: 0.05% max: 40.00% x̄: 5.37% x̃: 4.52%
HURT stats (abs) min: 1 max: 5 x̄: 1.57 x̃: 1
HURT stats (rel) min: 0.02% max: 1.28% x̄: 0.36% x̃: 0.24%
95% mean confidence interval for instructions value: -7.98 -7.95
95% mean confidence interval for instructions %-change: -5.43% -5.29%
Instructions are helped.
total cycles in shared programs: 860031654 -> 860237548 (0.02%)
cycles in affected programs: 449175235 -> 449381129 (0.05%)
helped: 7895
HURT: 4416
helped stats (abs) min: 1 max: 14129 x̄: 113.70 x̃: 22
helped stats (rel) min: <.01% max: 40.95% x̄: 1.31% x̃: 0.56%
HURT stats (abs) min: 1 max: 33397 x̄: 249.89 x̃: 34
HURT stats (rel) min: <.01% max: 67.47% x̄: 2.65% x̃: 0.65%
95% mean confidence interval for cycles value: 1.46 31.98
95% mean confidence interval for cycles %-change: 0.02% 0.19%
Cycles are HURT.
LOST: 557
GAINED: 900
fossil-db:
Lunar Lake
Totals:
Instrs: 141933621 -> 141884681 (-0.03%); split: -0.03%, +0.00%
Cycle count: 21990657282 -> 21990200212 (-0.00%); split: -0.14%, +0.14%
Spill count: 69754 -> 69732 (-0.03%); split: -0.05%, +0.02%
Fill count: 128559 -> 128521 (-0.03%); split: -0.05%, +0.02%
Scratch Memory Size: 5934080 -> 5925888 (-0.14%)
Max live registers: 48021653 -> 48051253 (+0.06%); split: -0.00%, +0.06%
Totals from 13510 (2.45% of 551410) affected shaders:
Instrs: 19497180 -> 19448240 (-0.25%); split: -0.25%, +0.00%
Cycle count: 2455370202 -> 2454913132 (-0.02%); split: -1.25%, +1.23%
Spill count: 10975 -> 10953 (-0.20%); split: -0.32%, +0.12%
Fill count: 21709 -> 21671 (-0.18%); split: -0.28%, +0.10%
Scratch Memory Size: 674816 -> 666624 (-1.21%)
Max live registers: 2502653 -> 2532253 (+1.18%); split: -0.01%, +1.19%
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 152763523 -> 152772716 (+0.01%); split: -0.00%, +0.01%
Cycle count: 17188701887 -> 17187510768 (-0.01%); split: -0.10%, +0.09%
Spill count: 79280 -> 79279 (-0.00%); split: -0.00%, +0.00%
Fill count: 148809 -> 148803 (-0.00%)
Max live registers: 31879240 -> 31879093 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5559984 -> 5559712 (-0.00%); split: +0.00%, -0.01%
Totals from 20524 (3.24% of 633183) affected shaders:
Instrs: 20366964 -> 20376157 (+0.05%); split: -0.01%, +0.05%
Cycle count: 2406162382 -> 2404971263 (-0.05%); split: -0.68%, +0.63%
Spill count: 19935 -> 19934 (-0.01%); split: -0.02%, +0.01%
Fill count: 34487 -> 34481 (-0.02%)
Max live registers: 1745598 -> 1745451 (-0.01%); split: -0.01%, +0.01%
Max dispatch width: 117992 -> 117720 (-0.23%); split: +0.03%, -0.26%
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150694108 -> 150683859 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15526754059 -> 15529031079 (+0.01%); split: -0.10%, +0.12%
Max live registers: 31791599 -> 31791441 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5569488 -> 5569296 (-0.00%); split: +0.00%, -0.01%
Totals from 15000 (2.37% of 632406) affected shaders:
Instrs: 10965577 -> 10955328 (-0.09%); split: -0.11%, +0.02%
Cycle count: 2025347115 -> 2027624135 (+0.11%); split: -0.80%, +0.91%
Max live registers: 983373 -> 983215 (-0.02%); split: -0.02%, +0.00%
Max dispatch width: 83064 -> 82872 (-0.23%); split: +0.12%, -0.35%
Skylake
Totals:
Instrs: 140588784 -> 140413758 (-0.12%); split: -0.13%, +0.00%
Cycle count: 14724286265 -> 14723402393 (-0.01%); split: -0.04%, +0.04%
Fill count: 100130 -> 100129 (-0.00%)
Max live registers: 31418029 -> 31417146 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5513400 -> 5535192 (+0.40%); split: +0.89%, -0.49%
Totals from 39733 (6.35% of 625986) affected shaders:
Instrs: 17240737 -> 17065711 (-1.02%); split: -1.02%, +0.01%
Cycle count: 1994668203 -> 1993784331 (-0.04%); split: -0.31%, +0.27%
Fill count: 44481 -> 44480 (-0.00%)
Max live registers: 2766781 -> 2765898 (-0.03%); split: -0.03%, +0.00%
Max dispatch width: 210600 -> 232392 (+10.35%); split: +23.23%, -12.89%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32041>
2024-10-30 19:41:02 -07:00
|
|
|
/* Do both forms of copy propagation because it is important to
|
|
|
|
|
* eliminate as many cases of load_payload-of-load_payload as possible.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_copy_propagation_defs);
|
|
|
|
|
OPT(brw_opt_copy_propagation);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
/* Run after logical send lowering to give it a chance to CSE the
|
|
|
|
|
* LOAD_PAYLOAD instructions created to construct the payloads of
|
|
|
|
|
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
|
|
|
|
* whole logical instruction.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_cse_defs);
|
|
|
|
|
OPT(brw_opt_register_coalesce);
|
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_remove_redundant_halts);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
if (OPT(brw_lower_load_payload)) {
|
|
|
|
|
OPT(brw_opt_split_virtual_grfs);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_register_coalesce);
|
|
|
|
|
OPT(brw_lower_simd_width);
|
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_alu_restrictions);
|
2024-03-18 22:52:35 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_combine_constants);
|
|
|
|
|
if (OPT(brw_lower_integer_multiplication)) {
|
2024-01-04 17:31:42 -08:00
|
|
|
/* If lower_integer_multiplication made progress, it may have produced
|
|
|
|
|
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
|
|
|
|
|
* one more time to clean those up if they exist.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_integer_multiplication);
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_sub_sat);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
progress = false;
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_derivatives);
|
|
|
|
|
OPT(brw_lower_regioning);
|
brw/opt: Always do copy prop, DCE, and register coalesce after lower_regioning
shader-db:
Lunar Lake
total instructions in shared programs: 18100289 -> 18083853 (-0.09%)
instructions in affected programs: 790048 -> 773612 (-2.08%)
helped: 3058 / HURT: 1
total cycles in shared programs: 921691992 -> 921293816 (-0.04%)
cycles in affected programs: 37210762 -> 36812586 (-1.07%)
helped: 2329 / HURT: 624
LOST: 27
GAINED: 26
Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19825635 -> 19821391 (-0.02%)
instructions in affected programs: 138675 -> 134431 (-3.06%)
helped: 877 / HURT: 0
total cycles in shared programs: 907900598 -> 907885713 (<.01%)
cycles in affected programs: 7127161 -> 7112276 (-0.21%)
helped: 318 / HURT: 242
total spills in shared programs: 5790 -> 5758 (-0.55%)
spills in affected programs: 660 -> 628 (-4.85%)
helped: 8 / HURT: 0
total fills in shared programs: 6744 -> 6712 (-0.47%)
fills in affected programs: 708 -> 676 (-4.52%)
helped: 8 / HURT: 0
LOST: 10
GAINED: 0
Skylake
total instructions in shared programs: 18722197 -> 18637637 (-0.45%)
instructions in affected programs: 2757553 -> 2672993 (-3.07%)
helped: 12290 / HURT: 1
total cycles in shared programs: 859716039 -> 859432560 (-0.03%)
cycles in affected programs: 113731837 -> 113448358 (-0.25%)
helped: 9555 / HURT: 2422
LOST: 265
GAINED: 714
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142000618 -> 141928331 (-0.05%); split: -0.05%, +0.00%
Subgroup size: 10995136 -> 10995072 (-0.00%)
Cycle count: 21994723230 -> 21990481140 (-0.02%); split: -0.08%, +0.06%
Spill count: 69911 -> 69754 (-0.22%); split: -0.23%, +0.00%
Fill count: 128723 -> 128559 (-0.13%); split: -0.15%, +0.02%
Scratch Memory Size: 5936128 -> 5934080 (-0.03%)
Max live registers: 48006880 -> 48020936 (+0.03%); split: -0.01%, +0.04%
Totals from 17450 (3.16% of 551410) affected shaders:
Instrs: 14984149 -> 14911862 (-0.48%); split: -0.48%, +0.00%
Subgroup size: 365744 -> 365680 (-0.02%)
Cycle count: 2585095128 -> 2580853038 (-0.16%); split: -0.71%, +0.54%
Spill count: 20893 -> 20736 (-0.75%); split: -0.76%, +0.00%
Fill count: 44181 -> 44017 (-0.37%); split: -0.44%, +0.07%
Scratch Memory Size: 995328 -> 993280 (-0.21%)
Max live registers: 2378069 -> 2392125 (+0.59%); split: -0.20%, +0.79%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150719758 -> 150676269 (-0.03%); split: -0.04%, +0.01%
Subgroup size: 7764560 -> 7764632 (+0.00%)
Cycle count: 15526689814 -> 15525687740 (-0.01%); split: -0.03%, +0.02%
Spill count: 60120 -> 59472 (-1.08%); split: -1.17%, +0.10%
Fill count: 105973 -> 104675 (-1.22%); split: -1.40%, +0.17%
Scratch Memory Size: 2396160 -> 2381824 (-0.60%); split: -0.73%, +0.13%
Max live registers: 31782879 -> 31788857 (+0.02%); split: -0.01%, +0.03%
Max dispatch width: 5569200 -> 5569344 (+0.00%); split: +0.00%, -0.00%
Totals from 10089 (1.60% of 632405) affected shaders:
Instrs: 6389866 -> 6346377 (-0.68%); split: -0.87%, +0.19%
Subgroup size: 102912 -> 102984 (+0.07%)
Cycle count: 681310278 -> 680308204 (-0.15%); split: -0.65%, +0.51%
Spill count: 19571 -> 18923 (-3.31%); split: -3.61%, +0.30%
Fill count: 38229 -> 36931 (-3.40%); split: -3.88%, +0.48%
Scratch Memory Size: 808960 -> 794624 (-1.77%); split: -2.15%, +0.38%
Max live registers: 677473 -> 683451 (+0.88%); split: -0.45%, +1.33%
Max dispatch width: 88672 -> 88816 (+0.16%); split: +0.27%, -0.11%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32041>
2024-10-30 09:56:48 -07:00
|
|
|
|
|
|
|
|
/* Try both copy propagation passes. The defs one will likely not be
|
|
|
|
|
* able to handle everything at this point.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
const bool cp1 = OPT(brw_opt_copy_propagation_defs);
|
|
|
|
|
const bool cp2 = OPT(brw_opt_copy_propagation);
|
brw/opt: Always do copy prop, DCE, and register coalesce after lower_regioning
shader-db:
Lunar Lake
total instructions in shared programs: 18100289 -> 18083853 (-0.09%)
instructions in affected programs: 790048 -> 773612 (-2.08%)
helped: 3058 / HURT: 1
total cycles in shared programs: 921691992 -> 921293816 (-0.04%)
cycles in affected programs: 37210762 -> 36812586 (-1.07%)
helped: 2329 / HURT: 624
LOST: 27
GAINED: 26
Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19825635 -> 19821391 (-0.02%)
instructions in affected programs: 138675 -> 134431 (-3.06%)
helped: 877 / HURT: 0
total cycles in shared programs: 907900598 -> 907885713 (<.01%)
cycles in affected programs: 7127161 -> 7112276 (-0.21%)
helped: 318 / HURT: 242
total spills in shared programs: 5790 -> 5758 (-0.55%)
spills in affected programs: 660 -> 628 (-4.85%)
helped: 8 / HURT: 0
total fills in shared programs: 6744 -> 6712 (-0.47%)
fills in affected programs: 708 -> 676 (-4.52%)
helped: 8 / HURT: 0
LOST: 10
GAINED: 0
Skylake
total instructions in shared programs: 18722197 -> 18637637 (-0.45%)
instructions in affected programs: 2757553 -> 2672993 (-3.07%)
helped: 12290 / HURT: 1
total cycles in shared programs: 859716039 -> 859432560 (-0.03%)
cycles in affected programs: 113731837 -> 113448358 (-0.25%)
helped: 9555 / HURT: 2422
LOST: 265
GAINED: 714
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142000618 -> 141928331 (-0.05%); split: -0.05%, +0.00%
Subgroup size: 10995136 -> 10995072 (-0.00%)
Cycle count: 21994723230 -> 21990481140 (-0.02%); split: -0.08%, +0.06%
Spill count: 69911 -> 69754 (-0.22%); split: -0.23%, +0.00%
Fill count: 128723 -> 128559 (-0.13%); split: -0.15%, +0.02%
Scratch Memory Size: 5936128 -> 5934080 (-0.03%)
Max live registers: 48006880 -> 48020936 (+0.03%); split: -0.01%, +0.04%
Totals from 17450 (3.16% of 551410) affected shaders:
Instrs: 14984149 -> 14911862 (-0.48%); split: -0.48%, +0.00%
Subgroup size: 365744 -> 365680 (-0.02%)
Cycle count: 2585095128 -> 2580853038 (-0.16%); split: -0.71%, +0.54%
Spill count: 20893 -> 20736 (-0.75%); split: -0.76%, +0.00%
Fill count: 44181 -> 44017 (-0.37%); split: -0.44%, +0.07%
Scratch Memory Size: 995328 -> 993280 (-0.21%)
Max live registers: 2378069 -> 2392125 (+0.59%); split: -0.20%, +0.79%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150719758 -> 150676269 (-0.03%); split: -0.04%, +0.01%
Subgroup size: 7764560 -> 7764632 (+0.00%)
Cycle count: 15526689814 -> 15525687740 (-0.01%); split: -0.03%, +0.02%
Spill count: 60120 -> 59472 (-1.08%); split: -1.17%, +0.10%
Fill count: 105973 -> 104675 (-1.22%); split: -1.40%, +0.17%
Scratch Memory Size: 2396160 -> 2381824 (-0.60%); split: -0.73%, +0.13%
Max live registers: 31782879 -> 31788857 (+0.02%); split: -0.01%, +0.03%
Max dispatch width: 5569200 -> 5569344 (+0.00%); split: +0.00%, -0.00%
Totals from 10089 (1.60% of 632405) affected shaders:
Instrs: 6389866 -> 6346377 (-0.68%); split: -0.87%, +0.19%
Subgroup size: 102912 -> 102984 (+0.07%)
Cycle count: 681310278 -> 680308204 (-0.15%); split: -0.65%, +0.51%
Spill count: 19571 -> 18923 (-3.31%); split: -3.61%, +0.30%
Fill count: 38229 -> 36931 (-3.40%); split: -3.88%, +0.48%
Scratch Memory Size: 808960 -> 794624 (-1.77%); split: -2.15%, +0.38%
Max live registers: 677473 -> 683451 (+0.88%); split: -0.45%, +1.33%
Max dispatch width: 88672 -> 88816 (+0.16%); split: +0.27%, -0.11%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32041>
2024-10-30 09:56:48 -07:00
|
|
|
if (cp1 || cp2)
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_combine_constants);
|
brw/opt: Always do copy prop, DCE, and register coalesce after lower_regioning
shader-db:
Lunar Lake
total instructions in shared programs: 18100289 -> 18083853 (-0.09%)
instructions in affected programs: 790048 -> 773612 (-2.08%)
helped: 3058 / HURT: 1
total cycles in shared programs: 921691992 -> 921293816 (-0.04%)
cycles in affected programs: 37210762 -> 36812586 (-1.07%)
helped: 2329 / HURT: 624
LOST: 27
GAINED: 26
Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19825635 -> 19821391 (-0.02%)
instructions in affected programs: 138675 -> 134431 (-3.06%)
helped: 877 / HURT: 0
total cycles in shared programs: 907900598 -> 907885713 (<.01%)
cycles in affected programs: 7127161 -> 7112276 (-0.21%)
helped: 318 / HURT: 242
total spills in shared programs: 5790 -> 5758 (-0.55%)
spills in affected programs: 660 -> 628 (-4.85%)
helped: 8 / HURT: 0
total fills in shared programs: 6744 -> 6712 (-0.47%)
fills in affected programs: 708 -> 676 (-4.52%)
helped: 8 / HURT: 0
LOST: 10
GAINED: 0
Skylake
total instructions in shared programs: 18722197 -> 18637637 (-0.45%)
instructions in affected programs: 2757553 -> 2672993 (-3.07%)
helped: 12290 / HURT: 1
total cycles in shared programs: 859716039 -> 859432560 (-0.03%)
cycles in affected programs: 113731837 -> 113448358 (-0.25%)
helped: 9555 / HURT: 2422
LOST: 265
GAINED: 714
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142000618 -> 141928331 (-0.05%); split: -0.05%, +0.00%
Subgroup size: 10995136 -> 10995072 (-0.00%)
Cycle count: 21994723230 -> 21990481140 (-0.02%); split: -0.08%, +0.06%
Spill count: 69911 -> 69754 (-0.22%); split: -0.23%, +0.00%
Fill count: 128723 -> 128559 (-0.13%); split: -0.15%, +0.02%
Scratch Memory Size: 5936128 -> 5934080 (-0.03%)
Max live registers: 48006880 -> 48020936 (+0.03%); split: -0.01%, +0.04%
Totals from 17450 (3.16% of 551410) affected shaders:
Instrs: 14984149 -> 14911862 (-0.48%); split: -0.48%, +0.00%
Subgroup size: 365744 -> 365680 (-0.02%)
Cycle count: 2585095128 -> 2580853038 (-0.16%); split: -0.71%, +0.54%
Spill count: 20893 -> 20736 (-0.75%); split: -0.76%, +0.00%
Fill count: 44181 -> 44017 (-0.37%); split: -0.44%, +0.07%
Scratch Memory Size: 995328 -> 993280 (-0.21%)
Max live registers: 2378069 -> 2392125 (+0.59%); split: -0.20%, +0.79%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150719758 -> 150676269 (-0.03%); split: -0.04%, +0.01%
Subgroup size: 7764560 -> 7764632 (+0.00%)
Cycle count: 15526689814 -> 15525687740 (-0.01%); split: -0.03%, +0.02%
Spill count: 60120 -> 59472 (-1.08%); split: -1.17%, +0.10%
Fill count: 105973 -> 104675 (-1.22%); split: -1.40%, +0.17%
Scratch Memory Size: 2396160 -> 2381824 (-0.60%); split: -0.73%, +0.13%
Max live registers: 31782879 -> 31788857 (+0.02%); split: -0.01%, +0.03%
Max dispatch width: 5569200 -> 5569344 (+0.00%); split: +0.00%, -0.00%
Totals from 10089 (1.60% of 632405) affected shaders:
Instrs: 6389866 -> 6346377 (-0.68%); split: -0.87%, +0.19%
Subgroup size: 102912 -> 102984 (+0.07%)
Cycle count: 681310278 -> 680308204 (-0.15%); split: -0.65%, +0.51%
Spill count: 19571 -> 18923 (-3.31%); split: -3.61%, +0.30%
Fill count: 38229 -> 36931 (-3.40%); split: -3.88%, +0.48%
Scratch Memory Size: 808960 -> 794624 (-1.77%); split: -2.15%, +0.38%
Max live registers: 677473 -> 683451 (+0.88%); split: -0.45%, +1.33%
Max dispatch width: 88672 -> 88816 (+0.16%); split: +0.27%, -0.11%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32041>
2024-10-30 09:56:48 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
|
|
|
OPT(brw_opt_register_coalesce);
|
brw/opt: Always do copy prop, DCE, and register coalesce after lower_regioning
shader-db:
Lunar Lake
total instructions in shared programs: 18100289 -> 18083853 (-0.09%)
instructions in affected programs: 790048 -> 773612 (-2.08%)
helped: 3058 / HURT: 1
total cycles in shared programs: 921691992 -> 921293816 (-0.04%)
cycles in affected programs: 37210762 -> 36812586 (-1.07%)
helped: 2329 / HURT: 624
LOST: 27
GAINED: 26
Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19825635 -> 19821391 (-0.02%)
instructions in affected programs: 138675 -> 134431 (-3.06%)
helped: 877 / HURT: 0
total cycles in shared programs: 907900598 -> 907885713 (<.01%)
cycles in affected programs: 7127161 -> 7112276 (-0.21%)
helped: 318 / HURT: 242
total spills in shared programs: 5790 -> 5758 (-0.55%)
spills in affected programs: 660 -> 628 (-4.85%)
helped: 8 / HURT: 0
total fills in shared programs: 6744 -> 6712 (-0.47%)
fills in affected programs: 708 -> 676 (-4.52%)
helped: 8 / HURT: 0
LOST: 10
GAINED: 0
Skylake
total instructions in shared programs: 18722197 -> 18637637 (-0.45%)
instructions in affected programs: 2757553 -> 2672993 (-3.07%)
helped: 12290 / HURT: 1
total cycles in shared programs: 859716039 -> 859432560 (-0.03%)
cycles in affected programs: 113731837 -> 113448358 (-0.25%)
helped: 9555 / HURT: 2422
LOST: 265
GAINED: 714
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142000618 -> 141928331 (-0.05%); split: -0.05%, +0.00%
Subgroup size: 10995136 -> 10995072 (-0.00%)
Cycle count: 21994723230 -> 21990481140 (-0.02%); split: -0.08%, +0.06%
Spill count: 69911 -> 69754 (-0.22%); split: -0.23%, +0.00%
Fill count: 128723 -> 128559 (-0.13%); split: -0.15%, +0.02%
Scratch Memory Size: 5936128 -> 5934080 (-0.03%)
Max live registers: 48006880 -> 48020936 (+0.03%); split: -0.01%, +0.04%
Totals from 17450 (3.16% of 551410) affected shaders:
Instrs: 14984149 -> 14911862 (-0.48%); split: -0.48%, +0.00%
Subgroup size: 365744 -> 365680 (-0.02%)
Cycle count: 2585095128 -> 2580853038 (-0.16%); split: -0.71%, +0.54%
Spill count: 20893 -> 20736 (-0.75%); split: -0.76%, +0.00%
Fill count: 44181 -> 44017 (-0.37%); split: -0.44%, +0.07%
Scratch Memory Size: 995328 -> 993280 (-0.21%)
Max live registers: 2378069 -> 2392125 (+0.59%); split: -0.20%, +0.79%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150719758 -> 150676269 (-0.03%); split: -0.04%, +0.01%
Subgroup size: 7764560 -> 7764632 (+0.00%)
Cycle count: 15526689814 -> 15525687740 (-0.01%); split: -0.03%, +0.02%
Spill count: 60120 -> 59472 (-1.08%); split: -1.17%, +0.10%
Fill count: 105973 -> 104675 (-1.22%); split: -1.40%, +0.17%
Scratch Memory Size: 2396160 -> 2381824 (-0.60%); split: -0.73%, +0.13%
Max live registers: 31782879 -> 31788857 (+0.02%); split: -0.01%, +0.03%
Max dispatch width: 5569200 -> 5569344 (+0.00%); split: +0.00%, -0.00%
Totals from 10089 (1.60% of 632405) affected shaders:
Instrs: 6389866 -> 6346377 (-0.68%); split: -0.87%, +0.19%
Subgroup size: 102912 -> 102984 (+0.07%)
Cycle count: 681310278 -> 680308204 (-0.15%); split: -0.65%, +0.51%
Spill count: 19571 -> 18923 (-3.31%); split: -3.61%, +0.30%
Fill count: 38229 -> 36931 (-3.40%); split: -3.88%, +0.48%
Scratch Memory Size: 808960 -> 794624 (-1.77%); split: -2.15%, +0.38%
Max live registers: 677473 -> 683451 (+0.88%); split: -0.45%, +1.33%
Max dispatch width: 88672 -> 88816 (+0.16%); split: +0.27%, -0.11%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32041>
2024-10-30 09:56:48 -07:00
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_simd_width);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
2025-01-14 10:49:51 -08:00
|
|
|
if (s.devinfo->ver >= 30)
|
|
|
|
|
OPT(brw_opt_send_gather_to_send);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_uniform_pull_constant_loads);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
if (OPT(brw_lower_send_descriptors)) {
|
|
|
|
|
/* No need for standard copy_propagation since
|
2024-12-07 10:54:40 -08:00
|
|
|
* brw_opt_address_reg_load will only optimize defs.
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
*/
|
|
|
|
|
if (OPT(brw_opt_copy_propagation_defs))
|
|
|
|
|
OPT(brw_opt_algebraic);
|
|
|
|
|
OPT(brw_opt_address_reg_load);
|
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OPT(brw_lower_sends_overlapping_payload);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_indirect_mov);
|
2024-05-21 12:56:50 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_find_live_channel);
|
2024-02-24 01:24:03 -08:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
OPT(brw_lower_load_subgroup_invocation);
|
2024-08-27 10:16:11 -07:00
|
|
|
|
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned
|
2024-12-07 00:23:07 -08:00
|
|
|
load_payload_sources_read_for_size(brw_inst *lp, unsigned size_read)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
|
|
|
|
|
assert(size_read >= lp->header_size * REG_SIZE);
|
|
|
|
|
|
|
|
|
|
unsigned i;
|
|
|
|
|
unsigned size = lp->header_size * REG_SIZE;
|
|
|
|
|
for (i = lp->header_size; size < size_read && i < lp->sources; i++)
|
2024-04-21 00:57:59 -07:00
|
|
|
size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
/* Size read must cover exactly a subset of sources. */
|
|
|
|
|
assert(size == size_read);
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Optimize sample messages that have constant zero values for the trailing
|
|
|
|
|
* parameters. We can just reduce the message length for these
|
|
|
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
|
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
|
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
|
|
|
* set up the zero value.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_zero_samples(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, send, s.cfg) {
|
2024-01-04 17:31:42 -08:00
|
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
|
|
|
|
send->sfid != BRW_SFID_SAMPLER)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Wa_14012688258:
|
|
|
|
|
*
|
|
|
|
|
* Don't trim zeros at the end of payload for sample operations
|
|
|
|
|
* in cube and cube arrays.
|
|
|
|
|
*/
|
|
|
|
|
if (send->keep_payload_trailing_zeros)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* This pass works on SENDs before splitting. */
|
|
|
|
|
if (send->ex_mlen > 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *lp = (brw_inst *) send->prev;
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* How much of the payload are actually read by this SEND. */
|
|
|
|
|
const unsigned params =
|
|
|
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
|
|
|
|
|
|
|
|
|
/* We don't want to remove the message header or the first parameter.
|
|
|
|
|
* Removing the first parameter is not allowed, see the Haswell PRM
|
|
|
|
|
* volume 7, page 149:
|
|
|
|
|
*
|
|
|
|
|
* "Parameter 0 is required except for the sampleinfo message, which
|
|
|
|
|
* has no parameter 0"
|
|
|
|
|
*/
|
|
|
|
|
const unsigned first_param_idx = lp->header_size;
|
|
|
|
|
unsigned zero_size = 0;
|
|
|
|
|
for (unsigned i = params - 1; i > first_param_idx; i--) {
|
|
|
|
|
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
|
|
|
|
|
break;
|
2024-04-21 00:57:59 -07:00
|
|
|
zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
|
2024-03-19 11:16:18 -07:00
|
|
|
/* Round down to ensure to only consider full registers. */
|
|
|
|
|
const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
|
2024-01-04 17:31:42 -08:00
|
|
|
if (zero_len > 0) {
|
2024-03-19 11:16:18 -07:00
|
|
|
/* Note mlen is in REG_SIZE units. */
|
2024-01-04 17:31:42 -08:00
|
|
|
send->mlen -= zero_len;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Opportunistically split SEND message payloads.
|
|
|
|
|
*
|
|
|
|
|
* Gfx9+ supports "split" SEND messages, which take two payloads that are
|
|
|
|
|
* implicitly concatenated. If we find a SEND message with a single payload,
|
|
|
|
|
* we can split that payload in two. This results in smaller contiguous
|
|
|
|
|
* register blocks for us to allocate. But it can help beyond that, too.
|
|
|
|
|
*
|
|
|
|
|
* We try and split a LOAD_PAYLOAD between sources which change registers.
|
|
|
|
|
* For example, a sampler message often contains a x/y/z coordinate that may
|
|
|
|
|
* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
|
|
|
|
|
* or array index, which comes from elsewhere. In this case, the first few
|
|
|
|
|
* sources will be different offsets of the same VGRF, then a later source
|
|
|
|
|
* will be a different VGRF. So we split there, possibly eliminating the
|
|
|
|
|
* payload concatenation altogether.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_split_sends(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, send, s.cfg) {
|
2024-01-04 17:31:42 -08:00
|
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
2024-01-05 14:08:47 -08:00
|
|
|
send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
|
|
|
|
|
send->src[2].file != VGRF)
|
2024-01-04 17:31:42 -08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Currently don't split sends that reuse a previously used payload. */
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *lp = (brw_inst *) send->prev;
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Split either after the header (if present), or when consecutive
|
|
|
|
|
* sources switch from one VGRF to a different one.
|
|
|
|
|
*/
|
|
|
|
|
unsigned mid = lp->header_size;
|
|
|
|
|
if (mid == 0) {
|
|
|
|
|
for (mid = 1; mid < lp->sources; mid++) {
|
|
|
|
|
if (lp->src[mid].file == BAD_FILE)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (lp->src[0].file != lp->src[mid].file ||
|
|
|
|
|
lp->src[0].nr != lp->src[mid].nr)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
|
|
|
|
|
* find out how many sources from the payload does it really need.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned end =
|
|
|
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
|
|
|
|
|
|
|
|
|
/* Nothing to split. */
|
|
|
|
|
if (end <= mid)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ibld(&s, block, lp);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
|
|
|
|
|
brw_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
assert(lp1->size_written % REG_SIZE == 0);
|
|
|
|
|
assert(lp2->size_written % REG_SIZE == 0);
|
|
|
|
|
assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
|
|
|
|
|
|
2025-01-31 12:50:20 -08:00
|
|
|
lp1->dst = retype(brw_allocate_vgrf_units(s, lp1->size_written / REG_SIZE), lp1->dst.type);
|
|
|
|
|
lp2->dst = retype(brw_allocate_vgrf_units(s, lp2->size_written / REG_SIZE), lp2->dst.type);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
send->resize_sources(4);
|
|
|
|
|
send->src[2] = lp1->dst;
|
|
|
|
|
send->src[3] = lp2->dst;
|
|
|
|
|
send->ex_mlen = lp2->size_written / REG_SIZE;
|
|
|
|
|
send->mlen -= send->ex_mlen;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Remove redundant or useless halts.
|
|
|
|
|
*
|
|
|
|
|
* For example, we can eliminate halts in the following sequence:
|
|
|
|
|
*
|
|
|
|
|
* halt (redundant with the next halt)
|
|
|
|
|
* halt (useless; jumps to the next instruction)
|
|
|
|
|
* halt-target
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_remove_redundant_halts(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
unsigned halt_count = 0;
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *halt_target = NULL;
|
2024-01-04 17:31:42 -08:00
|
|
|
bblock_t *halt_target_block = NULL;
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2024-01-04 17:31:42 -08:00
|
|
|
if (inst->opcode == BRW_OPCODE_HALT)
|
|
|
|
|
halt_count++;
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
|
|
|
|
|
halt_target = inst;
|
|
|
|
|
halt_target_block = block;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!halt_target) {
|
|
|
|
|
assert(halt_count == 0);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Delete any HALTs immediately before the halt target. */
|
2024-12-07 00:23:07 -08:00
|
|
|
for (brw_inst *prev = (brw_inst *) halt_target->prev;
|
2024-01-04 17:31:42 -08:00
|
|
|
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
|
2024-12-07 00:23:07 -08:00
|
|
|
prev = (brw_inst *) halt_target->prev) {
|
2024-01-04 17:31:42 -08:00
|
|
|
prev->remove(halt_target_block);
|
|
|
|
|
halt_count--;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (halt_count == 0) {
|
|
|
|
|
halt_target->remove(halt_target_block);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
|
|
|
|
* flow. We could probably do better here with some form of divergence
|
|
|
|
|
* analysis.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_eliminate_find_live_channel(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
unsigned depth = 0;
|
|
|
|
|
|
|
|
|
|
if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
2024-02-19 23:07:04 -08:00
|
|
|
s.prog_data)) {
|
2024-01-04 17:31:42 -08:00
|
|
|
/* The optimization below assumes that channel zero is live on thread
|
|
|
|
|
* dispatch, which may not be the case if the fixed function dispatches
|
|
|
|
|
* threads sparsely.
|
|
|
|
|
*/
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-01-04 17:31:42 -08:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
depth++;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
depth--;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_HALT:
|
|
|
|
|
/* This can potentially make control flow non-uniform until the end
|
|
|
|
|
* of the program.
|
|
|
|
|
*/
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
if (depth == 0) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = brw_imm_ud(0u);
|
|
|
|
|
inst->force_writemask_all = true;
|
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.
v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).
v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.
v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.
v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18
total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108
LOST: 0
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20
total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158
total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0
total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)
Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%
Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%
Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%
Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%
Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%
Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
2024-10-15 15:51:22 -07:00
|
|
|
|
|
|
|
|
/* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
|
|
|
|
|
* size_written set by hand to a smaller value. In this case,
|
|
|
|
|
* munge the exec_size to match.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
|
|
|
|
|
inst->exec_size = 8 * reg_unit(s.devinfo);
|
|
|
|
|
|
2024-11-20 18:47:23 -08:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 17:31:42 -08:00
|
|
|
progress = true;
|
intel/brw: Eliminate top-level FIND_LIVE_CHANNEL & BROADCAST once
brw_fs_opt_eliminate_find_live_channel eliminates FIND_LIVE_CHANNEL
outside of control flow. None of our optimization passes generate
additional cases of that instruction, so once it's gone, we shouldn't
ever have to run the pass again. Moving it out of the loop should
save a bit of CPU time.
While we're at it, also clean adjacent BROADCAST instructions that
consume the result of our FIND_LIVE_CHANNEL. Without this, we have
to perform copy propagation to get the MOV 0 immediate into the
BROADCAST, then algebraic to turn it into a MOV, which enables more
copy propagation...not to mention CSE gets involved. Since this
FIND_LIVE_CHANNEL + BROADCAST pattern from emit_uniformize() is
really common, and it's trivial to clean up, we can do that. This
lets the initial copy prop in the loop see MOV instead of BROADCAST.
Zero impact on fossil-db, but less work in the optimization loop.
Together with the previous patches, this cuts compile time in
Borderlands 3 on Alchemist by -1.38539% +/- 0.1632% (n = 24).
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28286>
2024-03-14 00:32:25 -07:00
|
|
|
|
|
|
|
|
/* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
|
|
|
|
|
* with a BROADCAST. Save some work for opt_copy_propagation
|
|
|
|
|
* and opt_algebraic by trivially cleaning up both together.
|
|
|
|
|
*/
|
|
|
|
|
assert(!inst->next->is_tail_sentinel());
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *bcast = (brw_inst *) inst->next;
|
intel/brw: Eliminate top-level FIND_LIVE_CHANNEL & BROADCAST once
brw_fs_opt_eliminate_find_live_channel eliminates FIND_LIVE_CHANNEL
outside of control flow. None of our optimization passes generate
additional cases of that instruction, so once it's gone, we shouldn't
ever have to run the pass again. Moving it out of the loop should
save a bit of CPU time.
While we're at it, also clean adjacent BROADCAST instructions that
consume the result of our FIND_LIVE_CHANNEL. Without this, we have
to perform copy propagation to get the MOV 0 immediate into the
BROADCAST, then algebraic to turn it into a MOV, which enables more
copy propagation...not to mention CSE gets involved. Since this
FIND_LIVE_CHANNEL + BROADCAST pattern from emit_uniformize() is
really common, and it's trivial to clean up, we can do that. This
lets the initial copy prop in the loop see MOV instead of BROADCAST.
Zero impact on fossil-db, but less work in the optimization loop.
Together with the previous patches, this cuts compile time in
Borderlands 3 on Alchemist by -1.38539% +/- 0.1632% (n = 24).
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28286>
2024-03-14 00:32:25 -07:00
|
|
|
|
|
|
|
|
/* Ignore stride when comparing */
|
|
|
|
|
if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
|
|
|
|
|
inst->dst.file == VGRF &&
|
|
|
|
|
inst->dst.file == bcast->src[1].file &&
|
|
|
|
|
inst->dst.nr == bcast->src[1].nr &&
|
|
|
|
|
inst->dst.offset == bcast->src[1].offset) {
|
|
|
|
|
bcast->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
if (!is_uniform(bcast->src[0]))
|
|
|
|
|
bcast->src[0] = component(bcast->src[0], 0);
|
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.
v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).
v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.
v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.
v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18
total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108
LOST: 0
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20
total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158
total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0
total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)
Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%
Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%
Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%
Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%
Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%
Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
2024-10-15 15:51:22 -07:00
|
|
|
|
intel/brw: Eliminate top-level FIND_LIVE_CHANNEL & BROADCAST once
brw_fs_opt_eliminate_find_live_channel eliminates FIND_LIVE_CHANNEL
outside of control flow. None of our optimization passes generate
additional cases of that instruction, so once it's gone, we shouldn't
ever have to run the pass again. Moving it out of the loop should
save a bit of CPU time.
While we're at it, also clean adjacent BROADCAST instructions that
consume the result of our FIND_LIVE_CHANNEL. Without this, we have
to perform copy propagation to get the MOV 0 immediate into the
BROADCAST, then algebraic to turn it into a MOV, which enables more
copy propagation...not to mention CSE gets involved. Since this
FIND_LIVE_CHANNEL + BROADCAST pattern from emit_uniformize() is
really common, and it's trivial to clean up, we can do that. This
lets the initial copy prop in the loop see MOV instead of BROADCAST.
Zero impact on fossil-db, but less work in the optimization loop.
Together with the previous patches, this cuts compile time in
Borderlands 3 on Alchemist by -1.38539% +/- 0.1632% (n = 24).
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28286>
2024-03-14 00:32:25 -07:00
|
|
|
bcast->force_writemask_all = true;
|
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.
v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).
v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.
v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.
v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18
total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108
LOST: 0
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20
total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158
total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0
total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)
Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%
Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%
Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%
Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%
Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%
Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
2024-10-15 15:51:22 -07:00
|
|
|
bcast->exec_size = 8 * reg_unit(s.devinfo);
|
|
|
|
|
assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
|
2024-11-20 18:47:23 -08:00
|
|
|
bcast->resize_sources(1);
|
intel/brw: Eliminate top-level FIND_LIVE_CHANNEL & BROADCAST once
brw_fs_opt_eliminate_find_live_channel eliminates FIND_LIVE_CHANNEL
outside of control flow. None of our optimization passes generate
additional cases of that instruction, so once it's gone, we shouldn't
ever have to run the pass again. Moving it out of the loop should
save a bit of CPU time.
While we're at it, also clean adjacent BROADCAST instructions that
consume the result of our FIND_LIVE_CHANNEL. Without this, we have
to perform copy propagation to get the MOV 0 immediate into the
BROADCAST, then algebraic to turn it into a MOV, which enables more
copy propagation...not to mention CSE gets involved. Since this
FIND_LIVE_CHANNEL + BROADCAST pattern from emit_uniformize() is
really common, and it's trivial to clean up, we can do that. This
lets the initial copy prop in the loop see MOV instead of BROADCAST.
Zero impact on fossil-db, but less work in the optimization loop.
Together with the previous patches, this cuts compile time in
Borderlands 3 on Alchemist by -1.38539% +/- 0.1632% (n = 24).
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28286>
2024-03-14 00:32:25 -07:00
|
|
|
}
|
2024-01-04 17:31:42 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Rounding modes for conversion instructions are included for each
|
|
|
|
|
* conversion, but right now it is a state. So once it is set,
|
|
|
|
|
* we don't need to call it again for subsequent calls.
|
|
|
|
|
*
|
|
|
|
|
* This is useful for vector/matrices conversions, as setting the
|
|
|
|
|
* mode once is enough for the full vector/matrix
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_remove_extra_rounding_modes(brw_shader &s)
|
2024-01-04 17:31:42 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
unsigned execution_mode = s.nir->info.float_controls_execution_mode;
|
|
|
|
|
|
|
|
|
|
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
|
|
|
|
execution_mode)
|
|
|
|
|
base_mode = BRW_RND_MODE_RTNE;
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
|
|
|
|
execution_mode)
|
|
|
|
|
base_mode = BRW_RND_MODE_RTZ;
|
|
|
|
|
|
|
|
|
|
foreach_block (block, s.cfg) {
|
|
|
|
|
brw_rnd_mode prev_mode = base_mode;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_inst_in_block_safe (brw_inst, inst, block) {
|
2024-01-04 17:31:42 -08:00
|
|
|
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
2024-08-20 11:48:54 -07:00
|
|
|
assert(inst->src[0].file == IMM);
|
2024-01-04 17:31:42 -08:00
|
|
|
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
|
|
|
|
if (mode == prev_mode) {
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
progress = true;
|
|
|
|
|
} else {
|
|
|
|
|
prev_mode = mode;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2024-01-04 17:31:42 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
2024-11-20 08:12:52 -08:00
|
|
|
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_send_to_send_gather(brw_shader &s)
|
2024-11-20 08:12:52 -08:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
assert(devinfo->ver >= 30);
|
|
|
|
|
|
|
|
|
|
const unsigned unit = reg_unit(devinfo);
|
|
|
|
|
assert(unit == 2);
|
|
|
|
|
|
|
|
|
|
unsigned count = 0;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-11-20 08:12:52 -08:00
|
|
|
if (inst->opcode != SHADER_OPCODE_SEND)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* For 1-2 registers, send-gather offers no benefits over split-send. */
|
|
|
|
|
if (inst->mlen + inst->ex_mlen <= 2 * unit)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(inst->mlen % unit == 0);
|
|
|
|
|
assert(inst->ex_mlen % unit == 0);
|
|
|
|
|
|
|
|
|
|
struct {
|
|
|
|
|
brw_reg src;
|
|
|
|
|
unsigned phys_len;
|
|
|
|
|
} payload[2] = {
|
|
|
|
|
{ inst->src[2], inst->mlen / unit },
|
|
|
|
|
{ inst->src[3], inst->ex_mlen / unit },
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
|
|
|
|
|
|
|
|
|
|
/* Limited by Src0.Length in the SEND instruction. */
|
|
|
|
|
if (num_payload_sources > 15)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
|
|
|
|
|
count++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(3 + num_payload_sources);
|
|
|
|
|
/* Sources 0 and 1 remain the same. Source 2 will be filled
|
|
|
|
|
* after register allocation.
|
|
|
|
|
*/
|
|
|
|
|
inst->src[2] = {};
|
|
|
|
|
|
|
|
|
|
int idx = 3;
|
|
|
|
|
for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
|
|
|
|
|
for (unsigned i = 0; i < payload[p].phys_len; i++) {
|
|
|
|
|
inst->src[idx++] = byte_offset(payload[p].src,
|
|
|
|
|
i * reg_unit(devinfo) * REG_SIZE);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
assert(idx == inst->sources);
|
|
|
|
|
|
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND_GATHER;
|
|
|
|
|
inst->mlen = 0;
|
|
|
|
|
inst->ex_mlen = 0;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
|
|
|
|
|
fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
|
|
|
|
|
count, _mesa_shader_stage_to_string(s.stage));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
|
|
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
|
2024-11-20 08:12:52 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
2025-01-14 10:49:51 -08:00
|
|
|
|
|
|
|
|
/* If after optimizations, the sources are *still* contiguous in a
|
|
|
|
|
* SEND_GATHER, prefer to use the regular SEND, which would save
|
|
|
|
|
* having to write the ARF scalar register.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_opt_send_gather_to_send(brw_shader &s)
|
2025-01-14 10:49:51 -08:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
assert(devinfo->ver >= 30);
|
|
|
|
|
|
|
|
|
|
const unsigned unit = reg_unit(devinfo);
|
|
|
|
|
assert(unit == 2);
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2025-01-14 10:49:51 -08:00
|
|
|
if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(inst->sources > 2);
|
|
|
|
|
assert(inst->src[2].file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
const int num_payload_sources = inst->sources - 3;
|
|
|
|
|
assert(num_payload_sources > 0);
|
|
|
|
|
|
|
|
|
|
/* Limited by Src0.Length in the SEND instruction. */
|
|
|
|
|
assert(num_payload_sources < 16);
|
|
|
|
|
|
|
|
|
|
/* Determine whether the sources are still spread in either one or two
|
|
|
|
|
* spans. In those cases the regular SEND instruction can be used
|
|
|
|
|
* and there's no need to use SEND_GATHER (which would set ARF scalar register
|
|
|
|
|
* adding an extra instruction).
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg *payload = &inst->src[3];
|
|
|
|
|
brw_reg payload1 = payload[0];
|
|
|
|
|
brw_reg payload2 = {};
|
|
|
|
|
int payload1_len = 0;
|
|
|
|
|
int payload2_len = 0;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < num_payload_sources; i++) {
|
|
|
|
|
if (payload[i].file == VGRF &&
|
|
|
|
|
payload[i].nr == payload1.nr &&
|
|
|
|
|
payload[i].offset == payload1_len * REG_SIZE * unit)
|
|
|
|
|
payload1_len++;
|
|
|
|
|
else {
|
|
|
|
|
payload2 = payload[i];
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (payload2.file == VGRF) {
|
|
|
|
|
for (int i = payload1_len; i < num_payload_sources; i++) {
|
|
|
|
|
if (payload[i].file == VGRF &&
|
|
|
|
|
payload[i].nr == payload2.nr &&
|
|
|
|
|
payload[i].offset == payload2_len * REG_SIZE * unit)
|
|
|
|
|
payload2_len++;
|
|
|
|
|
else
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
payload2 = brw_null_reg();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (payload1_len + payload2_len != num_payload_sources)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Bspec 57058 (r64705) says
|
|
|
|
|
*
|
|
|
|
|
* When a source data payload is used in dataport message, that payload
|
|
|
|
|
* must be specified as Source 1 portion of a Split Send message.
|
|
|
|
|
*
|
|
|
|
|
* But at this point the split point is not guaranteed to respect that.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Pass LSC address length or infer it so valid splits can work.
|
|
|
|
|
*/
|
|
|
|
|
if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
|
|
|
|
|
inst->sfid == GFX12_SFID_TGM ||
|
|
|
|
|
inst->sfid == GFX12_SFID_SLM ||
|
|
|
|
|
inst->sfid == BRW_SFID_URB)) {
|
|
|
|
|
enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
|
|
|
|
|
if (lsc_op_num_data_values(lsc_op) > 0)
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(4);
|
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
|
|
|
|
inst->src[2] = payload1;
|
|
|
|
|
inst->src[3] = payload2;
|
|
|
|
|
inst->mlen = payload1_len * unit;
|
|
|
|
|
inst->ex_mlen = payload2_len * unit;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
|
|
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
|
2025-01-14 10:49:51 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|