2012-05-10 16:10:15 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2012 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
#define XXH_INLINE_ALL
|
|
|
|
|
#include "util/xxhash.h"
|
|
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2012-10-03 13:03:12 -07:00
|
|
|
#include "brw_cfg.h"
|
2012-05-10 16:10:15 -07:00
|
|
|
|
|
|
|
|
/** @file brw_fs_cse.cpp
|
|
|
|
|
*
|
2024-04-09 23:21:24 -07:00
|
|
|
* Support for SSA-based global Common Subexpression Elimination (CSE).
|
2012-05-10 16:10:15 -07:00
|
|
|
*/
|
|
|
|
|
|
2015-06-04 16:13:35 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2014-03-30 12:41:55 -07:00
|
|
|
static bool
|
2015-04-01 15:38:23 -07:00
|
|
|
is_expression(const fs_visitor *v, const fs_inst *const inst)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
2014-04-03 14:29:30 -07:00
|
|
|
case BRW_OPCODE_MOV:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
case BRW_OPCODE_ROL:
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
2015-08-04 19:04:55 +03:00
|
|
|
case SHADER_OPCODE_MULH:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_AVG:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_FRC:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
intel/brw: Support CSE of ADD3
This one is a bit more complex in that we need to handle 3-source
commutative opcodes. But it's also quite useful:
fossil-db results on Alchemist (A770):
Instrs: 151659750 -> 150164959 (-0.99%); split: -0.99%, +0.01%
Cycles: 12822686329 -> 12574996669 (-1.93%); split: -2.05%, +0.12%
Subgroup size: 7589608 -> 7589592 (-0.00%)
Send messages: 7375047 -> 7375053 (+0.00%); split: -0.00%, +0.00%
Loop count: 46313 -> 46315 (+0.00%); split: -0.01%, +0.01%
Spill count: 110184 -> 54670 (-50.38%); split: -50.79%, +0.41%
Fill count: 213724 -> 104802 (-50.96%); split: -51.43%, +0.47%
Scratch Memory Size: 9406464 -> 3375104 (-64.12%); split: -64.35%, +0.23%
Our older Shadow of the Tomb Raider fossil is particularly helped with
over a 90% reduction in scratch access (spills, fills, and scratch
size). However, benchmarking in the actual game shows no change in
performance. We're thinking the game's shaders have been updated since
our capture.
Ian noted that there was a bug here where we'd accidentally CSE two ADD3
instructions with null destinations and different src[2] that couldn't
be dead code eliminated due to conditional mods. However, this is only
a bug in the new cse_defs pass so we don't need to nominate this for
stable branches.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29848>
2024-06-16 02:45:53 -07:00
|
|
|
case BRW_OPCODE_ADD3:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
2012-12-02 00:08:15 -08:00
|
|
|
case BRW_OPCODE_LRP:
|
2016-07-21 16:55:45 -07:00
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
2013-02-15 19:49:32 -08:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2016-05-17 23:18:38 -07:00
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
2015-02-20 20:25:04 +02:00
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
2022-03-17 00:46:21 -07:00
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
2024-01-05 09:19:38 -08:00
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
2020-01-23 23:01:32 -08:00
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
2015-02-19 14:52:24 +02:00
|
|
|
case SHADER_OPCODE_BROADCAST:
|
2024-03-06 01:36:10 -08:00
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2024-02-29 23:57:08 -08:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2024-03-06 01:36:10 -08:00
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
2016-05-05 11:40:41 +02:00
|
|
|
case FS_OPCODE_PACK:
|
2024-03-06 01:36:10 -08:00
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
2013-07-25 00:30:05 -07:00
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2024-03-04 03:17:03 -08:00
|
|
|
return true;
|
2014-03-30 12:41:55 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
2019-12-31 00:10:28 -08:00
|
|
|
return !is_coalescing_payload(v->alloc, inst);
|
2012-05-10 16:10:15 -07:00
|
|
|
default:
|
2015-10-19 23:13:09 -07:00
|
|
|
return inst->is_send_from_grf() && !inst->has_side_effects() &&
|
|
|
|
|
!inst->is_volatile();
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
/**
|
|
|
|
|
* True if the instruction should only be CSE'd within their local block.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
local_only(const fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
/* These depend on the current channel enables, so the same opcode
|
|
|
|
|
* in another block will likely return a different value.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
/* Global CSE of MOVs is likely not worthwhile. It can increase
|
|
|
|
|
* register pressure by extending the lifetime of simple constants.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
/* This is basically a MOV */
|
|
|
|
|
return inst->sources == 1;
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
/* Seems to increase spilling a lot without much benefit */
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-10-18 16:02:11 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
|
2013-10-18 16:02:11 -07:00
|
|
|
{
|
2014-03-25 15:28:17 -07:00
|
|
|
fs_reg *xs = a->src;
|
|
|
|
|
fs_reg *ys = b->src;
|
|
|
|
|
|
2014-10-26 10:08:40 -07:00
|
|
|
if (a->opcode == BRW_OPCODE_MAD) {
|
|
|
|
|
return xs[0].equals(ys[0]) &&
|
|
|
|
|
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
|
|
|
|
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
|
2024-04-20 17:08:02 -07:00
|
|
|
} else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_TYPE_F) {
|
2015-01-27 19:18:46 -08:00
|
|
|
bool xs0_negate = xs[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: xs[1].negate;
|
|
|
|
|
bool ys0_negate = ys[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: ys[1].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
float xs1_imm = xs[1].f;
|
|
|
|
|
float ys1_imm = ys[1].f;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
xs[0].negate = false;
|
|
|
|
|
xs[1].negate = false;
|
|
|
|
|
ys[0].negate = false;
|
|
|
|
|
ys[1].negate = false;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = fabsf(xs[1].f);
|
|
|
|
|
ys[1].f = fabsf(ys[1].f);
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
|
|
|
|
|
xs[0].negate = xs0_negate;
|
|
|
|
|
xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
|
|
|
|
|
ys[0].negate = ys0_negate;
|
|
|
|
|
ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = xs1_imm;
|
|
|
|
|
ys[1].f = ys1_imm;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
2015-04-13 11:29:14 -07:00
|
|
|
*negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
|
2016-02-22 10:25:38 -08:00
|
|
|
if (*negate && (a->saturate || b->saturate))
|
|
|
|
|
return false;
|
2015-01-27 19:18:46 -08:00
|
|
|
return ret;
|
2015-03-13 14:34:06 -07:00
|
|
|
} else if (!a->is_commutative()) {
|
2014-03-25 15:28:17 -07:00
|
|
|
bool match = true;
|
|
|
|
|
for (int i = 0; i < a->sources; i++) {
|
|
|
|
|
if (!xs[i].equals(ys[i])) {
|
|
|
|
|
match = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return match;
|
intel/brw: Support CSE of ADD3
This one is a bit more complex in that we need to handle 3-source
commutative opcodes. But it's also quite useful:
fossil-db results on Alchemist (A770):
Instrs: 151659750 -> 150164959 (-0.99%); split: -0.99%, +0.01%
Cycles: 12822686329 -> 12574996669 (-1.93%); split: -2.05%, +0.12%
Subgroup size: 7589608 -> 7589592 (-0.00%)
Send messages: 7375047 -> 7375053 (+0.00%); split: -0.00%, +0.00%
Loop count: 46313 -> 46315 (+0.00%); split: -0.01%, +0.01%
Spill count: 110184 -> 54670 (-50.38%); split: -50.79%, +0.41%
Fill count: 213724 -> 104802 (-50.96%); split: -51.43%, +0.47%
Scratch Memory Size: 9406464 -> 3375104 (-64.12%); split: -64.35%, +0.23%
Our older Shadow of the Tomb Raider fossil is particularly helped with
over a 90% reduction in scratch access (spills, fills, and scratch
size). However, benchmarking in the actual game shows no change in
performance. We're thinking the game's shaders have been updated since
our capture.
Ian noted that there was a bug here where we'd accidentally CSE two ADD3
instructions with null destinations and different src[2] that couldn't
be dead code eliminated due to conditional mods. However, this is only
a bug in the new cse_defs pass so we don't need to nominate this for
stable branches.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29848>
2024-06-16 02:45:53 -07:00
|
|
|
} else if (a->sources == 3) {
|
|
|
|
|
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
|
|
|
|
(xs[0].equals(ys[0]) && xs[1].equals(ys[2]) && xs[2].equals(ys[1])) ||
|
|
|
|
|
(xs[0].equals(ys[1]) && xs[1].equals(ys[0]) && xs[2].equals(ys[2])) ||
|
|
|
|
|
(xs[0].equals(ys[1]) && xs[1].equals(ys[2]) && xs[2].equals(ys[1])) ||
|
|
|
|
|
(xs[0].equals(ys[2]) && xs[1].equals(ys[0]) && xs[2].equals(ys[1])) ||
|
|
|
|
|
(xs[0].equals(ys[2]) && xs[1].equals(ys[1]) && xs[2].equals(ys[0]));
|
2013-10-18 16:02:11 -07:00
|
|
|
} else {
|
|
|
|
|
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
instructions_match(fs_inst *a, fs_inst *b, bool *negate)
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
{
|
|
|
|
|
return a->opcode == b->opcode &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->exec_size == b->exec_size &&
|
2016-05-20 16:14:13 -07:00
|
|
|
a->group == b->group &&
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
a->predicate == b->predicate &&
|
|
|
|
|
a->conditional_mod == b->conditional_mod &&
|
|
|
|
|
a->dst.type == b->dst.type &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->offset == b->offset &&
|
|
|
|
|
a->mlen == b->mlen &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->ex_mlen == b->ex_mlen &&
|
|
|
|
|
a->sfid == b->sfid &&
|
|
|
|
|
a->desc == b->desc &&
|
2024-04-10 23:31:26 -07:00
|
|
|
a->ex_desc == b->ex_desc &&
|
2016-09-07 13:38:20 -07:00
|
|
|
a->size_written == b->size_written &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->check_tdr == b->check_tdr &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->header_size == b->header_size &&
|
2016-07-06 20:49:58 -07:00
|
|
|
a->target == b->target &&
|
2014-03-25 15:28:17 -07:00
|
|
|
a->sources == b->sources &&
|
2024-04-10 23:31:26 -07:00
|
|
|
a->bits == b->bits &&
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(a, b, negate);
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
}
|
|
|
|
|
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
#define HASH(hash, data) XXH32(&(data), sizeof(data), hash)
|
|
|
|
|
|
|
|
|
|
uint32_t
|
|
|
|
|
hash_reg(uint32_t hash, const fs_reg &r)
|
|
|
|
|
{
|
|
|
|
|
struct {
|
|
|
|
|
uint64_t u64;
|
|
|
|
|
uint32_t u32;
|
|
|
|
|
uint16_t u16a;
|
|
|
|
|
uint16_t u16b;
|
|
|
|
|
} data = {
|
|
|
|
|
.u64 = r.u64, .u32 = r.bits, .u16a = r.offset, .u16b = r.stride
|
|
|
|
|
};
|
|
|
|
|
STATIC_ASSERT(sizeof(data) == 16); /* ensure there's no padding */
|
|
|
|
|
hash = HASH(hash, data);
|
|
|
|
|
return hash;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
|
hash_inst(const void *v)
|
|
|
|
|
{
|
|
|
|
|
const fs_inst *inst = static_cast<const fs_inst *>(v);
|
|
|
|
|
uint32_t hash = 0;
|
|
|
|
|
|
|
|
|
|
/* Skip dst - that would make nothing ever match */
|
|
|
|
|
|
|
|
|
|
/* Skip ir and annotation - we don't care for equivalency purposes. */
|
|
|
|
|
|
|
|
|
|
const uint8_t u8data[] = {
|
|
|
|
|
inst->sources,
|
|
|
|
|
inst->exec_size,
|
|
|
|
|
inst->group,
|
|
|
|
|
inst->mlen,
|
|
|
|
|
inst->ex_mlen,
|
|
|
|
|
inst->sfid,
|
|
|
|
|
inst->header_size,
|
|
|
|
|
inst->target,
|
|
|
|
|
|
|
|
|
|
inst->conditional_mod,
|
|
|
|
|
inst->predicate,
|
|
|
|
|
};
|
|
|
|
|
const uint32_t u32data[] = {
|
|
|
|
|
inst->desc,
|
|
|
|
|
inst->ex_desc,
|
|
|
|
|
inst->offset,
|
|
|
|
|
inst->size_written,
|
|
|
|
|
inst->opcode,
|
|
|
|
|
inst->bits,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
hash = HASH(hash, u8data);
|
|
|
|
|
hash = HASH(hash, u32data);
|
|
|
|
|
|
|
|
|
|
/* Skip hashing sched - we shouldn't be CSE'ing after that SWSB */
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MAD) {
|
|
|
|
|
/* Commutatively combine the hashes for the multiplicands */
|
|
|
|
|
hash = hash_reg(hash, inst->src[0]);
|
|
|
|
|
uint32_t hash1 = hash_reg(hash, inst->src[1]);
|
|
|
|
|
uint32_t hash2 = hash_reg(hash, inst->src[2]);
|
|
|
|
|
hash = hash1 * hash2;
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_MUL &&
|
|
|
|
|
inst->dst.type == BRW_TYPE_F) {
|
|
|
|
|
/* Canonicalize negations on either source (or both) and commutatively
|
|
|
|
|
* combine the hashes for both sources.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg src[2] = { inst->src[0], inst->src[1] };
|
|
|
|
|
uint32_t src_hash[2];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
|
src[i].negate = false;
|
|
|
|
|
if (src[i].file == IMM)
|
|
|
|
|
src[i].f = fabs(src[i].f);
|
|
|
|
|
|
|
|
|
|
src_hash[i] = hash_reg(hash, src[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hash = src_hash[0] * src_hash[1];
|
|
|
|
|
} else if (inst->is_commutative()) {
|
intel/brw: Support CSE of ADD3
This one is a bit more complex in that we need to handle 3-source
commutative opcodes. But it's also quite useful:
fossil-db results on Alchemist (A770):
Instrs: 151659750 -> 150164959 (-0.99%); split: -0.99%, +0.01%
Cycles: 12822686329 -> 12574996669 (-1.93%); split: -2.05%, +0.12%
Subgroup size: 7589608 -> 7589592 (-0.00%)
Send messages: 7375047 -> 7375053 (+0.00%); split: -0.00%, +0.00%
Loop count: 46313 -> 46315 (+0.00%); split: -0.01%, +0.01%
Spill count: 110184 -> 54670 (-50.38%); split: -50.79%, +0.41%
Fill count: 213724 -> 104802 (-50.96%); split: -51.43%, +0.47%
Scratch Memory Size: 9406464 -> 3375104 (-64.12%); split: -64.35%, +0.23%
Our older Shadow of the Tomb Raider fossil is particularly helped with
over a 90% reduction in scratch access (spills, fills, and scratch
size). However, benchmarking in the actual game shows no change in
performance. We're thinking the game's shaders have been updated since
our capture.
Ian noted that there was a bug here where we'd accidentally CSE two ADD3
instructions with null destinations and different src[2] that couldn't
be dead code eliminated due to conditional mods. However, this is only
a bug in the new cse_defs pass so we don't need to nominate this for
stable branches.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29848>
2024-06-16 02:45:53 -07:00
|
|
|
/* Commutatively combine the sources */
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
uint32_t hash0 = hash_reg(hash, inst->src[0]);
|
|
|
|
|
uint32_t hash1 = hash_reg(hash, inst->src[1]);
|
intel/brw: Support CSE of ADD3
This one is a bit more complex in that we need to handle 3-source
commutative opcodes. But it's also quite useful:
fossil-db results on Alchemist (A770):
Instrs: 151659750 -> 150164959 (-0.99%); split: -0.99%, +0.01%
Cycles: 12822686329 -> 12574996669 (-1.93%); split: -2.05%, +0.12%
Subgroup size: 7589608 -> 7589592 (-0.00%)
Send messages: 7375047 -> 7375053 (+0.00%); split: -0.00%, +0.00%
Loop count: 46313 -> 46315 (+0.00%); split: -0.01%, +0.01%
Spill count: 110184 -> 54670 (-50.38%); split: -50.79%, +0.41%
Fill count: 213724 -> 104802 (-50.96%); split: -51.43%, +0.47%
Scratch Memory Size: 9406464 -> 3375104 (-64.12%); split: -64.35%, +0.23%
Our older Shadow of the Tomb Raider fossil is particularly helped with
over a 90% reduction in scratch access (spills, fills, and scratch
size). However, benchmarking in the actual game shows no change in
performance. We're thinking the game's shaders have been updated since
our capture.
Ian noted that there was a bug here where we'd accidentally CSE two ADD3
instructions with null destinations and different src[2] that couldn't
be dead code eliminated due to conditional mods. However, this is only
a bug in the new cse_defs pass so we don't need to nominate this for
stable branches.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29848>
2024-06-16 02:45:53 -07:00
|
|
|
uint32_t hash2 = inst->sources > 2 ? hash_reg(hash, inst->src[2]) : 1;
|
|
|
|
|
hash = hash0 * hash1 * hash2;
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
} else {
|
|
|
|
|
/* Just hash all the sources */
|
|
|
|
|
for (int i = 0; i < inst->sources; i++)
|
|
|
|
|
hash = hash_reg(hash, inst->src[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hash;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
cmp_func(const void *data1, const void *data2)
|
|
|
|
|
{
|
|
|
|
|
bool negate;
|
|
|
|
|
return instructions_match((fs_inst *) data1, (fs_inst *) data2, &negate);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We set bit 31 in remap_table entries if it needs to be negated. */
|
|
|
|
|
#define REMAP_NEGATE (0x80000000u)
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
remap_sources(fs_visitor &s, const brw::def_analysis &defs,
|
|
|
|
|
fs_inst *inst, unsigned *remap_table)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF &&
|
|
|
|
|
inst->src[i].nr < defs.count() &&
|
|
|
|
|
remap_table[inst->src[i].nr] != ~0u) {
|
|
|
|
|
const unsigned old_nr = inst->src[i].nr;
|
|
|
|
|
unsigned new_nr = remap_table[old_nr];
|
|
|
|
|
const bool need_negate = new_nr & REMAP_NEGATE;
|
|
|
|
|
new_nr &= ~REMAP_NEGATE;
|
|
|
|
|
inst->src[i].nr = new_nr;
|
|
|
|
|
|
|
|
|
|
if (need_negate) {
|
|
|
|
|
if ((inst->src[i].type != BRW_TYPE_F &&
|
|
|
|
|
!inst->can_change_types()) ||
|
|
|
|
|
!inst->can_do_source_mods(s.devinfo)) {
|
|
|
|
|
/* We can't use the negate directly, resolve it just after the
|
|
|
|
|
* def and use that for any future uses.
|
|
|
|
|
*/
|
|
|
|
|
fs_inst *def = defs.get(inst->src[i]);
|
|
|
|
|
bblock_t *def_block = defs.get_block(inst->src[i]);
|
|
|
|
|
const fs_builder dbld =
|
|
|
|
|
fs_builder(&s, def_block, def).at(def_block, def->next);
|
|
|
|
|
|
|
|
|
|
/* Resolve any deferred block IP changes before inserting */
|
|
|
|
|
if (def_block->end_ip_delta)
|
|
|
|
|
s.cfg->adjust_block_ips();
|
|
|
|
|
|
|
|
|
|
fs_reg neg(VGRF, new_nr, BRW_TYPE_F);
|
|
|
|
|
fs_reg tmp = dbld.MOV(negate(neg));
|
|
|
|
|
inst->src[i].nr = tmp.nr;
|
|
|
|
|
remap_table[old_nr] = tmp.nr;
|
|
|
|
|
} else {
|
|
|
|
|
inst->src[i].negate = !inst->src[i].negate;
|
|
|
|
|
inst->src[i].type = BRW_TYPE_F;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_opt_cse_defs(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const idom_tree &idom = s.idom_analysis.require();
|
|
|
|
|
const brw::def_analysis &defs = s.def_analysis.require();
|
|
|
|
|
bool progress = false;
|
|
|
|
|
bool need_remaps = false;
|
|
|
|
|
|
|
|
|
|
unsigned *remap_table = new unsigned[defs.count()];
|
|
|
|
|
memset(remap_table, ~0u, defs.count() * sizeof(int));
|
|
|
|
|
struct set *set = _mesa_set_create(NULL, NULL, cmp_func);
|
|
|
|
|
|
|
|
|
|
foreach_block(block, s.cfg) {
|
|
|
|
|
fs_inst *last_flag_write = NULL;
|
|
|
|
|
fs_inst *last = NULL;
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_safe(fs_inst, inst, block) {
|
|
|
|
|
if (need_remaps)
|
|
|
|
|
remap_sources(s, defs, inst, remap_table);
|
|
|
|
|
|
|
|
|
|
/* Updating last_flag_written should be at the bottom of the loop,
|
|
|
|
|
* but doing it this way lets us use "continue" more easily.
|
|
|
|
|
*/
|
|
|
|
|
if (last && last->flags_written(devinfo))
|
|
|
|
|
last_flag_write = last;
|
|
|
|
|
last = inst;
|
|
|
|
|
|
|
|
|
|
if (inst->dst.is_null()) {
|
|
|
|
|
bool ignored;
|
|
|
|
|
if (last_flag_write && !inst->writes_accumulator &&
|
|
|
|
|
instructions_match(last_flag_write, inst, &ignored)) {
|
|
|
|
|
/* This instruction has no destination but has a flag write
|
|
|
|
|
* which is redundant with the previous flag write in our
|
|
|
|
|
* basic block. So we can simply remove it.
|
|
|
|
|
*/
|
|
|
|
|
inst->remove(block, true);
|
|
|
|
|
last = NULL;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
} else if (is_expression(&s, inst) && defs.get(inst->dst)) {
|
|
|
|
|
assert(!inst->writes_accumulator);
|
|
|
|
|
assert(!inst->reads_accumulator_implicitly());
|
|
|
|
|
|
|
|
|
|
uint32_t hash = hash_inst(inst);
|
|
|
|
|
if (inst->flags_read(devinfo)) {
|
|
|
|
|
hash = last_flag_write ? HASH(hash, last_flag_write)
|
|
|
|
|
: HASH(hash, block);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct set_entry *e =
|
|
|
|
|
_mesa_set_search_or_add_pre_hashed(set, hash, inst, NULL);
|
|
|
|
|
if (!e) goto out; /* out of memory error */
|
|
|
|
|
fs_inst *match = (fs_inst *) e->key;
|
|
|
|
|
|
|
|
|
|
/* If there was no match, move on */
|
|
|
|
|
if (match == inst)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bblock_t *def_block = defs.get_block(match->dst);
|
|
|
|
|
if (block != def_block && (local_only(inst) ||
|
|
|
|
|
!idom.dominates(def_block, block))) {
|
|
|
|
|
/* If `match` doesn't dominate `inst` then remove it from
|
|
|
|
|
* the set and add `inst` instead so future lookups see that.
|
|
|
|
|
*/
|
|
|
|
|
e->key = inst;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We can replace inst with match or negate(match). */
|
|
|
|
|
bool negate = false;
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MUL &&
|
|
|
|
|
inst->dst.type == BRW_TYPE_F) {
|
|
|
|
|
/* Determine whether inst is actually negate(match) */
|
|
|
|
|
bool ops_must_match = operands_match(inst, match, &negate);
|
|
|
|
|
assert(ops_must_match);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
need_remaps = true;
|
|
|
|
|
remap_table[inst->dst.nr] =
|
|
|
|
|
match->dst.nr | (negate ? REMAP_NEGATE : 0);
|
|
|
|
|
|
|
|
|
|
inst->remove(block, true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
delete [] remap_table;
|
|
|
|
|
_mesa_set_destroy(set, NULL);
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
|
|
|
|
s.cfg->adjust_block_ips();
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DETAIL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#undef HASH
|