2012-05-10 16:10:15 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2012 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
#define XXH_INLINE_ALL
|
|
|
|
|
#include "util/xxhash.h"
|
|
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2012-10-03 13:03:12 -07:00
|
|
|
#include "brw_cfg.h"
|
2012-05-10 16:10:15 -07:00
|
|
|
|
|
|
|
|
/** @file brw_fs_cse.cpp
|
|
|
|
|
*
|
|
|
|
|
* Support for local common subexpression elimination.
|
|
|
|
|
*
|
2013-10-19 16:40:19 -07:00
|
|
|
* See Muchnick's Advanced Compiler Design and Implementation, section
|
2012-05-10 16:10:15 -07:00
|
|
|
* 13.1 (p378).
|
|
|
|
|
*/
|
|
|
|
|
|
2015-06-04 16:13:35 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
namespace {
|
|
|
|
|
struct aeb_entry : public exec_node {
|
|
|
|
|
/** The instruction that generates the expression value. */
|
|
|
|
|
fs_inst *generator;
|
|
|
|
|
|
|
|
|
|
/** The temporary where the value is stored. */
|
|
|
|
|
fs_reg tmp;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-30 12:41:55 -07:00
|
|
|
static bool
|
2015-04-01 15:38:23 -07:00
|
|
|
is_expression(const fs_visitor *v, const fs_inst *const inst)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
2014-04-03 14:29:30 -07:00
|
|
|
case BRW_OPCODE_MOV:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
case BRW_OPCODE_ROL:
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
2015-08-04 19:04:55 +03:00
|
|
|
case SHADER_OPCODE_MULH:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_AVG:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_FRC:
|
2024-03-06 01:36:10 -08:00
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
2012-12-02 00:08:15 -08:00
|
|
|
case BRW_OPCODE_LRP:
|
2016-07-21 16:55:45 -07:00
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
2013-02-15 19:49:32 -08:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2016-05-17 23:18:38 -07:00
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
2015-02-20 20:25:04 +02:00
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
2022-03-17 00:46:21 -07:00
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
2024-01-05 09:19:38 -08:00
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
2020-01-23 23:01:32 -08:00
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
2015-02-19 14:52:24 +02:00
|
|
|
case SHADER_OPCODE_BROADCAST:
|
2024-03-06 01:36:10 -08:00
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2024-02-29 23:57:08 -08:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2024-03-06 01:36:10 -08:00
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
2016-05-05 11:40:41 +02:00
|
|
|
case FS_OPCODE_PACK:
|
2024-03-06 01:36:10 -08:00
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
2013-07-25 00:30:05 -07:00
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2024-03-04 03:17:03 -08:00
|
|
|
return true;
|
2014-03-30 12:41:55 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
2019-12-31 00:10:28 -08:00
|
|
|
return !is_coalescing_payload(v->alloc, inst);
|
2012-05-10 16:10:15 -07:00
|
|
|
default:
|
2015-10-19 23:13:09 -07:00
|
|
|
return inst->is_send_from_grf() && !inst->has_side_effects() &&
|
|
|
|
|
!inst->is_volatile();
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
/**
|
|
|
|
|
* True if the instruction should only be CSE'd within their local block.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
local_only(const fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
/* These depend on the current channel enables, so the same opcode
|
|
|
|
|
* in another block will likely return a different value.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
/* Global CSE of MOVs is likely not worthwhile. It can increase
|
|
|
|
|
* register pressure by extending the lifetime of simple constants.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
/* This is basically a MOV */
|
|
|
|
|
return inst->sources == 1;
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
/* Seems to increase spilling a lot without much benefit */
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-10-18 16:02:11 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
|
2013-10-18 16:02:11 -07:00
|
|
|
{
|
2014-03-25 15:28:17 -07:00
|
|
|
fs_reg *xs = a->src;
|
|
|
|
|
fs_reg *ys = b->src;
|
|
|
|
|
|
2014-10-26 10:08:40 -07:00
|
|
|
if (a->opcode == BRW_OPCODE_MAD) {
|
|
|
|
|
return xs[0].equals(ys[0]) &&
|
|
|
|
|
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
|
|
|
|
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
|
2024-04-20 17:08:02 -07:00
|
|
|
} else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_TYPE_F) {
|
2015-01-27 19:18:46 -08:00
|
|
|
bool xs0_negate = xs[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: xs[1].negate;
|
|
|
|
|
bool ys0_negate = ys[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: ys[1].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
float xs1_imm = xs[1].f;
|
|
|
|
|
float ys1_imm = ys[1].f;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
xs[0].negate = false;
|
|
|
|
|
xs[1].negate = false;
|
|
|
|
|
ys[0].negate = false;
|
|
|
|
|
ys[1].negate = false;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = fabsf(xs[1].f);
|
|
|
|
|
ys[1].f = fabsf(ys[1].f);
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
|
|
|
|
|
xs[0].negate = xs0_negate;
|
|
|
|
|
xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
|
|
|
|
|
ys[0].negate = ys0_negate;
|
|
|
|
|
ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = xs1_imm;
|
|
|
|
|
ys[1].f = ys1_imm;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
2015-04-13 11:29:14 -07:00
|
|
|
*negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
|
2016-02-22 10:25:38 -08:00
|
|
|
if (*negate && (a->saturate || b->saturate))
|
|
|
|
|
return false;
|
2015-01-27 19:18:46 -08:00
|
|
|
return ret;
|
2015-03-13 14:34:06 -07:00
|
|
|
} else if (!a->is_commutative()) {
|
2014-03-25 15:28:17 -07:00
|
|
|
bool match = true;
|
|
|
|
|
for (int i = 0; i < a->sources; i++) {
|
|
|
|
|
if (!xs[i].equals(ys[i])) {
|
|
|
|
|
match = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return match;
|
2013-10-18 16:02:11 -07:00
|
|
|
} else {
|
|
|
|
|
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
instructions_match(fs_inst *a, fs_inst *b, bool *negate)
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
{
|
|
|
|
|
return a->opcode == b->opcode &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->exec_size == b->exec_size &&
|
2016-05-20 16:14:13 -07:00
|
|
|
a->group == b->group &&
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
a->predicate == b->predicate &&
|
|
|
|
|
a->conditional_mod == b->conditional_mod &&
|
|
|
|
|
a->dst.type == b->dst.type &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->offset == b->offset &&
|
|
|
|
|
a->mlen == b->mlen &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->ex_mlen == b->ex_mlen &&
|
|
|
|
|
a->sfid == b->sfid &&
|
|
|
|
|
a->desc == b->desc &&
|
2024-04-10 23:31:26 -07:00
|
|
|
a->ex_desc == b->ex_desc &&
|
2016-09-07 13:38:20 -07:00
|
|
|
a->size_written == b->size_written &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->check_tdr == b->check_tdr &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->header_size == b->header_size &&
|
2016-07-06 20:49:58 -07:00
|
|
|
a->target == b->target &&
|
2014-03-25 15:28:17 -07:00
|
|
|
a->sources == b->sources &&
|
2024-04-10 23:31:26 -07:00
|
|
|
a->bits == b->bits &&
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(a, b, negate);
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
}
|
|
|
|
|
|
2015-06-04 16:13:35 +03:00
|
|
|
static void
|
|
|
|
|
create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
|
2015-03-24 15:06:24 -07:00
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
unsigned written = regs_written(inst);
|
|
|
|
|
unsigned dst_width =
|
2015-08-11 14:24:55 -07:00
|
|
|
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
|
2015-03-24 15:06:24 -07:00
|
|
|
fs_inst *copy;
|
|
|
|
|
|
2019-01-14 22:21:48 -06:00
|
|
|
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
2015-10-26 17:09:25 -07:00
|
|
|
assert(src.file == VGRF);
|
2019-01-14 22:21:48 -06:00
|
|
|
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
|
|
|
|
|
inst->sources);
|
|
|
|
|
for (int i = 0; i < inst->header_size; i++) {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
payload[i] = src;
|
2016-09-01 12:42:20 -07:00
|
|
|
src.offset += REG_SIZE;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
}
|
2019-01-14 22:21:48 -06:00
|
|
|
for (int i = inst->header_size; i < inst->sources; i++) {
|
2018-11-14 22:38:23 -06:00
|
|
|
src.type = inst->src[i].type;
|
2019-01-14 22:21:48 -06:00
|
|
|
payload[i] = src;
|
|
|
|
|
src = offset(src, bld, 1);
|
|
|
|
|
}
|
|
|
|
|
copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
|
|
|
|
|
inst->header_size);
|
|
|
|
|
} else if (written != dst_width) {
|
|
|
|
|
assert(src.file == VGRF);
|
|
|
|
|
assert(written % dst_width == 0);
|
|
|
|
|
const int sources = written / dst_width;
|
|
|
|
|
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
payload[i] = src;
|
2015-07-27 19:18:51 +03:00
|
|
|
src = offset(src, bld, 1);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
}
|
2019-01-14 22:21:48 -06:00
|
|
|
copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
|
2015-03-24 15:06:24 -07:00
|
|
|
} else {
|
2015-07-27 19:18:51 +03:00
|
|
|
copy = bld.MOV(inst->dst, src);
|
2016-05-20 16:14:13 -07:00
|
|
|
copy->group = inst->group;
|
2015-08-11 14:25:36 -07:00
|
|
|
copy->force_writemask_all = inst->force_writemask_all;
|
2015-03-24 15:06:24 -07:00
|
|
|
copy->src[0].negate = negate;
|
|
|
|
|
}
|
2016-09-07 16:59:35 -07:00
|
|
|
assert(regs_written(copy) == written);
|
2015-03-24 15:06:24 -07:00
|
|
|
}
|
|
|
|
|
|
2024-01-03 11:16:40 -08:00
|
|
|
static bool
|
|
|
|
|
brw_fs_opt_cse_local(fs_visitor &s, const fs_live_variables &live, bblock_t *block, int &ip)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
2024-01-03 11:16:40 -08:00
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
2012-05-10 16:10:15 -07:00
|
|
|
bool progress = false;
|
2014-07-11 20:35:31 -07:00
|
|
|
exec_list aeb;
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-03-26 15:58:12 -07:00
|
|
|
void *cse_ctx = ralloc_context(NULL);
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-06-24 12:42:00 -07:00
|
|
|
foreach_inst_in_block(fs_inst, inst, block) {
|
2012-05-10 16:10:15 -07:00
|
|
|
/* Skip some cases. */
|
2024-01-03 11:16:40 -08:00
|
|
|
if (is_expression(&s, inst) && !inst->is_partial_write() &&
|
2015-10-26 17:52:57 -07:00
|
|
|
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
|
|
|
|
|
inst->dst.is_null()))
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
2014-06-11 13:01:31 -07:00
|
|
|
bool found = false;
|
2015-01-27 19:18:46 -08:00
|
|
|
bool negate = false;
|
2014-06-11 13:01:31 -07:00
|
|
|
|
2014-07-11 20:35:31 -07:00
|
|
|
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
|
2014-06-11 13:01:31 -07:00
|
|
|
/* Match current instruction's expression against those in AEB. */
|
2015-01-12 13:58:06 -08:00
|
|
|
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
|
2015-01-27 19:18:46 -08:00
|
|
|
instructions_match(inst, entry->generator, &negate)) {
|
2014-06-11 13:01:31 -07:00
|
|
|
found = true;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!found) {
|
2014-04-03 14:29:30 -07:00
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
|
|
|
|
(inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
inst->src[0].file == IMM &&
|
2024-04-20 17:08:02 -07:00
|
|
|
inst->src[0].type == BRW_TYPE_VF)) {
|
2014-04-03 14:29:30 -07:00
|
|
|
/* Our first sighting of this expression. Create an entry. */
|
|
|
|
|
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
|
|
|
|
|
entry->tmp = reg_undef;
|
|
|
|
|
entry->generator = inst;
|
|
|
|
|
aeb.push_tail(entry);
|
|
|
|
|
}
|
2014-06-11 13:01:31 -07:00
|
|
|
} else {
|
|
|
|
|
/* This is at least our second sighting of this expression.
|
|
|
|
|
* If we don't have a temporary already, make one.
|
|
|
|
|
*/
|
|
|
|
|
bool no_existing_temp = entry->tmp.file == BAD_FILE;
|
|
|
|
|
if (no_existing_temp && !entry->generator->dst.is_null()) {
|
2024-01-03 11:16:40 -08:00
|
|
|
const fs_builder ibld = fs_builder(&s, block, entry->generator)
|
2015-07-27 19:18:51 +03:00
|
|
|
.at(block, entry->generator->next);
|
2016-09-07 16:59:35 -07:00
|
|
|
int written = regs_written(entry->generator);
|
2013-03-15 14:43:28 -07:00
|
|
|
|
2024-01-03 11:16:40 -08:00
|
|
|
entry->tmp = fs_reg(VGRF, s.alloc.allocate(written),
|
2015-06-18 12:44:35 -07:00
|
|
|
entry->generator->dst.type);
|
2013-03-15 14:43:28 -07:00
|
|
|
|
2015-07-27 19:18:51 +03:00
|
|
|
create_copy_instr(ibld, entry->generator, entry->tmp, false);
|
2015-03-24 15:06:24 -07:00
|
|
|
|
|
|
|
|
entry->generator->dst = entry->tmp;
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-06-11 13:01:31 -07:00
|
|
|
/* dest <- temp */
|
2013-10-22 15:40:08 -07:00
|
|
|
if (!inst->dst.is_null()) {
|
2016-09-07 13:38:20 -07:00
|
|
|
assert(inst->size_written == entry->generator->size_written);
|
2013-10-22 15:40:08 -07:00
|
|
|
assert(inst->dst.type == entry->tmp.type);
|
2024-01-03 11:16:40 -08:00
|
|
|
const fs_builder ibld(&s, block, inst);
|
2015-03-24 15:06:24 -07:00
|
|
|
|
2015-07-27 19:18:51 +03:00
|
|
|
create_copy_instr(ibld, inst, entry->tmp, negate);
|
2013-03-15 14:43:28 -07:00
|
|
|
}
|
2013-10-22 15:40:08 -07:00
|
|
|
|
|
|
|
|
/* Set our iterator so that next time through the loop inst->next
|
|
|
|
|
* will get the instruction in the basic block after the one we've
|
|
|
|
|
* removed.
|
|
|
|
|
*/
|
|
|
|
|
fs_inst *prev = (fs_inst *)inst->prev;
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
inst->remove(block);
|
2013-10-22 15:40:08 -07:00
|
|
|
inst = prev;
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2020-01-23 22:01:00 -08:00
|
|
|
/* Discard jumps aren't represented in the CFG unfortunately, so we need
|
|
|
|
|
* to make sure that they behave as a CSE barrier, since we lack global
|
|
|
|
|
* dataflow information. This is particularly likely to cause problems
|
|
|
|
|
* with instructions dependent on the current execution mask like
|
|
|
|
|
* SHADER_OPCODE_FIND_LIVE_CHANNEL.
|
|
|
|
|
*/
|
2020-11-30 17:24:51 -06:00
|
|
|
if (inst->opcode == BRW_OPCODE_HALT ||
|
2020-11-19 09:32:27 -06:00
|
|
|
inst->opcode == SHADER_OPCODE_HALT_TARGET)
|
2020-01-23 22:01:00 -08:00
|
|
|
aeb.make_empty();
|
|
|
|
|
|
2014-07-11 20:35:31 -07:00
|
|
|
foreach_in_list_safe(aeb_entry, entry, &aeb) {
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
/* Kill all AEB entries that write a different value to or read from
|
|
|
|
|
* the flag register if we just wrote it.
|
|
|
|
|
*/
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
if (inst->flags_written(devinfo)) {
|
2015-01-27 19:18:46 -08:00
|
|
|
bool negate; /* dummy */
|
2016-05-18 22:40:40 -07:00
|
|
|
if (entry->generator->flags_read(devinfo) ||
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
(entry->generator->flags_written(devinfo) &&
|
2015-01-27 19:18:46 -08:00
|
|
|
!instructions_match(inst, entry->generator, &negate))) {
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-06-11 13:01:31 -07:00
|
|
|
for (int i = 0; i < entry->generator->sources; i++) {
|
2013-02-19 16:20:10 -08:00
|
|
|
fs_reg *src_reg = &entry->generator->src[i];
|
|
|
|
|
|
|
|
|
|
/* Kill all AEB entries that use the destination we just
|
|
|
|
|
* overwrote.
|
|
|
|
|
*/
|
2016-09-01 19:34:18 -07:00
|
|
|
if (regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
entry->generator->src[i],
|
|
|
|
|
entry->generator->size_read(i))) {
|
2014-06-11 13:01:31 -07:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-02-19 16:20:10 -08:00
|
|
|
|
|
|
|
|
/* Kill any AEB entries using registers that don't get reused any
|
|
|
|
|
* more -- a sure sign they'll fail operands_match().
|
|
|
|
|
*/
|
2016-03-13 16:25:57 -07:00
|
|
|
if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) {
|
2013-02-19 16:20:10 -08:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
2014-06-11 13:01:31 -07:00
|
|
|
break;
|
2013-02-19 16:20:10 -08:00
|
|
|
}
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
2013-02-19 16:20:10 -08:00
|
|
|
|
|
|
|
|
ip++;
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2014-03-26 15:58:12 -07:00
|
|
|
ralloc_free(cse_ctx);
|
2012-05-10 16:10:15 -07:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
2024-01-03 11:16:40 -08:00
|
|
|
brw_fs_opt_cse(fs_visitor &s)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
2024-01-03 11:16:40 -08:00
|
|
|
const fs_live_variables &live = s.live_analysis.require();
|
2012-05-10 16:10:15 -07:00
|
|
|
bool progress = false;
|
2019-12-29 06:10:47 -08:00
|
|
|
int ip = 0;
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2024-01-03 11:16:40 -08:00
|
|
|
foreach_block (block, s.cfg) {
|
|
|
|
|
progress = brw_fs_opt_cse_local(s, live, block, ip) || progress;
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2014-07-11 20:37:04 -07:00
|
|
|
if (progress)
|
2024-01-03 11:16:40 -08:00
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2014-07-11 20:37:04 -07:00
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
return progress;
|
|
|
|
|
}
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
#define HASH(hash, data) XXH32(&(data), sizeof(data), hash)
|
|
|
|
|
|
|
|
|
|
uint32_t
|
|
|
|
|
hash_reg(uint32_t hash, const fs_reg &r)
|
|
|
|
|
{
|
|
|
|
|
struct {
|
|
|
|
|
uint64_t u64;
|
|
|
|
|
uint32_t u32;
|
|
|
|
|
uint16_t u16a;
|
|
|
|
|
uint16_t u16b;
|
|
|
|
|
} data = {
|
|
|
|
|
.u64 = r.u64, .u32 = r.bits, .u16a = r.offset, .u16b = r.stride
|
|
|
|
|
};
|
|
|
|
|
STATIC_ASSERT(sizeof(data) == 16); /* ensure there's no padding */
|
|
|
|
|
hash = HASH(hash, data);
|
|
|
|
|
return hash;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
|
hash_inst(const void *v)
|
|
|
|
|
{
|
|
|
|
|
const fs_inst *inst = static_cast<const fs_inst *>(v);
|
|
|
|
|
uint32_t hash = 0;
|
|
|
|
|
|
|
|
|
|
/* Skip dst - that would make nothing ever match */
|
|
|
|
|
|
|
|
|
|
/* Skip ir and annotation - we don't care for equivalency purposes. */
|
|
|
|
|
|
|
|
|
|
const uint8_t u8data[] = {
|
|
|
|
|
inst->sources,
|
|
|
|
|
inst->exec_size,
|
|
|
|
|
inst->group,
|
|
|
|
|
inst->mlen,
|
|
|
|
|
inst->ex_mlen,
|
|
|
|
|
inst->sfid,
|
|
|
|
|
inst->header_size,
|
|
|
|
|
inst->target,
|
|
|
|
|
|
|
|
|
|
inst->conditional_mod,
|
|
|
|
|
inst->predicate,
|
|
|
|
|
};
|
|
|
|
|
const uint32_t u32data[] = {
|
|
|
|
|
inst->desc,
|
|
|
|
|
inst->ex_desc,
|
|
|
|
|
inst->offset,
|
|
|
|
|
inst->size_written,
|
|
|
|
|
inst->opcode,
|
|
|
|
|
inst->bits,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
hash = HASH(hash, u8data);
|
|
|
|
|
hash = HASH(hash, u32data);
|
|
|
|
|
|
|
|
|
|
/* Skip hashing sched - we shouldn't be CSE'ing after that SWSB */
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MAD) {
|
|
|
|
|
/* Commutatively combine the hashes for the multiplicands */
|
|
|
|
|
hash = hash_reg(hash, inst->src[0]);
|
|
|
|
|
uint32_t hash1 = hash_reg(hash, inst->src[1]);
|
|
|
|
|
uint32_t hash2 = hash_reg(hash, inst->src[2]);
|
|
|
|
|
hash = hash1 * hash2;
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_MUL &&
|
|
|
|
|
inst->dst.type == BRW_TYPE_F) {
|
|
|
|
|
/* Canonicalize negations on either source (or both) and commutatively
|
|
|
|
|
* combine the hashes for both sources.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg src[2] = { inst->src[0], inst->src[1] };
|
|
|
|
|
uint32_t src_hash[2];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
|
src[i].negate = false;
|
|
|
|
|
if (src[i].file == IMM)
|
|
|
|
|
src[i].f = fabs(src[i].f);
|
|
|
|
|
|
|
|
|
|
src_hash[i] = hash_reg(hash, src[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hash = src_hash[0] * src_hash[1];
|
|
|
|
|
} else if (inst->is_commutative()) {
|
|
|
|
|
/* Commutatively combine both sources */
|
|
|
|
|
uint32_t hash0 = hash_reg(hash, inst->src[0]);
|
|
|
|
|
uint32_t hash1 = hash_reg(hash, inst->src[1]);
|
|
|
|
|
hash = hash0 * hash1;
|
|
|
|
|
} else {
|
|
|
|
|
/* Just hash all the sources */
|
|
|
|
|
for (int i = 0; i < inst->sources; i++)
|
|
|
|
|
hash = hash_reg(hash, inst->src[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hash;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
cmp_func(const void *data1, const void *data2)
|
|
|
|
|
{
|
|
|
|
|
bool negate;
|
|
|
|
|
return instructions_match((fs_inst *) data1, (fs_inst *) data2, &negate);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We set bit 31 in remap_table entries if it needs to be negated. */
|
|
|
|
|
#define REMAP_NEGATE (0x80000000u)
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
remap_sources(fs_visitor &s, const brw::def_analysis &defs,
|
|
|
|
|
fs_inst *inst, unsigned *remap_table)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF &&
|
|
|
|
|
inst->src[i].nr < defs.count() &&
|
|
|
|
|
remap_table[inst->src[i].nr] != ~0u) {
|
|
|
|
|
const unsigned old_nr = inst->src[i].nr;
|
|
|
|
|
unsigned new_nr = remap_table[old_nr];
|
|
|
|
|
const bool need_negate = new_nr & REMAP_NEGATE;
|
|
|
|
|
new_nr &= ~REMAP_NEGATE;
|
|
|
|
|
inst->src[i].nr = new_nr;
|
|
|
|
|
|
|
|
|
|
if (need_negate) {
|
|
|
|
|
if ((inst->src[i].type != BRW_TYPE_F &&
|
|
|
|
|
!inst->can_change_types()) ||
|
|
|
|
|
!inst->can_do_source_mods(s.devinfo)) {
|
|
|
|
|
/* We can't use the negate directly, resolve it just after the
|
|
|
|
|
* def and use that for any future uses.
|
|
|
|
|
*/
|
|
|
|
|
fs_inst *def = defs.get(inst->src[i]);
|
|
|
|
|
bblock_t *def_block = defs.get_block(inst->src[i]);
|
|
|
|
|
const fs_builder dbld =
|
|
|
|
|
fs_builder(&s, def_block, def).at(def_block, def->next);
|
|
|
|
|
|
|
|
|
|
/* Resolve any deferred block IP changes before inserting */
|
|
|
|
|
if (def_block->end_ip_delta)
|
|
|
|
|
s.cfg->adjust_block_ips();
|
|
|
|
|
|
|
|
|
|
fs_reg neg(VGRF, new_nr, BRW_TYPE_F);
|
|
|
|
|
fs_reg tmp = dbld.MOV(negate(neg));
|
|
|
|
|
inst->src[i].nr = tmp.nr;
|
|
|
|
|
remap_table[old_nr] = tmp.nr;
|
|
|
|
|
} else {
|
|
|
|
|
inst->src[i].negate = !inst->src[i].negate;
|
|
|
|
|
inst->src[i].type = BRW_TYPE_F;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_opt_cse_defs(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const idom_tree &idom = s.idom_analysis.require();
|
|
|
|
|
const brw::def_analysis &defs = s.def_analysis.require();
|
|
|
|
|
bool progress = false;
|
|
|
|
|
bool need_remaps = false;
|
|
|
|
|
|
|
|
|
|
unsigned *remap_table = new unsigned[defs.count()];
|
|
|
|
|
memset(remap_table, ~0u, defs.count() * sizeof(int));
|
|
|
|
|
struct set *set = _mesa_set_create(NULL, NULL, cmp_func);
|
|
|
|
|
|
|
|
|
|
foreach_block(block, s.cfg) {
|
|
|
|
|
fs_inst *last_flag_write = NULL;
|
|
|
|
|
fs_inst *last = NULL;
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_safe(fs_inst, inst, block) {
|
|
|
|
|
if (need_remaps)
|
|
|
|
|
remap_sources(s, defs, inst, remap_table);
|
|
|
|
|
|
|
|
|
|
/* Updating last_flag_written should be at the bottom of the loop,
|
|
|
|
|
* but doing it this way lets us use "continue" more easily.
|
|
|
|
|
*/
|
|
|
|
|
if (last && last->flags_written(devinfo))
|
|
|
|
|
last_flag_write = last;
|
|
|
|
|
last = inst;
|
|
|
|
|
|
|
|
|
|
if (inst->dst.is_null()) {
|
|
|
|
|
bool ignored;
|
|
|
|
|
if (last_flag_write && !inst->writes_accumulator &&
|
|
|
|
|
instructions_match(last_flag_write, inst, &ignored)) {
|
|
|
|
|
/* This instruction has no destination but has a flag write
|
|
|
|
|
* which is redundant with the previous flag write in our
|
|
|
|
|
* basic block. So we can simply remove it.
|
|
|
|
|
*/
|
|
|
|
|
inst->remove(block, true);
|
|
|
|
|
last = NULL;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
} else if (is_expression(&s, inst) && defs.get(inst->dst)) {
|
|
|
|
|
assert(!inst->writes_accumulator);
|
|
|
|
|
assert(!inst->reads_accumulator_implicitly());
|
|
|
|
|
|
|
|
|
|
uint32_t hash = hash_inst(inst);
|
|
|
|
|
if (inst->flags_read(devinfo)) {
|
|
|
|
|
hash = last_flag_write ? HASH(hash, last_flag_write)
|
|
|
|
|
: HASH(hash, block);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct set_entry *e =
|
|
|
|
|
_mesa_set_search_or_add_pre_hashed(set, hash, inst, NULL);
|
|
|
|
|
if (!e) goto out; /* out of memory error */
|
|
|
|
|
fs_inst *match = (fs_inst *) e->key;
|
|
|
|
|
|
|
|
|
|
/* If there was no match, move on */
|
|
|
|
|
if (match == inst)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bblock_t *def_block = defs.get_block(match->dst);
|
|
|
|
|
if (block != def_block && (local_only(inst) ||
|
|
|
|
|
!idom.dominates(def_block, block))) {
|
|
|
|
|
/* If `match` doesn't dominate `inst` then remove it from
|
|
|
|
|
* the set and add `inst` instead so future lookups see that.
|
|
|
|
|
*/
|
|
|
|
|
e->key = inst;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We can replace inst with match or negate(match). */
|
|
|
|
|
bool negate = false;
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MUL &&
|
|
|
|
|
inst->dst.type == BRW_TYPE_F) {
|
|
|
|
|
/* Determine whether inst is actually negate(match) */
|
|
|
|
|
bool ops_must_match = operands_match(inst, match, &negate);
|
|
|
|
|
assert(ops_must_match);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
need_remaps = true;
|
|
|
|
|
remap_table[inst->dst.nr] =
|
|
|
|
|
match->dst.nr | (negate ? REMAP_NEGATE : 0);
|
|
|
|
|
|
|
|
|
|
inst->remove(block, true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
delete [] remap_table;
|
|
|
|
|
_mesa_set_destroy(set, NULL);
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
|
|
|
|
s.cfg->adjust_block_ips();
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DETAIL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#undef HASH
|