2012-05-10 16:10:15 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2012 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_fs.h"
|
2012-10-03 13:03:12 -07:00
|
|
|
#include "brw_cfg.h"
|
2012-05-10 16:10:15 -07:00
|
|
|
|
|
|
|
|
/** @file brw_fs_cse.cpp
|
|
|
|
|
*
|
|
|
|
|
* Support for local common subexpression elimination.
|
|
|
|
|
*
|
2013-10-19 16:40:19 -07:00
|
|
|
* See Muchnick's Advanced Compiler Design and Implementation, section
|
2012-05-10 16:10:15 -07:00
|
|
|
* 13.1 (p378).
|
|
|
|
|
*/
|
|
|
|
|
|
2015-06-04 16:13:35 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
namespace {
|
|
|
|
|
struct aeb_entry : public exec_node {
|
|
|
|
|
/** The instruction that generates the expression value. */
|
|
|
|
|
fs_inst *generator;
|
|
|
|
|
|
|
|
|
|
/** The temporary where the value is stored. */
|
|
|
|
|
fs_reg tmp;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-30 12:41:55 -07:00
|
|
|
static bool
|
2015-04-01 15:38:23 -07:00
|
|
|
is_expression(const fs_visitor *v, const fs_inst *const inst)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
2014-04-03 14:29:30 -07:00
|
|
|
case BRW_OPCODE_MOV:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
2015-08-04 19:04:55 +03:00
|
|
|
case SHADER_OPCODE_MULH:
|
2012-05-10 16:10:15 -07:00
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
2012-12-02 00:08:15 -08:00
|
|
|
case BRW_OPCODE_LRP:
|
2016-07-21 16:55:45 -07:00
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
2013-02-15 19:49:32 -08:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2016-05-17 23:18:38 -07:00
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
2012-05-10 16:10:15 -07:00
|
|
|
case FS_OPCODE_LINTERP:
|
2015-02-20 20:25:04 +02:00
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
2015-02-19 14:52:24 +02:00
|
|
|
case SHADER_OPCODE_BROADCAST:
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
2016-04-29 23:35:01 -07:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2016-05-05 11:40:41 +02:00
|
|
|
case FS_OPCODE_PACK:
|
2012-05-10 16:10:15 -07:00
|
|
|
return true;
|
2013-07-25 00:30:05 -07:00
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2014-10-13 23:45:07 -07:00
|
|
|
return inst->mlen < 2;
|
2014-03-30 12:41:55 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
2015-04-01 15:38:23 -07:00
|
|
|
return !inst->is_copy_payload(v->alloc);
|
2012-05-10 16:10:15 -07:00
|
|
|
default:
|
2015-10-19 23:13:09 -07:00
|
|
|
return inst->is_send_from_grf() && !inst->has_side_effects() &&
|
|
|
|
|
!inst->is_volatile();
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-10-18 16:02:11 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
|
2013-10-18 16:02:11 -07:00
|
|
|
{
|
2014-03-25 15:28:17 -07:00
|
|
|
fs_reg *xs = a->src;
|
|
|
|
|
fs_reg *ys = b->src;
|
|
|
|
|
|
2014-10-26 10:08:40 -07:00
|
|
|
if (a->opcode == BRW_OPCODE_MAD) {
|
|
|
|
|
return xs[0].equals(ys[0]) &&
|
|
|
|
|
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
|
|
|
|
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
|
2015-01-27 19:18:46 -08:00
|
|
|
} else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
|
|
|
|
|
bool xs0_negate = xs[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: xs[1].negate;
|
|
|
|
|
bool ys0_negate = ys[0].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
|
2015-01-27 19:18:46 -08:00
|
|
|
: ys[1].negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
float xs1_imm = xs[1].f;
|
|
|
|
|
float ys1_imm = ys[1].f;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
xs[0].negate = false;
|
|
|
|
|
xs[1].negate = false;
|
|
|
|
|
ys[0].negate = false;
|
|
|
|
|
ys[1].negate = false;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = fabsf(xs[1].f);
|
|
|
|
|
ys[1].f = fabsf(ys[1].f);
|
2015-01-27 19:18:46 -08:00
|
|
|
|
|
|
|
|
bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
|
|
|
|
|
xs[0].negate = xs0_negate;
|
|
|
|
|
xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
|
|
|
|
|
ys[0].negate = ys0_negate;
|
|
|
|
|
ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
|
2015-10-24 14:55:57 -07:00
|
|
|
xs[1].f = xs1_imm;
|
|
|
|
|
ys[1].f = ys1_imm;
|
2015-01-27 19:18:46 -08:00
|
|
|
|
2015-04-13 11:29:14 -07:00
|
|
|
*negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
|
2016-02-22 10:25:38 -08:00
|
|
|
if (*negate && (a->saturate || b->saturate))
|
|
|
|
|
return false;
|
2015-01-27 19:18:46 -08:00
|
|
|
return ret;
|
2015-03-13 14:34:06 -07:00
|
|
|
} else if (!a->is_commutative()) {
|
2014-03-25 15:28:17 -07:00
|
|
|
bool match = true;
|
|
|
|
|
for (int i = 0; i < a->sources; i++) {
|
|
|
|
|
if (!xs[i].equals(ys[i])) {
|
|
|
|
|
match = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return match;
|
2013-10-18 16:02:11 -07:00
|
|
|
} else {
|
|
|
|
|
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
|
|
|
|
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
|
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
static bool
|
2015-01-27 19:18:46 -08:00
|
|
|
instructions_match(fs_inst *a, fs_inst *b, bool *negate)
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
{
|
|
|
|
|
return a->opcode == b->opcode &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->force_writemask_all == b->force_writemask_all &&
|
|
|
|
|
a->exec_size == b->exec_size &&
|
2016-05-20 16:14:13 -07:00
|
|
|
a->group == b->group &&
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
a->saturate == b->saturate &&
|
|
|
|
|
a->predicate == b->predicate &&
|
|
|
|
|
a->predicate_inverse == b->predicate_inverse &&
|
|
|
|
|
a->conditional_mod == b->conditional_mod &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->flag_subreg == b->flag_subreg &&
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
a->dst.type == b->dst.type &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->offset == b->offset &&
|
|
|
|
|
a->mlen == b->mlen &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->ex_mlen == b->ex_mlen &&
|
|
|
|
|
a->sfid == b->sfid &&
|
|
|
|
|
a->desc == b->desc &&
|
2016-09-07 13:38:20 -07:00
|
|
|
a->size_written == b->size_written &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->base_mrf == b->base_mrf &&
|
2018-10-29 15:06:14 -05:00
|
|
|
a->check_tdr == b->check_tdr &&
|
|
|
|
|
a->send_has_side_effects == b->send_has_side_effects &&
|
2015-06-04 15:09:10 +03:00
|
|
|
a->eot == b->eot &&
|
|
|
|
|
a->header_size == b->header_size &&
|
|
|
|
|
a->shadow_compare == b->shadow_compare &&
|
|
|
|
|
a->pi_noperspective == b->pi_noperspective &&
|
2016-07-06 20:49:58 -07:00
|
|
|
a->target == b->target &&
|
2014-03-25 15:28:17 -07:00
|
|
|
a->sources == b->sources &&
|
2015-01-27 19:18:46 -08:00
|
|
|
operands_match(a, b, negate);
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
}
|
|
|
|
|
|
2015-06-04 16:13:35 +03:00
|
|
|
static void
|
|
|
|
|
create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
|
2015-03-24 15:06:24 -07:00
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
unsigned written = regs_written(inst);
|
|
|
|
|
unsigned dst_width =
|
2015-08-11 14:24:55 -07:00
|
|
|
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
|
2015-03-24 15:06:24 -07:00
|
|
|
fs_inst *copy;
|
|
|
|
|
|
2019-01-14 22:21:48 -06:00
|
|
|
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
2015-10-26 17:09:25 -07:00
|
|
|
assert(src.file == VGRF);
|
2019-01-14 22:21:48 -06:00
|
|
|
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
|
|
|
|
|
inst->sources);
|
|
|
|
|
for (int i = 0; i < inst->header_size; i++) {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
payload[i] = src;
|
2016-09-01 12:42:20 -07:00
|
|
|
src.offset += REG_SIZE;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
}
|
2019-01-14 22:21:48 -06:00
|
|
|
for (int i = inst->header_size; i < inst->sources; i++) {
|
2018-11-14 22:38:23 -06:00
|
|
|
src.type = inst->src[i].type;
|
2019-01-14 22:21:48 -06:00
|
|
|
payload[i] = src;
|
|
|
|
|
src = offset(src, bld, 1);
|
|
|
|
|
}
|
|
|
|
|
copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
|
|
|
|
|
inst->header_size);
|
|
|
|
|
} else if (written != dst_width) {
|
|
|
|
|
assert(src.file == VGRF);
|
|
|
|
|
assert(written % dst_width == 0);
|
|
|
|
|
const int sources = written / dst_width;
|
|
|
|
|
fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
payload[i] = src;
|
2015-07-27 19:18:51 +03:00
|
|
|
src = offset(src, bld, 1);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
}
|
2019-01-14 22:21:48 -06:00
|
|
|
copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
|
2015-03-24 15:06:24 -07:00
|
|
|
} else {
|
2015-07-27 19:18:51 +03:00
|
|
|
copy = bld.MOV(inst->dst, src);
|
2016-05-20 16:14:13 -07:00
|
|
|
copy->group = inst->group;
|
2015-08-11 14:25:36 -07:00
|
|
|
copy->force_writemask_all = inst->force_writemask_all;
|
2015-03-24 15:06:24 -07:00
|
|
|
copy->src[0].negate = negate;
|
|
|
|
|
}
|
2016-09-07 16:59:35 -07:00
|
|
|
assert(regs_written(copy) == written);
|
2015-03-24 15:06:24 -07:00
|
|
|
}
|
|
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
bool
|
2014-07-11 20:35:31 -07:00
|
|
|
fs_visitor::opt_cse_local(bblock_t *block)
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
2014-07-11 20:35:31 -07:00
|
|
|
exec_list aeb;
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-03-26 15:58:12 -07:00
|
|
|
void *cse_ctx = ralloc_context(NULL);
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2013-02-19 16:20:10 -08:00
|
|
|
int ip = block->start_ip;
|
2014-06-24 12:42:00 -07:00
|
|
|
foreach_inst_in_block(fs_inst, inst, block) {
|
2012-05-10 16:10:15 -07:00
|
|
|
/* Skip some cases. */
|
intel/compiler: split is_partial_write() into two variants
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.
One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.
The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.
For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).
This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.
This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.
Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.
Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.
Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:
Patched | Master | % |
================================================
SIMD8 | 621,900 | 706,721 | -12.00% |
================================================
SIMD16 | 93,252 | 93,252 | 0.00% |
================================================
As expected, the change only affects SIMD8 dispatches.
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
2018-07-10 09:52:46 +02:00
|
|
|
if (is_expression(this, inst) &&
|
|
|
|
|
!inst->is_partial_var_write(dispatch_width) &&
|
2015-10-26 17:52:57 -07:00
|
|
|
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
|
|
|
|
|
inst->dst.is_null()))
|
2012-05-10 16:10:15 -07:00
|
|
|
{
|
2014-06-11 13:01:31 -07:00
|
|
|
bool found = false;
|
2015-01-27 19:18:46 -08:00
|
|
|
bool negate = false;
|
2014-06-11 13:01:31 -07:00
|
|
|
|
2014-07-11 20:35:31 -07:00
|
|
|
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
|
2014-06-11 13:01:31 -07:00
|
|
|
/* Match current instruction's expression against those in AEB. */
|
2015-01-12 13:58:06 -08:00
|
|
|
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
|
2015-01-27 19:18:46 -08:00
|
|
|
instructions_match(inst, entry->generator, &negate)) {
|
2014-06-11 13:01:31 -07:00
|
|
|
found = true;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!found) {
|
2014-04-03 14:29:30 -07:00
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
|
|
|
|
(inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
inst->src[0].file == IMM &&
|
|
|
|
|
inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
|
|
|
|
|
/* Our first sighting of this expression. Create an entry. */
|
|
|
|
|
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
|
|
|
|
|
entry->tmp = reg_undef;
|
|
|
|
|
entry->generator = inst;
|
|
|
|
|
aeb.push_tail(entry);
|
|
|
|
|
}
|
2014-06-11 13:01:31 -07:00
|
|
|
} else {
|
|
|
|
|
/* This is at least our second sighting of this expression.
|
|
|
|
|
* If we don't have a temporary already, make one.
|
|
|
|
|
*/
|
|
|
|
|
bool no_existing_temp = entry->tmp.file == BAD_FILE;
|
|
|
|
|
if (no_existing_temp && !entry->generator->dst.is_null()) {
|
2015-07-27 19:18:51 +03:00
|
|
|
const fs_builder ibld = fs_builder(this, block, entry->generator)
|
|
|
|
|
.at(block, entry->generator->next);
|
2016-09-07 16:59:35 -07:00
|
|
|
int written = regs_written(entry->generator);
|
2013-03-15 14:43:28 -07:00
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
entry->tmp = fs_reg(VGRF, alloc.allocate(written),
|
2015-06-18 12:44:35 -07:00
|
|
|
entry->generator->dst.type);
|
2013-03-15 14:43:28 -07:00
|
|
|
|
2015-07-27 19:18:51 +03:00
|
|
|
create_copy_instr(ibld, entry->generator, entry->tmp, false);
|
2015-03-24 15:06:24 -07:00
|
|
|
|
|
|
|
|
entry->generator->dst = entry->tmp;
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-06-11 13:01:31 -07:00
|
|
|
/* dest <- temp */
|
2013-10-22 15:40:08 -07:00
|
|
|
if (!inst->dst.is_null()) {
|
2016-09-07 13:38:20 -07:00
|
|
|
assert(inst->size_written == entry->generator->size_written);
|
2013-10-22 15:40:08 -07:00
|
|
|
assert(inst->dst.type == entry->tmp.type);
|
2015-07-27 19:18:51 +03:00
|
|
|
const fs_builder ibld(this, block, inst);
|
2015-03-24 15:06:24 -07:00
|
|
|
|
2015-07-27 19:18:51 +03:00
|
|
|
create_copy_instr(ibld, inst, entry->tmp, negate);
|
2013-03-15 14:43:28 -07:00
|
|
|
}
|
2013-10-22 15:40:08 -07:00
|
|
|
|
|
|
|
|
/* Set our iterator so that next time through the loop inst->next
|
|
|
|
|
* will get the instruction in the basic block after the one we've
|
|
|
|
|
* removed.
|
|
|
|
|
*/
|
|
|
|
|
fs_inst *prev = (fs_inst *)inst->prev;
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
inst->remove(block);
|
2013-10-22 15:40:08 -07:00
|
|
|
inst = prev;
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2014-07-11 20:35:31 -07:00
|
|
|
foreach_in_list_safe(aeb_entry, entry, &aeb) {
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
/* Kill all AEB entries that write a different value to or read from
|
|
|
|
|
* the flag register if we just wrote it.
|
|
|
|
|
*/
|
2016-05-18 22:40:40 -07:00
|
|
|
if (inst->flags_written()) {
|
2015-01-27 19:18:46 -08:00
|
|
|
bool negate; /* dummy */
|
2016-05-18 22:40:40 -07:00
|
|
|
if (entry->generator->flags_read(devinfo) ||
|
|
|
|
|
(entry->generator->flags_written() &&
|
2015-01-27 19:18:46 -08:00
|
|
|
!instructions_match(inst, entry->generator, &negate))) {
|
i965/fs: Perform CSE on CMP(N) instructions.
Optimizes
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
into
cmp.ge.f0(8) null g45<8,8,1>F 0F
(+f0) sel(8) g50<1>F g40<8,8,1>F g10<8,8,1>F
(+f0) sel(8) g51<1>F g41<8,8,1>F g11<8,8,1>F
(+f0) sel(8) g52<1>F g42<8,8,1>F g12<8,8,1>F
(+f0) sel(8) g53<1>F g43<8,8,1>F g13<8,8,1>F
total instructions in shared programs: 1644938 -> 1638181 (-0.41%)
instructions in affected programs: 574955 -> 568198 (-1.18%)
Two more 16-wide programs (in L4D2). Some large (-9%) decreases in
instruction count in some of Valve's Source Engine games. No
regressions.
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-10-20 11:38:17 -07:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-06-11 13:01:31 -07:00
|
|
|
for (int i = 0; i < entry->generator->sources; i++) {
|
2013-02-19 16:20:10 -08:00
|
|
|
fs_reg *src_reg = &entry->generator->src[i];
|
|
|
|
|
|
|
|
|
|
/* Kill all AEB entries that use the destination we just
|
|
|
|
|
* overwrote.
|
|
|
|
|
*/
|
2016-09-01 19:34:18 -07:00
|
|
|
if (regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
entry->generator->src[i],
|
|
|
|
|
entry->generator->size_read(i))) {
|
2014-06-11 13:01:31 -07:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-02-19 16:20:10 -08:00
|
|
|
|
|
|
|
|
/* Kill any AEB entries using registers that don't get reused any
|
|
|
|
|
* more -- a sure sign they'll fail operands_match().
|
|
|
|
|
*/
|
2015-10-26 17:09:25 -07:00
|
|
|
if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) {
|
2013-02-19 16:20:10 -08:00
|
|
|
entry->remove();
|
|
|
|
|
ralloc_free(entry);
|
2014-06-11 13:01:31 -07:00
|
|
|
break;
|
2013-02-19 16:20:10 -08:00
|
|
|
}
|
2014-06-11 13:01:31 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
2013-02-19 16:20:10 -08:00
|
|
|
|
|
|
|
|
ip++;
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2014-03-26 15:58:12 -07:00
|
|
|
ralloc_free(cse_ctx);
|
2012-05-10 16:10:15 -07:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::opt_cse()
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2014-07-11 20:54:52 -07:00
|
|
|
calculate_live_intervals();
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2014-07-11 22:31:39 -07:00
|
|
|
foreach_block (block, cfg) {
|
2014-07-11 20:35:31 -07:00
|
|
|
progress = opt_cse_local(block) || progress;
|
2012-05-10 16:10:15 -07:00
|
|
|
}
|
|
|
|
|
|
2014-07-11 20:37:04 -07:00
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
invalidate_live_intervals();
|
2014-07-11 20:37:04 -07:00
|
|
|
|
2012-05-10 16:10:15 -07:00
|
|
|
return progress;
|
|
|
|
|
}
|