mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-23 22:58:33 +02:00
brw: Replace logical operations with predication
There is more to do here. A few things I have noticed.
1. There are cases where the ideal pass cannot make progress, but the
"logic op to predicated move" pass can. Sometimes scheduling can
rearrange this to sequences like:
cmp.nz.f0.0(16) g99<1>F g98<1,1,0>F 0x3f800000F
cmp.g.f0.0(16) null<1>HF g106<16,16,1>HF 0x0000HF
(+f0.0) mov.nz.f0.0(16) null<1>UD g99<8,8,1>UD
We should be able to detect this after scheduling, and eliminate the
mov.nz.
2. We should extend post-scheduling cmod propagation to handle cases
where a predicated CMP is the only use of an ALU result. I have
observed sequences like
and(16) v5200:UD v5048+6.0:UD 134217726u
(+f0.0) cmp.z.f0.0(16) null:D v5200:D 0d
and
or(16) g113<1>UD g112<1,1,0>UD g20<1,1,0>UD
(-f0.0) mov.nz.f0.0(16) null<1>UD g113<8,8,1>UD
v2: Don't allow SEL or CSEL in is_valid_logic_source. No shader-db or
fossil-db changes here, but this prevents problems with (possible)
future commits.
v3: Replace hither and yon with nearer and farther. Find both logic
sources in one loop. Use brw_flags_written. Refactor cmod selection (for
BFN vs all other opcodes) to separate function. Suggested by Caio.
v4: Actually remove flags_written now. Suggested (twice) by Caio.
Require that flags written by the nearer logic source matches the flags
written by the logic operation. This fixes cmp_flag_subreg_mismatch and
mov_flag_subreg_mismatch (added in the next commit). Also
s/inst->src[0]/inst->src[src]/. Noticed by Caio.
v5: flags_read was also unused. Noticed by marge.
shader-db:
Lunar Lake
total instructions in shared programs: 17083282 -> 17072645 (-0.06%)
instructions in affected programs: 2076491 -> 2065854 (-0.51%)
helped: 3952 / HURT: 0
total cycles in shared programs: 887823360 -> 889080938 (0.14%)
cycles in affected programs: 472236518 -> 473494096 (0.27%)
helped: 3156 / HURT: 936
total fills in shared programs: 1778 -> 1778 (0.00%)
fills in affected programs: 286 -> 286 (0.00%)
helped: 2 / HURT: 2
LOST: 27
GAINED: 14
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
total instructions in shared programs: 19980337 -> 19965369 (-0.07%)
instructions in affected programs: 2406043 -> 2391075 (-0.62%)
helped: 4621 / HURT: 7
total cycles in shared programs: 887416449 -> 887170456 (-0.03%)
cycles in affected programs: 457957623 -> 457711630 (-0.05%)
helped: 3776 / HURT: 1039
total fills in shared programs: 4371 -> 4375 (0.09%)
fills in affected programs: 798 -> 802 (0.50%)
helped: 4 / HURT: 6
LOST: 15
GAINED: 1
Tiger Lake
total instructions in shared programs: 19904512 -> 19889603 (-0.07%)
instructions in affected programs: 2405908 -> 2390999 (-0.62%)
helped: 4616 / HURT: 22
total cycles in shared programs: 864580948 -> 863953289 (-0.07%)
cycles in affected programs: 459500521 -> 458872862 (-0.14%)
helped: 3710 / HURT: 1093
total spills in shared programs: 3467 -> 3472 (0.14%)
spills in affected programs: 15 -> 20 (33.33%)
helped: 0 / HURT: 1
total fills in shared programs: 2059 -> 2069 (0.49%)
fills in affected programs: 47 -> 57 (21.28%)
helped: 0 / HURT: 1
LOST: 11
GAINED: 9
Ice Lake
total instructions in shared programs: 20821682 -> 20806373 (-0.07%)
instructions in affected programs: 2447072 -> 2431763 (-0.63%)
helped: 4741 / HURT: 1
total cycles in shared programs: 876811334 -> 876360389 (-0.05%)
cycles in affected programs: 438363075 -> 437912130 (-0.10%)
helped: 4000 / HURT: 724
total fills in shared programs: 3837 -> 3835 (-0.05%)
fills in affected programs: 302 -> 300 (-0.66%)
helped: 1 / HURT: 0
LOST: 12
GAINED: 9
Skylake
total instructions in shared programs: 19041784 -> 19026462 (-0.08%)
instructions in affected programs: 2397491 -> 2382169 (-0.64%)
helped: 4711 / HURT: 0
total cycles in shared programs: 868019298 -> 867790279 (-0.03%)
cycles in affected programs: 441110462 -> 440881443 (-0.05%)
helped: 3915 / HURT: 788
total fills in shared programs: 3767 -> 3765 (-0.05%)
fills in affected programs: 302 -> 300 (-0.66%)
helped: 1 / HURT: 0
LOST: 4
GAINED: 3
fossil-db:
Lunar Lake
Totals:
Instrs: 924697067 -> 922488661 (-0.24%); split: -0.25%, +0.01%
Subgroup size: 40939424 -> 40939744 (+0.00%)
Cycle count: 106291402322 -> 105964111203 (-0.31%); split: -0.66%, +0.35%
Spill count: 3423988 -> 3421004 (-0.09%); split: -0.34%, +0.25%
Fill count: 4877087 -> 4862981 (-0.29%); split: -1.21%, +0.92%
Max live registers: 193812217 -> 193805296 (-0.00%)
Max dispatch width: 49089184 -> 49085216 (-0.01%); split: +0.01%, -0.02%
Totals from 453746 (22.47% of 2019504) affected shaders:
Instrs: 529674876 -> 527466470 (-0.42%); split: -0.43%, +0.02%
Subgroup size: 320 -> 640 (+100.00%)
Cycle count: 87892218969 -> 87564927850 (-0.37%); split: -0.79%, +0.42%
Spill count: 3302695 -> 3299711 (-0.09%); split: -0.35%, +0.26%
Fill count: 4778154 -> 4764048 (-0.30%); split: -1.23%, +0.94%
Max live registers: 65405449 -> 65398528 (-0.01%)
Max dispatch width: 10793104 -> 10789136 (-0.04%); split: +0.04%, -0.08%
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 998057341 -> 995683321 (-0.24%); split: -0.25%, +0.01%
Subgroup size: 27545440 -> 27545656 (+0.00%)
Cycle count: 93854696449 -> 93709099572 (-0.16%); split: -0.62%, +0.46%
Spill count: 3709547 -> 3701296 (-0.22%); split: -0.50%, +0.28%
Fill count: 5032889 -> 5014189 (-0.37%); split: -1.28%, +0.91%
Max live registers: 121823974 -> 121810927 (-0.01%)
Max dispatch width: 38021936 -> 38020536 (-0.00%); split: +0.06%, -0.07%
Totals from 505565 (22.13% of 2284025) affected shaders:
Instrs: 549480901 -> 547106881 (-0.43%); split: -0.45%, +0.02%
Subgroup size: 216 -> 432 (+100.00%)
Cycle count: 76260069937 -> 76114473060 (-0.19%); split: -0.76%, +0.57%
Spill count: 3526038 -> 3517787 (-0.23%); split: -0.53%, +0.29%
Fill count: 4844826 -> 4826126 (-0.39%); split: -1.33%, +0.94%
Max live registers: 38085235 -> 38072188 (-0.03%)
Max dispatch width: 8015432 -> 8014032 (-0.02%); split: +0.30%, -0.32%
Tiger Lake
Totals:
Instrs: 1013436935 -> 1011070083 (-0.23%); split: -0.25%, +0.02%
Cycle count: 85763486346 -> 85580242131 (-0.21%); split: -0.68%, +0.47%
Spill count: 3903905 -> 3902350 (-0.04%); split: -0.36%, +0.32%
Fill count: 6801966 -> 6787600 (-0.21%); split: -0.70%, +0.49%
Max live registers: 122298352 -> 122284634 (-0.01%)
Max dispatch width: 37957184 -> 37964608 (+0.02%); split: +0.10%, -0.08%
Totals from 525103 (23.03% of 2280298) affected shaders:
Instrs: 570013347 -> 567646495 (-0.42%); split: -0.44%, +0.03%
Cycle count: 71392808767 -> 71209564552 (-0.26%); split: -0.82%, +0.56%
Spill count: 3757751 -> 3756196 (-0.04%); split: -0.38%, +0.33%
Fill count: 6648525 -> 6634159 (-0.22%); split: -0.72%, +0.51%
Max live registers: 39876402 -> 39862684 (-0.03%)
Max dispatch width: 8453816 -> 8461240 (+0.09%); split: +0.44%, -0.36%
Ice Lake
Totals:
Instrs: 1014312031 -> 1011938992 (-0.23%); split: -0.24%, +0.01%
Cycle count: 86550003161 -> 86343662349 (-0.24%); split: -0.39%, +0.15%
Spill count: 3039497 -> 3035267 (-0.14%); split: -0.33%, +0.19%
Fill count: 5376655 -> 5370235 (-0.12%); split: -0.43%, +0.32%
Max live registers: 125551684 -> 125537675 (-0.01%)
Max dispatch width: 41300016 -> 41301552 (+0.00%); split: +0.02%, -0.02%
Totals from 537158 (23.01% of 2334535) affected shaders:
Instrs: 555656911 -> 553283872 (-0.43%); split: -0.44%, +0.01%
Cycle count: 71869799780 -> 71663458968 (-0.29%); split: -0.47%, +0.19%
Spill count: 2844469 -> 2840239 (-0.15%); split: -0.35%, +0.20%
Fill count: 5006995 -> 5000575 (-0.13%); split: -0.47%, +0.34%
Max live registers: 39809729 -> 39795720 (-0.04%)
Max dispatch width: 9226240 -> 9227776 (+0.02%); split: +0.10%, -0.08%
Skylake
Totals:
Instrs: 519584256 -> 518938991 (-0.12%); split: -0.13%, +0.00%
Cycle count: 57935410863 -> 57867852550 (-0.12%); split: -0.22%, +0.10%
Spill count: 636741 -> 636728 (-0.00%); split: -0.06%, +0.06%
Fill count: 860470 -> 860314 (-0.02%); split: -0.19%, +0.17%
Max live registers: 87895659 -> 87889485 (-0.01%)
Max dispatch width: 32565912 -> 32567080 (+0.00%); split: +0.03%, -0.03%
Totals from 235957 (13.59% of 1736653) affected shaders:
Instrs: 158020578 -> 157375313 (-0.41%); split: -0.41%, +0.00%
Cycle count: 44881056772 -> 44813498459 (-0.15%); split: -0.28%, +0.13%
Spill count: 461098 -> 461085 (-0.00%); split: -0.09%, +0.09%
Fill count: 601255 -> 601099 (-0.03%); split: -0.27%, +0.24%
Max live registers: 16143628 -> 16137454 (-0.04%)
Max dispatch width: 4664240 -> 4665408 (+0.03%); split: +0.20%, -0.17%
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39030>
This commit is contained in:
parent
539162936f
commit
67f0fc62fd
4 changed files with 504 additions and 0 deletions
|
|
@ -98,6 +98,16 @@ brw_optimize(brw_shader &s)
|
|||
OPT(brw_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
while (OPT(brw_opt_predicate_logic)) {
|
||||
/* The dead code elimination after brw_opt_predicate_logic can cause the
|
||||
* first comparison in the set to have a NULL destination. That can make
|
||||
* it a candidate for additional brw_opt_cmod_propagation and additional
|
||||
* brw_opt_predicate_logic.
|
||||
*/
|
||||
if (OPT(brw_opt_dead_code_eliminate) && OPT(brw_opt_cmod_propagation))
|
||||
OPT(brw_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
if (OPT(brw_lower_pack)) {
|
||||
OPT(brw_opt_register_coalesce);
|
||||
OPT(brw_opt_dead_code_eliminate);
|
||||
|
|
|
|||
492
src/intel/compiler/brw/brw_opt_predicate_logic.cpp
Normal file
492
src/intel/compiler/brw/brw_opt_predicate_logic.cpp
Normal file
|
|
@ -0,0 +1,492 @@
|
|||
/*
|
||||
* Copyright © 2025 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_shader.h"
|
||||
#include "brw_analysis.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
struct logic_source {
|
||||
brw_inst *inst = NULL;
|
||||
unsigned distance = 0;
|
||||
unsigned src = 0;
|
||||
};
|
||||
|
||||
static bool
|
||||
is_used_once(brw_inst *inst, const intel_device_info *devinfo,
|
||||
const brw_live_variables &live_vars,
|
||||
const brw_def_analysis &defs)
|
||||
{
|
||||
unsigned use_count = defs.get_use_count(inst->dst);
|
||||
if (use_count != 0)
|
||||
return use_count == 1;
|
||||
|
||||
/* If there are any uses outside the block, fail. */
|
||||
if (BITSET_TEST(live_vars.block_data[inst->block->num].liveout,
|
||||
live_vars.var_from_reg(inst->dst)))
|
||||
return false;
|
||||
|
||||
foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
|
||||
for (unsigned i = 0; i < scan_inst->sources; i++) {
|
||||
if (regions_overlap(inst->dst, inst->size_written,
|
||||
scan_inst->src[i], scan_inst->size_read(devinfo, i))) {
|
||||
use_count++;
|
||||
}
|
||||
}
|
||||
|
||||
if (use_count > 1)
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(use_count == 1);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_Boolean(brw_inst *inst, const brw_def_analysis &defs)
|
||||
{
|
||||
if (inst == NULL)
|
||||
return false;
|
||||
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_CMP:
|
||||
case BRW_OPCODE_CMPN:
|
||||
return true;
|
||||
|
||||
case BRW_OPCODE_AND:
|
||||
case BRW_OPCODE_NOT:
|
||||
case BRW_OPCODE_OR:
|
||||
case BRW_OPCODE_SEL:
|
||||
case BRW_OPCODE_XOR: {
|
||||
for (unsigned i = 0; i < inst->sources; i++) {
|
||||
brw_inst *def = defs.get(inst->src[i]);
|
||||
|
||||
if (def == NULL)
|
||||
return false;
|
||||
|
||||
if (def->opcode != BRW_OPCODE_CMP &&
|
||||
def->opcode != BRW_OPCODE_CMPN)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the flags read between two instructions.
|
||||
*
|
||||
* Flags read by \c begin or \c end are \b not included in the return value.
|
||||
*/
|
||||
static unsigned
|
||||
flags_read_between(brw_inst *begin, brw_inst *end,
|
||||
const intel_device_info *devinfo)
|
||||
{
|
||||
unsigned flags_read = 0;
|
||||
|
||||
foreach_inst_in_block_starting_from(brw_inst, inst, begin) {
|
||||
if (inst == end)
|
||||
return flags_read;
|
||||
|
||||
flags_read |= inst->flags_read(devinfo);
|
||||
}
|
||||
|
||||
if (end == NULL)
|
||||
return flags_read;
|
||||
|
||||
UNREACHABLE("end does not occur after begin in the same block.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the flags written between two instructions.
|
||||
*
|
||||
* Flags written by \c begin or \c end are \b not included in the return value.
|
||||
*/
|
||||
static unsigned
|
||||
flags_written_between(brw_inst *begin, brw_inst *end,
|
||||
const intel_device_info *devinfo)
|
||||
{
|
||||
unsigned flags_written = 0;
|
||||
|
||||
foreach_inst_in_block_starting_from(brw_inst, inst, begin) {
|
||||
if (inst == end)
|
||||
return flags_written;
|
||||
|
||||
flags_written |= inst->flags_written(devinfo);
|
||||
}
|
||||
|
||||
if (end == NULL)
|
||||
return flags_written;
|
||||
|
||||
UNREACHABLE("end does not occur after begin in the same block.");
|
||||
}
|
||||
|
||||
static enum brw_conditional_mod
|
||||
required_cmod(enum opcode op)
|
||||
{
|
||||
return op == BRW_OPCODE_BFN ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_NZ;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_valid_logic_source(const brw_inst *inst)
|
||||
{
|
||||
if (inst->opcode == BRW_OPCODE_CMP || inst->opcode == BRW_OPCODE_CMPN)
|
||||
return true;
|
||||
|
||||
/* Especially CSEL.NZ can confuse some of the checks below. Rejecting SEL
|
||||
* and CSEL here keeps that code more clear.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_SEL || inst->opcode == BRW_OPCODE_CSEL)
|
||||
return false;
|
||||
|
||||
/* The flags will be used as a proxy for the value produced by the
|
||||
* instruction. At the end, the instruction must have a
|
||||
* conditional modifier of NZ (G for BFN).
|
||||
*/
|
||||
const enum brw_conditional_mod req_cmod = required_cmod(inst->opcode);
|
||||
|
||||
return (inst->conditional_mod == BRW_CONDITIONAL_NONE &&
|
||||
inst->can_do_cmod(req_cmod)) || inst->conditional_mod == req_cmod;
|
||||
}
|
||||
|
||||
static void
|
||||
find_logic_sources(brw_inst *inst, const brw_def_analysis &defs,
|
||||
const intel_device_info *devinfo,
|
||||
logic_source *nearer, logic_source *farther)
|
||||
{
|
||||
unsigned distance = 0;
|
||||
const unsigned size_read[2] = {
|
||||
inst->size_read(devinfo, 0),
|
||||
inst->size_read(devinfo, 1),
|
||||
};
|
||||
int lo = 0;
|
||||
int hi = 1;
|
||||
logic_source ls[2];
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(brw_inst, scan_inst, inst) {
|
||||
distance++;
|
||||
|
||||
for (int src = lo; src <= hi; src++) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[src], size_read[src])) {
|
||||
if (!(scan_inst->is_partial_write() ||
|
||||
scan_inst->dst.offset != inst->src[src].offset ||
|
||||
scan_inst->exec_size != inst->exec_size ||
|
||||
!is_valid_logic_source(scan_inst))) {
|
||||
ls[src] = logic_source { scan_inst, distance, (unsigned) src };
|
||||
}
|
||||
|
||||
if (src == lo)
|
||||
lo++;
|
||||
else
|
||||
hi--;
|
||||
}
|
||||
}
|
||||
|
||||
if (lo > hi)
|
||||
break;
|
||||
}
|
||||
|
||||
for (int src = lo; src <= hi; src++) {
|
||||
brw_inst *def = defs.get(inst->src[src]);
|
||||
if (def != NULL) {
|
||||
assert(def->block != inst->block);
|
||||
|
||||
if (def->is_partial_write() ||
|
||||
def->dst.offset != inst->src[src].offset ||
|
||||
def->exec_size != inst->exec_size ||
|
||||
!is_valid_logic_source(def)) {
|
||||
def = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ls[src] = logic_source { def, UINT_MAX, (unsigned) src };
|
||||
}
|
||||
|
||||
assert(ls[0].inst == NULL || ls[0].inst != ls[1].inst);
|
||||
|
||||
if (ls[0].distance > ls[1].distance)
|
||||
SWAP(ls[0], ls[1]);
|
||||
|
||||
*nearer = ls[0];
|
||||
*farther = ls[1];
|
||||
}
|
||||
|
||||
static bool
|
||||
try_predicated_cmp(brw_shader &s, const brw_live_variables &live_vars,
|
||||
const brw_def_analysis &defs,
|
||||
brw_inst *logic_inst, logic_source &nearer,
|
||||
logic_source &farther, unsigned nearer_flags)
|
||||
{
|
||||
/* For this path, the farther instruction must also be in the same block
|
||||
* as the logic operation.
|
||||
*/
|
||||
if (farther.inst == NULL || farther.distance == UINT_MAX)
|
||||
return false;
|
||||
|
||||
/* If farther doesn't write any flags yet, determine what flags it would
|
||||
* write.
|
||||
*/
|
||||
unsigned farther_flags = farther.inst->flags_written(s.devinfo);
|
||||
if (farther_flags == 0) {
|
||||
farther_flags = brw_flags_written(farther.inst->opcode,
|
||||
required_cmod(farther.inst->opcode),
|
||||
logic_inst->flag_subreg,
|
||||
farther.inst->group,
|
||||
farther.inst->exec_size);
|
||||
}
|
||||
|
||||
/* If farther does not already write flags, there must be no readers of the flags
|
||||
* that it will write.
|
||||
*
|
||||
* A similar test for nearer is not necessary. It is already required
|
||||
* that there be no uses of the flags produced by nearer.
|
||||
*/
|
||||
if (farther.inst->conditional_mod == BRW_CONDITIONAL_NONE &&
|
||||
(flags_read_between(farther.inst, nearer.inst, s.devinfo) &
|
||||
farther_flags) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* The flags written by farther must reach nearer. */
|
||||
if ((flags_written_between(farther.inst, nearer.inst, s.devinfo) &
|
||||
farther_flags) != 0)
|
||||
return false;
|
||||
|
||||
/* The flags and the destination written by nearer must not be read by
|
||||
* any instruction other than the logic operation.
|
||||
*/
|
||||
if (!is_used_once(nearer.inst, s.devinfo, live_vars, defs))
|
||||
return false;
|
||||
|
||||
if ((nearer_flags & flags_read_between(nearer.inst, logic_inst, s.devinfo)) != 0)
|
||||
return false;
|
||||
|
||||
const unsigned flags_read_after_inst =
|
||||
flags_read_between(logic_inst, NULL, s.devinfo) |
|
||||
live_vars.block_data[logic_inst->block->num].flag_liveout[0];
|
||||
|
||||
if (flags_read_after_inst & (nearer_flags &
|
||||
~logic_inst->flags_written(s.devinfo)))
|
||||
return false;
|
||||
|
||||
/* It is safe to eliminate the logic operation. Perform the following
|
||||
* steps:
|
||||
*
|
||||
* 1. If farther doesn't already write flags, set a conditional modifier on
|
||||
* it, and set its flag_subreg.
|
||||
*
|
||||
* 2. If nearer doesn't already write flags, set a conditional modifier on
|
||||
* it, and set its flag_subreg.
|
||||
*
|
||||
* 3. Make nearer's destination be the null register.
|
||||
*
|
||||
* 4. Make nearer be predicated.
|
||||
*
|
||||
* 5. Remove the logic operation.
|
||||
*/
|
||||
if (farther.inst->conditional_mod == BRW_CONDITIONAL_NONE) {
|
||||
farther.inst->conditional_mod = required_cmod(farther.inst->opcode);
|
||||
farther.inst->flag_subreg = logic_inst->flag_subreg;
|
||||
|
||||
assert(farther_flags == farther.inst->flags_written(s.devinfo));
|
||||
}
|
||||
|
||||
if (nearer.inst->conditional_mod == BRW_CONDITIONAL_NONE) {
|
||||
nearer.inst->conditional_mod = required_cmod(nearer.inst->opcode);
|
||||
nearer.inst->flag_subreg = logic_inst->flag_subreg;
|
||||
|
||||
assert(nearer_flags == nearer.inst->flags_written(s.devinfo));
|
||||
}
|
||||
|
||||
nearer.inst->dst = retype(brw_null_reg(), nearer.inst->dst.type);
|
||||
|
||||
set_predicate_inv(BRW_PREDICATE_NORMAL,
|
||||
logic_inst->opcode == BRW_OPCODE_OR,
|
||||
nearer.inst);
|
||||
|
||||
assert((nearer.inst->flags_read(s.devinfo) &
|
||||
~farther.inst->flags_written(s.devinfo)) == 0);
|
||||
|
||||
logic_inst->remove();
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
try_predicated_mov(brw_shader &s, const brw_live_variables &live_vars,
|
||||
const brw_def_analysis &defs,
|
||||
brw_inst *logic_inst, logic_source &nearer,
|
||||
logic_source &farther, unsigned nearer_flags)
|
||||
{
|
||||
/* Cases like
|
||||
*
|
||||
* cmp.g.f0.0(8) v946:F, |v945|:F, 0f
|
||||
* and.nz.f0.0(8) null:UD, -v869:UD, v946:UD
|
||||
*
|
||||
* can be handled by replacing the AND instruction with a predicated NOT
|
||||
* instead of a predicated MOV.
|
||||
*
|
||||
* NOTE: ~x != 0 is not the same as x == 0 when x is not known to be a
|
||||
* Boolean value. Since farther may not be a CMP/CMPN, this is important.
|
||||
*
|
||||
* However, cases where the other source is negated would require more
|
||||
* complicated surgery. De Morgan's Law would have to be applied, and
|
||||
* all uses of the new predicate would have to be inverted. The
|
||||
* information is available to make that possible (e.g., the flags
|
||||
* liveness), but it's a lot more work.
|
||||
*/
|
||||
const enum opcode op = logic_inst->src[farther.src].negate ?
|
||||
BRW_OPCODE_NOT : BRW_OPCODE_MOV;
|
||||
|
||||
if (nearer.inst->conditional_mod == BRW_CONDITIONAL_NONE &&
|
||||
(flags_read_between(nearer.inst, logic_inst, s.devinfo) &
|
||||
nearer_flags) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* It is safe to eliminate the logic operation. Perform the following
|
||||
* steps:
|
||||
*
|
||||
* 1. If nearer doesn't already write flags, set a conditional modifier on
|
||||
* it, and set its flag_subreg.
|
||||
*
|
||||
* 2. Convert the logic operation to either a MOV or a NOT of the value
|
||||
* taken from farther.
|
||||
*/
|
||||
if (nearer.inst->conditional_mod == BRW_CONDITIONAL_NONE) {
|
||||
nearer.inst->conditional_mod = required_cmod(nearer.inst->opcode);
|
||||
nearer.inst->flag_subreg = logic_inst->flag_subreg;
|
||||
|
||||
assert(nearer_flags == nearer.inst->flags_written(s.devinfo));
|
||||
}
|
||||
|
||||
set_predicate_inv(BRW_PREDICATE_NORMAL,
|
||||
logic_inst->opcode == BRW_OPCODE_OR,
|
||||
logic_inst);
|
||||
logic_inst->src[0] = logic_inst->src[farther.src];
|
||||
logic_inst->src[0].negate = false;
|
||||
|
||||
brw_transform_inst(s, logic_inst, op);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
try_predicate_and(brw_shader &s, brw_inst *inst,
|
||||
const brw_live_variables &live_vars,
|
||||
const brw_def_analysis &defs)
|
||||
{
|
||||
if (inst->conditional_mod != BRW_CONDITIONAL_NZ)
|
||||
return false;
|
||||
|
||||
if (regions_overlap(inst->src[0], inst->size_read(s.devinfo, 0),
|
||||
inst->src[1], inst->size_read(s.devinfo, 1))) {
|
||||
return false;
|
||||
}
|
||||
/* These names are annoying. Some compilers secretly have "near" and "far"
|
||||
* as reserved words, so those can't be used.
|
||||
*/
|
||||
logic_source nearer;
|
||||
logic_source farther;
|
||||
|
||||
find_logic_sources(inst, defs, s.devinfo, &nearer, &farther);
|
||||
|
||||
/* The closer instruction must be in the same block. */
|
||||
if (nearer.inst == NULL || nearer.distance == UINT_MAX)
|
||||
return false;
|
||||
|
||||
/* If the logical operation is AND, one of the comparisons must be provably
|
||||
* a Boolean value (i.e., 0 or ~0). This is the only way to be sure A&B !=
|
||||
* 0 is equivalent to (A != 0) && (B != 0).
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_AND &&
|
||||
!is_Boolean(nearer.inst, defs) && !is_Boolean(farther.inst, defs))
|
||||
return false;
|
||||
|
||||
/* If nearer doesn't write any flags yet, determine what flags it would
|
||||
* write.
|
||||
*/
|
||||
unsigned nearer_flags = nearer.inst->flags_written(s.devinfo);
|
||||
if (nearer_flags == 0) {
|
||||
nearer_flags = brw_flags_written(nearer.inst->opcode,
|
||||
required_cmod(nearer.inst->opcode),
|
||||
inst->flag_subreg,
|
||||
nearer.inst->group,
|
||||
nearer.inst->exec_size);
|
||||
}
|
||||
|
||||
unsigned flags_written = inst->flags_written(s.devinfo);
|
||||
if ((nearer_flags & flags_written) != flags_written)
|
||||
return false;
|
||||
|
||||
/* The flags written by nearer must reach the logic operation. */
|
||||
if ((flags_written_between(nearer.inst, inst, s.devinfo) &
|
||||
nearer_flags) != 0)
|
||||
return false;
|
||||
|
||||
if (!inst->src[0].negate && !inst->src[1].negate &&
|
||||
try_predicated_cmp(s, live_vars, defs, inst, nearer, farther,
|
||||
nearer_flags)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!inst->src[nearer.src].negate &&
|
||||
try_predicated_mov(s, live_vars, defs, inst, nearer, farther,
|
||||
nearer_flags)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
opt_predicate_logic_local(brw_shader &s, bblock_t *block,
|
||||
const brw_live_variables &live_vars,
|
||||
const brw_def_analysis &defs)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_inst_in_block_reverse_safe(brw_inst, inst, block) {
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_AND:
|
||||
case BRW_OPCODE_OR:
|
||||
if (inst->predicate == BRW_PREDICATE_NONE &&
|
||||
inst->dst.is_null() &&
|
||||
brw_type_size_bytes(inst->src[0].type) == 4 &&
|
||||
brw_type_size_bytes(inst->src[1].type) == 4 &&
|
||||
!inst->src[0].abs && !inst->src[1].abs) {
|
||||
if (try_predicate_and(s, inst, live_vars, defs))
|
||||
progress = true;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_opt_predicate_logic(brw_shader &s)
|
||||
{
|
||||
bool progress = false;
|
||||
const brw_live_variables &live_vars = s.live_analysis.require();
|
||||
const brw_def_analysis &defs = s.def_analysis.require();
|
||||
|
||||
foreach_block (block, s.cfg) {
|
||||
if (opt_predicate_logic_local(s, block, live_vars, defs))
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
@ -367,6 +367,7 @@ bool brw_opt_cse_defs(brw_shader &s);
|
|||
bool brw_opt_dead_code_eliminate(brw_shader &s);
|
||||
bool brw_opt_eliminate_find_live_channel(brw_shader &s);
|
||||
bool brw_opt_fill_and_spill(brw_shader &s);
|
||||
bool brw_opt_predicate_logic(brw_shader &s);
|
||||
bool brw_opt_register_coalesce(brw_shader &s);
|
||||
bool brw_opt_remove_extra_rounding_modes(brw_shader &s);
|
||||
bool brw_opt_remove_redundant_halts(brw_shader &s);
|
||||
|
|
|
|||
|
|
@ -78,6 +78,7 @@ libintel_compiler_brw_files = files(
|
|||
'brw_opt_cse.cpp',
|
||||
'brw_opt_dead_code_eliminate.cpp',
|
||||
'brw_opt_fill_spill.cpp',
|
||||
'brw_opt_predicate_logic.cpp',
|
||||
'brw_opt_register_coalesce.cpp',
|
||||
'brw_opt_saturate_propagation.cpp',
|
||||
'brw_opt_txf_combiner.cpp',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue