mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 09:28:07 +02:00
brw: Change the flags written by some CMP
One frustrating thing about the CMP and CMPN instructions is that they
always write the flags. Sometimes, however, it is desirable to generate
the comparison result without modifying the flags. This would,
theoretically, reduce false dependencies that restrict the scheduler's
ability to rearrange code, create more opportunities for cmod
propagation, save a kitten from a tree, and make a rainbow.
Consider this sequence:
cmp.ge.f0.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F
cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D
(+f0.0) if(8) JIP: LABEL19 UIP: LABEL19
It would be advantageous to put the first CMP between the second CMP and
the IF, but this cannot be done since the IF depends on the flags generated
by the second CMP.
This pass enables this rescheduling by changing the first CMP to write
to a different flags register.
cmp.ge.f1.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F
cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D
(+f0.0) if(8) JIP: LABEL19 UIP: LABEL19
Sometimes this is also possible by using a different instruction. For
example, consider
cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D
This produces 0xffffffff when g101 negative and zero otherwise. This
instruction, which does not modifiy the flag, also produces these results:
asr(8) g103<1>D g101<8,8,1>D 31D
Gfx9 platforms take a hit on instructions due to the instruction added
at the end of short shaders by brw_workaround_source_arf_before_eot.
shader-db:
Lunar Lake, Meteor Lake, DG2, Tiger Lake, and Ice Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 17089451 -> 17088766 (<.01%)
instructions in affected programs: 766613 -> 765928 (-0.09%)
helped: 653 / HURT: 0
total cycles in shared programs: 888832986 -> 887873068 (-0.11%)
cycles in affected programs: 549441852 -> 548481934 (-0.17%)
helped: 10474 / HURT: 130
LOST: 9
GAINED: 0
Skylake
total instructions in shared programs: 19037976 -> 19049719 (0.06%)
instructions in affected programs: 3979914 -> 3991657 (0.30%)
helped: 503 / HURT: 12303
total cycles in shared programs: 867918242 -> 866930801 (-0.11%)
cycles in affected programs: 512773919 -> 511786478 (-0.19%)
helped: 13858 / HURT: 66
LOST: 32
GAINED: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 925023504 -> 924950382 (-0.01%); split: -0.01%, +0.00%
Cycle count: 106348432916 -> 106116809009 (-0.22%); split: -0.22%, +0.00%
Spill count: 3423988 -> 3423930 (-0.00%); split: -0.00%, +0.00%
Fill count: 4877087 -> 4876960 (-0.00%); split: -0.01%, +0.00%
Max dispatch width: 49087552 -> 49078448 (-0.02%); split: +0.00%, -0.02%
Totals from 1099332 (54.44% of 2019443) affected shaders:
Instrs: 742670473 -> 742597351 (-0.01%); split: -0.01%, +0.00%
Cycle count: 100455549635 -> 100223925728 (-0.23%); split: -0.23%, +0.00%
Spill count: 3384366 -> 3384308 (-0.00%); split: -0.00%, +0.00%
Fill count: 4837434 -> 4837307 (-0.00%); split: -0.01%, +0.00%
Max dispatch width: 26725152 -> 26716048 (-0.03%); split: +0.00%, -0.03%
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 997603774 -> 997529238 (-0.01%); split: -0.01%, +0.00%
Cycle count: 93904012762 -> 93646730006 (-0.27%); split: -0.28%, +0.00%
Spill count: 3710155 -> 3710125 (-0.00%); split: -0.00%, +0.00%
Fill count: 5032908 -> 5032819 (-0.00%); split: -0.01%, +0.00%
Max dispatch width: 37929640 -> 37811560 (-0.31%)
Totals from 1334920 (58.52% of 2281134) affected shaders:
Instrs: 817377787 -> 817303251 (-0.01%); split: -0.01%, +0.00%
Cycle count: 88468851658 -> 88211568902 (-0.29%); split: -0.29%, +0.00%
Spill count: 3663353 -> 3663323 (-0.00%); split: -0.00%, +0.00%
Fill count: 4991629 -> 4991540 (-0.00%); split: -0.01%, +0.00%
Max dispatch width: 20245832 -> 20127752 (-0.58%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 1013433769 -> 1013363273 (-0.01%); split: -0.01%, +0.00%
Cycle count: 85766921182 -> 85509316620 (-0.30%); split: -0.31%, +0.00%
Spill count: 3903923 -> 3903944 (+0.00%); split: -0.00%, +0.00%
Fill count: 6801983 -> 6801948 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 37896320 -> 37805320 (-0.24%); split: +0.00%, -0.24%
Totals from 1333814 (58.54% of 2278396) affected shaders:
Instrs: 830200531 -> 830130035 (-0.01%); split: -0.01%, +0.00%
Cycle count: 80746184101 -> 80488579539 (-0.32%); split: -0.32%, +0.01%
Spill count: 3855771 -> 3855792 (+0.00%); split: -0.00%, +0.00%
Fill count: 6755513 -> 6755478 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 20301456 -> 20210456 (-0.45%); split: +0.00%, -0.45%
Skylake
Totals:
Instrs: 519389758 -> 519874108 (+0.09%); split: -0.00%, +0.10%
Cycle count: 57932316132 -> 57789433956 (-0.25%); split: -0.25%, +0.00%
Spill count: 636741 -> 636715 (-0.00%); split: -0.01%, +0.00%
Fill count: 860470 -> 860357 (-0.01%); split: -0.02%, +0.00%
Max dispatch width: 32527800 -> 32481792 (-0.14%); split: +0.00%, -0.14%
Totals from 1080380 (62.25% of 1735462) affected shaders:
Instrs: 411976399 -> 412460749 (+0.12%); split: -0.00%, +0.12%
Cycle count: 54291447615 -> 54148565439 (-0.26%); split: -0.27%, +0.00%
Spill count: 602993 -> 602967 (-0.00%); split: -0.01%, +0.00%
Fill count: 734459 -> 734346 (-0.02%); split: -0.02%, +0.00%
Max dispatch width: 18626096 -> 18580088 (-0.25%); split: +0.00%, -0.25%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38978>
This commit is contained in:
parent
31de96d321
commit
b5e023777c
4 changed files with 191 additions and 0 deletions
169
src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp
Normal file
169
src/intel/compiler/brw/brw_opt_cmp_flag_destination.cpp
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* Change the flags written by some CMP and CMPN instructions.
|
||||
*
|
||||
* One frustrating thing about the CMP and CMPN instructions is that they
|
||||
* always write the flags. Sometimes, however, it is desirable to generate the
|
||||
* comparison result without modifying the flags. This would, theoretically,
|
||||
* reduce false dependencies that restrict the scheduler's ability to
|
||||
* rearrange code, create more opportunities for cmod propagation, save a
|
||||
* kitten from a tree, and make a rainbow.
|
||||
*
|
||||
* Consider this sequence:
|
||||
*
|
||||
* cmp.ge.f0.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F
|
||||
* cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D
|
||||
* (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19
|
||||
*
|
||||
* It would be advantageous to put the first CMP between the second CMP and
|
||||
* the IF, but this cannot be done since the IF depends on the flags generated
|
||||
* by the second CMP.
|
||||
*
|
||||
* This pass enables this rescheduling by changing the first CMP to write to a
|
||||
* different flags register.
|
||||
*
|
||||
* cmp.ge.f1.0(8) g103<1>F g101<8,8,1>F g39<8,8,1>F
|
||||
* cmp.nz.f0.0(8) null<1>D g81<8,8,1>D 0D
|
||||
* (+f0.0) if(8) JIP: LABEL19 UIP: LABEL19
|
||||
*
|
||||
* Sometimes this is also possible by using a different instruction. For
|
||||
* example, consider
|
||||
*
|
||||
* cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D
|
||||
*
|
||||
* This produces 0xffffffff when g101 negative and zero otherwise. This
|
||||
* instruction, which does not modifiy the flag, also produces these results:
|
||||
*
|
||||
* asr(8) g103<1>D g101<8,8,1>D 31D
|
||||
*/
|
||||
#include "brw_shader.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
static bool
|
||||
opt_cmp_flag_destination_local(brw_shader &s, bblock_t *block,
|
||||
bool uses_kill)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_inst_in_block(brw_inst, inst, block) {
|
||||
if (inst->opcode != BRW_OPCODE_CMP && inst->opcode != BRW_OPCODE_CMPN)
|
||||
continue;
|
||||
|
||||
if (inst->dst.is_null() || inst->predicate != BRW_PREDICATE_NONE)
|
||||
continue;
|
||||
|
||||
if ((s.dispatch_width <= 16 && inst->flag_subreg == 1) ||
|
||||
inst->flag_subreg > 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned flags_written = inst->flags_written(s.devinfo);
|
||||
|
||||
foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
|
||||
if ((scan_inst->flags_read(s.devinfo) & flags_written) != 0)
|
||||
break;
|
||||
|
||||
/* If scan_inst might not modify the flags, keep scanning. */
|
||||
if (scan_inst->predicate != BRW_PREDICATE_NONE)
|
||||
continue;
|
||||
|
||||
flags_written &= ~scan_inst->flags_written(s.devinfo);
|
||||
if (flags_written == 0) {
|
||||
/* A common special case:
|
||||
*
|
||||
* cmp.l.f0.0(8) g103<1>D g101<8,8,1>D 0D
|
||||
*
|
||||
* becomes
|
||||
*
|
||||
* asr(8) g103<1>D g101<8,8,1>D 31D
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_CMP &&
|
||||
brw_type_is_sint(inst->src[0].type) &&
|
||||
inst->src[1].is_zero() &&
|
||||
inst->conditional_mod == BRW_CONDITIONAL_L) {
|
||||
inst->opcode = BRW_OPCODE_ASR;
|
||||
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
||||
|
||||
inst->src[1] =
|
||||
brw_imm_w(brw_type_size_bits(inst->src[0].type) - 1);
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
/* A common special case:
|
||||
*
|
||||
* cmp.nz.f0.0(8) g103<1>D g101<8,8,1>D 0D
|
||||
*
|
||||
* or
|
||||
*
|
||||
* cmp.nz.f0.0(8) g103<1>UD g101<8,8,1>UD 0UD
|
||||
*
|
||||
* becomes
|
||||
*
|
||||
* asr(8) g103<1>D -(abs)g101<8,8,1>D 31D
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_CMP &&
|
||||
brw_type_is_int(inst->src[0].type) &&
|
||||
inst->src[1].is_zero() &&
|
||||
inst->conditional_mod == BRW_CONDITIONAL_NZ) {
|
||||
inst->opcode = BRW_OPCODE_ASR;
|
||||
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
||||
|
||||
inst->dst.type =
|
||||
brw_type_with_size(BRW_TYPE_D,
|
||||
brw_type_size_bits(inst->dst.type));
|
||||
|
||||
unsigned bits = brw_type_size_bits(inst->src[0].type);
|
||||
|
||||
inst->src[0].type = brw_type_with_size(BRW_TYPE_D, bits);
|
||||
inst->src[0].negate = true;
|
||||
inst->src[0].abs = true;
|
||||
|
||||
inst->src[1] = brw_imm_w(bits - 1);
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (inst->conditional_mod != BRW_CONDITIONAL_NONE) {
|
||||
if (s.dispatch_width <= 16) {
|
||||
inst->flag_subreg += 1;
|
||||
progress = true;
|
||||
} else if (s.devinfo->ver >= 20) {
|
||||
/* Xe2 (and a couple exotic Xe platforms) have 4 32-bit
|
||||
* flags registers. Use f2.x.
|
||||
*/
|
||||
inst->flag_subreg += 4;
|
||||
progress = true;
|
||||
} else if (!uses_kill) {
|
||||
inst->flag_subreg += 2;
|
||||
progress = true;
|
||||
}
|
||||
|
||||
assert(s.stage != MESA_SHADER_FRAGMENT ||
|
||||
inst->flag_subreg != sample_mask_flag_subreg(s));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_opt_cmp_flag_destination(brw_shader &s, bool uses_kill)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block (block, s.cfg) {
|
||||
if (opt_cmp_flag_destination_local(s, block, uses_kill))
|
||||
progress = true;
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
@ -845,6 +845,17 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
|||
if (OPT(brw_opt_cmod_propagation))
|
||||
OPT(brw_opt_dead_code_eliminate);
|
||||
|
||||
if (OPT(brw_opt_cmp_flag_destination,
|
||||
/* We want something like
|
||||
* brw_wm_prog_data(s.prog_data)->uses_kill, but there are
|
||||
* other things that can cause the sample_mask_flag_subreg to
|
||||
* be used.
|
||||
*/
|
||||
s.stage == MESA_SHADER_FRAGMENT)) {
|
||||
OPT(brw_opt_cmod_propagation);
|
||||
OPT(brw_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
||||
}
|
||||
|
||||
|
|
@ -905,6 +916,15 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
|||
*/
|
||||
OPT(brw_opt_cmod_propagation);
|
||||
|
||||
if (OPT(brw_opt_cmp_flag_destination,
|
||||
/* We want something like brw_wm_prog_data(s.prog_data)->uses_kill,
|
||||
* but there are other things that can cause the
|
||||
* sample_mask_flag_subreg to be used.
|
||||
*/
|
||||
s.stage == MESA_SHADER_FRAGMENT)) {
|
||||
OPT(brw_opt_cmod_propagation);
|
||||
}
|
||||
|
||||
if (s.devinfo->ver >= 30)
|
||||
OPT(brw_lower_send_gather);
|
||||
|
||||
|
|
|
|||
|
|
@ -356,6 +356,7 @@ bool brw_opt_address_reg_load(brw_shader &s);
|
|||
bool brw_opt_algebraic(brw_shader &s);
|
||||
bool brw_opt_bank_conflicts(brw_shader &s);
|
||||
bool brw_opt_cmod_propagation(brw_shader &s);
|
||||
bool brw_opt_cmp_flag_destination(brw_shader &s, bool uses_kill);
|
||||
bool brw_opt_combine_constants(brw_shader &s);
|
||||
bool brw_opt_combine_convergent_txf(brw_shader &s);
|
||||
bool brw_opt_compact_virtual_grfs(brw_shader &s);
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ libintel_compiler_brw_files = files(
|
|||
'brw_opt_algebraic.cpp',
|
||||
'brw_opt_bank_conflicts.cpp',
|
||||
'brw_opt_cmod_propagation.cpp',
|
||||
'brw_opt_cmp_flag_destination.cpp',
|
||||
'brw_opt_combine_constants.cpp',
|
||||
'brw_opt_copy_propagation.cpp',
|
||||
'brw_opt_cse.cpp',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue