mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-04 20:38:06 +02:00
intel/brw: Move optimize and small optimizations to brw_fs_opt.cpp
Remaining optimizations in brw_fs.cpp will get their own files. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26887>
This commit is contained in:
parent
7451c0f5d6
commit
f3b7f4726a
3 changed files with 505 additions and 491 deletions
|
|
@ -2938,296 +2938,6 @@ brw_fs_opt_algebraic(fs_visitor &s)
|
|||
return progress;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
|
||||
{
|
||||
assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
|
||||
assert(size_read >= lp->header_size * REG_SIZE);
|
||||
|
||||
unsigned i;
|
||||
unsigned size = lp->header_size * REG_SIZE;
|
||||
for (i = lp->header_size; size < size_read && i < lp->sources; i++)
|
||||
size += lp->exec_size * type_sz(lp->src[i].type);
|
||||
|
||||
/* Size read must cover exactly a subset of sources. */
|
||||
assert(size == size_read);
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimize sample messages that have constant zero values for the trailing
|
||||
* parameters. We can just reduce the message length for these
|
||||
* instructions instead of reserving a register for it. Trailing parameters
|
||||
* that aren't sent default to zero anyway. This will cause the dead code
|
||||
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
||||
* set up the zero value.
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_zero_samples(fs_visitor &s)
|
||||
{
|
||||
/* Implementation supports only SENDs, so applicable to Gfx7+ only. */
|
||||
assert(s.devinfo->ver >= 7);
|
||||
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, send, s.cfg) {
|
||||
if (send->opcode != SHADER_OPCODE_SEND ||
|
||||
send->sfid != BRW_SFID_SAMPLER)
|
||||
continue;
|
||||
|
||||
/* Wa_14012688258:
|
||||
*
|
||||
* Don't trim zeros at the end of payload for sample operations
|
||||
* in cube and cube arrays.
|
||||
*/
|
||||
if (send->keep_payload_trailing_zeros)
|
||||
continue;
|
||||
|
||||
/* This pass works on SENDs before splitting. */
|
||||
if (send->ex_mlen > 0)
|
||||
continue;
|
||||
|
||||
fs_inst *lp = (fs_inst *) send->prev;
|
||||
|
||||
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||||
continue;
|
||||
|
||||
/* How much of the payload are actually read by this SEND. */
|
||||
const unsigned params =
|
||||
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
||||
|
||||
/* We don't want to remove the message header or the first parameter.
|
||||
* Removing the first parameter is not allowed, see the Haswell PRM
|
||||
* volume 7, page 149:
|
||||
*
|
||||
* "Parameter 0 is required except for the sampleinfo message, which
|
||||
* has no parameter 0"
|
||||
*/
|
||||
const unsigned first_param_idx = lp->header_size;
|
||||
unsigned zero_size = 0;
|
||||
for (unsigned i = params - 1; i > first_param_idx; i--) {
|
||||
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
|
||||
break;
|
||||
zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
|
||||
}
|
||||
|
||||
const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
|
||||
if (zero_len > 0) {
|
||||
send->mlen -= zero_len;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Opportunistically split SEND message payloads.
|
||||
*
|
||||
* Gfx9+ supports "split" SEND messages, which take two payloads that are
|
||||
* implicitly concatenated. If we find a SEND message with a single payload,
|
||||
* we can split that payload in two. This results in smaller contiguous
|
||||
* register blocks for us to allocate. But it can help beyond that, too.
|
||||
*
|
||||
* We try and split a LOAD_PAYLOAD between sources which change registers.
|
||||
* For example, a sampler message often contains a x/y/z coordinate that may
|
||||
* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
|
||||
* or array index, which comes from elsewhere. In this case, the first few
|
||||
* sources will be different offsets of the same VGRF, then a later source
|
||||
* will be a different VGRF. So we split there, possibly eliminating the
|
||||
* payload concatenation altogether.
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_split_sends(fs_visitor &s)
|
||||
{
|
||||
if (s.devinfo->ver < 9)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, send, s.cfg) {
|
||||
if (send->opcode != SHADER_OPCODE_SEND ||
|
||||
send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
|
||||
continue;
|
||||
|
||||
assert(send->src[2].file == VGRF);
|
||||
|
||||
/* Currently don't split sends that reuse a previously used payload. */
|
||||
fs_inst *lp = (fs_inst *) send->prev;
|
||||
|
||||
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||||
continue;
|
||||
|
||||
if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
|
||||
continue;
|
||||
|
||||
/* Split either after the header (if present), or when consecutive
|
||||
* sources switch from one VGRF to a different one.
|
||||
*/
|
||||
unsigned mid = lp->header_size;
|
||||
if (mid == 0) {
|
||||
for (mid = 1; mid < lp->sources; mid++) {
|
||||
if (lp->src[mid].file == BAD_FILE)
|
||||
continue;
|
||||
|
||||
if (lp->src[0].file != lp->src[mid].file ||
|
||||
lp->src[0].nr != lp->src[mid].nr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
|
||||
* find out how many sources from the payload does it really need.
|
||||
*/
|
||||
const unsigned end =
|
||||
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
||||
|
||||
/* Nothing to split. */
|
||||
if (end <= mid)
|
||||
continue;
|
||||
|
||||
const fs_builder ibld(&s, block, lp);
|
||||
fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
|
||||
fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
|
||||
|
||||
assert(lp1->size_written % REG_SIZE == 0);
|
||||
assert(lp2->size_written % REG_SIZE == 0);
|
||||
assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
|
||||
|
||||
lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
|
||||
lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
|
||||
|
||||
send->resize_sources(4);
|
||||
send->src[2] = lp1->dst;
|
||||
send->src[3] = lp2->dst;
|
||||
send->ex_mlen = lp2->size_written / REG_SIZE;
|
||||
send->mlen -= send->ex_mlen;
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove redundant or useless halts.
|
||||
*
|
||||
* For example, we can eliminate halts in the following sequence:
|
||||
*
|
||||
* halt (redundant with the next halt)
|
||||
* halt (useless; jumps to the next instruction)
|
||||
* halt-target
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_remove_redundant_halts(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
unsigned halt_count = 0;
|
||||
fs_inst *halt_target = NULL;
|
||||
bblock_t *halt_target_block = NULL;
|
||||
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
||||
if (inst->opcode == BRW_OPCODE_HALT)
|
||||
halt_count++;
|
||||
|
||||
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
|
||||
halt_target = inst;
|
||||
halt_target_block = block;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!halt_target) {
|
||||
assert(halt_count == 0);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Delete any HALTs immediately before the halt target. */
|
||||
for (fs_inst *prev = (fs_inst *) halt_target->prev;
|
||||
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
|
||||
prev = (fs_inst *) halt_target->prev) {
|
||||
prev->remove(halt_target_block);
|
||||
halt_count--;
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (halt_count == 0) {
|
||||
halt_target->remove(halt_target_block);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
||||
* flow. We could probably do better here with some form of divergence
|
||||
* analysis.
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
unsigned depth = 0;
|
||||
|
||||
if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
||||
s.stage_prog_data)) {
|
||||
/* The optimization below assumes that channel zero is live on thread
|
||||
* dispatch, which may not be the case if the fixed function dispatches
|
||||
* threads sparsely.
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_IF:
|
||||
case BRW_OPCODE_DO:
|
||||
depth++;
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_ENDIF:
|
||||
case BRW_OPCODE_WHILE:
|
||||
depth--;
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_HALT:
|
||||
/* This can potentially make control flow non-uniform until the end
|
||||
* of the program.
|
||||
*/
|
||||
goto out;
|
||||
|
||||
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
||||
if (depth == 0) {
|
||||
inst->opcode = BRW_OPCODE_MOV;
|
||||
inst->src[0] = brw_imm_ud(0u);
|
||||
inst->sources = 1;
|
||||
inst->force_writemask_all = true;
|
||||
progress = true;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
||||
* instructions to FS_OPCODE_REP_FB_WRITE.
|
||||
|
|
@ -3301,55 +3011,6 @@ fs_visitor::emit_repclear_shader()
|
|||
brw_fs_lower_scoreboard(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Rounding modes for conversion instructions are included for each
|
||||
* conversion, but right now it is a state. So once it is set,
|
||||
* we don't need to call it again for subsequent calls.
|
||||
*
|
||||
* This is useful for vector/matrices conversions, as setting the
|
||||
* mode once is enough for the full vector/matrix
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
unsigned execution_mode = s.nir->info.float_controls_execution_mode;
|
||||
|
||||
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
||||
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
||||
execution_mode)
|
||||
base_mode = BRW_RND_MODE_RTNE;
|
||||
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
||||
execution_mode)
|
||||
base_mode = BRW_RND_MODE_RTZ;
|
||||
|
||||
foreach_block (block, s.cfg) {
|
||||
brw_rnd_mode prev_mode = base_mode;
|
||||
|
||||
foreach_inst_in_block_safe (fs_inst, inst, block) {
|
||||
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
||||
assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
|
||||
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
||||
if (mode == prev_mode) {
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
} else {
|
||||
prev_mode = mode;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
brw_fs_lower_load_payload(fs_visitor &s)
|
||||
{
|
||||
|
|
@ -5633,158 +5294,6 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
|
|||
free(filename);
|
||||
}
|
||||
|
||||
void
|
||||
brw_fs_optimize(fs_visitor &s)
|
||||
{
|
||||
const intel_device_info *devinfo = s.devinfo;
|
||||
const nir_shader *nir = s.nir;
|
||||
|
||||
s.debug_optimizer(nir, "start", 0, 0);
|
||||
|
||||
/* Start by validating the shader we currently have. */
|
||||
s.validate();
|
||||
|
||||
bool progress = false;
|
||||
int iteration = 0;
|
||||
int pass_num = 0;
|
||||
|
||||
#define OPT(pass, ...) ({ \
|
||||
pass_num++; \
|
||||
bool this_progress = pass(s, ##__VA_ARGS__); \
|
||||
\
|
||||
if (this_progress) \
|
||||
s.debug_optimizer(nir, #pass, iteration, pass_num); \
|
||||
\
|
||||
s.validate(); \
|
||||
\
|
||||
progress = progress || this_progress; \
|
||||
this_progress; \
|
||||
})
|
||||
|
||||
s.assign_constant_locations();
|
||||
OPT(brw_fs_lower_constant_loads);
|
||||
|
||||
s.validate();
|
||||
|
||||
if (s.compiler->lower_dpas)
|
||||
OPT(brw_lower_dpas);
|
||||
|
||||
OPT(brw_fs_opt_split_virtual_grfs);
|
||||
|
||||
/* Before anything else, eliminate dead code. The results of some NIR
|
||||
* instructions may effectively be calculated twice. Once when the
|
||||
* instruction is encountered, and again when the user of that result is
|
||||
* encountered. Wipe those away before algebraic optimizations and
|
||||
* especially copy propagation can mix things up.
|
||||
*/
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
|
||||
OPT(brw_fs_opt_remove_extra_rounding_modes);
|
||||
|
||||
do {
|
||||
progress = false;
|
||||
pass_num = 0;
|
||||
iteration++;
|
||||
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
OPT(brw_fs_opt_cse);
|
||||
OPT(brw_fs_opt_copy_propagation);
|
||||
OPT(opt_predicated_break);
|
||||
OPT(brw_fs_opt_cmod_propagation);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_opt_peephole_sel);
|
||||
OPT(dead_control_flow_eliminate);
|
||||
OPT(brw_fs_opt_saturate_propagation);
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_eliminate_find_live_channel);
|
||||
|
||||
OPT(brw_fs_opt_compact_virtual_grfs);
|
||||
} while (progress);
|
||||
|
||||
progress = false;
|
||||
pass_num = 0;
|
||||
|
||||
if (OPT(brw_fs_lower_pack)) {
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
OPT(brw_fs_lower_barycentrics);
|
||||
OPT(brw_fs_lower_logical_sends);
|
||||
|
||||
/* After logical SEND lowering. */
|
||||
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
|
||||
* Do this before splitting SENDs.
|
||||
*/
|
||||
if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
OPT(brw_fs_opt_split_sends);
|
||||
OPT(brw_fs_workaround_nomask_control_flow);
|
||||
|
||||
if (progress) {
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
/* Run after logical send lowering to give it a chance to CSE the
|
||||
* LOAD_PAYLOAD instructions created to construct the payloads of
|
||||
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
||||
* whole logical instruction.
|
||||
*/
|
||||
OPT(brw_fs_opt_cse);
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_opt_peephole_sel);
|
||||
}
|
||||
|
||||
OPT(brw_fs_opt_remove_redundant_halts);
|
||||
|
||||
if (OPT(brw_fs_lower_load_payload)) {
|
||||
OPT(brw_fs_opt_split_virtual_grfs);
|
||||
|
||||
/* Lower 64 bit MOVs generated by payload lowering. */
|
||||
if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
OPT(brw_fs_opt_combine_constants);
|
||||
if (OPT(brw_fs_lower_integer_multiplication)) {
|
||||
/* If lower_integer_multiplication made progress, it may have produced
|
||||
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
|
||||
* one more time to clean those up if they exist.
|
||||
*/
|
||||
OPT(brw_fs_lower_integer_multiplication);
|
||||
}
|
||||
OPT(brw_fs_lower_sub_sat);
|
||||
|
||||
progress = false;
|
||||
OPT(brw_fs_lower_derivatives);
|
||||
OPT(brw_fs_lower_regioning);
|
||||
if (progress) {
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
}
|
||||
|
||||
OPT(brw_fs_lower_sends_overlapping_payload);
|
||||
|
||||
OPT(brw_fs_lower_uniform_pull_constant_loads);
|
||||
|
||||
OPT(brw_fs_lower_find_live_channel);
|
||||
|
||||
s.validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* From the Skylake PRM Vol. 2a docs for sends:
|
||||
*
|
||||
|
|
|
|||
504
src/intel/compiler/brw_fs_opt.cpp
Normal file
504
src/intel/compiler/brw_fs_opt.cpp
Normal file
|
|
@ -0,0 +1,504 @@
|
|||
/*
|
||||
* Copyright © 2010 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_dead_control_flow.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
void
|
||||
brw_fs_optimize(fs_visitor &s)
|
||||
{
|
||||
const intel_device_info *devinfo = s.devinfo;
|
||||
const nir_shader *nir = s.nir;
|
||||
|
||||
s.debug_optimizer(nir, "start", 0, 0);
|
||||
|
||||
/* Start by validating the shader we currently have. */
|
||||
s.validate();
|
||||
|
||||
bool progress = false;
|
||||
int iteration = 0;
|
||||
int pass_num = 0;
|
||||
|
||||
#define OPT(pass, ...) ({ \
|
||||
pass_num++; \
|
||||
bool this_progress = pass(s, ##__VA_ARGS__); \
|
||||
\
|
||||
if (this_progress) \
|
||||
s.debug_optimizer(nir, #pass, iteration, pass_num); \
|
||||
\
|
||||
s.validate(); \
|
||||
\
|
||||
progress = progress || this_progress; \
|
||||
this_progress; \
|
||||
})
|
||||
|
||||
s.assign_constant_locations();
|
||||
OPT(brw_fs_lower_constant_loads);
|
||||
|
||||
s.validate();
|
||||
|
||||
if (s.compiler->lower_dpas)
|
||||
OPT(brw_lower_dpas);
|
||||
|
||||
OPT(brw_fs_opt_split_virtual_grfs);
|
||||
|
||||
/* Before anything else, eliminate dead code. The results of some NIR
|
||||
* instructions may effectively be calculated twice. Once when the
|
||||
* instruction is encountered, and again when the user of that result is
|
||||
* encountered. Wipe those away before algebraic optimizations and
|
||||
* especially copy propagation can mix things up.
|
||||
*/
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
|
||||
OPT(brw_fs_opt_remove_extra_rounding_modes);
|
||||
|
||||
do {
|
||||
progress = false;
|
||||
pass_num = 0;
|
||||
iteration++;
|
||||
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
OPT(brw_fs_opt_cse);
|
||||
OPT(brw_fs_opt_copy_propagation);
|
||||
OPT(opt_predicated_break);
|
||||
OPT(brw_fs_opt_cmod_propagation);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_opt_peephole_sel);
|
||||
OPT(dead_control_flow_eliminate);
|
||||
OPT(brw_fs_opt_saturate_propagation);
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_eliminate_find_live_channel);
|
||||
|
||||
OPT(brw_fs_opt_compact_virtual_grfs);
|
||||
} while (progress);
|
||||
|
||||
progress = false;
|
||||
pass_num = 0;
|
||||
|
||||
if (OPT(brw_fs_lower_pack)) {
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
OPT(brw_fs_lower_barycentrics);
|
||||
OPT(brw_fs_lower_logical_sends);
|
||||
|
||||
/* After logical SEND lowering. */
|
||||
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
|
||||
* Do this before splitting SENDs.
|
||||
*/
|
||||
if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
OPT(brw_fs_opt_split_sends);
|
||||
OPT(brw_fs_workaround_nomask_control_flow);
|
||||
|
||||
if (progress) {
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
/* Run after logical send lowering to give it a chance to CSE the
|
||||
* LOAD_PAYLOAD instructions created to construct the payloads of
|
||||
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
||||
* whole logical instruction.
|
||||
*/
|
||||
OPT(brw_fs_opt_cse);
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_opt_peephole_sel);
|
||||
}
|
||||
|
||||
OPT(brw_fs_opt_remove_redundant_halts);
|
||||
|
||||
if (OPT(brw_fs_lower_load_payload)) {
|
||||
OPT(brw_fs_opt_split_virtual_grfs);
|
||||
|
||||
/* Lower 64 bit MOVs generated by payload lowering. */
|
||||
if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
|
||||
OPT(brw_fs_opt_register_coalesce);
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
}
|
||||
|
||||
OPT(brw_fs_opt_combine_constants);
|
||||
if (OPT(brw_fs_lower_integer_multiplication)) {
|
||||
/* If lower_integer_multiplication made progress, it may have produced
|
||||
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
|
||||
* one more time to clean those up if they exist.
|
||||
*/
|
||||
OPT(brw_fs_lower_integer_multiplication);
|
||||
}
|
||||
OPT(brw_fs_lower_sub_sat);
|
||||
|
||||
progress = false;
|
||||
OPT(brw_fs_lower_derivatives);
|
||||
OPT(brw_fs_lower_regioning);
|
||||
if (progress) {
|
||||
if (OPT(brw_fs_opt_copy_propagation))
|
||||
OPT(brw_fs_opt_algebraic);
|
||||
OPT(brw_fs_opt_dead_code_eliminate);
|
||||
OPT(brw_fs_lower_simd_width);
|
||||
}
|
||||
|
||||
OPT(brw_fs_lower_sends_overlapping_payload);
|
||||
|
||||
OPT(brw_fs_lower_uniform_pull_constant_loads);
|
||||
|
||||
OPT(brw_fs_lower_find_live_channel);
|
||||
|
||||
s.validate();
|
||||
}
|
||||
|
||||
static unsigned
|
||||
load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
|
||||
{
|
||||
assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
|
||||
assert(size_read >= lp->header_size * REG_SIZE);
|
||||
|
||||
unsigned i;
|
||||
unsigned size = lp->header_size * REG_SIZE;
|
||||
for (i = lp->header_size; size < size_read && i < lp->sources; i++)
|
||||
size += lp->exec_size * type_sz(lp->src[i].type);
|
||||
|
||||
/* Size read must cover exactly a subset of sources. */
|
||||
assert(size == size_read);
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimize sample messages that have constant zero values for the trailing
|
||||
* parameters. We can just reduce the message length for these
|
||||
* instructions instead of reserving a register for it. Trailing parameters
|
||||
* that aren't sent default to zero anyway. This will cause the dead code
|
||||
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
||||
* set up the zero value.
|
||||
*/
|
||||
|
||||
bool
|
||||
brw_fs_opt_zero_samples(fs_visitor &s)
|
||||
{
|
||||
/* Implementation supports only SENDs, so applicable to Gfx7+ only. */
|
||||
assert(s.devinfo->ver >= 7);
|
||||
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, send, s.cfg) {
|
||||
if (send->opcode != SHADER_OPCODE_SEND ||
|
||||
send->sfid != BRW_SFID_SAMPLER)
|
||||
continue;
|
||||
|
||||
/* Wa_14012688258:
|
||||
*
|
||||
* Don't trim zeros at the end of payload for sample operations
|
||||
* in cube and cube arrays.
|
||||
*/
|
||||
if (send->keep_payload_trailing_zeros)
|
||||
continue;
|
||||
|
||||
/* This pass works on SENDs before splitting. */
|
||||
if (send->ex_mlen > 0)
|
||||
continue;
|
||||
|
||||
fs_inst *lp = (fs_inst *) send->prev;
|
||||
|
||||
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||||
continue;
|
||||
|
||||
/* How much of the payload are actually read by this SEND. */
|
||||
const unsigned params =
|
||||
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
||||
|
||||
/* We don't want to remove the message header or the first parameter.
|
||||
* Removing the first parameter is not allowed, see the Haswell PRM
|
||||
* volume 7, page 149:
|
||||
*
|
||||
* "Parameter 0 is required except for the sampleinfo message, which
|
||||
* has no parameter 0"
|
||||
*/
|
||||
const unsigned first_param_idx = lp->header_size;
|
||||
unsigned zero_size = 0;
|
||||
for (unsigned i = params - 1; i > first_param_idx; i--) {
|
||||
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
|
||||
break;
|
||||
zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
|
||||
}
|
||||
|
||||
const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
|
||||
if (zero_len > 0) {
|
||||
send->mlen -= zero_len;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Opportunistically split SEND message payloads.
|
||||
*
|
||||
* Gfx9+ supports "split" SEND messages, which take two payloads that are
|
||||
* implicitly concatenated. If we find a SEND message with a single payload,
|
||||
* we can split that payload in two. This results in smaller contiguous
|
||||
* register blocks for us to allocate. But it can help beyond that, too.
|
||||
*
|
||||
* We try and split a LOAD_PAYLOAD between sources which change registers.
|
||||
* For example, a sampler message often contains a x/y/z coordinate that may
|
||||
* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
|
||||
* or array index, which comes from elsewhere. In this case, the first few
|
||||
* sources will be different offsets of the same VGRF, then a later source
|
||||
* will be a different VGRF. So we split there, possibly eliminating the
|
||||
* payload concatenation altogether.
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_split_sends(fs_visitor &s)
|
||||
{
|
||||
if (s.devinfo->ver < 9)
|
||||
return false;
|
||||
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, send, s.cfg) {
|
||||
if (send->opcode != SHADER_OPCODE_SEND ||
|
||||
send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
|
||||
continue;
|
||||
|
||||
assert(send->src[2].file == VGRF);
|
||||
|
||||
/* Currently don't split sends that reuse a previously used payload. */
|
||||
fs_inst *lp = (fs_inst *) send->prev;
|
||||
|
||||
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||||
continue;
|
||||
|
||||
if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
|
||||
continue;
|
||||
|
||||
/* Split either after the header (if present), or when consecutive
|
||||
* sources switch from one VGRF to a different one.
|
||||
*/
|
||||
unsigned mid = lp->header_size;
|
||||
if (mid == 0) {
|
||||
for (mid = 1; mid < lp->sources; mid++) {
|
||||
if (lp->src[mid].file == BAD_FILE)
|
||||
continue;
|
||||
|
||||
if (lp->src[0].file != lp->src[mid].file ||
|
||||
lp->src[0].nr != lp->src[mid].nr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
|
||||
* find out how many sources from the payload does it really need.
|
||||
*/
|
||||
const unsigned end =
|
||||
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
||||
|
||||
/* Nothing to split. */
|
||||
if (end <= mid)
|
||||
continue;
|
||||
|
||||
const fs_builder ibld(&s, block, lp);
|
||||
fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
|
||||
fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
|
||||
|
||||
assert(lp1->size_written % REG_SIZE == 0);
|
||||
assert(lp2->size_written % REG_SIZE == 0);
|
||||
assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
|
||||
|
||||
lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
|
||||
lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
|
||||
|
||||
send->resize_sources(4);
|
||||
send->src[2] = lp1->dst;
|
||||
send->src[3] = lp2->dst;
|
||||
send->ex_mlen = lp2->size_written / REG_SIZE;
|
||||
send->mlen -= send->ex_mlen;
|
||||
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove redundant or useless halts.
|
||||
*
|
||||
* For example, we can eliminate halts in the following sequence:
|
||||
*
|
||||
* halt (redundant with the next halt)
|
||||
* halt (useless; jumps to the next instruction)
|
||||
* halt-target
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_remove_redundant_halts(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
unsigned halt_count = 0;
|
||||
fs_inst *halt_target = NULL;
|
||||
bblock_t *halt_target_block = NULL;
|
||||
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
||||
if (inst->opcode == BRW_OPCODE_HALT)
|
||||
halt_count++;
|
||||
|
||||
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
|
||||
halt_target = inst;
|
||||
halt_target_block = block;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!halt_target) {
|
||||
assert(halt_count == 0);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Delete any HALTs immediately before the halt target. */
|
||||
for (fs_inst *prev = (fs_inst *) halt_target->prev;
|
||||
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
|
||||
prev = (fs_inst *) halt_target->prev) {
|
||||
prev->remove(halt_target_block);
|
||||
halt_count--;
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (halt_count == 0) {
|
||||
halt_target->remove(halt_target_block);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
||||
* flow. We could probably do better here with some form of divergence
|
||||
* analysis.
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
unsigned depth = 0;
|
||||
|
||||
if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
||||
s.stage_prog_data)) {
|
||||
/* The optimization below assumes that channel zero is live on thread
|
||||
* dispatch, which may not be the case if the fixed function dispatches
|
||||
* threads sparsely.
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_IF:
|
||||
case BRW_OPCODE_DO:
|
||||
depth++;
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_ENDIF:
|
||||
case BRW_OPCODE_WHILE:
|
||||
depth--;
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_HALT:
|
||||
/* This can potentially make control flow non-uniform until the end
|
||||
* of the program.
|
||||
*/
|
||||
goto out;
|
||||
|
||||
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
||||
if (depth == 0) {
|
||||
inst->opcode = BRW_OPCODE_MOV;
|
||||
inst->src[0] = brw_imm_ud(0u);
|
||||
inst->sources = 1;
|
||||
inst->force_writemask_all = true;
|
||||
progress = true;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rounding modes for conversion instructions are included for each
|
||||
* conversion, but right now it is a state. So once it is set,
|
||||
* we don't need to call it again for subsequent calls.
|
||||
*
|
||||
* This is useful for vector/matrices conversions, as setting the
|
||||
* mode once is enough for the full vector/matrix
|
||||
*/
|
||||
bool
|
||||
brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
|
||||
{
|
||||
bool progress = false;
|
||||
unsigned execution_mode = s.nir->info.float_controls_execution_mode;
|
||||
|
||||
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
||||
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
||||
execution_mode)
|
||||
base_mode = BRW_RND_MODE_RTNE;
|
||||
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
||||
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
||||
execution_mode)
|
||||
base_mode = BRW_RND_MODE_RTZ;
|
||||
|
||||
foreach_block (block, s.cfg) {
|
||||
brw_rnd_mode prev_mode = base_mode;
|
||||
|
||||
foreach_inst_in_block_safe (fs_inst, inst, block) {
|
||||
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
||||
assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
|
||||
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
||||
if (mode == prev_mode) {
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
} else {
|
||||
prev_mode = mode;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
|
|
@ -79,6 +79,7 @@ libintel_compiler_brw_files = files(
|
|||
'brw_fs_lower_pack.cpp',
|
||||
'brw_fs_lower_regioning.cpp',
|
||||
'brw_fs_nir.cpp',
|
||||
'brw_fs_opt.cpp',
|
||||
'brw_fs_reg_allocate.cpp',
|
||||
'brw_fs_register_coalesce.cpp',
|
||||
'brw_fs_saturate_propagation.cpp',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue