mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 20:00:10 +01:00
751 lines
23 KiB
C++
751 lines
23 KiB
C++
/*
|
|
* Copyright © 2010 Intel Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "brw_eu.h"
|
|
#include "brw_shader.h"
|
|
#include "brw_builder.h"
|
|
|
|
#include "dev/intel_debug.h"
|
|
|
|
void
|
|
brw_optimize(brw_shader &s)
|
|
{
|
|
const nir_shader *nir = s.nir;
|
|
|
|
s.debug_optimizer(nir, "start", 0, 0);
|
|
|
|
/* Start by validating the shader we currently have. */
|
|
brw_validate(s);
|
|
|
|
/* Track how much non-SSA at this point. */
|
|
{
|
|
const brw_def_analysis &defs = s.def_analysis.require();
|
|
s.shader_stats.non_ssa_registers_after_nir =
|
|
defs.count() - defs.ssa_count();
|
|
}
|
|
|
|
bool progress = false;
|
|
int iteration = 0;
|
|
int pass_num = 0;
|
|
|
|
#define OPT(pass, ...) ({ \
|
|
pass_num++; \
|
|
bool this_progress = pass(s, ##__VA_ARGS__); \
|
|
\
|
|
if (this_progress) \
|
|
s.debug_optimizer(nir, #pass, iteration, pass_num); \
|
|
\
|
|
brw_validate(s); \
|
|
\
|
|
progress = progress || this_progress; \
|
|
this_progress; \
|
|
})
|
|
|
|
if (s.compiler->lower_dpas)
|
|
OPT(brw_lower_dpas);
|
|
|
|
OPT(brw_opt_split_virtual_grfs);
|
|
|
|
/* Before anything else, eliminate dead code. The results of some NIR
|
|
* instructions may effectively be calculated twice. Once when the
|
|
* instruction is encountered, and again when the user of that result is
|
|
* encountered. Wipe those away before algebraic optimizations and
|
|
* especially copy propagation can mix things up.
|
|
*/
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
|
|
OPT(brw_opt_remove_extra_rounding_modes);
|
|
|
|
OPT(brw_opt_eliminate_find_live_channel);
|
|
|
|
do {
|
|
progress = false;
|
|
pass_num = 0;
|
|
iteration++;
|
|
|
|
OPT(brw_opt_algebraic);
|
|
OPT(brw_opt_cse_defs);
|
|
if (!OPT(brw_opt_copy_propagation_defs))
|
|
OPT(brw_opt_copy_propagation);
|
|
OPT(brw_opt_cmod_propagation);
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
OPT(brw_opt_saturate_propagation);
|
|
OPT(brw_opt_register_coalesce);
|
|
|
|
OPT(brw_opt_compact_virtual_grfs);
|
|
} while (progress);
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
|
|
|
|
progress = false;
|
|
pass_num = 0;
|
|
|
|
if (OPT(brw_opt_combine_convergent_txf))
|
|
OPT(brw_opt_copy_propagation_defs);
|
|
|
|
if (OPT(brw_lower_pack)) {
|
|
OPT(brw_opt_register_coalesce);
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
}
|
|
|
|
OPT(brw_lower_subgroup_ops);
|
|
OPT(brw_lower_csel);
|
|
OPT(brw_lower_simd_width);
|
|
OPT(brw_lower_scalar_fp64_MAD);
|
|
OPT(brw_lower_barycentrics);
|
|
OPT(brw_lower_logical_sends);
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
|
|
|
|
/* After logical SEND lowering. */
|
|
|
|
if (!OPT(brw_opt_copy_propagation_defs))
|
|
OPT(brw_opt_copy_propagation);
|
|
|
|
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
|
|
* Do this before splitting SENDs.
|
|
*/
|
|
if (OPT(brw_opt_zero_samples)) {
|
|
if (!OPT(brw_opt_copy_propagation_defs)) {
|
|
OPT(brw_opt_copy_propagation);
|
|
}
|
|
}
|
|
|
|
if (s.devinfo->ver >= 30)
|
|
OPT(brw_opt_send_to_send_gather);
|
|
|
|
OPT(brw_opt_split_sends);
|
|
OPT(brw_workaround_nomask_control_flow);
|
|
|
|
if (progress) {
|
|
/* Do both forms of copy propagation because it is important to
|
|
* eliminate as many cases of load_payload-of-load_payload as possible.
|
|
*/
|
|
OPT(brw_opt_copy_propagation_defs);
|
|
OPT(brw_opt_copy_propagation);
|
|
|
|
/* Run after logical send lowering to give it a chance to CSE the
|
|
* LOAD_PAYLOAD instructions created to construct the payloads of
|
|
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
|
* whole logical instruction.
|
|
*/
|
|
OPT(brw_opt_cse_defs);
|
|
OPT(brw_opt_register_coalesce);
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
}
|
|
|
|
OPT(brw_opt_remove_redundant_halts);
|
|
|
|
if (OPT(brw_lower_load_payload)) {
|
|
OPT(brw_opt_split_virtual_grfs);
|
|
|
|
OPT(brw_opt_register_coalesce);
|
|
OPT(brw_lower_simd_width);
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
}
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
|
|
|
|
OPT(brw_lower_alu_restrictions);
|
|
|
|
OPT(brw_opt_combine_constants);
|
|
if (OPT(brw_lower_integer_multiplication)) {
|
|
/* If lower_integer_multiplication made progress, it may have produced
|
|
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
|
|
* one more time to clean those up if they exist.
|
|
*/
|
|
OPT(brw_lower_integer_multiplication);
|
|
}
|
|
OPT(brw_lower_sub_sat);
|
|
|
|
progress = false;
|
|
OPT(brw_lower_derivatives);
|
|
OPT(brw_lower_regioning);
|
|
|
|
/* Try both copy propagation passes. The defs one will likely not be
|
|
* able to handle everything at this point.
|
|
*/
|
|
const bool cp1 = OPT(brw_opt_copy_propagation_defs);
|
|
const bool cp2 = OPT(brw_opt_copy_propagation);
|
|
if (cp1 || cp2)
|
|
OPT(brw_opt_combine_constants);
|
|
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
OPT(brw_opt_register_coalesce);
|
|
|
|
if (progress)
|
|
OPT(brw_lower_simd_width);
|
|
|
|
if (s.devinfo->ver >= 30)
|
|
OPT(brw_opt_send_gather_to_send);
|
|
|
|
OPT(brw_lower_uniform_pull_constant_loads);
|
|
|
|
if (OPT(brw_lower_send_descriptors)) {
|
|
/* No need for standard copy_propagation since
|
|
* brw_opt_address_reg_load will only optimize defs.
|
|
*/
|
|
if (OPT(brw_opt_copy_propagation_defs))
|
|
OPT(brw_opt_algebraic);
|
|
OPT(brw_opt_address_reg_load);
|
|
OPT(brw_opt_dead_code_eliminate);
|
|
}
|
|
|
|
OPT(brw_lower_sends_overlapping_payload);
|
|
|
|
OPT(brw_lower_indirect_mov);
|
|
|
|
OPT(brw_lower_find_live_channel);
|
|
|
|
OPT(brw_lower_load_subgroup_invocation);
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
|
|
}
|
|
|
|
static unsigned
|
|
load_payload_sources_read_for_size(brw_inst *lp, unsigned size_read)
|
|
{
|
|
assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
|
|
assert(size_read >= lp->header_size * REG_SIZE);
|
|
|
|
unsigned i;
|
|
unsigned size = lp->header_size * REG_SIZE;
|
|
for (i = lp->header_size; size < size_read && i < lp->sources; i++)
|
|
size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
|
|
|
|
/* Size read must cover exactly a subset of sources. */
|
|
assert(size == size_read);
|
|
return i;
|
|
}
|
|
|
|
/**
|
|
* Optimize sample messages that have constant zero values for the trailing
|
|
* parameters. We can just reduce the message length for these
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
* set up the zero value.
|
|
*/
|
|
|
|
bool
|
|
brw_opt_zero_samples(brw_shader &s)
|
|
{
|
|
bool progress = false;
|
|
|
|
foreach_block_and_inst(block, brw_inst, send, s.cfg) {
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
|
send->sfid != BRW_SFID_SAMPLER)
|
|
continue;
|
|
|
|
/* Wa_14012688258:
|
|
*
|
|
* Don't trim zeros at the end of payload for sample operations
|
|
* in cube and cube arrays.
|
|
*/
|
|
if (send->keep_payload_trailing_zeros)
|
|
continue;
|
|
|
|
/* This pass works on SENDs before splitting. */
|
|
if (send->ex_mlen > 0)
|
|
continue;
|
|
|
|
brw_inst *lp = (brw_inst *) send->prev;
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
continue;
|
|
|
|
/* How much of the payload are actually read by this SEND. */
|
|
const unsigned params =
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
|
|
|
/* We don't want to remove the message header or the first parameter.
|
|
* Removing the first parameter is not allowed, see the Haswell PRM
|
|
* volume 7, page 149:
|
|
*
|
|
* "Parameter 0 is required except for the sampleinfo message, which
|
|
* has no parameter 0"
|
|
*/
|
|
const unsigned first_param_idx = lp->header_size;
|
|
unsigned zero_size = 0;
|
|
for (unsigned i = params - 1; i > first_param_idx; i--) {
|
|
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
|
|
break;
|
|
zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
|
|
}
|
|
|
|
/* Round down to ensure to only consider full registers. */
|
|
const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
|
|
if (zero_len > 0) {
|
|
/* Note mlen is in REG_SIZE units. */
|
|
send->mlen -= zero_len;
|
|
progress = true;
|
|
}
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);
|
|
|
|
return progress;
|
|
}
|
|
|
|
/**
|
|
* Opportunistically split SEND message payloads.
|
|
*
|
|
* Gfx9+ supports "split" SEND messages, which take two payloads that are
|
|
* implicitly concatenated. If we find a SEND message with a single payload,
|
|
* we can split that payload in two. This results in smaller contiguous
|
|
* register blocks for us to allocate. But it can help beyond that, too.
|
|
*
|
|
* We try and split a LOAD_PAYLOAD between sources which change registers.
|
|
* For example, a sampler message often contains a x/y/z coordinate that may
|
|
* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
|
|
* or array index, which comes from elsewhere. In this case, the first few
|
|
* sources will be different offsets of the same VGRF, then a later source
|
|
* will be a different VGRF. So we split there, possibly eliminating the
|
|
* payload concatenation altogether.
|
|
*/
|
|
bool
|
|
brw_opt_split_sends(brw_shader &s)
|
|
{
|
|
bool progress = false;
|
|
|
|
foreach_block_and_inst(block, brw_inst, send, s.cfg) {
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
|
send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
|
|
send->src[2].file != VGRF)
|
|
continue;
|
|
|
|
/* Currently don't split sends that reuse a previously used payload. */
|
|
brw_inst *lp = (brw_inst *) send->prev;
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
continue;
|
|
|
|
if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
|
|
continue;
|
|
|
|
/* Split either after the header (if present), or when consecutive
|
|
* sources switch from one VGRF to a different one.
|
|
*/
|
|
unsigned mid = lp->header_size;
|
|
if (mid == 0) {
|
|
for (mid = 1; mid < lp->sources; mid++) {
|
|
if (lp->src[mid].file == BAD_FILE)
|
|
continue;
|
|
|
|
if (lp->src[0].file != lp->src[mid].file ||
|
|
lp->src[0].nr != lp->src[mid].nr)
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
|
|
* find out how many sources from the payload does it really need.
|
|
*/
|
|
const unsigned end =
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
|
|
|
/* Nothing to split. */
|
|
if (end <= mid)
|
|
continue;
|
|
|
|
const brw_builder ibld(&s, block, lp);
|
|
brw_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
|
|
brw_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
|
|
|
|
assert(lp1->size_written % REG_SIZE == 0);
|
|
assert(lp2->size_written % REG_SIZE == 0);
|
|
assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
|
|
|
|
lp1->dst = retype(brw_allocate_vgrf_units(s, lp1->size_written / REG_SIZE), lp1->dst.type);
|
|
lp2->dst = retype(brw_allocate_vgrf_units(s, lp2->size_written / REG_SIZE), lp2->dst.type);
|
|
|
|
send->resize_sources(4);
|
|
send->src[2] = lp1->dst;
|
|
send->src[3] = lp2->dst;
|
|
send->ex_mlen = lp2->size_written / REG_SIZE;
|
|
send->mlen -= send->ex_mlen;
|
|
|
|
progress = true;
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
BRW_DEPENDENCY_VARIABLES);
|
|
|
|
return progress;
|
|
}
|
|
|
|
/**
|
|
* Remove redundant or useless halts.
|
|
*
|
|
* For example, we can eliminate halts in the following sequence:
|
|
*
|
|
* halt (redundant with the next halt)
|
|
* halt (useless; jumps to the next instruction)
|
|
* halt-target
|
|
*/
|
|
bool
|
|
brw_opt_remove_redundant_halts(brw_shader &s)
|
|
{
|
|
bool progress = false;
|
|
|
|
unsigned halt_count = 0;
|
|
brw_inst *halt_target = NULL;
|
|
bblock_t *halt_target_block = NULL;
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
|
if (inst->opcode == BRW_OPCODE_HALT)
|
|
halt_count++;
|
|
|
|
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
|
|
halt_target = inst;
|
|
halt_target_block = block;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!halt_target) {
|
|
assert(halt_count == 0);
|
|
return false;
|
|
}
|
|
|
|
/* Delete any HALTs immediately before the halt target. */
|
|
for (brw_inst *prev = (brw_inst *) halt_target->prev;
|
|
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
|
|
prev = (brw_inst *) halt_target->prev) {
|
|
prev->remove(halt_target_block);
|
|
halt_count--;
|
|
progress = true;
|
|
}
|
|
|
|
if (halt_count == 0) {
|
|
halt_target->remove(halt_target_block);
|
|
progress = true;
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
|
|
|
return progress;
|
|
}
|
|
|
|
/**
|
|
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
|
* flow. We could probably do better here with some form of divergence
|
|
* analysis.
|
|
*/
|
|
bool
|
|
brw_opt_eliminate_find_live_channel(brw_shader &s)
|
|
{
|
|
bool progress = false;
|
|
unsigned depth = 0;
|
|
|
|
if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
|
s.prog_data)) {
|
|
/* The optimization below assumes that channel zero is live on thread
|
|
* dispatch, which may not be the case if the fixed function dispatches
|
|
* threads sparsely.
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
|
switch (inst->opcode) {
|
|
case BRW_OPCODE_IF:
|
|
case BRW_OPCODE_DO:
|
|
depth++;
|
|
break;
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
case BRW_OPCODE_WHILE:
|
|
depth--;
|
|
break;
|
|
|
|
case BRW_OPCODE_HALT:
|
|
/* This can potentially make control flow non-uniform until the end
|
|
* of the program.
|
|
*/
|
|
goto out;
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
if (depth == 0) {
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
inst->src[0] = brw_imm_ud(0u);
|
|
inst->force_writemask_all = true;
|
|
|
|
/* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
|
|
* size_written set by hand to a smaller value. In this case,
|
|
* munge the exec_size to match.
|
|
*/
|
|
if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
|
|
inst->exec_size = 8 * reg_unit(s.devinfo);
|
|
|
|
inst->resize_sources(1);
|
|
progress = true;
|
|
|
|
/* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
|
|
* with a BROADCAST. Save some work for opt_copy_propagation
|
|
* and opt_algebraic by trivially cleaning up both together.
|
|
*/
|
|
assert(!inst->next->is_tail_sentinel());
|
|
brw_inst *bcast = (brw_inst *) inst->next;
|
|
|
|
/* Ignore stride when comparing */
|
|
if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
|
|
inst->dst.file == VGRF &&
|
|
inst->dst.file == bcast->src[1].file &&
|
|
inst->dst.nr == bcast->src[1].nr &&
|
|
inst->dst.offset == bcast->src[1].offset) {
|
|
bcast->opcode = BRW_OPCODE_MOV;
|
|
if (!is_uniform(bcast->src[0]))
|
|
bcast->src[0] = component(bcast->src[0], 0);
|
|
|
|
bcast->force_writemask_all = true;
|
|
bcast->exec_size = 8 * reg_unit(s.devinfo);
|
|
assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
|
|
bcast->resize_sources(1);
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
out:
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL);
|
|
|
|
return progress;
|
|
}
|
|
|
|
/**
|
|
* Rounding modes for conversion instructions are included for each
|
|
* conversion, but right now it is a state. So once it is set,
|
|
* we don't need to call it again for subsequent calls.
|
|
*
|
|
* This is useful for vector/matrices conversions, as setting the
|
|
* mode once is enough for the full vector/matrix
|
|
*/
|
|
bool
|
|
brw_opt_remove_extra_rounding_modes(brw_shader &s)
|
|
{
|
|
bool progress = false;
|
|
unsigned execution_mode = s.nir->info.float_controls_execution_mode;
|
|
|
|
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
|
execution_mode)
|
|
base_mode = BRW_RND_MODE_RTNE;
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
|
execution_mode)
|
|
base_mode = BRW_RND_MODE_RTZ;
|
|
|
|
foreach_block (block, s.cfg) {
|
|
brw_rnd_mode prev_mode = base_mode;
|
|
|
|
foreach_inst_in_block_safe (brw_inst, inst, block) {
|
|
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
|
assert(inst->src[0].file == IMM);
|
|
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
|
if (mode == prev_mode) {
|
|
inst->remove(block);
|
|
progress = true;
|
|
} else {
|
|
prev_mode = mode;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
|
|
|
return progress;
|
|
}
|
|
|
|
bool
|
|
brw_opt_send_to_send_gather(brw_shader &s)
|
|
{
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
bool progress = false;
|
|
|
|
assert(devinfo->ver >= 30);
|
|
|
|
const unsigned unit = reg_unit(devinfo);
|
|
assert(unit == 2);
|
|
|
|
unsigned count = 0;
|
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
|
if (inst->opcode != SHADER_OPCODE_SEND)
|
|
continue;
|
|
|
|
/* For 1-2 registers, send-gather offers no benefits over split-send. */
|
|
if (inst->mlen + inst->ex_mlen <= 2 * unit)
|
|
continue;
|
|
|
|
assert(inst->mlen % unit == 0);
|
|
assert(inst->ex_mlen % unit == 0);
|
|
|
|
struct {
|
|
brw_reg src;
|
|
unsigned phys_len;
|
|
} payload[2] = {
|
|
{ inst->src[2], inst->mlen / unit },
|
|
{ inst->src[3], inst->ex_mlen / unit },
|
|
};
|
|
|
|
const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
|
|
|
|
/* Limited by Src0.Length in the SEND instruction. */
|
|
if (num_payload_sources > 15)
|
|
continue;
|
|
|
|
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
|
|
count++;
|
|
continue;
|
|
}
|
|
|
|
inst->resize_sources(3 + num_payload_sources);
|
|
/* Sources 0 and 1 remain the same. Source 2 will be filled
|
|
* after register allocation.
|
|
*/
|
|
inst->src[2] = {};
|
|
|
|
int idx = 3;
|
|
for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
|
|
for (unsigned i = 0; i < payload[p].phys_len; i++) {
|
|
inst->src[idx++] = byte_offset(payload[p].src,
|
|
i * reg_unit(devinfo) * REG_SIZE);
|
|
}
|
|
}
|
|
assert(idx == inst->sources);
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND_GATHER;
|
|
inst->mlen = 0;
|
|
inst->ex_mlen = 0;
|
|
|
|
progress = true;
|
|
}
|
|
|
|
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
|
|
fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
|
|
count, _mesa_shader_stage_to_string(s.stage));
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
|
|
|
|
return progress;
|
|
}
|
|
|
|
/* If after optimizations, the sources are *still* contiguous in a
|
|
* SEND_GATHER, prefer to use the regular SEND, which would save
|
|
* having to write the ARF scalar register.
|
|
*/
|
|
bool
|
|
brw_opt_send_gather_to_send(brw_shader &s)
|
|
{
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
bool progress = false;
|
|
|
|
assert(devinfo->ver >= 30);
|
|
|
|
const unsigned unit = reg_unit(devinfo);
|
|
assert(unit == 2);
|
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
|
if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
|
|
continue;
|
|
|
|
assert(inst->sources > 2);
|
|
assert(inst->src[2].file == BAD_FILE);
|
|
|
|
const int num_payload_sources = inst->sources - 3;
|
|
assert(num_payload_sources > 0);
|
|
|
|
/* Limited by Src0.Length in the SEND instruction. */
|
|
assert(num_payload_sources < 16);
|
|
|
|
/* Determine whether the sources are still spread in either one or two
|
|
* spans. In those cases the regular SEND instruction can be used
|
|
* and there's no need to use SEND_GATHER (which would set ARF scalar register
|
|
* adding an extra instruction).
|
|
*/
|
|
const brw_reg *payload = &inst->src[3];
|
|
brw_reg payload1 = payload[0];
|
|
brw_reg payload2 = {};
|
|
int payload1_len = 0;
|
|
int payload2_len = 0;
|
|
|
|
for (int i = 0; i < num_payload_sources; i++) {
|
|
if (payload[i].file == VGRF &&
|
|
payload[i].nr == payload1.nr &&
|
|
payload[i].offset == payload1_len * REG_SIZE * unit)
|
|
payload1_len++;
|
|
else {
|
|
payload2 = payload[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (payload2.file == VGRF) {
|
|
for (int i = payload1_len; i < num_payload_sources; i++) {
|
|
if (payload[i].file == VGRF &&
|
|
payload[i].nr == payload2.nr &&
|
|
payload[i].offset == payload2_len * REG_SIZE * unit)
|
|
payload2_len++;
|
|
else
|
|
break;
|
|
}
|
|
} else {
|
|
payload2 = brw_null_reg();
|
|
}
|
|
|
|
if (payload1_len + payload2_len != num_payload_sources)
|
|
continue;
|
|
|
|
/* Bspec 57058 (r64705) says
|
|
*
|
|
* When a source data payload is used in dataport message, that payload
|
|
* must be specified as Source 1 portion of a Split Send message.
|
|
*
|
|
* But at this point the split point is not guaranteed to respect that.
|
|
*
|
|
* TODO: Pass LSC address length or infer it so valid splits can work.
|
|
*/
|
|
if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
|
|
inst->sfid == GFX12_SFID_TGM ||
|
|
inst->sfid == GFX12_SFID_SLM ||
|
|
inst->sfid == BRW_SFID_URB)) {
|
|
enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
|
|
if (lsc_op_num_data_values(lsc_op) > 0)
|
|
continue;
|
|
}
|
|
|
|
inst->resize_sources(4);
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
|
inst->src[2] = payload1;
|
|
inst->src[3] = payload2;
|
|
inst->mlen = payload1_len * unit;
|
|
inst->ex_mlen = payload2_len * unit;
|
|
|
|
progress = true;
|
|
}
|
|
|
|
if (progress) {
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
|
BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW);
|
|
}
|
|
|
|
return progress;
|
|
}
|