intel/brw: Use SHADER_OPCODE_SEND_GATHER in Xe3

Add an optimization pass to turn regular SENDs into SEND_GATHERs.
This allows the payload to be "broken" into smaller pieces that
can be further optimized, which _may_ result in

- less register pressure (no need to contiguous space), and
- less instructions (no need to MOV to such space).

For debugging, the INTEL_DEBUG=no-send-gather option skips this
optimization, and reporting how many opportunities were missed.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Lionel Landwerlin <None>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32410>
This commit is contained in:
Caio Oliveira 2024-11-20 08:12:52 -08:00 committed by Marge Bot
parent 26d4d04d63
commit b6b32933ad
4 changed files with 87 additions and 1 deletions

View file

@ -559,6 +559,7 @@ bool brw_opt_register_coalesce(fs_visitor &s);
bool brw_opt_remove_extra_rounding_modes(fs_visitor &s);
bool brw_opt_remove_redundant_halts(fs_visitor &s);
bool brw_opt_saturate_propagation(fs_visitor &s);
bool brw_opt_send_to_send_gather(fs_visitor &s);
bool brw_opt_split_sends(fs_visitor &s);
bool brw_opt_split_virtual_grfs(fs_visitor &s);
bool brw_opt_zero_samples(fs_visitor &s);

View file

@ -7,6 +7,8 @@
#include "brw_fs.h"
#include "brw_builder.h"
#include "dev/intel_debug.h"
using namespace brw;
void
@ -113,6 +115,9 @@ brw_optimize(fs_visitor &s)
}
}
if (s.devinfo->ver >= 30)
OPT(brw_opt_send_to_send_gather);
OPT(brw_opt_split_sends);
OPT(brw_workaround_nomask_control_flow);
@ -563,3 +568,80 @@ brw_opt_remove_extra_rounding_modes(fs_visitor &s)
return progress;
}
bool
brw_opt_send_to_send_gather(fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
bool progress = false;
assert(devinfo->ver >= 30);
const unsigned unit = reg_unit(devinfo);
assert(unit == 2);
unsigned count = 0;
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_SEND)
continue;
/* For 1-2 registers, send-gather offers no benefits over split-send. */
if (inst->mlen + inst->ex_mlen <= 2 * unit)
continue;
assert(inst->mlen % unit == 0);
assert(inst->ex_mlen % unit == 0);
struct {
brw_reg src;
unsigned phys_len;
} payload[2] = {
{ inst->src[2], inst->mlen / unit },
{ inst->src[3], inst->ex_mlen / unit },
};
const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
/* Limited by Src0.Length in the SEND instruction. */
if (num_payload_sources > 15)
continue;
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
count++;
continue;
}
inst->resize_sources(3 + num_payload_sources);
/* Sources 0 and 1 remain the same. Source 2 will be filled
* after register allocation.
*/
inst->src[2] = {};
int idx = 3;
for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
for (unsigned i = 0; i < payload[p].phys_len; i++) {
inst->src[idx++] = byte_offset(payload[p].src,
i * reg_unit(devinfo) * REG_SIZE);
}
}
assert(idx == inst->sources);
inst->opcode = SHADER_OPCODE_SEND_GATHER;
inst->mlen = 0;
inst->ex_mlen = 0;
progress = true;
}
if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
count, _mesa_shader_stage_to_string(s.stage));
}
if (progress)
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
DEPENDENCY_INSTRUCTION_DATA_FLOW);
return progress;
}

View file

@ -115,6 +115,7 @@ static const struct debug_control debug_control[] = {
{ "reg-pressure", DEBUG_REG_PRESSURE },
{ "shader-print", DEBUG_SHADER_PRINT },
{ "cl-quiet", DEBUG_CL_QUIET },
{ "no-send-gather", DEBUG_NO_SEND_GATHER },
{ NULL, 0 }
};

View file

@ -106,6 +106,7 @@ extern uint64_t intel_debug;
#define DEBUG_BVH_BLAS_IR_AS (1ull << 58)
#define DEBUG_BVH_TLAS_IR_AS (1ull << 59)
#define DEBUG_BVH_NO_BUILD (1ull << 60)
#define DEBUG_NO_SEND_GATHER (1ull << 61)
#define DEBUG_ANY (~0ull)
@ -115,7 +116,8 @@ extern uint64_t intel_debug;
/* These flags may affect program generation */
#define DEBUG_DISK_CACHE_MASK \
(DEBUG_NO_DUAL_OBJECT_GS | DEBUG_SPILL_FS | \
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64)
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \
DEBUG_NO_SEND_GATHER)
/* Flags to determine what bvh to dump out */
#define DEBUG_BVH_ANV (DEBUG_BVH_BLAS | DEBUG_BVH_TLAS)