intel/brw: Fallback to SEND from SEND_GATHER if possible

After optimization happen, if the sources are still in one or two
contigous spans for some reason (e.g. some data read from memory
now being written), it is beneficial to just use regular SEND
and avoid having to set the ARF scalar instruction.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Lionel Landwerlin <None>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32410>
This commit is contained in:
Caio Oliveira 2025-01-14 10:49:51 -08:00 committed by Marge Bot
parent b6b32933ad
commit f18dee3618
2 changed files with 106 additions and 0 deletions

View file

@ -559,6 +559,7 @@ bool brw_opt_register_coalesce(fs_visitor &s);
bool brw_opt_remove_extra_rounding_modes(fs_visitor &s);
bool brw_opt_remove_redundant_halts(fs_visitor &s);
bool brw_opt_saturate_propagation(fs_visitor &s);
bool brw_opt_send_gather_to_send(fs_visitor &s);
bool brw_opt_send_to_send_gather(fs_visitor &s);
bool brw_opt_split_sends(fs_visitor &s);
bool brw_opt_split_virtual_grfs(fs_visitor &s);

View file

@ -180,6 +180,9 @@ brw_optimize(fs_visitor &s)
if (progress)
OPT(brw_lower_simd_width);
if (s.devinfo->ver >= 30)
OPT(brw_opt_send_gather_to_send);
OPT(brw_lower_uniform_pull_constant_loads);
if (OPT(brw_lower_send_descriptors)) {
@ -645,3 +648,105 @@ brw_opt_send_to_send_gather(fs_visitor &s)
return progress;
}
/* If after optimizations, the sources are *still* contiguous in a
* SEND_GATHER, prefer to use the regular SEND, which would save
* having to write the ARF scalar register.
*/
bool
brw_opt_send_gather_to_send(fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
bool progress = false;
assert(devinfo->ver >= 30);
const unsigned unit = reg_unit(devinfo);
assert(unit == 2);
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
continue;
assert(inst->sources > 2);
assert(inst->src[2].file == BAD_FILE);
const int num_payload_sources = inst->sources - 3;
assert(num_payload_sources > 0);
/* Limited by Src0.Length in the SEND instruction. */
assert(num_payload_sources < 16);
/* Determine whether the sources are still spread in either one or two
* spans. In those cases the regular SEND instruction can be used
* and there's no need to use SEND_GATHER (which would set ARF scalar register
* adding an extra instruction).
*/
const brw_reg *payload = &inst->src[3];
brw_reg payload1 = payload[0];
brw_reg payload2 = {};
int payload1_len = 0;
int payload2_len = 0;
for (int i = 0; i < num_payload_sources; i++) {
if (payload[i].file == VGRF &&
payload[i].nr == payload1.nr &&
payload[i].offset == payload1_len * REG_SIZE * unit)
payload1_len++;
else {
payload2 = payload[i];
break;
}
}
if (payload2.file == VGRF) {
for (int i = payload1_len; i < num_payload_sources; i++) {
if (payload[i].file == VGRF &&
payload[i].nr == payload2.nr &&
payload[i].offset == payload2_len * REG_SIZE * unit)
payload2_len++;
else
break;
}
} else {
payload2 = brw_null_reg();
}
if (payload1_len + payload2_len != num_payload_sources)
continue;
/* Bspec 57058 (r64705) says
*
* When a source data payload is used in dataport message, that payload
* must be specified as Source 1 portion of a Split Send message.
*
* But at this point the split point is not guaranteed to respect that.
*
* TODO: Pass LSC address length or infer it so valid splits can work.
*/
if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
inst->sfid == GFX12_SFID_TGM ||
inst->sfid == GFX12_SFID_SLM ||
inst->sfid == BRW_SFID_URB)) {
enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
if (lsc_op_num_data_values(lsc_op) > 0)
continue;
}
inst->resize_sources(4);
inst->opcode = SHADER_OPCODE_SEND;
inst->src[2] = payload1;
inst->src[3] = payload2;
inst->mlen = payload1_len * unit;
inst->ex_mlen = payload2_len * unit;
progress = true;
}
if (progress) {
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
DEPENDENCY_INSTRUCTION_DATA_FLOW);
}
return progress;
}