From f18dee36184ee7cea14c6772ee7ce49218bf030e Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Tue, 14 Jan 2025 10:49:51 -0800 Subject: [PATCH] intel/brw: Fallback to SEND from SEND_GATHER if possible After optimization happen, if the sources are still in one or two contigous spans for some reason (e.g. some data read from memory now being written), it is beneficial to just use regular SEND and avoid having to set the ARF scalar instruction. Reviewed-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_fs.h | 1 + src/intel/compiler/brw_opt.cpp | 105 +++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index d89df2ff9f8..5bcc534a11d 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -559,6 +559,7 @@ bool brw_opt_register_coalesce(fs_visitor &s); bool brw_opt_remove_extra_rounding_modes(fs_visitor &s); bool brw_opt_remove_redundant_halts(fs_visitor &s); bool brw_opt_saturate_propagation(fs_visitor &s); +bool brw_opt_send_gather_to_send(fs_visitor &s); bool brw_opt_send_to_send_gather(fs_visitor &s); bool brw_opt_split_sends(fs_visitor &s); bool brw_opt_split_virtual_grfs(fs_visitor &s); diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp index deb632bbc14..f3953335e6e 100644 --- a/src/intel/compiler/brw_opt.cpp +++ b/src/intel/compiler/brw_opt.cpp @@ -180,6 +180,9 @@ brw_optimize(fs_visitor &s) if (progress) OPT(brw_lower_simd_width); + if (s.devinfo->ver >= 30) + OPT(brw_opt_send_gather_to_send); + OPT(brw_lower_uniform_pull_constant_loads); if (OPT(brw_lower_send_descriptors)) { @@ -645,3 +648,105 @@ brw_opt_send_to_send_gather(fs_visitor &s) return progress; } + +/* If after optimizations, the sources are *still* contiguous in a + * SEND_GATHER, prefer to use the regular SEND, which would save + * having to write the ARF scalar register. + */ +bool +brw_opt_send_gather_to_send(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + assert(devinfo->ver >= 30); + + const unsigned unit = reg_unit(devinfo); + assert(unit == 2); + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_SEND_GATHER) + continue; + + assert(inst->sources > 2); + assert(inst->src[2].file == BAD_FILE); + + const int num_payload_sources = inst->sources - 3; + assert(num_payload_sources > 0); + + /* Limited by Src0.Length in the SEND instruction. */ + assert(num_payload_sources < 16); + + /* Determine whether the sources are still spread in either one or two + * spans. In those cases the regular SEND instruction can be used + * and there's no need to use SEND_GATHER (which would set ARF scalar register + * adding an extra instruction). + */ + const brw_reg *payload = &inst->src[3]; + brw_reg payload1 = payload[0]; + brw_reg payload2 = {}; + int payload1_len = 0; + int payload2_len = 0; + + for (int i = 0; i < num_payload_sources; i++) { + if (payload[i].file == VGRF && + payload[i].nr == payload1.nr && + payload[i].offset == payload1_len * REG_SIZE * unit) + payload1_len++; + else { + payload2 = payload[i]; + break; + } + } + + if (payload2.file == VGRF) { + for (int i = payload1_len; i < num_payload_sources; i++) { + if (payload[i].file == VGRF && + payload[i].nr == payload2.nr && + payload[i].offset == payload2_len * REG_SIZE * unit) + payload2_len++; + else + break; + } + } else { + payload2 = brw_null_reg(); + } + + if (payload1_len + payload2_len != num_payload_sources) + continue; + + /* Bspec 57058 (r64705) says + * + * When a source data payload is used in dataport message, that payload + * must be specified as Source 1 portion of a Split Send message. + * + * But at this point the split point is not guaranteed to respect that. + * + * TODO: Pass LSC address length or infer it so valid splits can work. + */ + if (payload2_len && (inst->sfid == GFX12_SFID_UGM || + inst->sfid == GFX12_SFID_TGM || + inst->sfid == GFX12_SFID_SLM || + inst->sfid == BRW_SFID_URB)) { + enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc); + if (lsc_op_num_data_values(lsc_op) > 0) + continue; + } + + inst->resize_sources(4); + inst->opcode = SHADER_OPCODE_SEND; + inst->src[2] = payload1; + inst->src[3] = payload2; + inst->mlen = payload1_len * unit; + inst->ex_mlen = payload2_len * unit; + + progress = true; + } + + if (progress) { + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_INSTRUCTION_DATA_FLOW); + } + + return progress; +}