diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index d89df2ff9f8..5bcc534a11d 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -559,6 +559,7 @@ bool brw_opt_register_coalesce(fs_visitor &s); bool brw_opt_remove_extra_rounding_modes(fs_visitor &s); bool brw_opt_remove_redundant_halts(fs_visitor &s); bool brw_opt_saturate_propagation(fs_visitor &s); +bool brw_opt_send_gather_to_send(fs_visitor &s); bool brw_opt_send_to_send_gather(fs_visitor &s); bool brw_opt_split_sends(fs_visitor &s); bool brw_opt_split_virtual_grfs(fs_visitor &s); diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp index deb632bbc14..f3953335e6e 100644 --- a/src/intel/compiler/brw_opt.cpp +++ b/src/intel/compiler/brw_opt.cpp @@ -180,6 +180,9 @@ brw_optimize(fs_visitor &s) if (progress) OPT(brw_lower_simd_width); + if (s.devinfo->ver >= 30) + OPT(brw_opt_send_gather_to_send); + OPT(brw_lower_uniform_pull_constant_loads); if (OPT(brw_lower_send_descriptors)) { @@ -645,3 +648,105 @@ brw_opt_send_to_send_gather(fs_visitor &s) return progress; } + +/* If after optimizations, the sources are *still* contiguous in a + * SEND_GATHER, prefer to use the regular SEND, which would save + * having to write the ARF scalar register. + */ +bool +brw_opt_send_gather_to_send(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + assert(devinfo->ver >= 30); + + const unsigned unit = reg_unit(devinfo); + assert(unit == 2); + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_SEND_GATHER) + continue; + + assert(inst->sources > 2); + assert(inst->src[2].file == BAD_FILE); + + const int num_payload_sources = inst->sources - 3; + assert(num_payload_sources > 0); + + /* Limited by Src0.Length in the SEND instruction. */ + assert(num_payload_sources < 16); + + /* Determine whether the sources are still spread in either one or two + * spans. In those cases the regular SEND instruction can be used + * and there's no need to use SEND_GATHER (which would set ARF scalar register + * adding an extra instruction). + */ + const brw_reg *payload = &inst->src[3]; + brw_reg payload1 = payload[0]; + brw_reg payload2 = {}; + int payload1_len = 0; + int payload2_len = 0; + + for (int i = 0; i < num_payload_sources; i++) { + if (payload[i].file == VGRF && + payload[i].nr == payload1.nr && + payload[i].offset == payload1_len * REG_SIZE * unit) + payload1_len++; + else { + payload2 = payload[i]; + break; + } + } + + if (payload2.file == VGRF) { + for (int i = payload1_len; i < num_payload_sources; i++) { + if (payload[i].file == VGRF && + payload[i].nr == payload2.nr && + payload[i].offset == payload2_len * REG_SIZE * unit) + payload2_len++; + else + break; + } + } else { + payload2 = brw_null_reg(); + } + + if (payload1_len + payload2_len != num_payload_sources) + continue; + + /* Bspec 57058 (r64705) says + * + * When a source data payload is used in dataport message, that payload + * must be specified as Source 1 portion of a Split Send message. + * + * But at this point the split point is not guaranteed to respect that. + * + * TODO: Pass LSC address length or infer it so valid splits can work. + */ + if (payload2_len && (inst->sfid == GFX12_SFID_UGM || + inst->sfid == GFX12_SFID_TGM || + inst->sfid == GFX12_SFID_SLM || + inst->sfid == BRW_SFID_URB)) { + enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc); + if (lsc_op_num_data_values(lsc_op) > 0) + continue; + } + + inst->resize_sources(4); + inst->opcode = SHADER_OPCODE_SEND; + inst->src[2] = payload1; + inst->src[3] = payload2; + inst->mlen = payload1_len * unit; + inst->ex_mlen = payload2_len * unit; + + progress = true; + } + + if (progress) { + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_INSTRUCTION_DATA_FLOW); + } + + return progress; +}