diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 692368bdded..d0400fc7eb8 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1466,6 +1466,11 @@ brw_allocate_registers(fs_visitor &s, bool allow_spilling) s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++); + if (s.devinfo->ver >= 30) { + brw_lower_send_gather(s); + s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++); + } + brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC); if (s.last_scratch > 0) { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 497c314a281..4c7d55629d4 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -533,6 +533,7 @@ bool brw_lower_regioning(fs_visitor &s); bool brw_lower_scalar_fp64_MAD(fs_visitor &s); bool brw_lower_scoreboard(fs_visitor &s); bool brw_lower_send_descriptors(fs_visitor &s); +bool brw_lower_send_gather(fs_visitor &s); bool brw_lower_sends_overlapping_payload(fs_visitor &s); bool brw_lower_simd_width(fs_visitor &s); bool brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i); diff --git a/src/intel/compiler/brw_lower.cpp b/src/intel/compiler/brw_lower.cpp index e2db2b01206..e8ce42de325 100644 --- a/src/intel/compiler/brw_lower.cpp +++ b/src/intel/compiler/brw_lower.cpp @@ -757,6 +757,92 @@ brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s) DEPENDENCY_VARIABLES); } +static brw_reg +brw_s0(enum brw_reg_type type, unsigned subnr) +{ + return brw_make_reg(ARF, + BRW_ARF_SCALAR, + subnr, + 0, + 0, + type, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XYZW, + WRITEMASK_XYZW); +} + +static bool +brw_lower_send_gather_inst(fs_visitor &s, bblock_t *block, fs_inst *inst) +{ + const intel_device_info *devinfo = s.devinfo; + assert(devinfo->ver >= 30); + + const unsigned unit = reg_unit(devinfo); + assert(unit == 2); + + assert(inst->opcode == SHADER_OPCODE_SEND_GATHER); + assert(inst->sources > 2); + assert(inst->src[2].file == BAD_FILE); + + unsigned count = 0; + uint8_t regs[16] = {}; + + const unsigned num_payload_sources = inst->sources - 3; + assert(num_payload_sources > 0); + + /* Limited by Src0.Length in the SEND instruction. */ + assert(num_payload_sources < 16); + + for (unsigned i = 3; i < inst->sources; i++) { + assert(inst->src[i].file == FIXED_GRF); + assert(inst->src[i].nr % reg_unit(devinfo) == 0); + + unsigned nr = phys_nr(devinfo, inst->src[i]); + assert(nr <= UINT8_MAX); + regs[count++] = nr; + } + + /* Fill out ARF scalar register with the physical register numbers + * and use SEND_GATHER. + */ + brw_builder ubld = brw_builder(&s, block, inst).group(1, 0).exec_all(); + for (unsigned q = 0; q < DIV_ROUND_UP(count, 8); q++) { + uint64_t v = 0; + for (unsigned i = 0; i < 8; i++) { + const uint64_t reg = regs[(q * 8) + i]; + v |= reg << (8 * i); + } + ubld.MOV(brw_s0(BRW_TYPE_UQ, q), brw_imm_uq(v)); + } + + inst->src[2] = brw_s0(BRW_TYPE_UD, 0); + inst->mlen = count * unit; + + return true; +} + +bool +brw_lower_send_gather(fs_visitor &s) +{ + assert(s.devinfo->ver >= 30); + assert(s.grf_used || !"Must be called after register allocation"); + + bool progress = false; + + foreach_block_and_inst(block, fs_inst, inst, s.cfg) { + if (inst->opcode == SHADER_OPCODE_SEND_GATHER) + progress |= brw_lower_send_gather_inst(s, block, inst); + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_VARIABLES); + + return progress; +} + bool brw_lower_load_subgroup_invocation(fs_visitor &s) {