diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 4c7d55629d4..d89df2ff9f8 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -559,6 +559,7 @@ bool brw_opt_register_coalesce(fs_visitor &s); bool brw_opt_remove_extra_rounding_modes(fs_visitor &s); bool brw_opt_remove_redundant_halts(fs_visitor &s); bool brw_opt_saturate_propagation(fs_visitor &s); +bool brw_opt_send_to_send_gather(fs_visitor &s); bool brw_opt_split_sends(fs_visitor &s); bool brw_opt_split_virtual_grfs(fs_visitor &s); bool brw_opt_zero_samples(fs_visitor &s); diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp index 9027b458639..deb632bbc14 100644 --- a/src/intel/compiler/brw_opt.cpp +++ b/src/intel/compiler/brw_opt.cpp @@ -7,6 +7,8 @@ #include "brw_fs.h" #include "brw_builder.h" +#include "dev/intel_debug.h" + using namespace brw; void @@ -113,6 +115,9 @@ brw_optimize(fs_visitor &s) } } + if (s.devinfo->ver >= 30) + OPT(brw_opt_send_to_send_gather); + OPT(brw_opt_split_sends); OPT(brw_workaround_nomask_control_flow); @@ -563,3 +568,80 @@ brw_opt_remove_extra_rounding_modes(fs_visitor &s) return progress; } + +bool +brw_opt_send_to_send_gather(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + assert(devinfo->ver >= 30); + + const unsigned unit = reg_unit(devinfo); + assert(unit == 2); + + unsigned count = 0; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_SEND) + continue; + + /* For 1-2 registers, send-gather offers no benefits over split-send. */ + if (inst->mlen + inst->ex_mlen <= 2 * unit) + continue; + + assert(inst->mlen % unit == 0); + assert(inst->ex_mlen % unit == 0); + + struct { + brw_reg src; + unsigned phys_len; + } payload[2] = { + { inst->src[2], inst->mlen / unit }, + { inst->src[3], inst->ex_mlen / unit }, + }; + + const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len; + + /* Limited by Src0.Length in the SEND instruction. */ + if (num_payload_sources > 15) + continue; + + if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) { + count++; + continue; + } + + inst->resize_sources(3 + num_payload_sources); + /* Sources 0 and 1 remain the same. Source 2 will be filled + * after register allocation. + */ + inst->src[2] = {}; + + int idx = 3; + for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) { + for (unsigned i = 0; i < payload[p].phys_len; i++) { + inst->src[idx++] = byte_offset(payload[p].src, + i * reg_unit(devinfo) * REG_SIZE); + } + } + assert(idx == inst->sources); + + inst->opcode = SHADER_OPCODE_SEND_GATHER; + inst->mlen = 0; + inst->ex_mlen = 0; + + progress = true; + } + + if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) { + fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n", + count, _mesa_shader_stage_to_string(s.stage)); + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_INSTRUCTION_DATA_FLOW); + + return progress; +} diff --git a/src/intel/dev/intel_debug.c b/src/intel/dev/intel_debug.c index 84e4a4adcf4..b9ba9bb2fdc 100644 --- a/src/intel/dev/intel_debug.c +++ b/src/intel/dev/intel_debug.c @@ -115,6 +115,7 @@ static const struct debug_control debug_control[] = { { "reg-pressure", DEBUG_REG_PRESSURE }, { "shader-print", DEBUG_SHADER_PRINT }, { "cl-quiet", DEBUG_CL_QUIET }, + { "no-send-gather", DEBUG_NO_SEND_GATHER }, { NULL, 0 } }; diff --git a/src/intel/dev/intel_debug.h b/src/intel/dev/intel_debug.h index 808d32e9556..42b325e735a 100644 --- a/src/intel/dev/intel_debug.h +++ b/src/intel/dev/intel_debug.h @@ -106,6 +106,7 @@ extern uint64_t intel_debug; #define DEBUG_BVH_BLAS_IR_AS (1ull << 58) #define DEBUG_BVH_TLAS_IR_AS (1ull << 59) #define DEBUG_BVH_NO_BUILD (1ull << 60) +#define DEBUG_NO_SEND_GATHER (1ull << 61) #define DEBUG_ANY (~0ull) @@ -115,7 +116,8 @@ extern uint64_t intel_debug; /* These flags may affect program generation */ #define DEBUG_DISK_CACHE_MASK \ (DEBUG_NO_DUAL_OBJECT_GS | DEBUG_SPILL_FS | \ - DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64) + DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \ + DEBUG_NO_SEND_GATHER) /* Flags to determine what bvh to dump out */ #define DEBUG_BVH_ANV (DEBUG_BVH_BLAS | DEBUG_BVH_TLAS)