From 49ee3ae9e8be4fd2a4a9f658c06e0bf01e08d13c Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Mon, 6 Jun 2022 02:35:09 -0700 Subject: [PATCH] intel/compiler: Lower FIND_[LAST_]LIVE_CHANNEL in IR on Gfx8+ This allows the software scoreboarding pass, scheduler, and so on to handle the individual instructions and handle them, rather than trusting in the generator to do scoreboarding correctly when expanding the virtual instruction to multiple actual instructions. By using SHADER_OPCODE_READ_SR_REG, we also correctly handle the software scoreboarding workaround when reading DMask/VMask. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_fs.cpp | 73 +++++++++++++++++++++++++++++++++++ src/intel/compiler/brw_fs.h | 1 + 2 files changed, 74 insertions(+) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index e0455b36d59..427b05e5f46 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5529,6 +5529,77 @@ fs_visitor::lower_derivatives() return progress; } +bool +fs_visitor::lower_find_live_channel() +{ + bool progress = false; + + if (devinfo->ver < 8) + return false; + + bool packed_dispatch = + brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data); + bool vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(stage_prog_data)->uses_vmask; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL && + inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL) + continue; + + bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL; + + /* Getting the first active channel index is easy on Gfx8: Just find + * the first bit set in the execution mask. The register exists on + * HSW already but it reads back as all ones when the current + * instruction has execution masking disabled, so it's kind of + * useless there. + */ + fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + + const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0); + + /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), + * so combine the execution and dispatch masks to obtain the true mask. + * + * If we're looking for the first live channel, and we have packed + * dispatch, we can skip this step, as we know all dispatched channels + * will appear at the front of the mask. + */ + if (!(first && packed_dispatch)) { + fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2)); + + /* Quarter control has the effect of magically shifting the value of + * ce0 so you'll get the first/last active channel relative to the + * specified quarter control as result. + */ + if (inst->group > 0) + ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8))); + + ubld.AND(mask, exec_mask, mask); + exec_mask = mask; + } + + if (first) { + ubld.FBL(inst->dst, exec_mask); + } else { + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1); + ubld.LZD(tmp, exec_mask); + ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31)); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + void fs_visitor::dump_instructions() const { @@ -6121,6 +6192,8 @@ fs_visitor::optimize() lower_uniform_pull_constant_loads(); + lower_find_live_channel(); + validate(); } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index e31c23bf0f5..23588cf6c36 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -199,6 +199,7 @@ public: bool lower_simd_width(); bool lower_barycentrics(); bool lower_derivatives(); + bool lower_find_live_channel(); bool lower_scoreboard(); bool lower_sub_sat(); bool opt_combine_constants();