intel/brw: Make gl_SubgroupInvocation lane index loading SSA

Our code to initialize gl_SubgroupInvocation uses multiple instructions
some of which are partial writes.  This makes it difficult to analyze
expressions involving gl_SubgroupInvocation, which appear very
frequently in compute shaders.

To make this easier, we add a new virtual opcode which initializes
a full VGRF to the value of gl_SubgroupInvocation.  (We also expand
it to UD for SIMD8 so there are not partial write issues.)  We then
lower it to the original code later on in compilation, after we've
done the bulk of our optimizations.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
This commit is contained in:
Kenneth Graunke 2024-02-24 01:24:03 -08:00 committed by Marge Bot
parent 344d4ee9f0
commit 84219892ad
6 changed files with 53 additions and 12 deletions

View file

@ -536,6 +536,8 @@ enum opcode {
SHADER_OPCODE_READ_ARCH_REG,
SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION,
RT_OPCODE_TRACE_RAY_LOGICAL,
};

View file

@ -2464,6 +2464,8 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
return "btd_retire_logical";
case SHADER_OPCODE_READ_ARCH_REG:
return "read_arch_reg";
case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
return "load_subgroup_invocation";
}
unreachable("not reached");

View file

@ -599,6 +599,7 @@ bool brw_fs_lower_derivatives(fs_visitor &s);
bool brw_fs_lower_dpas(fs_visitor &s);
bool brw_fs_lower_find_live_channel(fs_visitor &s);
bool brw_fs_lower_integer_multiplication(fs_visitor &s);
bool brw_fs_lower_load_subgroup_invocation(fs_visitor &s);
bool brw_fs_lower_logical_sends(fs_visitor &s);
bool brw_fs_lower_pack(fs_visitor &s);
bool brw_fs_lower_load_payload(fs_visitor &s);

View file

@ -703,3 +703,47 @@ brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
bool
brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
continue;
const fs_builder abld =
fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
const fs_builder ubld8 = abld.group(8, 0).exec_all();
if (inst->exec_size == 8) {
assert(inst->dst.type == BRW_TYPE_UD);
fs_reg uw = retype(inst->dst, BRW_TYPE_UW);
ubld8.MOV(uw, brw_imm_v(0x76543210));
ubld8.MOV(inst->dst, uw);
} else {
assert(inst->dst.type == BRW_TYPE_UW);
abld.UNDEF(inst->dst);
ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
if (inst->exec_size > 16) {
const fs_builder ubld16 = abld.group(16, 0).exec_all();
ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
}
}
inst->remove(block);
progress = true;
/* Currently this is only ever emitted once, so there's no point in
* continuing to look for more cases. Drop if we ever re-emit it.
*/
break;
}
if (progress)
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}

View file

@ -379,19 +379,9 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
* never end up using it.
*/
{
const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
reg = abld.vgrf(BRW_TYPE_UW);
abld.UNDEF(reg);
const fs_builder allbld8 = abld.group(8, 0).exec_all();
allbld8.MOV(reg, brw_imm_v(0x76543210));
if (s.dispatch_width > 8)
allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
if (s.dispatch_width > 16) {
const fs_builder allbld16 = abld.group(16, 0).exec_all();
allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
}
reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
}
nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);

View file

@ -155,6 +155,8 @@ brw_fs_optimize(fs_visitor &s)
OPT(brw_fs_lower_uniform_pull_constant_loads);
OPT(brw_fs_lower_find_live_channel);
OPT(brw_fs_lower_load_subgroup_invocation);
}
static unsigned