mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-02 07:20:16 +01:00
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
This commit is contained in:
parent
df16f47036
commit
2e2b83f72d
4 changed files with 20 additions and 33 deletions
|
|
@ -892,6 +892,14 @@ namespace brw {
|
|||
shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
|
||||
}
|
||||
|
||||
brw_reg
|
||||
LOAD_SUBGROUP_INVOCATION() const
|
||||
{
|
||||
brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
|
||||
exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
|
||||
return reg;
|
||||
}
|
||||
|
||||
fs_visitor *shader;
|
||||
|
||||
fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
|
|||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
|
||||
return true;
|
||||
case SHADER_OPCODE_LOAD_PAYLOAD:
|
||||
return !is_coalescing_payload(v->alloc, inst);
|
||||
|
|
|
|||
|
|
@ -798,6 +798,7 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
|
|||
const fs_builder abld =
|
||||
fs_builder(&s, block, inst).annotate("SubgroupInvocation", NULL);
|
||||
const fs_builder ubld8 = abld.group(8, 0).exec_all();
|
||||
ubld8.UNDEF(inst->dst);
|
||||
|
||||
if (inst->exec_size == 8) {
|
||||
assert(inst->dst.type == BRW_TYPE_UD);
|
||||
|
|
@ -806,7 +807,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
|
|||
ubld8.MOV(inst->dst, uw);
|
||||
} else {
|
||||
assert(inst->dst.type == BRW_TYPE_UW);
|
||||
abld.UNDEF(inst->dst);
|
||||
ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
|
||||
ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
|
||||
if (inst->exec_size > 16) {
|
||||
|
|
@ -817,11 +817,6 @@ brw_fs_lower_load_subgroup_invocation(fs_visitor &s)
|
|||
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
|
||||
/* Currently this is only ever emitted once, so there's no point in
|
||||
* continuing to look for more cases. Drop if we ever re-emit it.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
|
|
|
|||
|
|
@ -367,7 +367,6 @@ emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
|
|||
static void
|
||||
fs_nir_emit_system_values(nir_to_brw_state &ntb)
|
||||
{
|
||||
const fs_builder &bld = ntb.bld;
|
||||
fs_visitor &s = ntb.s;
|
||||
|
||||
ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
|
||||
|
|
@ -375,15 +374,6 @@ fs_nir_emit_system_values(nir_to_brw_state &ntb)
|
|||
ntb.system_values[i] = brw_reg();
|
||||
}
|
||||
|
||||
/* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
|
||||
* never end up using it.
|
||||
*/
|
||||
{
|
||||
brw_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
reg = bld.vgrf(s.dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
|
||||
bld.emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
|
||||
}
|
||||
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
|
||||
nir_foreach_block(block, impl)
|
||||
emit_system_values_block(ntb, block);
|
||||
|
|
@ -2650,8 +2640,7 @@ emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
|
|||
* by 32 (shifting by 5), and add the two together. This is
|
||||
* the final indirect byte offset.
|
||||
*/
|
||||
brw_reg sequence =
|
||||
ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
|
||||
|
||||
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
|
||||
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
|
||||
|
|
@ -2899,7 +2888,7 @@ get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
|
|||
* by the GRF size (by shifting), and add the two together. This is
|
||||
* the final indirect byte offset.
|
||||
*/
|
||||
brw_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
|
||||
|
||||
/* Offsets will be 0, 4, 8, ... */
|
||||
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
|
||||
|
|
@ -5254,8 +5243,7 @@ swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
|
|||
{
|
||||
fs_visitor &s = ntb.s;
|
||||
|
||||
const brw_reg &chan_index =
|
||||
ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
|
||||
const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
|
||||
|
||||
if (nir_src_is_const(nir_addr_src)) {
|
||||
|
|
@ -7357,8 +7345,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
break;
|
||||
|
||||
case nir_intrinsic_load_subgroup_invocation:
|
||||
bld.MOV(retype(dest, BRW_TYPE_UD),
|
||||
ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
|
||||
bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_subgroup_eq_mask:
|
||||
|
|
@ -7415,7 +7402,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
* 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
|
||||
*/
|
||||
brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
|
||||
bld.MOV(invoc_ud, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
|
||||
bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
|
||||
brw_reg quad_mask =
|
||||
bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));
|
||||
|
||||
|
|
@ -7679,8 +7666,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
* MOVs or else fall back to doing indirects.
|
||||
*/
|
||||
brw_reg idx = bld.vgrf(BRW_TYPE_W);
|
||||
bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||
brw_imm_w(0x2));
|
||||
bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x2));
|
||||
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
|
||||
}
|
||||
break;
|
||||
|
|
@ -7700,8 +7686,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
* MOVs or else fall back to doing indirects.
|
||||
*/
|
||||
brw_reg idx = bld.vgrf(BRW_TYPE_W);
|
||||
bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||
brw_imm_w(0x3));
|
||||
bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(0x3));
|
||||
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
|
||||
}
|
||||
break;
|
||||
|
|
@ -7783,8 +7768,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
*/
|
||||
brw_reg shifted = bld.vgrf(src.type);
|
||||
brw_reg idx = bld.vgrf(BRW_TYPE_W);
|
||||
allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||
brw_imm_w(-1));
|
||||
allbld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1));
|
||||
allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
|
||||
allbld.group(1, 0).MOV(horiz_offset(shifted, 0), identity);
|
||||
scan = shifted;
|
||||
|
|
@ -8079,10 +8063,9 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(2, 0))),
|
||||
brw_imm_ud(4));
|
||||
|
||||
/* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
|
||||
/* LaneID[0:3] << 0 (Use subgroup invocation) */
|
||||
assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
|
||||
bld.ADD(dst, bld.OR(eu, tid),
|
||||
ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
|
||||
bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue