brw/nir: Treat some ballot as convergent

v2: Fix for Xe2.

v3: Add a comment explaining the use of bld instead of xbld. Suggested
by Ken. Fix a bug in handing is_scalar source. Noticed by me while
applying Ken's review feedback.

shader-db:

Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228657 -> 18228689 (<.01%)
instructions in affected programs: 9333 -> 9365 (0.34%)
helped: 2 / HURT: 26

total cycles in shared programs: 932511560 -> 932542994 (<.01%)
cycles in affected programs: 2263040 -> 2294474 (1.39%)
helped: 7 / HURT: 27

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20700370 -> 20700392 (<.01%)
instructions in affected programs: 18579 -> 18601 (0.12%)
helped: 1 / HURT: 28

total cycles in shared programs: 888385851 -> 888386325 (<.01%)
cycles in affected programs: 2571368 -> 2571842 (0.02%)
helped: 14 / HURT: 6

total spills in shared programs: 4373 -> 4371 (-0.05%)
spills in affected programs: 71 -> 69 (-2.82%)
helped: 1 / HURT: 0

total fills in shared programs: 4657 -> 4653 (-0.09%)
fills in affected programs: 196 -> 192 (-2.04%)
helped: 1 / HURT: 0

fossil-db:

Lunar Lake
Totals:
Instrs: 142887258 -> 142890605 (+0.00%); split: -0.00%, +0.00%
Cycle count: 21653599282 -> 21655049536 (+0.01%); split: -0.00%, +0.01%
Max live registers: 47942973 -> 47942837 (-0.00%)

Totals from 22209 (4.01% of 553251) affected shaders:
Instrs: 4337679 -> 4341026 (+0.08%); split: -0.00%, +0.08%
Cycle count: 261852040 -> 263302294 (+0.55%); split: -0.38%, +0.93%
Max live registers: 1299670 -> 1299534 (-0.01%)

Meteor Lake, DG2, Tiger Lake, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 156599915 -> 156590882 (-0.01%); split: -0.01%, +0.00%
Cycle count: 16940072009 -> 16940902317 (+0.00%); split: -0.01%, +0.01%
Max live registers: 32610801 -> 32610488 (-0.00%)
Max dispatch width: 5730736 -> 5731744 (+0.02%); split: +0.12%, -0.11%

Totals from 35528 (5.52% of 643617) affected shaders:
Instrs: 6175409 -> 6166376 (-0.15%); split: -0.21%, +0.06%
Cycle count: 230679923 -> 231510231 (+0.36%); split: -0.46%, +0.82%
Max live registers: 1354716 -> 1354403 (-0.02%)
Max dispatch width: 167648 -> 168656 (+0.60%); split: +4.26%, -3.66%

Ice Lake
Totals:
Instrs: 155330276 -> 155318037 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15019092327 -> 15019637026 (+0.00%); split: -0.00%, +0.01%
Max live registers: 32640341 -> 32637305 (-0.01%)
Max dispatch width: 5780720 -> 5780688 (-0.00%); split: +0.02%, -0.02%

Totals from 37773 (5.85% of 645641) affected shaders:
Instrs: 6643030 -> 6630791 (-0.18%); split: -0.24%, +0.05%
Cycle count: 223589025 -> 224133724 (+0.24%); split: -0.29%, +0.53%
Max live registers: 1491781 -> 1488745 (-0.20%)
Max dispatch width: 167600 -> 167568 (-0.02%); split: +0.75%, -0.77%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
This commit is contained in:
Ian Romanick 2024-02-28 13:05:08 -08:00
parent f2d2014636
commit 1a7593ed36
2 changed files with 33 additions and 5 deletions

View file

@ -1979,6 +1979,7 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
is_scalar = get_nir_src(ntb, instr->src[0]).is_scalar;
break;
case nir_intrinsic_ballot:
case nir_intrinsic_resource_intel:
is_scalar = !def.divergent;
break;
@ -6403,8 +6404,33 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
dest.type = BRW_TYPE_UD;
}
const brw_reg value = get_nir_src(ntb, instr->src[0]);
bld.emit(SHADER_OPCODE_BALLOT, dest, value);
brw_reg value = get_nir_src(ntb, instr->src[0]);
/* A ballot will always be at the full dispatch width even if the
* use of the ballot result is smaller. If the source is_scalar,
* it may be allocated at less than the full dispatch width (e.g.,
* allocated at SIMD8 with SIMD32 dispatch). The input may or may
* not be stride=0. If it is not, the generated ballot
*
* ballot(32) dst, value<1>
*
* is invalid because it will read out of bounds from value.
*
* To account for this, modify the stride of an is_scalar input to be
* zero.
*/
if (value.is_scalar)
value = component(value, 0);
/* Note the use of bld here instead of xbld. As mentioned above, the
* ballot must execute on all SIMD lanes regardless of the amount of
* data (i.e., scalar or not scalar) generated.
*/
fs_inst *inst = bld.emit(SHADER_OPCODE_BALLOT, dest, value);
if (dest.is_scalar)
inst->size_written = dest.component_size(xbld.dispatch_width());
break;
}

View file

@ -519,20 +519,22 @@ brw_lower_ballot(fs_visitor &s, bblock_t *block, fs_inst *inst)
brw_reg value = retype(inst->src[0], BRW_TYPE_UD);
brw_reg dst = inst->dst;
const fs_builder xbld = dst.is_scalar ? bld.scalar_group() : bld;
if (value.file == IMM) {
/* Implement a fast-path for ballot(true). */
if (!value.is_zero()) {
brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp);
bld.MOV(dst, brw_reg(component(tmp, 0)));
xbld.MOV(dst, brw_reg(component(tmp, 0)));
} else {
brw_reg zero = retype(brw_imm_uq(0), dst.type);
bld.MOV(dst, zero);
xbld.MOV(dst, zero);
}
} else {
brw_reg flag = brw_fill_flag(bld, 0);
bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
bld.MOV(dst, flag);
xbld.MOV(dst, flag);
}
inst->remove(block);