brw/nir: Treat some ALU results as convergent

v2: Fix for Xe2.

v3: Fix handling of 64-bit CMP results.

v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.

shader-db:

Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21

total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135

Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18

total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134

total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0

LOST:   1
GAINED: 1

Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19

total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100

total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0

total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0

LOST:   1
GAINED: 1

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16

total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39

total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2

total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4

LOST:   1
GAINED: 1

fossil-db:

Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%

Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%

Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)

Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
This commit is contained in:
Ian Romanick 2024-01-30 18:53:05 -08:00
parent 7eab2cb67e
commit c48570d2b2

View file

@ -65,7 +65,7 @@ struct nir_to_brw_state {
};
static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
static nir_component_mask_t get_nir_write_mask(const nir_def &def);
static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
@ -508,11 +508,10 @@ fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
* match instr.
*/
static bool
optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
const brw_reg &result)
optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
nir_alu_instr *instr, const brw_reg &result)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
/* No fast path for f16 (yet) or f64. */
assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
@ -736,20 +735,27 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
{
const intel_device_info *devinfo = ntb.devinfo;
brw_reg result =
need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
result.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].output_type |
instr->def.bit_size));
bool all_sources_uniform = true;
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
op[i] = get_nir_src(ntb, instr->src[i].src, -1);
op[i].type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
nir_src_bit_size(instr->src[i].src)));
/* is_scalar sources won't be is_uniform because get_nir_src was passed
* -1 as the channel.
*/
if (!is_uniform(op[i]) && !op[i].is_scalar)
all_sources_uniform = false;
}
brw_reg result =
need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
result.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].output_type |
instr->def.bit_size));
/* Move and vecN instrutions may still be vectored. Return the raw,
* vectored source and destination so that fs_visitor::nir_emit_alu can
* handle it. Other callers should not have to handle these kinds of
@ -767,6 +773,9 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
break;
}
const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
/* At this point, we have dealt with any instruction that operates on
* more than a single channel. Therefore, we can just adjust the source
* and destination registers for that channel and emit the instruction.
@ -780,12 +789,18 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
assert(util_bitcount(write_mask) == 1);
channel = ffs(write_mask) - 1;
result = offset(result, bld, channel);
result = offset(result, xbld, channel);
}
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
assert(nir_op_infos[instr->op].input_sizes[i] < 2);
op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
/* If the dispatch width matches the scalar allocation width, offset()
* won't set the stride to zero. Force that here.
*/
if (op[i].is_scalar)
op[i] = component(op[i], 0);
}
return result;
@ -867,14 +882,13 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
bool need_dest)
{
const intel_device_info *devinfo = ntb.devinfo;
const fs_builder &bld = ntb.bld;
fs_inst *inst;
unsigned execution_mode =
bld.shader->nir->info.float_controls_execution_mode;
ntb.bld.shader->nir->info.float_controls_execution_mode;
brw_reg op[NIR_MAX_VEC_COMPONENTS];
brw_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
#ifndef NDEBUG
/* Everything except raw moves, some type conversions, iabs, and ineg
@ -909,6 +923,8 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
}
#endif
const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
switch (instr->op) {
case nir_op_mov:
case nir_op_vec2:
@ -976,7 +992,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
case nir_op_i2f32:
case nir_op_u2f32:
if (optimize_extract_to_float(ntb, instr, result))
if (optimize_extract_to_float(ntb, bld, instr, result))
return;
bld.MOV(result, op[0]);
break;
@ -1056,7 +1072,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
if (extract_instr != NULL) {
if (extract_instr->op == nir_op_extract_u8 ||
extract_instr->op == nir_op_extract_i8) {
prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
const brw_reg_type type =
@ -1065,7 +1081,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
op[0] = subscript(op[0], type, byte);
} else if (extract_instr->op == nir_op_extract_u16 ||
extract_instr->op == nir_op_extract_i16) {
prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
const brw_reg_type type =
@ -1302,6 +1318,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
/* The destination will now be used as a source, so select component 0
* if it's is_scalar (as is done in get_nir_src).
*/
if (bit_size != 32 && result.is_scalar)
dest = component(dest, 0);
if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
} else if(bit_size < 32) {
@ -1333,6 +1355,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
bld.CMP(dest, op[0], op[1],
brw_cmod_for_nir_comparison(instr->op));
/* The destination will now be used as a source, so select component 0
* if it's is_scalar (as is done in get_nir_src).
*/
if (bit_size != 32 && result.is_scalar)
dest = component(dest, 0);
if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
} else if (bit_size < 32) {
@ -1357,7 +1385,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
/* The sources of the source logical instruction are now the
* sources of the instruction that will be generated.
*/
prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
resolve_inot_sources(ntb, bld, inot_src_instr, op);
/* Smash all of the sources and destination to be signed. This
@ -1938,7 +1966,7 @@ get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
}
static brw_reg
get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
{
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
bool is_scalar = false;
@ -1963,6 +1991,8 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
/* This cannot be is_scalar if NIR thought it was divergent. */
assert(!(is_scalar && def.divergent));
} else if (def.parent_instr->type == nir_instr_type_alu) {
is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
}
const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;