mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 04:20:08 +01:00
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2. v3: Fix handling of 64-bit CMP results. v4: Scalarize 16-bit comparison temporary destination when used as a source (as was already done for 64-bit). Suggested by Ken. shader-db: Lunar Lake total instructions in shared programs: 18096500 -> 18096549 (<.01%) instructions in affected programs: 15919 -> 15968 (0.31%) helped: 8 / HURT: 21 total cycles in shared programs: 921841300 -> 922073090 (0.03%) cycles in affected programs: 115946336 -> 116178126 (0.20%) helped: 386 / HURT: 135 Meteor Lake and DG2 (Meteor Lake shown) total instructions in shared programs: 19836053 -> 19836016 (<.01%) instructions in affected programs: 19547 -> 19510 (-0.19%) helped: 21 / HURT: 18 total cycles in shared programs: 906713777 -> 906588541 (-0.01%) cycles in affected programs: 96914584 -> 96789348 (-0.13%) helped: 335 / HURT: 134 total fills in shared programs: 6712 -> 6710 (-0.03%) fills in affected programs: 52 -> 50 (-3.85%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Tiger Lake total instructions in shared programs: 19641284 -> 19641278 (<.01%) instructions in affected programs: 12358 -> 12352 (-0.05%) helped: 10 / HURT: 19 total cycles in shared programs: 865413131 -> 865460513 (<.01%) cycles in affected programs: 74641489 -> 74688871 (0.06%) helped: 388 / HURT: 100 total spills in shared programs: 3899 -> 3898 (-0.03%) spills in affected programs: 17 -> 16 (-5.88%) helped: 1 / HURT: 0 total fills in shared programs: 3249 -> 3245 (-0.12%) fills in affected programs: 51 -> 47 (-7.84%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20495826 -> 20496111 (<.01%) instructions in affected programs: 53220 -> 53505 (0.54%) helped: 28 / HURT: 16 total cycles in shared programs: 875173550 -> 875243910 (<.01%) cycles in affected programs: 51700652 -> 51771012 (0.14%) helped: 400 / HURT: 39 total spills in shared programs: 4546 -> 4546 (0.00%) spills in affected programs: 288 -> 288 (0.00%) helped: 1 / HURT: 2 total fills in shared programs: 5224 -> 5280 (1.07%) fills in affected programs: 795 -> 851 (7.04%) helped: 0 / HURT: 4 LOST: 1 GAINED: 1 fossil-db: Lunar Lake Totals: Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00% Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05% Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04% Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00% Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00% Totals from 6817 (1.24% of 551443) affected shaders: Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05% Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07% Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07% Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01% Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04% Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown) Totals: Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00% Subgroup size: 7733536 -> 7733616 (+0.00%) Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02% Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00% Max dispatch width: 5561696 -> 5561712 (+0.00%) Totals from 5672 (0.90% of 633314) affected shaders: Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07% Subgroup size: 81128 -> 81208 (+0.10%) Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03% Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01% Max dispatch width: 74136 -> 74152 (+0.02%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
This commit is contained in:
parent
7eab2cb67e
commit
c48570d2b2
1 changed files with 51 additions and 21 deletions
|
|
@ -65,7 +65,7 @@ struct nir_to_brw_state {
|
|||
};
|
||||
|
||||
static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
|
||||
static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
|
||||
static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
|
||||
static nir_component_mask_t get_nir_write_mask(const nir_def &def);
|
||||
|
||||
static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
|
||||
|
|
@ -508,11 +508,10 @@ fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
|
|||
* match instr.
|
||||
*/
|
||||
static bool
|
||||
optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
||||
const brw_reg &result)
|
||||
optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
|
||||
nir_alu_instr *instr, const brw_reg &result)
|
||||
{
|
||||
const intel_device_info *devinfo = ntb.devinfo;
|
||||
const fs_builder &bld = ntb.bld;
|
||||
|
||||
/* No fast path for f16 (yet) or f64. */
|
||||
assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
|
||||
|
|
@ -736,20 +735,27 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
|
|||
{
|
||||
const intel_device_info *devinfo = ntb.devinfo;
|
||||
|
||||
brw_reg result =
|
||||
need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
|
||||
|
||||
result.type = brw_type_for_nir_type(devinfo,
|
||||
(nir_alu_type)(nir_op_infos[instr->op].output_type |
|
||||
instr->def.bit_size));
|
||||
|
||||
bool all_sources_uniform = true;
|
||||
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
||||
op[i] = get_nir_src(ntb, instr->src[i].src, -1);
|
||||
op[i].type = brw_type_for_nir_type(devinfo,
|
||||
(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
|
||||
nir_src_bit_size(instr->src[i].src)));
|
||||
|
||||
/* is_scalar sources won't be is_uniform because get_nir_src was passed
|
||||
* -1 as the channel.
|
||||
*/
|
||||
if (!is_uniform(op[i]) && !op[i].is_scalar)
|
||||
all_sources_uniform = false;
|
||||
}
|
||||
|
||||
brw_reg result =
|
||||
need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
|
||||
|
||||
result.type = brw_type_for_nir_type(devinfo,
|
||||
(nir_alu_type)(nir_op_infos[instr->op].output_type |
|
||||
instr->def.bit_size));
|
||||
|
||||
/* Move and vecN instrutions may still be vectored. Return the raw,
|
||||
* vectored source and destination so that fs_visitor::nir_emit_alu can
|
||||
* handle it. Other callers should not have to handle these kinds of
|
||||
|
|
@ -767,6 +773,9 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
|
|||
break;
|
||||
}
|
||||
|
||||
const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
|
||||
const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
|
||||
|
||||
/* At this point, we have dealt with any instruction that operates on
|
||||
* more than a single channel. Therefore, we can just adjust the source
|
||||
* and destination registers for that channel and emit the instruction.
|
||||
|
|
@ -780,12 +789,18 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
|
|||
assert(util_bitcount(write_mask) == 1);
|
||||
channel = ffs(write_mask) - 1;
|
||||
|
||||
result = offset(result, bld, channel);
|
||||
result = offset(result, xbld, channel);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
||||
assert(nir_op_infos[instr->op].input_sizes[i] < 2);
|
||||
op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
|
||||
op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
|
||||
|
||||
/* If the dispatch width matches the scalar allocation width, offset()
|
||||
* won't set the stride to zero. Force that here.
|
||||
*/
|
||||
if (op[i].is_scalar)
|
||||
op[i] = component(op[i], 0);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
@ -867,14 +882,13 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
bool need_dest)
|
||||
{
|
||||
const intel_device_info *devinfo = ntb.devinfo;
|
||||
const fs_builder &bld = ntb.bld;
|
||||
|
||||
fs_inst *inst;
|
||||
unsigned execution_mode =
|
||||
bld.shader->nir->info.float_controls_execution_mode;
|
||||
ntb.bld.shader->nir->info.float_controls_execution_mode;
|
||||
|
||||
brw_reg op[NIR_MAX_VEC_COMPONENTS];
|
||||
brw_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
|
||||
brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
|
||||
|
||||
#ifndef NDEBUG
|
||||
/* Everything except raw moves, some type conversions, iabs, and ineg
|
||||
|
|
@ -909,6 +923,8 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
}
|
||||
#endif
|
||||
|
||||
const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
|
||||
|
||||
switch (instr->op) {
|
||||
case nir_op_mov:
|
||||
case nir_op_vec2:
|
||||
|
|
@ -976,7 +992,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
|
||||
case nir_op_i2f32:
|
||||
case nir_op_u2f32:
|
||||
if (optimize_extract_to_float(ntb, instr, result))
|
||||
if (optimize_extract_to_float(ntb, bld, instr, result))
|
||||
return;
|
||||
bld.MOV(result, op[0]);
|
||||
break;
|
||||
|
|
@ -1056,7 +1072,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
if (extract_instr != NULL) {
|
||||
if (extract_instr->op == nir_op_extract_u8 ||
|
||||
extract_instr->op == nir_op_extract_i8) {
|
||||
prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
|
||||
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
|
||||
|
||||
const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
|
||||
const brw_reg_type type =
|
||||
|
|
@ -1065,7 +1081,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
op[0] = subscript(op[0], type, byte);
|
||||
} else if (extract_instr->op == nir_op_extract_u16 ||
|
||||
extract_instr->op == nir_op_extract_i16) {
|
||||
prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
|
||||
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
|
||||
|
||||
const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
|
||||
const brw_reg_type type =
|
||||
|
|
@ -1302,6 +1318,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
|
||||
bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
|
||||
|
||||
/* The destination will now be used as a source, so select component 0
|
||||
* if it's is_scalar (as is done in get_nir_src).
|
||||
*/
|
||||
if (bit_size != 32 && result.is_scalar)
|
||||
dest = component(dest, 0);
|
||||
|
||||
if (bit_size > 32) {
|
||||
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
|
||||
} else if(bit_size < 32) {
|
||||
|
|
@ -1333,6 +1355,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
bld.CMP(dest, op[0], op[1],
|
||||
brw_cmod_for_nir_comparison(instr->op));
|
||||
|
||||
/* The destination will now be used as a source, so select component 0
|
||||
* if it's is_scalar (as is done in get_nir_src).
|
||||
*/
|
||||
if (bit_size != 32 && result.is_scalar)
|
||||
dest = component(dest, 0);
|
||||
|
||||
if (bit_size > 32) {
|
||||
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
|
||||
} else if (bit_size < 32) {
|
||||
|
|
@ -1357,7 +1385,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
|||
/* The sources of the source logical instruction are now the
|
||||
* sources of the instruction that will be generated.
|
||||
*/
|
||||
prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
|
||||
prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
|
||||
resolve_inot_sources(ntb, bld, inot_src_instr, op);
|
||||
|
||||
/* Smash all of the sources and destination to be signed. This
|
||||
|
|
@ -1938,7 +1966,7 @@ get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
|
|||
}
|
||||
|
||||
static brw_reg
|
||||
get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
|
||||
get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
|
||||
{
|
||||
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
|
||||
bool is_scalar = false;
|
||||
|
|
@ -1963,6 +1991,8 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
|
|||
|
||||
/* This cannot be is_scalar if NIR thought it was divergent. */
|
||||
assert(!(is_scalar && def.divergent));
|
||||
} else if (def.parent_instr->type == nir_instr_type_alu) {
|
||||
is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
|
||||
}
|
||||
|
||||
const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue