From c48570d2b266970e15960e8dd0c5ba1b94d606a0 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 30 Jan 2024 18:53:05 -0800 Subject: [PATCH] brw/nir: Treat some ALU results as convergent v2: Fix for Xe2. v3: Fix handling of 64-bit CMP results. v4: Scalarize 16-bit comparison temporary destination when used as a source (as was already done for 64-bit). Suggested by Ken. shader-db: Lunar Lake total instructions in shared programs: 18096500 -> 18096549 (<.01%) instructions in affected programs: 15919 -> 15968 (0.31%) helped: 8 / HURT: 21 total cycles in shared programs: 921841300 -> 922073090 (0.03%) cycles in affected programs: 115946336 -> 116178126 (0.20%) helped: 386 / HURT: 135 Meteor Lake and DG2 (Meteor Lake shown) total instructions in shared programs: 19836053 -> 19836016 (<.01%) instructions in affected programs: 19547 -> 19510 (-0.19%) helped: 21 / HURT: 18 total cycles in shared programs: 906713777 -> 906588541 (-0.01%) cycles in affected programs: 96914584 -> 96789348 (-0.13%) helped: 335 / HURT: 134 total fills in shared programs: 6712 -> 6710 (-0.03%) fills in affected programs: 52 -> 50 (-3.85%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Tiger Lake total instructions in shared programs: 19641284 -> 19641278 (<.01%) instructions in affected programs: 12358 -> 12352 (-0.05%) helped: 10 / HURT: 19 total cycles in shared programs: 865413131 -> 865460513 (<.01%) cycles in affected programs: 74641489 -> 74688871 (0.06%) helped: 388 / HURT: 100 total spills in shared programs: 3899 -> 3898 (-0.03%) spills in affected programs: 17 -> 16 (-5.88%) helped: 1 / HURT: 0 total fills in shared programs: 3249 -> 3245 (-0.12%) fills in affected programs: 51 -> 47 (-7.84%) helped: 1 / HURT: 0 LOST: 1 GAINED: 1 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20495826 -> 20496111 (<.01%) instructions in affected programs: 53220 -> 53505 (0.54%) helped: 28 / HURT: 16 total cycles in shared programs: 875173550 -> 875243910 (<.01%) cycles in affected programs: 51700652 -> 51771012 (0.14%) helped: 400 / HURT: 39 total spills in shared programs: 4546 -> 4546 (0.00%) spills in affected programs: 288 -> 288 (0.00%) helped: 1 / HURT: 2 total fills in shared programs: 5224 -> 5280 (1.07%) fills in affected programs: 795 -> 851 (7.04%) helped: 0 / HURT: 4 LOST: 1 GAINED: 1 fossil-db: Lunar Lake Totals: Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00% Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05% Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04% Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00% Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00% Totals from 6817 (1.24% of 551443) affected shaders: Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05% Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07% Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07% Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01% Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04% Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown) Totals: Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00% Subgroup size: 7733536 -> 7733616 (+0.00%) Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02% Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00% Max dispatch width: 5561696 -> 5561712 (+0.00%) Totals from 5672 (0.90% of 633314) affected shaders: Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07% Subgroup size: 81128 -> 81208 (+0.10%) Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03% Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01% Max dispatch width: 74136 -> 74152 (+0.02%) Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 72 ++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 257dfd12011..5ef888f0040 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -65,7 +65,7 @@ struct nir_to_brw_state { }; static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0); -static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def); +static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false); static nir_component_mask_t get_nir_write_mask(const nir_def &def); static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr); @@ -508,11 +508,10 @@ fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block) * match instr. */ static bool -optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr, - const brw_reg &result) +optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld, + nir_alu_instr *instr, const brw_reg &result) { const intel_device_info *devinfo = ntb.devinfo; - const fs_builder &bld = ntb.bld; /* No fast path for f16 (yet) or f64. */ assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32); @@ -736,20 +735,27 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb, { const intel_device_info *devinfo = ntb.devinfo; - brw_reg result = - need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud(); - - result.type = brw_type_for_nir_type(devinfo, - (nir_alu_type)(nir_op_infos[instr->op].output_type | - instr->def.bit_size)); - + bool all_sources_uniform = true; for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { op[i] = get_nir_src(ntb, instr->src[i].src, -1); op[i].type = brw_type_for_nir_type(devinfo, (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | nir_src_bit_size(instr->src[i].src))); + + /* is_scalar sources won't be is_uniform because get_nir_src was passed + * -1 as the channel. + */ + if (!is_uniform(op[i]) && !op[i].is_scalar) + all_sources_uniform = false; } + brw_reg result = + need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud(); + + result.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[instr->op].output_type | + instr->def.bit_size)); + /* Move and vecN instrutions may still be vectored. Return the raw, * vectored source and destination so that fs_visitor::nir_emit_alu can * handle it. Other callers should not have to handle these kinds of @@ -767,6 +773,9 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb, break; } + const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform); + const fs_builder xbld = is_scalar ? bld.scalar_group() : bld; + /* At this point, we have dealt with any instruction that operates on * more than a single channel. Therefore, we can just adjust the source * and destination registers for that channel and emit the instruction. @@ -780,12 +789,18 @@ prepare_alu_destination_and_sources(nir_to_brw_state &ntb, assert(util_bitcount(write_mask) == 1); channel = ffs(write_mask) - 1; - result = offset(result, bld, channel); + result = offset(result, xbld, channel); } for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { assert(nir_op_infos[instr->op].input_sizes[i] < 2); - op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); + op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]); + + /* If the dispatch width matches the scalar allocation width, offset() + * won't set the stride to zero. Force that here. + */ + if (op[i].is_scalar) + op[i] = component(op[i], 0); } return result; @@ -867,14 +882,13 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, bool need_dest) { const intel_device_info *devinfo = ntb.devinfo; - const fs_builder &bld = ntb.bld; fs_inst *inst; unsigned execution_mode = - bld.shader->nir->info.float_controls_execution_mode; + ntb.bld.shader->nir->info.float_controls_execution_mode; brw_reg op[NIR_MAX_VEC_COMPONENTS]; - brw_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest); + brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest); #ifndef NDEBUG /* Everything except raw moves, some type conversions, iabs, and ineg @@ -909,6 +923,8 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, } #endif + const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld; + switch (instr->op) { case nir_op_mov: case nir_op_vec2: @@ -976,7 +992,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, case nir_op_i2f32: case nir_op_u2f32: - if (optimize_extract_to_float(ntb, instr, result)) + if (optimize_extract_to_float(ntb, bld, instr, result)) return; bld.MOV(result, op[0]); break; @@ -1056,7 +1072,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, if (extract_instr != NULL) { if (extract_instr->op == nir_op_extract_u8 || extract_instr->op == nir_op_extract_i8) { - prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false); + prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false); const unsigned byte = nir_src_as_uint(extract_instr->src[1].src); const brw_reg_type type = @@ -1065,7 +1081,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, op[0] = subscript(op[0], type, byte); } else if (extract_instr->op == nir_op_extract_u16 || extract_instr->op == nir_op_extract_i16) { - prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false); + prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false); const unsigned word = nir_src_as_uint(extract_instr->src[1].src); const brw_reg_type type = @@ -1302,6 +1318,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); + /* The destination will now be used as a source, so select component 0 + * if it's is_scalar (as is done in get_nir_src). + */ + if (bit_size != 32 && result.is_scalar) + dest = component(dest, 0); + if (bit_size > 32) { bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0)); } else if(bit_size < 32) { @@ -1333,6 +1355,12 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); + /* The destination will now be used as a source, so select component 0 + * if it's is_scalar (as is done in get_nir_src). + */ + if (bit_size != 32 && result.is_scalar) + dest = component(dest, 0); + if (bit_size > 32) { bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0)); } else if (bit_size < 32) { @@ -1357,7 +1385,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, /* The sources of the source logical instruction are now the * sources of the instruction that will be generated. */ - prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false); + prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false); resolve_inot_sources(ntb, bld, inot_src_instr, op); /* Smash all of the sources and destination to be signed. This @@ -1938,7 +1966,7 @@ get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src) } static brw_reg -get_nir_def(nir_to_brw_state &ntb, const nir_def &def) +get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform) { nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def); bool is_scalar = false; @@ -1963,6 +1991,8 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def) /* This cannot be is_scalar if NIR thought it was divergent. */ assert(!(is_scalar && def.divergent)); + } else if (def.parent_instr->type == nir_instr_type_alu) { + is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent; } const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;