From 662339a2ff4db1b8180778a1449f317da3898f8c Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 15 Oct 2024 15:51:22 -0700 Subject: [PATCH] brw/build: Use SIMD8 temporaries in emit_uniformize The fossil-db results are very different from v1. This is now mostly helpful on older platforms. v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV, adjust the exec_size to match the size allocated for the destination register. Fixes EU validation failures in some piglit OpenCL tests (e.g., atomic_add-global-return.cl). v3: Use component_size() in emit_uniformize and BROADCAST to properly account for UQ vs UD destination. This doesn't matter for emit_uniformize because the type is always UD, but it is technically more correct. v4: Update trace checksums. Now amly expects the same checksum as several other platforms. v5: Use xbld.dispatch_width() in the builder for when scalar_group() eventually becomes SIMD1. Suggested by Lionel. shader-db: Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown) total instructions in shared programs: 18091701 -> 18091586 (<.01%) instructions in affected programs: 29616 -> 29501 (-0.39%) helped: 28 / HURT: 18 total cycles in shared programs: 919250494 -> 919123828 (-0.01%) cycles in affected programs: 12201102 -> 12074436 (-1.04%) helped: 124 / HURT: 108 LOST: 0 GAINED: 1 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20480808 -> 20480624 (<.01%) instructions in affected programs: 58465 -> 58281 (-0.31%) helped: 61 / HURT: 20 total cycles in shared programs: 874860168 -> 874960312 (0.01%) cycles in affected programs: 18240986 -> 18341130 (0.55%) helped: 113 / HURT: 158 total spills in shared programs: 4557 -> 4555 (-0.04%) spills in affected programs: 93 -> 91 (-2.15%) helped: 1 / HURT: 0 total fills in shared programs: 5247 -> 5243 (-0.08%) fills in affected programs: 224 -> 220 (-1.79%) helped: 1 / HURT: 0 fossil-db: Lunar Lake Totals: Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00% Subgroup size: 14102592 -> 14102624 (+0.00%) Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02% Max live registers: 65371025 -> 65355084 (-0.02%) Totals from 12130 (1.73% of 702392) affected shaders: Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08% Subgroup size: 388128 -> 388160 (+0.01%) Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81% Max live registers: 1538550 -> 1522609 (-1.04%) Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00% Subgroup size: 9631168 -> 9631216 (+0.00%) Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01% Max live registers: 41540611 -> 41514296 (-0.06%) Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05% Totals from 16852 (2.11% of 796880) affected shaders: Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07% Subgroup size: 323592 -> 323640 (+0.01%) Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58% Max live registers: 1072491 -> 1046176 (-2.45%) Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30% Tiger Lake Totals: Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00% Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01% Max live registers: 41644106 -> 41620052 (-0.06%) Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02% Totals from 15102 (1.90% of 793371) affected shaders: Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11% Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52% Max live registers: 989858 -> 965804 (-2.43%) Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98% Ice Lake and Skylake had similar results. (Ice Lake shown) Totals: Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00% Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00% Spill count: 567071 -> 567049 (-0.00%) Fill count: 701323 -> 701273 (-0.01%) Max live registers: 41914047 -> 41913281 (-0.00%) Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00% Totals from 3904 (0.49% of 798473) affected shaders: Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03% Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25% Spill count: 1696 -> 1674 (-1.30%) Fill count: 2523 -> 2473 (-1.98%) Max live registers: 341695 -> 340929 (-0.22%) Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05% Reviewed-by: Kenneth Graunke Part-of: --- src/intel/ci/traces-iris.yml | 4 ++-- src/intel/compiler/brw_fs_builder.h | 26 ++++++++++++++++----- src/intel/compiler/brw_fs_opt.cpp | 11 +++++++++ src/intel/compiler/brw_fs_opt_algebraic.cpp | 4 ++++ 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/intel/ci/traces-iris.yml b/src/intel/ci/traces-iris.yml index 43556e2fe8d..a0f814bdb7b 100644 --- a/src/intel/ci/traces-iris.yml +++ b/src/intel/ci/traces-iris.yml @@ -88,7 +88,7 @@ traces: gl-intel-glk: checksum: 06f4222f7f5737f93ed1d191cbdc0798 gl-intel-amly: - checksum: 06d587a2b934295da6ad874b750b9c9d + checksum: 06f4222f7f5737f93ed1d191cbdc0798 gl-intel-kbl: checksum: 06f4222f7f5737f93ed1d191cbdc0798 gl-intel-whl: @@ -191,7 +191,7 @@ traces: gl-intel-glk: checksum: f9309a25e696938c0372c1afc781d01b gl-intel-amly: - checksum: 10e49cd5a5e12d4a01f504c14b4335cc + checksum: f9309a25e696938c0372c1afc781d01b gl-intel-kbl: checksum: f9309a25e696938c0372c1afc781d01b gl-intel-whl: diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index 807b566531b..c4896f427af 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -383,13 +383,19 @@ namespace brw { /* FIXME: We use a vector chan_index and dst to allow constant and * copy propagration to move result all the way into the consuming * instruction (typically a surface index or sampler index for a - * send). This uses 1 or 3 extra hw registers in 16 or 32 wide - * dispatch. Once we teach const/copy propagation about scalars we + * send). Once we teach const/copy propagation about scalars we * should go back to scalar destinations here. */ - const brw_reg chan_index = vgrf(BRW_TYPE_UD); + const fs_builder xbld = scalar_group(); + const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD); + + /* FIND_LIVE_CHANNEL will only write a single component after + * lowering. Munge size_written here to match the allocated size of + * chan_index. + */ + exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index) + ->size_written = chan_index.component_size(xbld.dispatch_width()); - exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); return BROADCAST(src, component(chan_index, 0)); } @@ -792,7 +798,10 @@ namespace brw { brw_reg BROADCAST(brw_reg value, brw_reg index) const { - const brw_reg dst = vgrf(value.type); + const fs_builder xbld = scalar_group(); + const brw_reg dst = xbld.vgrf(value.type); + + assert(is_uniform(index)); /* Ensure that the source of a broadcast is always register aligned. * See brw_broadcast() non-scalar case for more details. @@ -800,7 +809,12 @@ namespace brw { if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0) value = MOV(value); - exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index); + /* BROADCAST will only write a single component after lowering. Munge + * size_written here to match the allocated size of dst. + */ + exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index) + ->size_written = dst.component_size(xbld.dispatch_width()); + return component(dst, 0); } diff --git a/src/intel/compiler/brw_fs_opt.cpp b/src/intel/compiler/brw_fs_opt.cpp index 3ec97916e26..3d04f018d89 100644 --- a/src/intel/compiler/brw_fs_opt.cpp +++ b/src/intel/compiler/brw_fs_opt.cpp @@ -456,6 +456,14 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s) inst->opcode = BRW_OPCODE_MOV; inst->src[0] = brw_imm_ud(0u); inst->force_writemask_all = true; + + /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have + * size_written set by hand to a smaller value. In this case, + * munge the exec_size to match. + */ + if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo))) + inst->exec_size = 8 * reg_unit(s.devinfo); + inst->resize_sources(1); progress = true; @@ -475,7 +483,10 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s) bcast->opcode = BRW_OPCODE_MOV; if (!is_uniform(bcast->src[0])) bcast->src[0] = component(bcast->src[0], 0); + bcast->force_writemask_all = true; + bcast->exec_size = 8 * reg_unit(s.devinfo); + assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size)); bcast->resize_sources(1); } } diff --git a/src/intel/compiler/brw_fs_opt_algebraic.cpp b/src/intel/compiler/brw_fs_opt_algebraic.cpp index 94f52d574f4..b34ea251261 100644 --- a/src/intel/compiler/brw_fs_opt_algebraic.cpp +++ b/src/intel/compiler/brw_fs_opt_algebraic.cpp @@ -464,6 +464,8 @@ brw_fs_opt_algebraic(fs_visitor &s) if (is_uniform(inst->src[0])) { inst->opcode = BRW_OPCODE_MOV; inst->force_writemask_all = true; + inst->exec_size = 8 * reg_unit(devinfo); + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); inst->resize_sources(1); progress = true; } else if (inst->src[1].file == IMM) { @@ -480,6 +482,8 @@ brw_fs_opt_algebraic(fs_visitor &s) const unsigned comp = inst->src[1].ud & (inst->exec_size - 1); inst->src[0] = component(inst->src[0], comp); inst->force_writemask_all = true; + inst->exec_size = 8 * reg_unit(devinfo); + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); inst->resize_sources(1); progress = true; }