brw/build: Use SIMD8 temporaries in emit_uniformize

The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.

v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).

v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.

v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.

v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.

shader-db:

Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18

total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108

LOST:   0
GAINED: 1

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20

total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158

total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0

total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0

fossil-db:

Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)

Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%

Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%

Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%

Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%

Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%

Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
This commit is contained in:
Ian Romanick 2024-10-15 15:51:22 -07:00 committed by Marge Bot
parent d2b266187d
commit 662339a2ff
4 changed files with 37 additions and 8 deletions

View file

@ -88,7 +88,7 @@ traces:
gl-intel-glk: gl-intel-glk:
checksum: 06f4222f7f5737f93ed1d191cbdc0798 checksum: 06f4222f7f5737f93ed1d191cbdc0798
gl-intel-amly: gl-intel-amly:
checksum: 06d587a2b934295da6ad874b750b9c9d checksum: 06f4222f7f5737f93ed1d191cbdc0798
gl-intel-kbl: gl-intel-kbl:
checksum: 06f4222f7f5737f93ed1d191cbdc0798 checksum: 06f4222f7f5737f93ed1d191cbdc0798
gl-intel-whl: gl-intel-whl:
@ -191,7 +191,7 @@ traces:
gl-intel-glk: gl-intel-glk:
checksum: f9309a25e696938c0372c1afc781d01b checksum: f9309a25e696938c0372c1afc781d01b
gl-intel-amly: gl-intel-amly:
checksum: 10e49cd5a5e12d4a01f504c14b4335cc checksum: f9309a25e696938c0372c1afc781d01b
gl-intel-kbl: gl-intel-kbl:
checksum: f9309a25e696938c0372c1afc781d01b checksum: f9309a25e696938c0372c1afc781d01b
gl-intel-whl: gl-intel-whl:

View file

@ -383,13 +383,19 @@ namespace brw {
/* FIXME: We use a vector chan_index and dst to allow constant and /* FIXME: We use a vector chan_index and dst to allow constant and
* copy propagration to move result all the way into the consuming * copy propagration to move result all the way into the consuming
* instruction (typically a surface index or sampler index for a * instruction (typically a surface index or sampler index for a
* send). This uses 1 or 3 extra hw registers in 16 or 32 wide * send). Once we teach const/copy propagation about scalars we
* dispatch. Once we teach const/copy propagation about scalars we
* should go back to scalar destinations here. * should go back to scalar destinations here.
*/ */
const brw_reg chan_index = vgrf(BRW_TYPE_UD); const fs_builder xbld = scalar_group();
const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
/* FIND_LIVE_CHANNEL will only write a single component after
* lowering. Munge size_written here to match the allocated size of
* chan_index.
*/
exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
->size_written = chan_index.component_size(xbld.dispatch_width());
exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
return BROADCAST(src, component(chan_index, 0)); return BROADCAST(src, component(chan_index, 0));
} }
@ -792,7 +798,10 @@ namespace brw {
brw_reg brw_reg
BROADCAST(brw_reg value, brw_reg index) const BROADCAST(brw_reg value, brw_reg index) const
{ {
const brw_reg dst = vgrf(value.type); const fs_builder xbld = scalar_group();
const brw_reg dst = xbld.vgrf(value.type);
assert(is_uniform(index));
/* Ensure that the source of a broadcast is always register aligned. /* Ensure that the source of a broadcast is always register aligned.
* See brw_broadcast() non-scalar case for more details. * See brw_broadcast() non-scalar case for more details.
@ -800,7 +809,12 @@ namespace brw {
if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0) if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
value = MOV(value); value = MOV(value);
exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index); /* BROADCAST will only write a single component after lowering. Munge
* size_written here to match the allocated size of dst.
*/
exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
->size_written = dst.component_size(xbld.dispatch_width());
return component(dst, 0); return component(dst, 0);
} }

View file

@ -456,6 +456,14 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
inst->opcode = BRW_OPCODE_MOV; inst->opcode = BRW_OPCODE_MOV;
inst->src[0] = brw_imm_ud(0u); inst->src[0] = brw_imm_ud(0u);
inst->force_writemask_all = true; inst->force_writemask_all = true;
/* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
* size_written set by hand to a smaller value. In this case,
* munge the exec_size to match.
*/
if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
inst->exec_size = 8 * reg_unit(s.devinfo);
inst->resize_sources(1); inst->resize_sources(1);
progress = true; progress = true;
@ -475,7 +483,10 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
bcast->opcode = BRW_OPCODE_MOV; bcast->opcode = BRW_OPCODE_MOV;
if (!is_uniform(bcast->src[0])) if (!is_uniform(bcast->src[0]))
bcast->src[0] = component(bcast->src[0], 0); bcast->src[0] = component(bcast->src[0], 0);
bcast->force_writemask_all = true; bcast->force_writemask_all = true;
bcast->exec_size = 8 * reg_unit(s.devinfo);
assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
bcast->resize_sources(1); bcast->resize_sources(1);
} }
} }

View file

@ -464,6 +464,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
if (is_uniform(inst->src[0])) { if (is_uniform(inst->src[0])) {
inst->opcode = BRW_OPCODE_MOV; inst->opcode = BRW_OPCODE_MOV;
inst->force_writemask_all = true; inst->force_writemask_all = true;
inst->exec_size = 8 * reg_unit(devinfo);
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
inst->resize_sources(1); inst->resize_sources(1);
progress = true; progress = true;
} else if (inst->src[1].file == IMM) { } else if (inst->src[1].file == IMM) {
@ -480,6 +482,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1); const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
inst->src[0] = component(inst->src[0], comp); inst->src[0] = component(inst->src[0], comp);
inst->force_writemask_all = true; inst->force_writemask_all = true;
inst->exec_size = 8 * reg_unit(devinfo);
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
inst->resize_sources(1); inst->resize_sources(1);
progress = true; progress = true;
} }