mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 16:00:08 +01:00
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly helpful on older platforms. v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV, adjust the exec_size to match the size allocated for the destination register. Fixes EU validation failures in some piglit OpenCL tests (e.g., atomic_add-global-return.cl). v3: Use component_size() in emit_uniformize and BROADCAST to properly account for UQ vs UD destination. This doesn't matter for emit_uniformize because the type is always UD, but it is technically more correct. v4: Update trace checksums. Now amly expects the same checksum as several other platforms. v5: Use xbld.dispatch_width() in the builder for when scalar_group() eventually becomes SIMD1. Suggested by Lionel. shader-db: Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown) total instructions in shared programs: 18091701 -> 18091586 (<.01%) instructions in affected programs: 29616 -> 29501 (-0.39%) helped: 28 / HURT: 18 total cycles in shared programs: 919250494 -> 919123828 (-0.01%) cycles in affected programs: 12201102 -> 12074436 (-1.04%) helped: 124 / HURT: 108 LOST: 0 GAINED: 1 Ice Lake and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20480808 -> 20480624 (<.01%) instructions in affected programs: 58465 -> 58281 (-0.31%) helped: 61 / HURT: 20 total cycles in shared programs: 874860168 -> 874960312 (0.01%) cycles in affected programs: 18240986 -> 18341130 (0.55%) helped: 113 / HURT: 158 total spills in shared programs: 4557 -> 4555 (-0.04%) spills in affected programs: 93 -> 91 (-2.15%) helped: 1 / HURT: 0 total fills in shared programs: 5247 -> 5243 (-0.08%) fills in affected programs: 224 -> 220 (-1.79%) helped: 1 / HURT: 0 fossil-db: Lunar Lake Totals: Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00% Subgroup size: 14102592 -> 14102624 (+0.00%) Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02% Max live registers: 65371025 -> 65355084 (-0.02%) Totals from 12130 (1.73% of 702392) affected shaders: Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08% Subgroup size: 388128 -> 388160 (+0.01%) Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81% Max live registers: 1538550 -> 1522609 (-1.04%) Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00% Subgroup size: 9631168 -> 9631216 (+0.00%) Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01% Max live registers: 41540611 -> 41514296 (-0.06%) Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05% Totals from 16852 (2.11% of 796880) affected shaders: Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07% Subgroup size: 323592 -> 323640 (+0.01%) Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58% Max live registers: 1072491 -> 1046176 (-2.45%) Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30% Tiger Lake Totals: Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00% Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01% Max live registers: 41644106 -> 41620052 (-0.06%) Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02% Totals from 15102 (1.90% of 793371) affected shaders: Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11% Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52% Max live registers: 989858 -> 965804 (-2.43%) Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98% Ice Lake and Skylake had similar results. (Ice Lake shown) Totals: Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00% Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00% Spill count: 567071 -> 567049 (-0.00%) Fill count: 701323 -> 701273 (-0.01%) Max live registers: 41914047 -> 41913281 (-0.00%) Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00% Totals from 3904 (0.49% of 798473) affected shaders: Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03% Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25% Spill count: 1696 -> 1674 (-1.30%) Fill count: 2523 -> 2473 (-1.98%) Max live registers: 341695 -> 340929 (-0.22%) Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05% Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
This commit is contained in:
parent
d2b266187d
commit
662339a2ff
4 changed files with 37 additions and 8 deletions
|
|
@ -88,7 +88,7 @@ traces:
|
||||||
gl-intel-glk:
|
gl-intel-glk:
|
||||||
checksum: 06f4222f7f5737f93ed1d191cbdc0798
|
checksum: 06f4222f7f5737f93ed1d191cbdc0798
|
||||||
gl-intel-amly:
|
gl-intel-amly:
|
||||||
checksum: 06d587a2b934295da6ad874b750b9c9d
|
checksum: 06f4222f7f5737f93ed1d191cbdc0798
|
||||||
gl-intel-kbl:
|
gl-intel-kbl:
|
||||||
checksum: 06f4222f7f5737f93ed1d191cbdc0798
|
checksum: 06f4222f7f5737f93ed1d191cbdc0798
|
||||||
gl-intel-whl:
|
gl-intel-whl:
|
||||||
|
|
@ -191,7 +191,7 @@ traces:
|
||||||
gl-intel-glk:
|
gl-intel-glk:
|
||||||
checksum: f9309a25e696938c0372c1afc781d01b
|
checksum: f9309a25e696938c0372c1afc781d01b
|
||||||
gl-intel-amly:
|
gl-intel-amly:
|
||||||
checksum: 10e49cd5a5e12d4a01f504c14b4335cc
|
checksum: f9309a25e696938c0372c1afc781d01b
|
||||||
gl-intel-kbl:
|
gl-intel-kbl:
|
||||||
checksum: f9309a25e696938c0372c1afc781d01b
|
checksum: f9309a25e696938c0372c1afc781d01b
|
||||||
gl-intel-whl:
|
gl-intel-whl:
|
||||||
|
|
|
||||||
|
|
@ -383,13 +383,19 @@ namespace brw {
|
||||||
/* FIXME: We use a vector chan_index and dst to allow constant and
|
/* FIXME: We use a vector chan_index and dst to allow constant and
|
||||||
* copy propagration to move result all the way into the consuming
|
* copy propagration to move result all the way into the consuming
|
||||||
* instruction (typically a surface index or sampler index for a
|
* instruction (typically a surface index or sampler index for a
|
||||||
* send). This uses 1 or 3 extra hw registers in 16 or 32 wide
|
* send). Once we teach const/copy propagation about scalars we
|
||||||
* dispatch. Once we teach const/copy propagation about scalars we
|
|
||||||
* should go back to scalar destinations here.
|
* should go back to scalar destinations here.
|
||||||
*/
|
*/
|
||||||
const brw_reg chan_index = vgrf(BRW_TYPE_UD);
|
const fs_builder xbld = scalar_group();
|
||||||
|
const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
|
||||||
|
|
||||||
|
/* FIND_LIVE_CHANNEL will only write a single component after
|
||||||
|
* lowering. Munge size_written here to match the allocated size of
|
||||||
|
* chan_index.
|
||||||
|
*/
|
||||||
|
exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
|
||||||
|
->size_written = chan_index.component_size(xbld.dispatch_width());
|
||||||
|
|
||||||
exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
|
|
||||||
return BROADCAST(src, component(chan_index, 0));
|
return BROADCAST(src, component(chan_index, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -792,7 +798,10 @@ namespace brw {
|
||||||
brw_reg
|
brw_reg
|
||||||
BROADCAST(brw_reg value, brw_reg index) const
|
BROADCAST(brw_reg value, brw_reg index) const
|
||||||
{
|
{
|
||||||
const brw_reg dst = vgrf(value.type);
|
const fs_builder xbld = scalar_group();
|
||||||
|
const brw_reg dst = xbld.vgrf(value.type);
|
||||||
|
|
||||||
|
assert(is_uniform(index));
|
||||||
|
|
||||||
/* Ensure that the source of a broadcast is always register aligned.
|
/* Ensure that the source of a broadcast is always register aligned.
|
||||||
* See brw_broadcast() non-scalar case for more details.
|
* See brw_broadcast() non-scalar case for more details.
|
||||||
|
|
@ -800,7 +809,12 @@ namespace brw {
|
||||||
if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
|
if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
|
||||||
value = MOV(value);
|
value = MOV(value);
|
||||||
|
|
||||||
exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index);
|
/* BROADCAST will only write a single component after lowering. Munge
|
||||||
|
* size_written here to match the allocated size of dst.
|
||||||
|
*/
|
||||||
|
exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
|
||||||
|
->size_written = dst.component_size(xbld.dispatch_width());
|
||||||
|
|
||||||
return component(dst, 0);
|
return component(dst, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -456,6 +456,14 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
|
||||||
inst->opcode = BRW_OPCODE_MOV;
|
inst->opcode = BRW_OPCODE_MOV;
|
||||||
inst->src[0] = brw_imm_ud(0u);
|
inst->src[0] = brw_imm_ud(0u);
|
||||||
inst->force_writemask_all = true;
|
inst->force_writemask_all = true;
|
||||||
|
|
||||||
|
/* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
|
||||||
|
* size_written set by hand to a smaller value. In this case,
|
||||||
|
* munge the exec_size to match.
|
||||||
|
*/
|
||||||
|
if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
|
||||||
|
inst->exec_size = 8 * reg_unit(s.devinfo);
|
||||||
|
|
||||||
inst->resize_sources(1);
|
inst->resize_sources(1);
|
||||||
progress = true;
|
progress = true;
|
||||||
|
|
||||||
|
|
@ -475,7 +483,10 @@ brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
|
||||||
bcast->opcode = BRW_OPCODE_MOV;
|
bcast->opcode = BRW_OPCODE_MOV;
|
||||||
if (!is_uniform(bcast->src[0]))
|
if (!is_uniform(bcast->src[0]))
|
||||||
bcast->src[0] = component(bcast->src[0], 0);
|
bcast->src[0] = component(bcast->src[0], 0);
|
||||||
|
|
||||||
bcast->force_writemask_all = true;
|
bcast->force_writemask_all = true;
|
||||||
|
bcast->exec_size = 8 * reg_unit(s.devinfo);
|
||||||
|
assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
|
||||||
bcast->resize_sources(1);
|
bcast->resize_sources(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -464,6 +464,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
|
||||||
if (is_uniform(inst->src[0])) {
|
if (is_uniform(inst->src[0])) {
|
||||||
inst->opcode = BRW_OPCODE_MOV;
|
inst->opcode = BRW_OPCODE_MOV;
|
||||||
inst->force_writemask_all = true;
|
inst->force_writemask_all = true;
|
||||||
|
inst->exec_size = 8 * reg_unit(devinfo);
|
||||||
|
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
||||||
inst->resize_sources(1);
|
inst->resize_sources(1);
|
||||||
progress = true;
|
progress = true;
|
||||||
} else if (inst->src[1].file == IMM) {
|
} else if (inst->src[1].file == IMM) {
|
||||||
|
|
@ -480,6 +482,8 @@ brw_fs_opt_algebraic(fs_visitor &s)
|
||||||
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
|
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
|
||||||
inst->src[0] = component(inst->src[0], comp);
|
inst->src[0] = component(inst->src[0], comp);
|
||||||
inst->force_writemask_all = true;
|
inst->force_writemask_all = true;
|
||||||
|
inst->exec_size = 8 * reg_unit(devinfo);
|
||||||
|
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
||||||
inst->resize_sources(1);
|
inst->resize_sources(1);
|
||||||
progress = true;
|
progress = true;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue