From 7aad19ccd252e259edf1fce7eef175abd6619fbd Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 24 Oct 2024 16:35:38 -0700 Subject: [PATCH] brw/lower: Lower invalid source conversion to better code There are two fragment shaders from RDR2 that is hurt for spills and fills on Lunar Lake. Totals from 2 (0.00% of 551413) affected shaders: Spill count: 1252 -> 1317 (+5.19%) Fill count: 2518 -> 2642 (+4.92%) Those shaders... have a lot of room for improvement. There are some patterns in those shaders that we handle very, very poorly. Improving those patterns would likely improve the spills and fills in these shaders quite dramatically. Given how much other platforms are helped, I don't this should block this commit. No shader-db or fossil-db changes on any pre-Gfx12.5 Intel platforms. v2: Add some comments and an additional assertion. Suggested by Ken. shader-db: Lunar Lake total instructions in shared programs: 18094517 -> 18094511 (<.01%) instructions in affected programs: 809 -> 803 (-0.74%) helped: 6 / HURT: 0 total cycles in shared programs: 921532158 -> 921532168 (<.01%) cycles in affected programs: 2266 -> 2276 (0.44%) helped: 0 / HURT: 3 Meteor Lake and DG2 had similar results. (Meteor Lake shown) total instructions in shared programs: 19820845 -> 19820839 (<.01%) instructions in affected programs: 803 -> 797 (-0.75%) helped: 6 / HURT: 0 total cycles in shared programs: 906372999 -> 906372949 (<.01%) cycles in affected programs: 3216 -> 3166 (-1.55%) helped: 6 / HURT: 0 fossil-db: Lunar Lake Totals: Instrs: 141887377 -> 141884465 (-0.00%); split: -0.00%, +0.00% Cycle count: 21990301498 -> 21990267232 (-0.00%); split: -0.00%, +0.00% Spill count: 69732 -> 69797 (+0.09%) Fill count: 128521 -> 128645 (+0.10%) Totals from 349 (0.06% of 551413) affected shaders: Instrs: 506117 -> 503205 (-0.58%); split: -0.79%, +0.21% Cycle count: 32362996 -> 32328730 (-0.11%); split: -0.52%, +0.41% Spill count: 1951 -> 2016 (+3.33%) Fill count: 4899 -> 5023 (+2.53%) Meteor Lake and DG2 had similar results. (Meteor Lake shown) Totals: Instrs: 152773732 -> 152761383 (-0.01%); split: -0.01%, +0.00% Cycle count: 17187529968 -> 17187450663 (-0.00%); split: -0.00%, +0.00% Spill count: 79279 -> 79003 (-0.35%) Fill count: 148803 -> 147942 (-0.58%) Scratch Memory Size: 3949568 -> 3946496 (-0.08%) Max live registers: 31879325 -> 31879230 (-0.00%) Totals from 366 (0.06% of 633185) affected shaders: Instrs: 557377 -> 545028 (-2.22%); split: -2.22%, +0.01% Cycle count: 26171205 -> 26091900 (-0.30%); split: -0.54%, +0.24% Spill count: 3238 -> 2962 (-8.52%) Fill count: 10018 -> 9157 (-8.59%) Scratch Memory Size: 257024 -> 253952 (-1.20%) Max live registers: 28187 -> 28092 (-0.34%) Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_lower_regioning.cpp | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index 86c72bb53be..89f3d20171a 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -720,6 +720,38 @@ namespace { return true; } + /** + * Fast-path for very specific kinds of invalid regions. + * + * Gfx12.5+ does not allow moves of B or UB sources to floating-point + * destinations. This restriction can be resolved more efficiently than by + * the general lowering in lower_src_modifiers or lower_src_region. + */ + void + lower_src_conversion(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const intel_device_info *devinfo = v->devinfo; + const fs_builder ibld = fs_builder(v, block, inst).scalar_group(); + + /* We only handle scalar conversions from small types for now. */ + assert(is_uniform(inst->src[0])); + + brw_reg tmp = ibld.vgrf(brw_type_with_size(inst->src[0].type, 32)); + fs_inst *mov = ibld.MOV(tmp, inst->src[0]); + + inst->src[0] = component(tmp, 0); + + /* Assert that neither the added MOV nor the original instruction will need + * any additional lowering. + */ + assert(!has_invalid_src_region(devinfo, mov, 0)); + assert(!has_invalid_src_modifiers(devinfo, mov, 0)); + assert(!has_invalid_dst_region(devinfo, mov)); + + assert(!has_invalid_src_region(devinfo, inst, 0)); + assert(!has_invalid_src_modifiers(devinfo, inst, 0)); + } + /** * Legalize the source and destination regioning controls of the specified * instruction. @@ -736,6 +768,11 @@ namespace { if (has_invalid_dst_region(devinfo, inst)) progress |= lower_dst_region(v, block, inst); + if (has_invalid_src_conversion(devinfo, inst)) { + lower_src_conversion(v, block, inst); + progress = true; + } + for (unsigned i = 0; i < inst->sources; i++) { if (has_invalid_src_modifiers(devinfo, inst, i)) progress |= lower_src_modifiers(v, block, inst, i);