From e103afe7bec5eb509bf11ea62148a3db61a9eeaa Mon Sep 17 00:00:00 2001 From: Rohan Garg Date: Wed, 22 May 2024 12:22:47 +0200 Subject: [PATCH] brw: run the nir_opt_offsets pass and set the maximum offset size Perf A/B testing on DG2: no changes Perf A/B testing on BMG: +2.1% Blackops3, +1.5% Cyberpunk DG2 stats (mostly insignificant): Assassins Creed Valhalla: Totals from 1169 (55.67% of 2100) affected shaders: Instrs: 509237 -> 509215 (-0.00%) Cycle count: 30614325 -> 30607419 (-0.02%); split: -0.03%, +0.00% Non SSA regs after NIR: 83434 -> 85909 (+2.97%) Blackops 3: Totals from 1045 (64.63% of 1617) affected shaders: Instrs: 527312 -> 527310 (-0.00%) Cycle count: 496912222 -> 496902846 (-0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 106883 -> 109095 (+2.07%) Cyberpunk: Totals from 706 (56.03% of 1260) affected shaders: Instrs: 345976 -> 345974 (-0.00%); split: -0.00%, +0.00% Cycle count: 9775138 -> 9775472 (+0.00%); split: -0.00%, +0.00% Max live registers: 40295 -> 40297 (+0.00%) Non SSA regs after NIR: 93245 -> 94718 (+1.58%) Fortnite: Totals from 4210 (55.98% of 7521) affected shaders: Instrs: 2205471 -> 2205469 (-0.00%) Cycle count: 91451040 -> 91450956 (-0.00%); split: -0.00%, +0.00% Non SSA regs after NIR: 952354 -> 961664 (+0.98%) LNL stats (notable changes): Assassins Creed Valhalla: Totals from 1684 (83.57% of 2015) affected shaders: Instrs: 774305 -> 764501 (-1.27%); split: -1.27%, +0.01% Cycle count: 58845842 -> 58699250 (-0.25%); split: -0.98%, +0.73% Spill count: 625 -> 638 (+2.08%) Fill count: 1490 -> 1503 (+0.87%) Scratch Memory Size: 41984 -> 44032 (+4.88%) Max live registers: 196424 -> 197561 (+0.58%); split: -0.10%, +0.68% Blackops 3: Totals from 1125 (76.53% of 1470) affected shaders: Instrs: 781749 -> 773275 (-1.08%); split: -1.08%, +0.00% Subgroup size: 22896 -> 22912 (+0.07%) Cycle count: 659864454 -> 654641032 (-0.79%); split: -1.10%, +0.31% Max live registers: 116772 -> 116854 (+0.07%); split: -0.01%, +0.08% Non SSA regs after NIR: 172648 -> 168260 (-2.54%); split: -2.55%, +0.01% Control: Totals from 378 (51.50% of 734) affected shaders: Instrs: 148184 -> 147544 (-0.43%) Cycle count: 6905200 -> 6913366 (+0.12%); split: -0.30%, +0.42% Max live registers: 41271 -> 41281 (+0.02%) Non SSA regs after NIR: 44964 -> 43868 (-2.44%); split: -2.45%, +0.01% Cyberpunk: Totals from 1141 (92.46% of 1234) affected shaders: Instrs: 636744 -> 629333 (-1.16%) Subgroup size: 24256 -> 24272 (+0.07%) Cycle count: 24952258 -> 24801298 (-0.60%); split: -1.39%, +0.78% Max live registers: 125848 -> 126855 (+0.80%); split: -0.00%, +0.80% Non SSA regs after NIR: 127399 -> 119837 (-5.94%); split: -5.95%, +0.02% Fortnite: Totals from 5497 (83.52% of 6582) affected shaders: Instrs: 4072831 -> 4041852 (-0.76%); split: -0.77%, +0.01% Subgroup size: 103296 -> 103312 (+0.02%) Cycle count: 133046874 -> 132789242 (-0.19%); split: -0.67%, +0.48% Spill count: 7218 -> 7254 (+0.50%); split: -0.33%, +0.83% Fill count: 11724 -> 11749 (+0.21%); split: -0.34%, +0.55% Scratch Memory Size: 591872 -> 599040 (+1.21%) Max live registers: 816530 -> 818522 (+0.24%); split: -0.01%, +0.26% Non SSA regs after NIR: 1610296 -> 1560284 (-3.11%); split: -3.11%, +0.00% Hitman3: Totals from 4713 (92.39% of 5101) affected shaders: Instrs: 2731598 -> 2698224 (-1.22%) Cycle count: 186422098 -> 185472640 (-0.51%); split: -1.12%, +0.61% Spill count: 3244 -> 3242 (-0.06%) Fill count: 9937 -> 9933 (-0.04%) Max live registers: 585035 -> 589801 (+0.81%); split: -0.00%, +0.82% Non SSA regs after NIR: 347681 -> 324314 (-6.72%); split: -6.73%, +0.01% Hogwarts Legacy: Totals from 930 (59.81% of 1555) affected shaders: Instrs: 464146 -> 459526 (-1.00%); split: -1.00%, +0.01% Subgroup size: 19104 -> 19120 (+0.08%) Cycle count: 24062460 -> 24078964 (+0.07%); split: -0.49%, +0.56% Spill count: 2068 -> 1964 (-5.03%); split: -5.22%, +0.19% Fill count: 2342 -> 2205 (-5.85%); split: -6.40%, +0.56% Scratch Memory Size: 147456 -> 141312 (-4.17%) Max live registers: 112384 -> 112787 (+0.36%); split: -0.08%, +0.44% Non SSA regs after NIR: 80293 -> 79161 (-1.41%); split: -1.72%, +0.32% Metro Exodus: Totals from 29755 (78.62% of 37846) affected shaders: Instrs: 11495578 -> 11492951 (-0.02%); split: -0.02%, +0.00% Subgroup size: 644688 -> 644704 (+0.00%) Cycle count: 301572068 -> 301548054 (-0.01%); split: -0.03%, +0.02% Max live registers: 3369504 -> 3370454 (+0.03%); split: -0.00%, +0.03% Non SSA regs after NIR: 2476561 -> 2396090 (-3.25%); split: -3.27%, +0.02% Red Dead Redemption 2: Totals from 4161 (78.61% of 5293) affected shaders: Instrs: 2428782 -> 2409032 (-0.81%); split: -0.82%, +0.00% Subgroup size: 85344 -> 85360 (+0.02%) Cycle count: 8514984142 -> 8533415324 (+0.22%); split: -0.02%, +0.23% Spill count: 4659 -> 4674 (+0.32%); split: -0.02%, +0.34% Fill count: 11236 -> 11231 (-0.04%); split: -0.19%, +0.14% Scratch Memory Size: 398336 -> 397312 (-0.26%) Max live registers: 473946 -> 475798 (+0.39%); split: -0.08%, +0.47% Non SSA regs after NIR: 616820 -> 567706 (-7.96%); split: -8.09%, +0.12% Rise Of The Tomb Raider: Totals from 68 (46.58% of 146) affected shaders: Instrs: 28209 -> 27801 (-1.45%) Subgroup size: 1584 -> 1600 (+1.01%) Cycle count: 16182992 -> 16249364 (+0.41%); split: -0.97%, +1.38% Max live registers: 7320 -> 7296 (-0.33%); split: -0.38%, +0.05% Non SSA regs after NIR: 8438 -> 8207 (-2.74%); split: -2.82%, +0.08% Spiderman Remastered: Totals from 6403 (93.87% of 6821) affected shaders: Instrs: 5662713 -> 5597949 (-1.14%); split: -1.28%, +0.14% Cycle count: 282861519016 -> 279806958122 (-1.08%); split: -1.26%, +0.18% Spill count: 61150 -> 60754 (-0.65%); split: -1.13%, +0.48% Fill count: 162597 -> 163190 (+0.36%); split: -0.84%, +1.21% Scratch Memory Size: 5834752 -> 5804032 (-0.53%); split: -0.70%, +0.18% Max live registers: 901926 -> 903820 (+0.21%); split: -0.01%, +0.22% Non SSA regs after NIR: 555053 -> 521016 (-6.13%); split: -6.14%, +0.01% Signed-off-by: Rohan Garg Reviewed-by: Kenneth Graunke Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_nir.c | 70 ++++++++++++++++- src/intel/compiler/brw_nir.h | 2 + .../brw_nir_lower_immediate_offsets.c | 78 +++++++++++++++++++ src/intel/compiler/meson.build | 1 + 4 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/intel/compiler/brw_nir_lower_immediate_offsets.c diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index e90888cc8d6..97c4e7b839e 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1756,6 +1756,58 @@ get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes, } } +static bool +brw_nir_ssbo_intel_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *cb_data) +{ + switch (intrin->intrinsic) { + case nir_intrinsic_load_ssbo: { + b->cursor = nir_before_instr(&intrin->instr); + nir_def *value = nir_load_ssbo_intel( + b, + intrin->def.num_components, + intrin->def.bit_size, + intrin->src[0].ssa, + intrin->src[1].ssa, + .access = nir_intrinsic_access(intrin), + .align_mul = nir_intrinsic_align_mul(intrin), + .align_offset = nir_intrinsic_align_offset(intrin), + .base = 0); + value->loop_invariant = intrin->def.loop_invariant; + value->divergent = intrin->def.divergent; + nir_def_replace(&intrin->def, value); + return true; + } + + case nir_intrinsic_store_ssbo: { + b->cursor = nir_instr_remove(&intrin->instr); + nir_store_ssbo_intel( + b, + intrin->src[0].ssa, + intrin->src[1].ssa, + intrin->src[2].ssa, + .access = nir_intrinsic_access(intrin), + .align_mul = nir_intrinsic_align_mul(intrin), + .align_offset = nir_intrinsic_align_offset(intrin), + .base = 0); + return true; + } + + default: + return false; + } +} + +static bool +brw_nir_ssbo_intel(nir_shader *shader) +{ + return nir_shader_intrinsics_pass(shader, + brw_nir_ssbo_intel_instr, + nir_metadata_control_flow, + NULL); +} + static void brw_vectorize_lower_mem_access(nir_shader *nir, const struct brw_compiler *compiler, @@ -1808,7 +1860,6 @@ brw_vectorize_lower_mem_access(nir_shader *nir, } } - struct brw_mem_access_cb_data cb_data = { .devinfo = compiler->devinfo, }; @@ -1836,6 +1887,23 @@ brw_vectorize_lower_mem_access(nir_shader *nir, OPT(nir_opt_algebraic); OPT(nir_opt_constant_folding); } + + /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads + * so that we maximize the offset put into the messages. + */ + if (compiler->devinfo->ver >= 20) { + OPT(brw_nir_ssbo_intel); + + const nir_opt_offsets_options offset_options = { + .buffer_max = UINT32_MAX, + .shared_max = UINT32_MAX, + .shared_atomic_max = UINT32_MAX, + .uniform_max = UINT32_MAX, + }; + OPT(nir_opt_offsets, &offset_options); + + OPT(brw_nir_lower_immediate_offsets); + } } static bool diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 878074f4b81..2a2f7548655 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -215,6 +215,8 @@ bool brw_nir_lower_texture(nir_shader *nir, bool brw_nir_lower_sample_index_in_coord(nir_shader *nir); +bool brw_nir_lower_immediate_offsets(nir_shader *shader); + bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader, const struct intel_device_info *devinfo); diff --git a/src/intel/compiler/brw_nir_lower_immediate_offsets.c b/src/intel/compiler/brw_nir_lower_immediate_offsets.c new file mode 100644 index 00000000000..6b0ea8b9a4a --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_immediate_offsets.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir_builder.h" +#include "brw_eu.h" +#include "brw_nir.h" + +static bool +lower_immediate_offsets(nir_builder *b, nir_intrinsic_instr *intrin, void *data) +{ + unsigned max_bits = 0; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_load_shared_block_intel: + case nir_intrinsic_store_shared_block_intel: + case nir_intrinsic_load_shared_uniform_block_intel: + max_bits = LSC_ADDRESS_OFFSET_FLAT_BITS; + break; + case nir_intrinsic_load_ssbo_intel: + case nir_intrinsic_load_ubo_uniform_block_intel: + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_store_ssbo_intel: + case nir_intrinsic_store_ssbo_block_intel: { + nir_src *binding = nir_get_io_index_src(intrin); + const bool has_resource = + binding->ssa->parent_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(binding->ssa->parent_instr)->intrinsic == + nir_intrinsic_resource_intel; + bool ss_binding = false; + if (has_resource) { + nir_intrinsic_instr *resource = + nir_instr_as_intrinsic(binding->ssa->parent_instr); + ss_binding = (nir_intrinsic_resource_access_intel(resource) & + nir_resource_intel_bindless) != 0; + } + max_bits = ss_binding ? + LSC_ADDRESS_OFFSET_SS_BITS : LSC_ADDRESS_OFFSET_BTI_BITS; + break; + } + default: + return false; + } + + assert(nir_intrinsic_has_base(intrin)); + + b->cursor = nir_before_instr(&intrin->instr); + + const int32_t min = u_intN_min(max_bits); + const int32_t max = u_intN_max(max_bits); + + const int32_t base = nir_intrinsic_base(intrin); + if ((base % 4) == 0 && base >= min && base <= max) + return false; + + int32_t new_base = CLAMP(base, min, max); + new_base -= new_base % 4; + + assert(new_base >= min && new_base <= max); + + nir_src *offset_src = nir_get_io_offset_src(intrin); + nir_src_rewrite(offset_src, nir_iadd_imm(b, offset_src->ssa, base - new_base)); + nir_intrinsic_set_base(intrin, new_base); + + return true; +} + +bool +brw_nir_lower_immediate_offsets(nir_shader *shader) +{ + return nir_shader_intrinsics_pass(shader, lower_immediate_offsets, + nir_metadata_control_flow, NULL); +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index ca99c5b7b50..03c69c98bbd 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -73,6 +73,7 @@ libintel_compiler_brw_files = files( 'brw_nir_lower_alpha_to_coverage.c', 'brw_nir_lower_fs_barycentrics.c', 'brw_nir_lower_fs_msaa.c', + 'brw_nir_lower_immediate_offsets.c', 'brw_nir_lower_intersection_shader.c', 'brw_nir_lower_ray_queries.c', 'brw_nir_lower_rt_intrinsics.c',