From 6b9f838d621931b688d7f4e69f23958c67b17f34 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 8 Jun 2023 21:53:02 +0300 Subject: [PATCH] intel/fs: handle load_global_constant_uniform_block_intel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Again, load the data just once in GRF, share it across lanes. Shader-db on dg2: total instructions in shared programs: 23214555 -> 23215400 (<.01%) instructions in affected programs: 199977 -> 200822 (0.42%) helped: 3 HURT: 38 helped stats (abs) min: 5 max: 670 x̄: 283.67 x̃: 176 helped stats (rel) min: 1.34% max: 49.41% x̄: 22.15% x̃: 15.70% HURT stats (abs) min: 1 max: 185 x̄: 44.63 x̃: 32 HURT stats (rel) min: 0.13% max: 42.86% x̄: 10.25% x̃: 9.30% 95% mean confidence interval for instructions value: -18.65 59.87 95% mean confidence interval for instructions %-change: 3.29% 12.47% Inconclusive result (value mean confidence interval includes 0). total loops in shared programs: 5928 -> 5928 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 851137495 -> 851152449 (<.01%) cycles in affected programs: 16406137 -> 16421091 (0.09%) helped: 9 HURT: 32 helped stats (abs) min: 10 max: 13498 x̄: 6443.22 x̃: 5581 helped stats (rel) min: 0.11% max: 4.75% x̄: 1.45% x̃: 0.34% HURT stats (abs) min: 3 max: 15056 x̄: 2279.47 x̃: 735 HURT stats (rel) min: 0.10% max: 23.71% x̄: 4.58% x̃: 4.65% 95% mean confidence interval for cycles value: -1315.40 2044.87 95% mean confidence interval for cycles %-change: 1.71% 4.80% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 11856 -> 11825 (-0.26%) spills in affected programs: 2368 -> 2337 (-1.31%) helped: 4 HURT: 0 total fills in shared programs: 16258 -> 16207 (-0.31%) fills in affected programs: 2930 -> 2879 (-1.74%) helped: 4 HURT: 0 total sends in shared programs: 1038194 -> 1038185 (<.01%) sends in affected programs: 40 -> 31 (-22.50%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 2.25 x̃: 2 helped stats (rel) min: 10.00% max: 33.33% x̄: 21.46% x̃: 21.25% 95% mean confidence interval for sends value: -4.64 0.14 95% mean confidence interval for sends %-change: -40.41% -2.51% Inconclusive result (value mean confidence interval includes 0). LOST: 0 GAINED: 0 Some VK/DX titles result (on DG2 only), it's mostly additional instruction counts except for the unity spaceship demo where a CS shader gets additional SIMDness. The reason for additional instructions is that since we're doing block loads, we need to find the live channels in control flow to select a single lane value that is valid. aztec_ruins_high: Totals from 3 (1.12% of 269) affected shaders: Instrs: 17732 -> 17896 (+0.92%) Cycles: 796518 -> 819302 (+2.86%) cyberpunk_2077: Totals from 17 (0.17% of 10301) affected shaders: Instrs: 10848 -> 11658 (+7.47%) Cycles: 248243 -> 259168 (+4.40%); split: -0.57%, +4.97% fallout_4_dxvk_g2: Totals from 2 (0.12% of 1638) affected shaders: Instrs: 3157 -> 3368 (+6.68%) Cycles: 487807 -> 490426 (+0.54%); split: -0.26%, +0.79% Max live registers: 139 -> 141 (+1.44%) red_dead_redemption2: Totals from 68 (1.14% of 5970) affected shaders: Instrs: 34871 -> 36486 (+4.63%) Cycles: 551430 -> 565211 (+2.50%) Send messages: 2074 -> 2072 (-0.10%) Max live registers: 5078 -> 5077 (-0.02%) total_war_warhammer2: Totals from 5 (1.05% of 478) affected shaders: Instrs: 6905 -> 6971 (+0.96%); split: -0.16%, +1.12% Cycles: 97035 -> 97989 (+0.98%); split: -0.07%, +1.05% unity spaceship demo (instruction count going up due to a CS shader bump from SIMD8->16): Totals from 53 (9.71% of 546) affected shaders: Instrs: 223748 -> 233223 (+4.23%); split: -0.01%, +4.25% Cycles: 23134697 -> 25207080 (+8.96%); split: -0.17%, +9.13% Subgroup size: 480 -> 488 (+1.67%) Spill count: 2156 -> 2242 (+3.99%); split: -0.19%, +4.17% Fill count: 4617 -> 4845 (+4.94%); split: -0.09%, +5.02% Max live registers: 5991 -> 6050 (+0.98%); split: -0.40%, +1.39% Max dispatch width: 480 -> 488 (+1.67%) witcher_3_dxvk_g2: Totals from 27 (2.51% of 1074) affected shaders: Instrs: 57067 -> 57677 (+1.07%); split: -0.03%, +1.10% Cycles: 1397871 -> 1436704 (+2.78%); split: -0.35%, +3.13% Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 40 +++++++++++++++++++ src/intel/compiler/brw_nir.c | 3 +- .../compiler/brw_nir_blockify_uniform_loads.c | 16 ++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 1b41b0bd206..073a1ad391c 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5127,6 +5127,46 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_global_constant_uniform_block_intel: { + const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4); + unsigned loaded_dwords = 0; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0])); + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + + fs_reg srcs[A64_LOGICAL_NUM_SRCS]; + srcs[A64_LOGICAL_ADDRESS] = address; + srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); + ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; + + increment_a64_address(ubld1, address, block_bytes); + loaded_dwords += block; + } + + for (unsigned c = 0; c < instr->num_components; c++) + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + + break; + } + case nir_intrinsic_load_ssbo: { assert(devinfo->ver >= 7); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 5803dbdbbc1..eea8c374fd2 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1310,7 +1310,8 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || - low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) { + low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) { if (num_components > 4) { if (!util_is_power_of_two_nonzero(num_components)) return false; diff --git a/src/intel/compiler/brw_nir_blockify_uniform_loads.c b/src/intel/compiler/brw_nir_blockify_uniform_loads.c index e78b582753b..d28d6a4adf6 100644 --- a/src/intel/compiler/brw_nir_blockify_uniform_loads.c +++ b/src/intel/compiler/brw_nir_blockify_uniform_loads.c @@ -87,6 +87,22 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b, intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel; return true; + case nir_intrinsic_load_global_constant: + if (nir_src_is_divergent(intrin->src[0])) + return false; + + if (nir_dest_bit_size(intrin->dest) != 32) + return false; + + /* Without the LSC, we can only do block loads of at least 4dwords (1 + * oword). + */ + if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) + return false; + + intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel; + return true; + default: return false; }