From 8ac7802ac835eb8e01e88a477586bae7dc547034 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 29 Feb 2024 20:51:50 +0200 Subject: [PATCH] brw: move final send lowering up into the IR Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin Reviewed-by: Alyssa Rosenzweig Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_compile_fs.cpp | 16 +- src/intel/compiler/brw_eu.h | 5 +- src/intel/compiler/brw_eu_defines.h | 7 + src/intel/compiler/brw_eu_emit.c | 137 +--------------- src/intel/compiler/brw_fs.cpp | 20 ++- src/intel/compiler/brw_fs.h | 2 + src/intel/compiler/brw_fs_generator.cpp | 23 +-- src/intel/compiler/brw_fs_reg_allocate.cpp | 150 ++++++++++++------ src/intel/compiler/brw_ir_fs.h | 4 - .../compiler/brw_lower_logical_sends.cpp | 80 ++++++++++ src/intel/compiler/brw_opt.cpp | 14 +- .../compiler/brw_opt_address_reg_load.cpp | 75 +++++++++ src/intel/compiler/meson.build | 1 + 13 files changed, 320 insertions(+), 214 deletions(-) create mode 100644 src/intel/compiler/brw_opt_address_reg_load.cpp diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index df3b2362442..f84587639a6 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -627,15 +627,23 @@ brw_emit_repclear_shader(fs_visitor &s) write = bld.emit(SHADER_OPCODE_SEND); write->resize_sources(3); + + /* We can use a headerless message for the first render target */ + write->header_size = i == 0 ? 0 : 2; + write->mlen = 1 + write->header_size; + write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE; - write->src[0] = brw_imm_ud(0); + write->src[0] = brw_imm_ud( + brw_fb_write_desc( + s.devinfo, i, + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED, + i == key->nr_color_regions - 1, false) | + brw_message_desc(s.devinfo, write->mlen, + 0 /* rlen */, write->header_size)); write->src[1] = brw_imm_ud(0); write->src[2] = i == 0 ? color_output : header; write->check_tdr = true; write->send_has_side_effects = true; - write->desc = brw_fb_write_desc(s.devinfo, i, - BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED, - i == key->nr_color_regions - 1, false); /* We can use a headerless message for the first render target */ write->header_size = i == 0 ? 0 : 2; diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index ba06b716384..0f3bcac9e18 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1430,7 +1430,6 @@ brw_send_indirect_message(struct brw_codegen *p, struct brw_reg dst, struct brw_reg payload, struct brw_reg desc, - unsigned desc_imm, bool eot); void @@ -1440,10 +1439,8 @@ brw_send_indirect_split_message(struct brw_codegen *p, struct brw_reg payload0, struct brw_reg payload1, struct brw_reg desc, - unsigned desc_imm, struct brw_reg ex_desc, - unsigned ex_desc_imm, - bool ex_desc_scratch, + unsigned ex_mlen, bool ex_bso, bool eot); diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 41adb0db669..7108e0146c1 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -816,6 +816,13 @@ enum ENUM_PACKED gfx10_align1_3src_exec_type { #define BRW_THREAD_ATOMIC 1 #define BRW_THREAD_SWITCH 2 +/* Subregister of the address register used for particular purposes */ +enum brw_address_subreg { + BRW_ADDRESS_SUBREG_INDIRECT_DESC = 0, + BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC = 2, + BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC = 4, +}; + enum ENUM_PACKED brw_vertical_stride { BRW_VERTICAL_STRIDE_0 = 0, BRW_VERTICAL_STRIDE_1 = 1, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 743202cd955..02afc6bf241 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1438,7 +1438,6 @@ brw_send_indirect_message(struct brw_codegen *p, struct brw_reg dst, struct brw_reg payload, struct brw_reg desc, - unsigned desc_imm, bool eot) { const struct intel_device_info *devinfo = p->devinfo; @@ -1451,35 +1450,16 @@ brw_send_indirect_message(struct brw_codegen *p, if (desc.file == IMM) { send = next_insn(p, BRW_OPCODE_SEND); brw_set_src0(p, send, retype(payload, BRW_TYPE_UD)); - brw_set_desc(p, send, desc.ud | desc_imm); + brw_set_desc(p, send, desc.ud); } else { - const struct tgl_swsb swsb = brw_get_default_swsb(p); - struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_flag_reg(p, 0, 0); - brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); - - /* Load the indirect descriptor to an address register using OR so the - * caller can specify additional descriptor bits with the desc_imm - * immediate. - */ - brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); - - brw_pop_insn_state(p); - - brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + assert(desc.file == ADDRESS); + assert(desc.subnr == 0); send = next_insn(p, BRW_OPCODE_SEND); brw_set_src0(p, send, retype(payload, BRW_TYPE_UD)); - if (devinfo->ver >= 12) brw_eu_inst_set_send_sel_reg32_desc(devinfo, send, true); else - brw_set_src1(p, send, addr); + brw_set_src1(p, send, desc); } brw_set_dest(p, send, dst); @@ -1494,10 +1474,8 @@ brw_send_indirect_split_message(struct brw_codegen *p, struct brw_reg payload0, struct brw_reg payload1, struct brw_reg desc, - unsigned desc_imm, struct brw_reg ex_desc, - unsigned ex_desc_imm, - bool ex_desc_scratch, + unsigned ex_mlen, bool ex_bso, bool eot) { @@ -1508,105 +1486,6 @@ brw_send_indirect_split_message(struct brw_codegen *p, assert(desc.type == BRW_TYPE_UD); - if (desc.file == IMM) { - desc.ud |= desc_imm; - } else { - const struct tgl_swsb swsb = brw_get_default_swsb(p); - struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_flag_reg(p, 0, 0); - brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); - - /* Load the indirect descriptor to an address register using OR so the - * caller can specify additional descriptor bits with the desc_imm - * immediate. - */ - brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); - - brw_pop_insn_state(p); - desc = addr; - - brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); - } - - if (ex_desc.file == IMM && - !ex_desc_scratch && - (devinfo->ver >= 12 || - ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) { - /* ATS-M PRMs, Volume 2d: Command Reference: Structures, - * EU_INSTRUCTION_SEND instruction - * - * "ExBSO: Exists If: ([ExDesc.IsReg]==true)" - */ - assert(!ex_bso); - ex_desc.ud |= ex_desc_imm; - } else { - const struct tgl_swsb swsb = brw_get_default_swsb(p); - struct brw_reg addr = retype(brw_address_reg(2), BRW_TYPE_UD); - - /* On Xe2+ ExBSO addressing is implicitly enabled for the UGM - * shared function. - */ - ex_bso |= (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_flag_reg(p, 0, 0); - brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); - - /* Load the indirect extended descriptor to an address register using OR - * so the caller can specify additional descriptor bits with the - * desc_imm immediate. - * - * Even though the instruction dispatcher always pulls the SFID and EOT - * fields from the instruction itself, actual external unit which - * processes the message gets the SFID and EOT from the extended - * descriptor which comes from the address register. If we don't OR - * those two bits in, the external unit may get confused and hang. - */ - unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5); - - if (ex_desc_scratch) { - assert(devinfo->verx10 >= 125); - brw_AND(p, addr, - retype(brw_vec1_grf(0, 5), BRW_TYPE_UD), - brw_imm_ud(INTEL_MASK(31, 10))); - - if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) { - const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm); - assert(ex_desc_imm == brw_message_ex_desc(devinfo, ex_mlen)); - brw_SHR(p, addr, addr, brw_imm_ud(4)); - } else { - /* Or the scratch surface offset together with the immediate part - * of the extended descriptor. - */ - brw_OR(p, addr, addr, brw_imm_ud(imm_part)); - } - - } else if (ex_desc.file == IMM) { - /* ex_desc bits 15:12 don't exist in the instruction encoding prior - * to Gfx12, so we may have fallen back to an indirect extended - * descriptor. - */ - brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part)); - } else { - brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part)); - } - - brw_pop_insn_state(p); - ex_desc = addr; - - brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); - } - send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS); brw_set_dest(p, send, dst); brw_set_src0(p, send, retype(payload0, BRW_TYPE_UD)); @@ -1630,10 +1509,8 @@ brw_send_indirect_split_message(struct brw_codegen *p, brw_eu_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1); brw_eu_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2); - if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) { - const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm); + if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) brw_eu_inst_set_bits(send, 103, 99, ex_mlen / reg_unit(devinfo)); - } } if (ex_bso) { @@ -1644,7 +1521,7 @@ brw_send_indirect_split_message(struct brw_codegen *p, */ if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM) brw_eu_inst_set_send_ex_bso(devinfo, send, true); - brw_eu_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6)); + brw_eu_inst_set_send_src1_len(devinfo, send, ex_mlen / reg_unit(devinfo)); } brw_eu_inst_set_sfid(devinfo, send, sfid); brw_eu_inst_set_eot(devinfo, send, eot); diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index cb77c691cb9..00f22a4d1bd 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -958,19 +958,25 @@ fs_visitor::assign_curb_setup() fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4); send->sfid = GFX12_SFID_UGM; - send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - LSC_ADDR_SURFTYPE_FLAT, - LSC_ADDR_SIZE_A32, - LSC_DATA_SIZE_D32, - num_regs * 8 /* num_channels */, - true /* transpose */, - LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); + uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + LSC_ADDR_SURFTYPE_FLAT, + LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, + num_regs * 8 /* num_channels */, + true /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); send->header_size = 0; send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1); send->size_written = lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE; send->send_is_volatile = true; + send->src[0] = brw_imm_ud(desc | + brw_message_desc(devinfo, + send->mlen, + send->size_written / REG_SIZE, + send->header_size)); + i += num_regs; } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 8d66055bc59..37ea0ca9687 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -635,6 +635,7 @@ bool brw_lower_pack(fs_visitor &s); bool brw_lower_regioning(fs_visitor &s); bool brw_lower_scalar_fp64_MAD(fs_visitor &s); bool brw_lower_scoreboard(fs_visitor &s); +bool brw_lower_send_descriptors(fs_visitor &s); bool brw_lower_sends_overlapping_payload(fs_visitor &s); bool brw_lower_simd_width(fs_visitor &s); bool brw_lower_sub_sat(fs_visitor &s); @@ -642,6 +643,7 @@ bool brw_lower_subgroup_ops(fs_visitor &s); bool brw_lower_uniform_pull_constant_loads(fs_visitor &s); void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s); +bool brw_opt_address_reg_load(fs_visitor &s); bool brw_opt_algebraic(fs_visitor &s); bool brw_opt_bank_conflicts(fs_visitor &s); bool brw_opt_cmod_propagation(fs_visitor &s); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index ebd1adeff43..5ec03dc24cf 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -167,31 +167,20 @@ fs_generator::generate_send(fs_inst *inst, struct brw_reg payload, struct brw_reg payload2) { - const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE; - - uint32_t desc_imm = inst->desc | - brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); - - uint32_t ex_desc_imm = inst->ex_desc | - brw_message_ex_desc(devinfo, inst->ex_mlen); - - if (ex_desc.file != IMM || ex_desc.ud || ex_desc_imm || - inst->send_ex_desc_scratch) { + if (ex_desc.file == IMM && ex_desc.ud == 0) { + brw_send_indirect_message(p, inst->sfid, dst, payload, desc, inst->eot); + if (inst->check_tdr) + brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC); + } else { /* If we have any sort of extended descriptor, then we need SENDS. This * also covers the dual-payload case because ex_mlen goes in ex_desc. */ brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, - desc, desc_imm, ex_desc, ex_desc_imm, - inst->send_ex_desc_scratch, + desc, ex_desc, inst->ex_mlen, inst->send_ex_bso, inst->eot); if (inst->check_tdr) brw_eu_inst_set_opcode(p->isa, brw_last_inst, devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); - } else { - brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, - inst->eot); - if (inst->check_tdr) - brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC); } } diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index c2fdc9081f2..a5ec755251e 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -222,12 +222,6 @@ void fs_visitor::calculate_payload_ranges(bool allow_spilling, } } - /* The generator implicitly uses g0 to construct extended message - * descriptors for scratch send messages when this bit is set. - */ - if (inst->send_ex_desc_scratch) - payload_last_use_ip[0] = use_ip; - ip++; } @@ -294,6 +288,8 @@ private: void build_interference_graph(bool allow_spilling); + brw_reg build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill); + brw_reg build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip); brw_reg build_single_offset(const fs_builder &bld, @@ -689,6 +685,44 @@ fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, return offset; } +brw_reg +fs_reg_alloc::build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill) +{ + /* Use a different area of the address register than what is used in + * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have + * interactions between the spill/fill instructions and the other send + * messages. + */ + brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD, + BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC); + fs_inst *inst = bld.exec_all().group(1, 0).AND( + ex_desc, + retype(brw_vec1_grf(0, 5), BRW_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + _mesa_set_add(spill_insts, inst); + + const intel_device_info *devinfo = bld.shader->devinfo; + if (devinfo->verx10 >= 200) { + inst = bld.exec_all().group(1, 0).SHR( + ex_desc, ex_desc, brw_imm_ud(4)); + _mesa_set_add(spill_insts, inst); + } else { + if (unspill) { + inst = bld.exec_all().group(1, 0).OR( + ex_desc, ex_desc, brw_imm_ud(GFX12_SFID_UGM)); + _mesa_set_add(spill_insts, inst); + } else { + inst = bld.exec_all().group(1, 0).OR( + ex_desc, ex_desc, + brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | + GFX12_SFID_UGM)); + _mesa_set_add(spill_insts, inst); + } + } + + return ex_desc; +} + brw_reg fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip) { @@ -782,28 +816,26 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, } else { offset = build_lane_offsets(ubld, spill_offset, ip); } - /* We leave the extended descriptor empty and flag the instruction to - * ask the generated to insert the extended descriptor in the address - * register. That way we don't need to burn an additional register - * for register allocation spill/fill. - */ + brw_reg srcs[] = { brw_imm_ud(0), /* desc */ - brw_imm_ud(0), /* ex_desc */ + build_ex_desc(bld, reg_size, true), offset, /* payload */ brw_reg(), /* payload2 */ }; + uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, + use_transpose ? reg_size * 8 : 1 /* num_channels */, + use_transpose, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); + + unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst, srcs, ARRAY_SIZE(srcs)); unspill_inst->sfid = GFX12_SFID_UGM; - unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - LSC_ADDR_SURFTYPE_SS, - LSC_ADDR_SIZE_A32, - LSC_DATA_SIZE_D32, - use_transpose ? reg_size * 8 : 1 /* num_channels */, - use_transpose, - LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); unspill_inst->header_size = 0; unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, unspill_inst->exec_size); @@ -812,14 +844,23 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE; unspill_inst->send_has_side_effects = false; unspill_inst->send_is_volatile = true; - unspill_inst->send_ex_desc_scratch = true; + + unspill_inst->src[0] = brw_imm_ud( + desc | + brw_message_desc(devinfo, + unspill_inst->mlen, + unspill_inst->size_written / REG_SIZE, + unspill_inst->header_size)); } else { brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; - const brw_reg ex_desc = brw_imm_ud(0); - brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header }; + brw_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + header + }; unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, srcs, ARRAY_SIZE(srcs)); unspill_inst->mlen = 1; @@ -828,10 +869,15 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, unspill_inst->send_has_side_effects = false; unspill_inst->send_is_volatile = true; unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; - unspill_inst->desc = + + unspill_inst->src[0] = brw_imm_ud( brw_dp_desc(devinfo, bti, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) | + brw_message_desc(devinfo, + unspill_inst->mlen, + unspill_inst->size_written / REG_SIZE, + unspill_inst->header_size)); } _mesa_set_add(spill_insts, unspill_inst); assert(unspill_inst->force_writemask_all || count % reg_size == 0); @@ -857,27 +903,23 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, fs_inst *spill_inst; if (devinfo->verx10 >= 125) { brw_reg offset = build_lane_offsets(bld, spill_offset, ip); - /* We leave the extended descriptor empty and flag the instruction - * relocate the extended descriptor. That way the surface offset is - * directly put into the instruction and we don't need to use a - * register to hold it. - */ + brw_reg srcs[] = { - brw_imm_ud(0), /* desc */ - brw_imm_ud(0), /* ex_desc */ - offset, /* payload */ - src, /* payload2 */ + brw_imm_ud(0), /* desc */ + build_ex_desc(bld, reg_size, false), + offset, /* payload */ + src, /* payload2 */ }; spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), srcs, ARRAY_SIZE(srcs)); spill_inst->sfid = GFX12_SFID_UGM; - spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, - LSC_ADDR_SURFTYPE_SS, - LSC_ADDR_SIZE_A32, - LSC_DATA_SIZE_D32, - 1 /* num_channels */, - false /* transpose */, - LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); + uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE, + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, + 1 /* num_channels */, + false /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); spill_inst->header_size = 0; spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, bld.dispatch_width()); @@ -885,14 +927,23 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, spill_inst->size_written = 0; spill_inst->send_has_side_effects = true; spill_inst->send_is_volatile = false; - spill_inst->send_ex_desc_scratch = true; + + spill_inst->src[0] = brw_imm_ud( + desc | + brw_message_desc(devinfo, + spill_inst->mlen, + spill_inst->size_written / REG_SIZE, + spill_inst->header_size)); } else { brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; - const brw_reg ex_desc = brw_imm_ud(0); - - brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src }; + brw_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + header, + src + }; spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), srcs, ARRAY_SIZE(srcs)); spill_inst->mlen = 1; @@ -902,10 +953,17 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, spill_inst->send_has_side_effects = true; spill_inst->send_is_volatile = false; spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; - spill_inst->desc = + + spill_inst->src[0] = brw_imm_ud( brw_dp_desc(devinfo, bti, GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) | + brw_message_desc(devinfo, + spill_inst->mlen, + spill_inst->size_written / REG_SIZE, + spill_inst->header_size)); + spill_inst->src[1] = brw_imm_ud( + brw_message_ex_desc(devinfo, spill_inst->ex_mlen)); } _mesa_set_add(spill_insts, spill_inst); assert(spill_inst->force_writemask_all || count % reg_size == 0); diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index f9564886625..2cfb15e7167 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -193,10 +193,6 @@ public: bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */ - bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use - * the scratch surface offset to build - * extended descriptor - */ bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended * bindless surface offset (26bits instead of * 20bits) diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 29e15907ec6..abd5c223466 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1389,6 +1389,9 @@ setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst, * we can use the surface handle directly as the extended descriptor. */ inst->src[1] = retype(surface, BRW_TYPE_UD); + /* Gfx20+ assumes ExBSO with UGM */ + if (devinfo->ver >= 20 && inst->sfid == GFX12_SFID_UGM) + inst->send_ex_bso = true; break; case LSC_ADDR_SURFTYPE_BTI: @@ -2610,3 +2613,80 @@ brw_lower_uniform_pull_constant_loads(fs_visitor &s) return progress; } + +bool +brw_lower_send_descriptors(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + foreach_block_and_inst (block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_SEND) + continue; + + const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0); + + /* Descriptor */ + const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE; + uint32_t desc_imm = inst->desc | + brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); + + assert(inst->src[0].file != BAD_FILE); + assert(inst->src[1].file != BAD_FILE); + + brw_reg desc = inst->src[0]; + if (desc.file == IMM) { + inst->src[0] = brw_imm_ud(desc.ud | desc_imm); + } else { + brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD, + BRW_ADDRESS_SUBREG_INDIRECT_DESC); + ubld.OR(addr_reg, desc, brw_imm_ud(desc_imm)); + inst->src[0] = addr_reg; + } + + /* Extended descriptor */ + brw_reg ex_desc = inst->src[1]; + uint32_t ex_desc_imm = inst->ex_desc | + brw_message_ex_desc(devinfo, inst->ex_mlen); + + if (ex_desc.file == IMM) + ex_desc_imm |= ex_desc.ud; + + bool needs_addr_reg = false; + if (ex_desc.file != IMM) + needs_addr_reg = true; + if (devinfo->ver < 12 && ex_desc.file == IMM && + (ex_desc_imm & INTEL_MASK(15, 12)) != 0) + needs_addr_reg = true; + + if (inst->send_ex_bso) { + needs_addr_reg = true; + /* When using the extended bindless offset, the whole extended + * descriptor is the surface handle. + */ + ex_desc_imm = 0; + } else { + if (needs_addr_reg) + ex_desc_imm |= inst->sfid | inst->eot << 5; + } + + if (needs_addr_reg) { + brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD, + BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC); + if (ex_desc.file == IMM) + ubld.MOV(addr_reg, brw_imm_ud(ex_desc_imm)); + else if (ex_desc_imm == 0) + ubld.MOV(addr_reg, ex_desc); + else + ubld.OR(addr_reg, ex_desc, brw_imm_ud(ex_desc_imm)); + inst->src[1] = addr_reg; + } else { + inst->src[1] = brw_imm_ud(ex_desc_imm); + } + + progress = true; + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + } + + return progress; +} diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp index 904aee155cb..4e2b56e16bf 100644 --- a/src/intel/compiler/brw_opt.cpp +++ b/src/intel/compiler/brw_opt.cpp @@ -175,10 +175,20 @@ brw_optimize(fs_visitor &s) if (progress) OPT(brw_lower_simd_width); - OPT(brw_lower_sends_overlapping_payload); - OPT(brw_lower_uniform_pull_constant_loads); + if (OPT(brw_lower_send_descriptors)) { + /* No need for standard copy_propagation since + * brw_fs_opt_address_reg_load will only optimize defs. + */ + if (OPT(brw_opt_copy_propagation_defs)) + OPT(brw_opt_algebraic); + OPT(brw_opt_address_reg_load); + OPT(brw_opt_dead_code_eliminate); + } + + OPT(brw_lower_sends_overlapping_payload); + OPT(brw_lower_indirect_mov); OPT(brw_lower_find_live_channel); diff --git a/src/intel/compiler/brw_opt_address_reg_load.cpp b/src/intel/compiler/brw_opt_address_reg_load.cpp new file mode 100644 index 00000000000..3eeaadfdeb1 --- /dev/null +++ b/src/intel/compiler/brw_opt_address_reg_load.cpp @@ -0,0 +1,75 @@ +/* + * Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" +#include "brw_cfg.h" +#include "brw_eu.h" + +/** @file brw_fs_opt_address_reg_load.cpp + * + * Turn this sequence : + * + * add(8) vgrf64:UD, vgrf63:UD, 192u + * mov(1) a0.4:UD, vgrf64+0.0<0>:UD + * + * into : + * + * add(1) a0.4:UD, vgrf63+0.0<0>:UD, 192u + */ + +using namespace brw; + +static bool +opt_address_reg_load_local(fs_visitor &s, bblock_t *block, const brw::def_analysis &defs) +{ + bool progress = false; + + foreach_inst_in_block_safe(fs_inst, inst, block) { + if (!inst->dst.is_address() || inst->opcode != BRW_OPCODE_MOV) + continue; + + fs_inst *src_inst = defs.get(inst->src[0]); + if (src_inst == NULL) + continue; + + if (src_inst->uses_address_register_implicitly() || + src_inst->sources > 2) + continue; + + fs_builder ubld = fs_builder(&s).at(block, inst).exec_all().group(1, 0); + brw_reg sources[3]; + for (unsigned i = 0; i < src_inst->sources; i++) { + sources[i] = inst->src[i].file == VGRF ? component(src_inst->src[i], 0) : src_inst->src[i]; + } + ubld.emit(src_inst->opcode, inst->dst, sources, src_inst->sources); + + inst->remove(block); + + progress = true; + } + + return progress; +} + +bool +brw_opt_address_reg_load(fs_visitor &s) +{ + bool progress = false; + const brw::def_analysis &defs = s.def_analysis.require(); + + foreach_block(block, s.cfg) { + foreach_inst_in_block_safe(fs_inst, inst, block) { + progress = opt_address_reg_load_local(s, block, defs) || progress; + } + } + + if (progress) { + s.cfg->adjust_block_ips(); + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + + return progress; +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 3c9cfd2821f..742baea4803 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -88,6 +88,7 @@ libintel_compiler_brw_files = files( 'brw_nir_rt.c', 'brw_nir_rt_builder.h', 'brw_opt.cpp', + 'brw_opt_address_reg_load.cpp', 'brw_opt_algebraic.cpp', 'brw_opt_bank_conflicts.cpp', 'brw_opt_cmod_propagation.cpp',