brw: move final send lowering up into the IR

Because we do emit the final send message form in code generation, a
lot of emissions look like this :

  add(8)  vgrf0,    u0, 0x100
  mov(1)   a0.1, vgrf0          # emitted by the generator
  send(8)   ...,  a0.1

By moving address register manipulation in the IR, we can get this
down to :

  add(1)  a0.1,   u0, 0x100
  send(8)  ..., a0.1

This reduce register pressure around some send messages by 1 vgrf.

All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.

DG2 results:

  Assassin's Creed Valhalla:
  Totals from 2044 (96.87% of 2110) affected shaders:
  Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
  Subgroup size: 23832 -> 23824 (-0.03%)
  Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
  Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
  Fill count: 2005 -> 1256 (-37.36%)
  Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
  Max live registers: 116765 -> 115058 (-1.46%)
  Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%

  Cyberpunk 2077:
  Totals from 1181 (93.43% of 1264) affected shaders:
  Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
  Subgroup size: 13016 -> 13032 (+0.12%)
  Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
  Spill count: 12 -> 8 (-33.33%)
  Fill count: 9 -> 6 (-33.33%)

  Dota2:
  Totals from 173 (11.59% of 1493) affected shaders:
  Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
  Max live registers: 5787 -> 5779 (-0.14%)
  Max dispatch width: 1344 -> 1152 (-14.29%)

  Hitman3:
  Totals from 5072 (95.39% of 5317) affected shaders:
  Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
  Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
  Spill count: 3942 -> 3200 (-18.82%)
  Fill count: 10158 -> 8846 (-12.92%)
  Scratch Memory Size: 257024 -> 223232 (-13.15%)
  Max live registers: 328467 -> 324631 (-1.17%)
  Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%

  Fortnite:
  Totals from 360 (4.82% of 7472) affected shaders:
  Instrs: 778068 -> 777925 (-0.02%)
  Subgroup size: 3128 -> 3136 (+0.26%)
  Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
  Max live registers: 50689 -> 50658 (-0.06%)

  Hogwarts Legacy:
  Totals from 1376 (84.00% of 1638) affected shaders:
  Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
  Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
  Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
  Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
  Scratch Memory Size: 99328 -> 89088 (-10.31%)
  Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
  Max dispatch width: 11848 -> 11920 (+0.61%)

  Metro Exodus:
  Totals from 92 (0.21% of 43072) affected shaders:
  Instrs: 262995 -> 262968 (-0.01%)
  Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
  Max live registers: 11152 -> 11140 (-0.11%)

  Red Dead Redemption 2 :
  Totals from 451 (7.71% of 5847) affected shaders:
  Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
  Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
  Max live registers: 42294 -> 42185 (-0.26%)

  Spiderman Remastered:
  Totals from 6820 (98.02% of 6958) affected shaders:
  Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
  Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
  Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
  Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
  Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
  Max live registers: 493149 -> 487458 (-1.15%)
  Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%

  Strange Brigade:
  Totals from 3769 (91.21% of 4132) affected shaders:
  Instrs: 1354476 -> 1321474 (-2.44%)
  Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
  Max live registers: 199057 -> 193656 (-2.71%)
  Max dispatch width: 30272 -> 30240 (-0.11%)

  Witcher 3:
  Totals from 25 (2.40% of 1041) affected shaders:
  Instrs: 24621 -> 24606 (-0.06%)
  Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
  Max live registers: 1963 -> 1955 (-0.41%)

LNL results:

  Assassin's Creed Valhalla:
  Totals from 1928 (98.02% of 1967) affected shaders:
  Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
  Subgroup size: 41264 -> 41280 (+0.04%)
  Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
  Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
  Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
  Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
  Max live registers: 205483 -> 202192 (-1.60%)

  Cyberpunk 2077:
  Totals from 1177 (96.40% of 1221) affected shaders:
  Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
  Subgroup size: 24912 -> 24944 (+0.13%)
  Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
  Spill count: 8 -> 3 (-62.50%)
  Fill count: 6 -> 3 (-50.00%)
  Max live registers: 126922 -> 125472 (-1.14%)

  Dota2:
  Totals from 428 (32.47% of 1318) affected shaders:
  Instrs: 89355 -> 89740 (+0.43%)
  Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
  Max live registers: 32863 -> 32847 (-0.05%)

  Fortnite:
  Totals from 5354 (81.72% of 6552) affected shaders:
  Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
  Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
  Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
  Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
  Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%

  Hitman3:
  Totals from 4912 (97.09% of 5059) affected shaders:
  Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
  Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
  Spill count: 3739 -> 3136 (-16.13%)
  Fill count: 10657 -> 9564 (-10.26%)
  Scratch Memory Size: 373760 -> 318464 (-14.79%)
  Max live registers: 597566 -> 589460 (-1.36%)

  Hogwarts Legacy:
  Totals from 1471 (96.33% of 1527) affected shaders:
  Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
  Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
  Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
  Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
  Scratch Memory Size: 251904 -> 217088 (-13.82%)
  Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%

  Metro Exodus:
  Totals from 18356 (49.81% of 36854) affected shaders:
  Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
  Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
  Spill count: 595 -> 546 (-8.24%)
  Fill count: 1604 -> 1408 (-12.22%)
  Max live registers: 2086937 -> 2086933 (-0.00%)

  Red Dead Redemption 2:
  Totals from 4171 (79.31% of 5259) affected shaders:
  Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
  Subgroup size: 86416 -> 86432 (+0.02%)
  Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
  Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
  Scratch Memory Size: 401408 -> 385024 (-4.08%)

  Spiderman Remastered:
  Totals from 6639 (98.94% of 6710) affected shaders:
  Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
  Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
  Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
  Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
  Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
  Max live registers: 918240 -> 906604 (-1.27%)

  Strange Brigade:
  Totals from 3675 (92.24% of 3984) affected shaders:
  Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
  Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
  Max live registers: 361849 -> 351265 (-2.92%)

  Witcher 3:
  Totals from 13 (46.43% of 28) affected shaders:
  Instrs: 593 -> 660 (+11.30%)
  Cycle count: 28302 -> 28714 (+1.46%)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
This commit is contained in:
Lionel Landwerlin 2024-02-29 20:51:50 +02:00 committed by Marge Bot
parent a27d98e933
commit 8ac7802ac8
13 changed files with 320 additions and 214 deletions

View file

@ -627,15 +627,23 @@ brw_emit_repclear_shader(fs_visitor &s)
write = bld.emit(SHADER_OPCODE_SEND); write = bld.emit(SHADER_OPCODE_SEND);
write->resize_sources(3); write->resize_sources(3);
/* We can use a headerless message for the first render target */
write->header_size = i == 0 ? 0 : 2;
write->mlen = 1 + write->header_size;
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE; write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
write->src[0] = brw_imm_ud(0); write->src[0] = brw_imm_ud(
brw_fb_write_desc(
s.devinfo, i,
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
i == key->nr_color_regions - 1, false) |
brw_message_desc(s.devinfo, write->mlen,
0 /* rlen */, write->header_size));
write->src[1] = brw_imm_ud(0); write->src[1] = brw_imm_ud(0);
write->src[2] = i == 0 ? color_output : header; write->src[2] = i == 0 ? color_output : header;
write->check_tdr = true; write->check_tdr = true;
write->send_has_side_effects = true; write->send_has_side_effects = true;
write->desc = brw_fb_write_desc(s.devinfo, i,
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
i == key->nr_color_regions - 1, false);
/* We can use a headerless message for the first render target */ /* We can use a headerless message for the first render target */
write->header_size = i == 0 ? 0 : 2; write->header_size = i == 0 ? 0 : 2;

View file

@ -1430,7 +1430,6 @@ brw_send_indirect_message(struct brw_codegen *p,
struct brw_reg dst, struct brw_reg dst,
struct brw_reg payload, struct brw_reg payload,
struct brw_reg desc, struct brw_reg desc,
unsigned desc_imm,
bool eot); bool eot);
void void
@ -1440,10 +1439,8 @@ brw_send_indirect_split_message(struct brw_codegen *p,
struct brw_reg payload0, struct brw_reg payload0,
struct brw_reg payload1, struct brw_reg payload1,
struct brw_reg desc, struct brw_reg desc,
unsigned desc_imm,
struct brw_reg ex_desc, struct brw_reg ex_desc,
unsigned ex_desc_imm, unsigned ex_mlen,
bool ex_desc_scratch,
bool ex_bso, bool ex_bso,
bool eot); bool eot);

View file

@ -816,6 +816,13 @@ enum ENUM_PACKED gfx10_align1_3src_exec_type {
#define BRW_THREAD_ATOMIC 1 #define BRW_THREAD_ATOMIC 1
#define BRW_THREAD_SWITCH 2 #define BRW_THREAD_SWITCH 2
/* Subregister of the address register used for particular purposes */
enum brw_address_subreg {
BRW_ADDRESS_SUBREG_INDIRECT_DESC = 0,
BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC = 2,
BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC = 4,
};
enum ENUM_PACKED brw_vertical_stride { enum ENUM_PACKED brw_vertical_stride {
BRW_VERTICAL_STRIDE_0 = 0, BRW_VERTICAL_STRIDE_0 = 0,
BRW_VERTICAL_STRIDE_1 = 1, BRW_VERTICAL_STRIDE_1 = 1,

View file

@ -1438,7 +1438,6 @@ brw_send_indirect_message(struct brw_codegen *p,
struct brw_reg dst, struct brw_reg dst,
struct brw_reg payload, struct brw_reg payload,
struct brw_reg desc, struct brw_reg desc,
unsigned desc_imm,
bool eot) bool eot)
{ {
const struct intel_device_info *devinfo = p->devinfo; const struct intel_device_info *devinfo = p->devinfo;
@ -1451,35 +1450,16 @@ brw_send_indirect_message(struct brw_codegen *p,
if (desc.file == IMM) { if (desc.file == IMM) {
send = next_insn(p, BRW_OPCODE_SEND); send = next_insn(p, BRW_OPCODE_SEND);
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD)); brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
brw_set_desc(p, send, desc.ud | desc_imm); brw_set_desc(p, send, desc.ud);
} else { } else {
const struct tgl_swsb swsb = brw_get_default_swsb(p); assert(desc.file == ADDRESS);
struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD); assert(desc.subnr == 0);
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
brw_set_default_flag_reg(p, 0, 0);
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
/* Load the indirect descriptor to an address register using OR so the
* caller can specify additional descriptor bits with the desc_imm
* immediate.
*/
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
brw_pop_insn_state(p);
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
send = next_insn(p, BRW_OPCODE_SEND); send = next_insn(p, BRW_OPCODE_SEND);
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD)); brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
if (devinfo->ver >= 12) if (devinfo->ver >= 12)
brw_eu_inst_set_send_sel_reg32_desc(devinfo, send, true); brw_eu_inst_set_send_sel_reg32_desc(devinfo, send, true);
else else
brw_set_src1(p, send, addr); brw_set_src1(p, send, desc);
} }
brw_set_dest(p, send, dst); brw_set_dest(p, send, dst);
@ -1494,10 +1474,8 @@ brw_send_indirect_split_message(struct brw_codegen *p,
struct brw_reg payload0, struct brw_reg payload0,
struct brw_reg payload1, struct brw_reg payload1,
struct brw_reg desc, struct brw_reg desc,
unsigned desc_imm,
struct brw_reg ex_desc, struct brw_reg ex_desc,
unsigned ex_desc_imm, unsigned ex_mlen,
bool ex_desc_scratch,
bool ex_bso, bool ex_bso,
bool eot) bool eot)
{ {
@ -1508,105 +1486,6 @@ brw_send_indirect_split_message(struct brw_codegen *p,
assert(desc.type == BRW_TYPE_UD); assert(desc.type == BRW_TYPE_UD);
if (desc.file == IMM) {
desc.ud |= desc_imm;
} else {
const struct tgl_swsb swsb = brw_get_default_swsb(p);
struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD);
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
brw_set_default_flag_reg(p, 0, 0);
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
/* Load the indirect descriptor to an address register using OR so the
* caller can specify additional descriptor bits with the desc_imm
* immediate.
*/
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
brw_pop_insn_state(p);
desc = addr;
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
}
if (ex_desc.file == IMM &&
!ex_desc_scratch &&
(devinfo->ver >= 12 ||
((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
/* ATS-M PRMs, Volume 2d: Command Reference: Structures,
* EU_INSTRUCTION_SEND instruction
*
* "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
*/
assert(!ex_bso);
ex_desc.ud |= ex_desc_imm;
} else {
const struct tgl_swsb swsb = brw_get_default_swsb(p);
struct brw_reg addr = retype(brw_address_reg(2), BRW_TYPE_UD);
/* On Xe2+ ExBSO addressing is implicitly enabled for the UGM
* shared function.
*/
ex_bso |= (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM);
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
brw_set_default_flag_reg(p, 0, 0);
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
/* Load the indirect extended descriptor to an address register using OR
* so the caller can specify additional descriptor bits with the
* desc_imm immediate.
*
* Even though the instruction dispatcher always pulls the SFID and EOT
* fields from the instruction itself, actual external unit which
* processes the message gets the SFID and EOT from the extended
* descriptor which comes from the address register. If we don't OR
* those two bits in, the external unit may get confused and hang.
*/
unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
if (ex_desc_scratch) {
assert(devinfo->verx10 >= 125);
brw_AND(p, addr,
retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) {
const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm);
assert(ex_desc_imm == brw_message_ex_desc(devinfo, ex_mlen));
brw_SHR(p, addr, addr, brw_imm_ud(4));
} else {
/* Or the scratch surface offset together with the immediate part
* of the extended descriptor.
*/
brw_OR(p, addr, addr, brw_imm_ud(imm_part));
}
} else if (ex_desc.file == IMM) {
/* ex_desc bits 15:12 don't exist in the instruction encoding prior
* to Gfx12, so we may have fallen back to an indirect extended
* descriptor.
*/
brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
} else {
brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
}
brw_pop_insn_state(p);
ex_desc = addr;
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
}
send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS); send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
brw_set_dest(p, send, dst); brw_set_dest(p, send, dst);
brw_set_src0(p, send, retype(payload0, BRW_TYPE_UD)); brw_set_src0(p, send, retype(payload0, BRW_TYPE_UD));
@ -1630,11 +1509,9 @@ brw_send_indirect_split_message(struct brw_codegen *p,
brw_eu_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1); brw_eu_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
brw_eu_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2); brw_eu_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) { if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM)
const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm);
brw_eu_inst_set_bits(send, 103, 99, ex_mlen / reg_unit(devinfo)); brw_eu_inst_set_bits(send, 103, 99, ex_mlen / reg_unit(devinfo));
} }
}
if (ex_bso) { if (ex_bso) {
/* The send instruction ExBSO field does not exist with UGM on Gfx20+, /* The send instruction ExBSO field does not exist with UGM on Gfx20+,
@ -1644,7 +1521,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
*/ */
if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM) if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
brw_eu_inst_set_send_ex_bso(devinfo, send, true); brw_eu_inst_set_send_ex_bso(devinfo, send, true);
brw_eu_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6)); brw_eu_inst_set_send_src1_len(devinfo, send, ex_mlen / reg_unit(devinfo));
} }
brw_eu_inst_set_sfid(devinfo, send, sfid); brw_eu_inst_set_sfid(devinfo, send, sfid);
brw_eu_inst_set_eot(devinfo, send, eot); brw_eu_inst_set_eot(devinfo, send, eot);

View file

@ -958,7 +958,7 @@ fs_visitor::assign_curb_setup()
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4); fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
send->sfid = GFX12_SFID_UGM; send->sfid = GFX12_SFID_UGM;
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SURFTYPE_FLAT,
LSC_ADDR_SIZE_A32, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, LSC_DATA_SIZE_D32,
@ -971,6 +971,12 @@ fs_visitor::assign_curb_setup()
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE; lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
send->send_is_volatile = true; send->send_is_volatile = true;
send->src[0] = brw_imm_ud(desc |
brw_message_desc(devinfo,
send->mlen,
send->size_written / REG_SIZE,
send->header_size));
i += num_regs; i += num_regs;
} }

View file

@ -635,6 +635,7 @@ bool brw_lower_pack(fs_visitor &s);
bool brw_lower_regioning(fs_visitor &s); bool brw_lower_regioning(fs_visitor &s);
bool brw_lower_scalar_fp64_MAD(fs_visitor &s); bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
bool brw_lower_scoreboard(fs_visitor &s); bool brw_lower_scoreboard(fs_visitor &s);
bool brw_lower_send_descriptors(fs_visitor &s);
bool brw_lower_sends_overlapping_payload(fs_visitor &s); bool brw_lower_sends_overlapping_payload(fs_visitor &s);
bool brw_lower_simd_width(fs_visitor &s); bool brw_lower_simd_width(fs_visitor &s);
bool brw_lower_sub_sat(fs_visitor &s); bool brw_lower_sub_sat(fs_visitor &s);
@ -642,6 +643,7 @@ bool brw_lower_subgroup_ops(fs_visitor &s);
bool brw_lower_uniform_pull_constant_loads(fs_visitor &s); bool brw_lower_uniform_pull_constant_loads(fs_visitor &s);
void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s); void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
bool brw_opt_address_reg_load(fs_visitor &s);
bool brw_opt_algebraic(fs_visitor &s); bool brw_opt_algebraic(fs_visitor &s);
bool brw_opt_bank_conflicts(fs_visitor &s); bool brw_opt_bank_conflicts(fs_visitor &s);
bool brw_opt_cmod_propagation(fs_visitor &s); bool brw_opt_cmod_propagation(fs_visitor &s);

View file

@ -167,31 +167,20 @@ fs_generator::generate_send(fs_inst *inst,
struct brw_reg payload, struct brw_reg payload,
struct brw_reg payload2) struct brw_reg payload2)
{ {
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE; if (ex_desc.file == IMM && ex_desc.ud == 0) {
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, inst->eot);
uint32_t desc_imm = inst->desc | if (inst->check_tdr)
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
} else {
uint32_t ex_desc_imm = inst->ex_desc |
brw_message_ex_desc(devinfo, inst->ex_mlen);
if (ex_desc.file != IMM || ex_desc.ud || ex_desc_imm ||
inst->send_ex_desc_scratch) {
/* If we have any sort of extended descriptor, then we need SENDS. This /* If we have any sort of extended descriptor, then we need SENDS. This
* also covers the dual-payload case because ex_mlen goes in ex_desc. * also covers the dual-payload case because ex_mlen goes in ex_desc.
*/ */
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
desc, desc_imm, ex_desc, ex_desc_imm, desc, ex_desc, inst->ex_mlen,
inst->send_ex_desc_scratch,
inst->send_ex_bso, inst->eot); inst->send_ex_bso, inst->eot);
if (inst->check_tdr) if (inst->check_tdr)
brw_eu_inst_set_opcode(p->isa, brw_last_inst, brw_eu_inst_set_opcode(p->isa, brw_last_inst,
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
} else {
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
inst->eot);
if (inst->check_tdr)
brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
} }
} }

View file

@ -222,12 +222,6 @@ void fs_visitor::calculate_payload_ranges(bool allow_spilling,
} }
} }
/* The generator implicitly uses g0 to construct extended message
* descriptors for scratch send messages when this bit is set.
*/
if (inst->send_ex_desc_scratch)
payload_last_use_ip[0] = use_ip;
ip++; ip++;
} }
@ -294,6 +288,8 @@ private:
void build_interference_graph(bool allow_spilling); void build_interference_graph(bool allow_spilling);
brw_reg build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill);
brw_reg build_lane_offsets(const fs_builder &bld, brw_reg build_lane_offsets(const fs_builder &bld,
uint32_t spill_offset, int ip); uint32_t spill_offset, int ip);
brw_reg build_single_offset(const fs_builder &bld, brw_reg build_single_offset(const fs_builder &bld,
@ -689,6 +685,44 @@ fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset,
return offset; return offset;
} }
brw_reg
fs_reg_alloc::build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill)
{
/* Use a different area of the address register than what is used in
* brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
* interactions between the spill/fill instructions and the other send
* messages.
*/
brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
fs_inst *inst = bld.exec_all().group(1, 0).AND(
ex_desc,
retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
_mesa_set_add(spill_insts, inst);
const intel_device_info *devinfo = bld.shader->devinfo;
if (devinfo->verx10 >= 200) {
inst = bld.exec_all().group(1, 0).SHR(
ex_desc, ex_desc, brw_imm_ud(4));
_mesa_set_add(spill_insts, inst);
} else {
if (unspill) {
inst = bld.exec_all().group(1, 0).OR(
ex_desc, ex_desc, brw_imm_ud(GFX12_SFID_UGM));
_mesa_set_add(spill_insts, inst);
} else {
inst = bld.exec_all().group(1, 0).OR(
ex_desc, ex_desc,
brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) |
GFX12_SFID_UGM));
_mesa_set_add(spill_insts, inst);
}
}
return ex_desc;
}
brw_reg brw_reg
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip) fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
{ {
@ -782,28 +816,26 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
} else { } else {
offset = build_lane_offsets(ubld, spill_offset, ip); offset = build_lane_offsets(ubld, spill_offset, ip);
} }
/* We leave the extended descriptor empty and flag the instruction to
* ask the generated to insert the extended descriptor in the address
* register. That way we don't need to burn an additional register
* for register allocation spill/fill.
*/
brw_reg srcs[] = { brw_reg srcs[] = {
brw_imm_ud(0), /* desc */ brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */ build_ex_desc(bld, reg_size, true),
offset, /* payload */ offset, /* payload */
brw_reg(), /* payload2 */ brw_reg(), /* payload2 */
}; };
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst, uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
srcs, ARRAY_SIZE(srcs));
unspill_inst->sfid = GFX12_SFID_UGM;
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SURFTYPE_SS,
LSC_ADDR_SIZE_A32, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, LSC_DATA_SIZE_D32,
use_transpose ? reg_size * 8 : 1 /* num_channels */, use_transpose ? reg_size * 8 : 1 /* num_channels */,
use_transpose, use_transpose,
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
srcs, ARRAY_SIZE(srcs));
unspill_inst->sfid = GFX12_SFID_UGM;
unspill_inst->header_size = 0; unspill_inst->header_size = 0;
unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
unspill_inst->exec_size); unspill_inst->exec_size);
@ -812,14 +844,23 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE; lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
unspill_inst->send_has_side_effects = false; unspill_inst->send_has_side_effects = false;
unspill_inst->send_is_volatile = true; unspill_inst->send_is_volatile = true;
unspill_inst->send_ex_desc_scratch = true;
unspill_inst->src[0] = brw_imm_ud(
desc |
brw_message_desc(devinfo,
unspill_inst->mlen,
unspill_inst->size_written / REG_SIZE,
unspill_inst->header_size));
} else { } else {
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
const brw_reg ex_desc = brw_imm_ud(0);
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header }; brw_reg srcs[] = {
brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */
header
};
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
srcs, ARRAY_SIZE(srcs)); srcs, ARRAY_SIZE(srcs));
unspill_inst->mlen = 1; unspill_inst->mlen = 1;
@ -828,10 +869,15 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
unspill_inst->send_has_side_effects = false; unspill_inst->send_has_side_effects = false;
unspill_inst->send_is_volatile = true; unspill_inst->send_is_volatile = true;
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
unspill_inst->desc =
unspill_inst->src[0] = brw_imm_ud(
brw_dp_desc(devinfo, bti, brw_dp_desc(devinfo, bti,
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) |
brw_message_desc(devinfo,
unspill_inst->mlen,
unspill_inst->size_written / REG_SIZE,
unspill_inst->header_size));
} }
_mesa_set_add(spill_insts, unspill_inst); _mesa_set_add(spill_insts, unspill_inst);
assert(unspill_inst->force_writemask_all || count % reg_size == 0); assert(unspill_inst->force_writemask_all || count % reg_size == 0);
@ -857,21 +903,17 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
fs_inst *spill_inst; fs_inst *spill_inst;
if (devinfo->verx10 >= 125) { if (devinfo->verx10 >= 125) {
brw_reg offset = build_lane_offsets(bld, spill_offset, ip); brw_reg offset = build_lane_offsets(bld, spill_offset, ip);
/* We leave the extended descriptor empty and flag the instruction
* relocate the extended descriptor. That way the surface offset is
* directly put into the instruction and we don't need to use a
* register to hold it.
*/
brw_reg srcs[] = { brw_reg srcs[] = {
brw_imm_ud(0), /* desc */ brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */ build_ex_desc(bld, reg_size, false),
offset, /* payload */ offset, /* payload */
src, /* payload2 */ src, /* payload2 */
}; };
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
srcs, ARRAY_SIZE(srcs)); srcs, ARRAY_SIZE(srcs));
spill_inst->sfid = GFX12_SFID_UGM; spill_inst->sfid = GFX12_SFID_UGM;
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SURFTYPE_SS,
LSC_ADDR_SIZE_A32, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, LSC_DATA_SIZE_D32,
@ -885,14 +927,23 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
spill_inst->size_written = 0; spill_inst->size_written = 0;
spill_inst->send_has_side_effects = true; spill_inst->send_has_side_effects = true;
spill_inst->send_is_volatile = false; spill_inst->send_is_volatile = false;
spill_inst->send_ex_desc_scratch = true;
spill_inst->src[0] = brw_imm_ud(
desc |
brw_message_desc(devinfo,
spill_inst->mlen,
spill_inst->size_written / REG_SIZE,
spill_inst->header_size));
} else { } else {
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
const brw_reg ex_desc = brw_imm_ud(0); brw_reg srcs[] = {
brw_imm_ud(0), /* desc */
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src }; brw_imm_ud(0), /* ex_desc */
header,
src
};
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
srcs, ARRAY_SIZE(srcs)); srcs, ARRAY_SIZE(srcs));
spill_inst->mlen = 1; spill_inst->mlen = 1;
@ -902,10 +953,17 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
spill_inst->send_has_side_effects = true; spill_inst->send_has_side_effects = true;
spill_inst->send_is_volatile = false; spill_inst->send_is_volatile = false;
spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
spill_inst->desc =
spill_inst->src[0] = brw_imm_ud(
brw_dp_desc(devinfo, bti, brw_dp_desc(devinfo, bti,
GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)); BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) |
brw_message_desc(devinfo,
spill_inst->mlen,
spill_inst->size_written / REG_SIZE,
spill_inst->header_size));
spill_inst->src[1] = brw_imm_ud(
brw_message_ex_desc(devinfo, spill_inst->ex_mlen));
} }
_mesa_set_add(spill_insts, spill_inst); _mesa_set_add(spill_insts, spill_inst);
assert(spill_inst->force_writemask_all || count % reg_size == 0); assert(spill_inst->force_writemask_all || count % reg_size == 0);

View file

@ -193,10 +193,6 @@ public:
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
* the scratch surface offset to build
* extended descriptor
*/
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
* bindless surface offset (26bits instead of * bindless surface offset (26bits instead of
* 20bits) * 20bits)

View file

@ -1389,6 +1389,9 @@ setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
* we can use the surface handle directly as the extended descriptor. * we can use the surface handle directly as the extended descriptor.
*/ */
inst->src[1] = retype(surface, BRW_TYPE_UD); inst->src[1] = retype(surface, BRW_TYPE_UD);
/* Gfx20+ assumes ExBSO with UGM */
if (devinfo->ver >= 20 && inst->sfid == GFX12_SFID_UGM)
inst->send_ex_bso = true;
break; break;
case LSC_ADDR_SURFTYPE_BTI: case LSC_ADDR_SURFTYPE_BTI:
@ -2610,3 +2613,80 @@ brw_lower_uniform_pull_constant_loads(fs_visitor &s)
return progress; return progress;
} }
bool
brw_lower_send_descriptors(fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
bool progress = false;
foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_SEND)
continue;
const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
/* Descriptor */
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
uint32_t desc_imm = inst->desc |
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
assert(inst->src[0].file != BAD_FILE);
assert(inst->src[1].file != BAD_FILE);
brw_reg desc = inst->src[0];
if (desc.file == IMM) {
inst->src[0] = brw_imm_ud(desc.ud | desc_imm);
} else {
brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
BRW_ADDRESS_SUBREG_INDIRECT_DESC);
ubld.OR(addr_reg, desc, brw_imm_ud(desc_imm));
inst->src[0] = addr_reg;
}
/* Extended descriptor */
brw_reg ex_desc = inst->src[1];
uint32_t ex_desc_imm = inst->ex_desc |
brw_message_ex_desc(devinfo, inst->ex_mlen);
if (ex_desc.file == IMM)
ex_desc_imm |= ex_desc.ud;
bool needs_addr_reg = false;
if (ex_desc.file != IMM)
needs_addr_reg = true;
if (devinfo->ver < 12 && ex_desc.file == IMM &&
(ex_desc_imm & INTEL_MASK(15, 12)) != 0)
needs_addr_reg = true;
if (inst->send_ex_bso) {
needs_addr_reg = true;
/* When using the extended bindless offset, the whole extended
* descriptor is the surface handle.
*/
ex_desc_imm = 0;
} else {
if (needs_addr_reg)
ex_desc_imm |= inst->sfid | inst->eot << 5;
}
if (needs_addr_reg) {
brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC);
if (ex_desc.file == IMM)
ubld.MOV(addr_reg, brw_imm_ud(ex_desc_imm));
else if (ex_desc_imm == 0)
ubld.MOV(addr_reg, ex_desc);
else
ubld.OR(addr_reg, ex_desc, brw_imm_ud(ex_desc_imm));
inst->src[1] = addr_reg;
} else {
inst->src[1] = brw_imm_ud(ex_desc_imm);
}
progress = true;
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
}
return progress;
}

View file

@ -175,10 +175,20 @@ brw_optimize(fs_visitor &s)
if (progress) if (progress)
OPT(brw_lower_simd_width); OPT(brw_lower_simd_width);
OPT(brw_lower_sends_overlapping_payload);
OPT(brw_lower_uniform_pull_constant_loads); OPT(brw_lower_uniform_pull_constant_loads);
if (OPT(brw_lower_send_descriptors)) {
/* No need for standard copy_propagation since
* brw_fs_opt_address_reg_load will only optimize defs.
*/
if (OPT(brw_opt_copy_propagation_defs))
OPT(brw_opt_algebraic);
OPT(brw_opt_address_reg_load);
OPT(brw_opt_dead_code_eliminate);
}
OPT(brw_lower_sends_overlapping_payload);
OPT(brw_lower_indirect_mov); OPT(brw_lower_indirect_mov);
OPT(brw_lower_find_live_channel); OPT(brw_lower_find_live_channel);

View file

@ -0,0 +1,75 @@
/*
* Copyright © 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
#include "brw_cfg.h"
#include "brw_eu.h"
/** @file brw_fs_opt_address_reg_load.cpp
*
* Turn this sequence :
*
* add(8) vgrf64:UD, vgrf63:UD, 192u
* mov(1) a0.4:UD, vgrf64+0.0<0>:UD
*
* into :
*
* add(1) a0.4:UD, vgrf63+0.0<0>:UD, 192u
*/
using namespace brw;
static bool
opt_address_reg_load_local(fs_visitor &s, bblock_t *block, const brw::def_analysis &defs)
{
bool progress = false;
foreach_inst_in_block_safe(fs_inst, inst, block) {
if (!inst->dst.is_address() || inst->opcode != BRW_OPCODE_MOV)
continue;
fs_inst *src_inst = defs.get(inst->src[0]);
if (src_inst == NULL)
continue;
if (src_inst->uses_address_register_implicitly() ||
src_inst->sources > 2)
continue;
fs_builder ubld = fs_builder(&s).at(block, inst).exec_all().group(1, 0);
brw_reg sources[3];
for (unsigned i = 0; i < src_inst->sources; i++) {
sources[i] = inst->src[i].file == VGRF ? component(src_inst->src[i], 0) : src_inst->src[i];
}
ubld.emit(src_inst->opcode, inst->dst, sources, src_inst->sources);
inst->remove(block);
progress = true;
}
return progress;
}
bool
brw_opt_address_reg_load(fs_visitor &s)
{
bool progress = false;
const brw::def_analysis &defs = s.def_analysis.require();
foreach_block(block, s.cfg) {
foreach_inst_in_block_safe(fs_inst, inst, block) {
progress = opt_address_reg_load_local(s, block, defs) || progress;
}
}
if (progress) {
s.cfg->adjust_block_ips();
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
return progress;
}

View file

@ -88,6 +88,7 @@ libintel_compiler_brw_files = files(
'brw_nir_rt.c', 'brw_nir_rt.c',
'brw_nir_rt_builder.h', 'brw_nir_rt_builder.h',
'brw_opt.cpp', 'brw_opt.cpp',
'brw_opt_address_reg_load.cpp',
'brw_opt_algebraic.cpp', 'brw_opt_algebraic.cpp',
'brw_opt_bank_conflicts.cpp', 'brw_opt_bank_conflicts.cpp',
'brw_opt_cmod_propagation.cpp', 'brw_opt_cmod_propagation.cpp',