mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 11:10:10 +01:00
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
This commit is contained in:
parent
a27d98e933
commit
8ac7802ac8
13 changed files with 320 additions and 214 deletions
|
|
@ -627,15 +627,23 @@ brw_emit_repclear_shader(fs_visitor &s)
|
||||||
|
|
||||||
write = bld.emit(SHADER_OPCODE_SEND);
|
write = bld.emit(SHADER_OPCODE_SEND);
|
||||||
write->resize_sources(3);
|
write->resize_sources(3);
|
||||||
|
|
||||||
|
/* We can use a headerless message for the first render target */
|
||||||
|
write->header_size = i == 0 ? 0 : 2;
|
||||||
|
write->mlen = 1 + write->header_size;
|
||||||
|
|
||||||
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
|
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
|
||||||
write->src[0] = brw_imm_ud(0);
|
write->src[0] = brw_imm_ud(
|
||||||
|
brw_fb_write_desc(
|
||||||
|
s.devinfo, i,
|
||||||
|
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
|
||||||
|
i == key->nr_color_regions - 1, false) |
|
||||||
|
brw_message_desc(s.devinfo, write->mlen,
|
||||||
|
0 /* rlen */, write->header_size));
|
||||||
write->src[1] = brw_imm_ud(0);
|
write->src[1] = brw_imm_ud(0);
|
||||||
write->src[2] = i == 0 ? color_output : header;
|
write->src[2] = i == 0 ? color_output : header;
|
||||||
write->check_tdr = true;
|
write->check_tdr = true;
|
||||||
write->send_has_side_effects = true;
|
write->send_has_side_effects = true;
|
||||||
write->desc = brw_fb_write_desc(s.devinfo, i,
|
|
||||||
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
|
|
||||||
i == key->nr_color_regions - 1, false);
|
|
||||||
|
|
||||||
/* We can use a headerless message for the first render target */
|
/* We can use a headerless message for the first render target */
|
||||||
write->header_size = i == 0 ? 0 : 2;
|
write->header_size = i == 0 ? 0 : 2;
|
||||||
|
|
|
||||||
|
|
@ -1430,7 +1430,6 @@ brw_send_indirect_message(struct brw_codegen *p,
|
||||||
struct brw_reg dst,
|
struct brw_reg dst,
|
||||||
struct brw_reg payload,
|
struct brw_reg payload,
|
||||||
struct brw_reg desc,
|
struct brw_reg desc,
|
||||||
unsigned desc_imm,
|
|
||||||
bool eot);
|
bool eot);
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -1440,10 +1439,8 @@ brw_send_indirect_split_message(struct brw_codegen *p,
|
||||||
struct brw_reg payload0,
|
struct brw_reg payload0,
|
||||||
struct brw_reg payload1,
|
struct brw_reg payload1,
|
||||||
struct brw_reg desc,
|
struct brw_reg desc,
|
||||||
unsigned desc_imm,
|
|
||||||
struct brw_reg ex_desc,
|
struct brw_reg ex_desc,
|
||||||
unsigned ex_desc_imm,
|
unsigned ex_mlen,
|
||||||
bool ex_desc_scratch,
|
|
||||||
bool ex_bso,
|
bool ex_bso,
|
||||||
bool eot);
|
bool eot);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -816,6 +816,13 @@ enum ENUM_PACKED gfx10_align1_3src_exec_type {
|
||||||
#define BRW_THREAD_ATOMIC 1
|
#define BRW_THREAD_ATOMIC 1
|
||||||
#define BRW_THREAD_SWITCH 2
|
#define BRW_THREAD_SWITCH 2
|
||||||
|
|
||||||
|
/* Subregister of the address register used for particular purposes */
|
||||||
|
enum brw_address_subreg {
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_DESC = 0,
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC = 2,
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC = 4,
|
||||||
|
};
|
||||||
|
|
||||||
enum ENUM_PACKED brw_vertical_stride {
|
enum ENUM_PACKED brw_vertical_stride {
|
||||||
BRW_VERTICAL_STRIDE_0 = 0,
|
BRW_VERTICAL_STRIDE_0 = 0,
|
||||||
BRW_VERTICAL_STRIDE_1 = 1,
|
BRW_VERTICAL_STRIDE_1 = 1,
|
||||||
|
|
|
||||||
|
|
@ -1438,7 +1438,6 @@ brw_send_indirect_message(struct brw_codegen *p,
|
||||||
struct brw_reg dst,
|
struct brw_reg dst,
|
||||||
struct brw_reg payload,
|
struct brw_reg payload,
|
||||||
struct brw_reg desc,
|
struct brw_reg desc,
|
||||||
unsigned desc_imm,
|
|
||||||
bool eot)
|
bool eot)
|
||||||
{
|
{
|
||||||
const struct intel_device_info *devinfo = p->devinfo;
|
const struct intel_device_info *devinfo = p->devinfo;
|
||||||
|
|
@ -1451,35 +1450,16 @@ brw_send_indirect_message(struct brw_codegen *p,
|
||||||
if (desc.file == IMM) {
|
if (desc.file == IMM) {
|
||||||
send = next_insn(p, BRW_OPCODE_SEND);
|
send = next_insn(p, BRW_OPCODE_SEND);
|
||||||
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
|
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
|
||||||
brw_set_desc(p, send, desc.ud | desc_imm);
|
brw_set_desc(p, send, desc.ud);
|
||||||
} else {
|
} else {
|
||||||
const struct tgl_swsb swsb = brw_get_default_swsb(p);
|
assert(desc.file == ADDRESS);
|
||||||
struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD);
|
assert(desc.subnr == 0);
|
||||||
|
|
||||||
brw_push_insn_state(p);
|
|
||||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
|
||||||
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
||||||
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
||||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
|
||||||
brw_set_default_flag_reg(p, 0, 0);
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
|
|
||||||
|
|
||||||
/* Load the indirect descriptor to an address register using OR so the
|
|
||||||
* caller can specify additional descriptor bits with the desc_imm
|
|
||||||
* immediate.
|
|
||||||
*/
|
|
||||||
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
|
|
||||||
|
|
||||||
brw_pop_insn_state(p);
|
|
||||||
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
|
|
||||||
send = next_insn(p, BRW_OPCODE_SEND);
|
send = next_insn(p, BRW_OPCODE_SEND);
|
||||||
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
|
brw_set_src0(p, send, retype(payload, BRW_TYPE_UD));
|
||||||
|
|
||||||
if (devinfo->ver >= 12)
|
if (devinfo->ver >= 12)
|
||||||
brw_eu_inst_set_send_sel_reg32_desc(devinfo, send, true);
|
brw_eu_inst_set_send_sel_reg32_desc(devinfo, send, true);
|
||||||
else
|
else
|
||||||
brw_set_src1(p, send, addr);
|
brw_set_src1(p, send, desc);
|
||||||
}
|
}
|
||||||
|
|
||||||
brw_set_dest(p, send, dst);
|
brw_set_dest(p, send, dst);
|
||||||
|
|
@ -1494,10 +1474,8 @@ brw_send_indirect_split_message(struct brw_codegen *p,
|
||||||
struct brw_reg payload0,
|
struct brw_reg payload0,
|
||||||
struct brw_reg payload1,
|
struct brw_reg payload1,
|
||||||
struct brw_reg desc,
|
struct brw_reg desc,
|
||||||
unsigned desc_imm,
|
|
||||||
struct brw_reg ex_desc,
|
struct brw_reg ex_desc,
|
||||||
unsigned ex_desc_imm,
|
unsigned ex_mlen,
|
||||||
bool ex_desc_scratch,
|
|
||||||
bool ex_bso,
|
bool ex_bso,
|
||||||
bool eot)
|
bool eot)
|
||||||
{
|
{
|
||||||
|
|
@ -1508,105 +1486,6 @@ brw_send_indirect_split_message(struct brw_codegen *p,
|
||||||
|
|
||||||
assert(desc.type == BRW_TYPE_UD);
|
assert(desc.type == BRW_TYPE_UD);
|
||||||
|
|
||||||
if (desc.file == IMM) {
|
|
||||||
desc.ud |= desc_imm;
|
|
||||||
} else {
|
|
||||||
const struct tgl_swsb swsb = brw_get_default_swsb(p);
|
|
||||||
struct brw_reg addr = retype(brw_address_reg(0), BRW_TYPE_UD);
|
|
||||||
|
|
||||||
brw_push_insn_state(p);
|
|
||||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
|
||||||
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
||||||
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
||||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
|
||||||
brw_set_default_flag_reg(p, 0, 0);
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
|
|
||||||
|
|
||||||
/* Load the indirect descriptor to an address register using OR so the
|
|
||||||
* caller can specify additional descriptor bits with the desc_imm
|
|
||||||
* immediate.
|
|
||||||
*/
|
|
||||||
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
|
|
||||||
|
|
||||||
brw_pop_insn_state(p);
|
|
||||||
desc = addr;
|
|
||||||
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ex_desc.file == IMM &&
|
|
||||||
!ex_desc_scratch &&
|
|
||||||
(devinfo->ver >= 12 ||
|
|
||||||
((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
|
|
||||||
/* ATS-M PRMs, Volume 2d: Command Reference: Structures,
|
|
||||||
* EU_INSTRUCTION_SEND instruction
|
|
||||||
*
|
|
||||||
* "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
|
|
||||||
*/
|
|
||||||
assert(!ex_bso);
|
|
||||||
ex_desc.ud |= ex_desc_imm;
|
|
||||||
} else {
|
|
||||||
const struct tgl_swsb swsb = brw_get_default_swsb(p);
|
|
||||||
struct brw_reg addr = retype(brw_address_reg(2), BRW_TYPE_UD);
|
|
||||||
|
|
||||||
/* On Xe2+ ExBSO addressing is implicitly enabled for the UGM
|
|
||||||
* shared function.
|
|
||||||
*/
|
|
||||||
ex_bso |= (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM);
|
|
||||||
|
|
||||||
brw_push_insn_state(p);
|
|
||||||
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
|
||||||
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
||||||
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
||||||
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
|
||||||
brw_set_default_flag_reg(p, 0, 0);
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
|
|
||||||
|
|
||||||
/* Load the indirect extended descriptor to an address register using OR
|
|
||||||
* so the caller can specify additional descriptor bits with the
|
|
||||||
* desc_imm immediate.
|
|
||||||
*
|
|
||||||
* Even though the instruction dispatcher always pulls the SFID and EOT
|
|
||||||
* fields from the instruction itself, actual external unit which
|
|
||||||
* processes the message gets the SFID and EOT from the extended
|
|
||||||
* descriptor which comes from the address register. If we don't OR
|
|
||||||
* those two bits in, the external unit may get confused and hang.
|
|
||||||
*/
|
|
||||||
unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
|
|
||||||
|
|
||||||
if (ex_desc_scratch) {
|
|
||||||
assert(devinfo->verx10 >= 125);
|
|
||||||
brw_AND(p, addr,
|
|
||||||
retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
|
|
||||||
brw_imm_ud(INTEL_MASK(31, 10)));
|
|
||||||
|
|
||||||
if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) {
|
|
||||||
const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm);
|
|
||||||
assert(ex_desc_imm == brw_message_ex_desc(devinfo, ex_mlen));
|
|
||||||
brw_SHR(p, addr, addr, brw_imm_ud(4));
|
|
||||||
} else {
|
|
||||||
/* Or the scratch surface offset together with the immediate part
|
|
||||||
* of the extended descriptor.
|
|
||||||
*/
|
|
||||||
brw_OR(p, addr, addr, brw_imm_ud(imm_part));
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (ex_desc.file == IMM) {
|
|
||||||
/* ex_desc bits 15:12 don't exist in the instruction encoding prior
|
|
||||||
* to Gfx12, so we may have fallen back to an indirect extended
|
|
||||||
* descriptor.
|
|
||||||
*/
|
|
||||||
brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
|
|
||||||
} else {
|
|
||||||
brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
|
|
||||||
}
|
|
||||||
|
|
||||||
brw_pop_insn_state(p);
|
|
||||||
ex_desc = addr;
|
|
||||||
|
|
||||||
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
|
send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
|
||||||
brw_set_dest(p, send, dst);
|
brw_set_dest(p, send, dst);
|
||||||
brw_set_src0(p, send, retype(payload0, BRW_TYPE_UD));
|
brw_set_src0(p, send, retype(payload0, BRW_TYPE_UD));
|
||||||
|
|
@ -1630,11 +1509,9 @@ brw_send_indirect_split_message(struct brw_codegen *p,
|
||||||
brw_eu_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
|
brw_eu_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
|
||||||
brw_eu_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
|
brw_eu_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
|
||||||
|
|
||||||
if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM) {
|
if (devinfo->ver >= 20 && sfid == GFX12_SFID_UGM)
|
||||||
const unsigned ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc_imm);
|
|
||||||
brw_eu_inst_set_bits(send, 103, 99, ex_mlen / reg_unit(devinfo));
|
brw_eu_inst_set_bits(send, 103, 99, ex_mlen / reg_unit(devinfo));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (ex_bso) {
|
if (ex_bso) {
|
||||||
/* The send instruction ExBSO field does not exist with UGM on Gfx20+,
|
/* The send instruction ExBSO field does not exist with UGM on Gfx20+,
|
||||||
|
|
@ -1644,7 +1521,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
|
||||||
*/
|
*/
|
||||||
if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
|
if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
|
||||||
brw_eu_inst_set_send_ex_bso(devinfo, send, true);
|
brw_eu_inst_set_send_ex_bso(devinfo, send, true);
|
||||||
brw_eu_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
|
brw_eu_inst_set_send_src1_len(devinfo, send, ex_mlen / reg_unit(devinfo));
|
||||||
}
|
}
|
||||||
brw_eu_inst_set_sfid(devinfo, send, sfid);
|
brw_eu_inst_set_sfid(devinfo, send, sfid);
|
||||||
brw_eu_inst_set_eot(devinfo, send, eot);
|
brw_eu_inst_set_eot(devinfo, send, eot);
|
||||||
|
|
|
||||||
|
|
@ -958,7 +958,7 @@ fs_visitor::assign_curb_setup()
|
||||||
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
||||||
|
|
||||||
send->sfid = GFX12_SFID_UGM;
|
send->sfid = GFX12_SFID_UGM;
|
||||||
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
||||||
LSC_ADDR_SURFTYPE_FLAT,
|
LSC_ADDR_SURFTYPE_FLAT,
|
||||||
LSC_ADDR_SIZE_A32,
|
LSC_ADDR_SIZE_A32,
|
||||||
LSC_DATA_SIZE_D32,
|
LSC_DATA_SIZE_D32,
|
||||||
|
|
@ -971,6 +971,12 @@ fs_visitor::assign_curb_setup()
|
||||||
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
||||||
send->send_is_volatile = true;
|
send->send_is_volatile = true;
|
||||||
|
|
||||||
|
send->src[0] = brw_imm_ud(desc |
|
||||||
|
brw_message_desc(devinfo,
|
||||||
|
send->mlen,
|
||||||
|
send->size_written / REG_SIZE,
|
||||||
|
send->header_size));
|
||||||
|
|
||||||
i += num_regs;
|
i += num_regs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -635,6 +635,7 @@ bool brw_lower_pack(fs_visitor &s);
|
||||||
bool brw_lower_regioning(fs_visitor &s);
|
bool brw_lower_regioning(fs_visitor &s);
|
||||||
bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
|
bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
|
||||||
bool brw_lower_scoreboard(fs_visitor &s);
|
bool brw_lower_scoreboard(fs_visitor &s);
|
||||||
|
bool brw_lower_send_descriptors(fs_visitor &s);
|
||||||
bool brw_lower_sends_overlapping_payload(fs_visitor &s);
|
bool brw_lower_sends_overlapping_payload(fs_visitor &s);
|
||||||
bool brw_lower_simd_width(fs_visitor &s);
|
bool brw_lower_simd_width(fs_visitor &s);
|
||||||
bool brw_lower_sub_sat(fs_visitor &s);
|
bool brw_lower_sub_sat(fs_visitor &s);
|
||||||
|
|
@ -642,6 +643,7 @@ bool brw_lower_subgroup_ops(fs_visitor &s);
|
||||||
bool brw_lower_uniform_pull_constant_loads(fs_visitor &s);
|
bool brw_lower_uniform_pull_constant_loads(fs_visitor &s);
|
||||||
void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
|
void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
|
||||||
|
|
||||||
|
bool brw_opt_address_reg_load(fs_visitor &s);
|
||||||
bool brw_opt_algebraic(fs_visitor &s);
|
bool brw_opt_algebraic(fs_visitor &s);
|
||||||
bool brw_opt_bank_conflicts(fs_visitor &s);
|
bool brw_opt_bank_conflicts(fs_visitor &s);
|
||||||
bool brw_opt_cmod_propagation(fs_visitor &s);
|
bool brw_opt_cmod_propagation(fs_visitor &s);
|
||||||
|
|
|
||||||
|
|
@ -167,31 +167,20 @@ fs_generator::generate_send(fs_inst *inst,
|
||||||
struct brw_reg payload,
|
struct brw_reg payload,
|
||||||
struct brw_reg payload2)
|
struct brw_reg payload2)
|
||||||
{
|
{
|
||||||
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
|
if (ex_desc.file == IMM && ex_desc.ud == 0) {
|
||||||
|
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, inst->eot);
|
||||||
uint32_t desc_imm = inst->desc |
|
if (inst->check_tdr)
|
||||||
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
|
brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
|
||||||
|
} else {
|
||||||
uint32_t ex_desc_imm = inst->ex_desc |
|
|
||||||
brw_message_ex_desc(devinfo, inst->ex_mlen);
|
|
||||||
|
|
||||||
if (ex_desc.file != IMM || ex_desc.ud || ex_desc_imm ||
|
|
||||||
inst->send_ex_desc_scratch) {
|
|
||||||
/* If we have any sort of extended descriptor, then we need SENDS. This
|
/* If we have any sort of extended descriptor, then we need SENDS. This
|
||||||
* also covers the dual-payload case because ex_mlen goes in ex_desc.
|
* also covers the dual-payload case because ex_mlen goes in ex_desc.
|
||||||
*/
|
*/
|
||||||
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
|
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
|
||||||
desc, desc_imm, ex_desc, ex_desc_imm,
|
desc, ex_desc, inst->ex_mlen,
|
||||||
inst->send_ex_desc_scratch,
|
|
||||||
inst->send_ex_bso, inst->eot);
|
inst->send_ex_bso, inst->eot);
|
||||||
if (inst->check_tdr)
|
if (inst->check_tdr)
|
||||||
brw_eu_inst_set_opcode(p->isa, brw_last_inst,
|
brw_eu_inst_set_opcode(p->isa, brw_last_inst,
|
||||||
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
|
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
|
||||||
} else {
|
|
||||||
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
|
|
||||||
inst->eot);
|
|
||||||
if (inst->check_tdr)
|
|
||||||
brw_eu_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -222,12 +222,6 @@ void fs_visitor::calculate_payload_ranges(bool allow_spilling,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The generator implicitly uses g0 to construct extended message
|
|
||||||
* descriptors for scratch send messages when this bit is set.
|
|
||||||
*/
|
|
||||||
if (inst->send_ex_desc_scratch)
|
|
||||||
payload_last_use_ip[0] = use_ip;
|
|
||||||
|
|
||||||
ip++;
|
ip++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -294,6 +288,8 @@ private:
|
||||||
|
|
||||||
void build_interference_graph(bool allow_spilling);
|
void build_interference_graph(bool allow_spilling);
|
||||||
|
|
||||||
|
brw_reg build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill);
|
||||||
|
|
||||||
brw_reg build_lane_offsets(const fs_builder &bld,
|
brw_reg build_lane_offsets(const fs_builder &bld,
|
||||||
uint32_t spill_offset, int ip);
|
uint32_t spill_offset, int ip);
|
||||||
brw_reg build_single_offset(const fs_builder &bld,
|
brw_reg build_single_offset(const fs_builder &bld,
|
||||||
|
|
@ -689,6 +685,44 @@ fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset,
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
brw_reg
|
||||||
|
fs_reg_alloc::build_ex_desc(const fs_builder &bld, unsigned reg_size, bool unspill)
|
||||||
|
{
|
||||||
|
/* Use a different area of the address register than what is used in
|
||||||
|
* brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have
|
||||||
|
* interactions between the spill/fill instructions and the other send
|
||||||
|
* messages.
|
||||||
|
*/
|
||||||
|
brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD,
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC);
|
||||||
|
fs_inst *inst = bld.exec_all().group(1, 0).AND(
|
||||||
|
ex_desc,
|
||||||
|
retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
|
||||||
|
brw_imm_ud(INTEL_MASK(31, 10)));
|
||||||
|
_mesa_set_add(spill_insts, inst);
|
||||||
|
|
||||||
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||||
|
if (devinfo->verx10 >= 200) {
|
||||||
|
inst = bld.exec_all().group(1, 0).SHR(
|
||||||
|
ex_desc, ex_desc, brw_imm_ud(4));
|
||||||
|
_mesa_set_add(spill_insts, inst);
|
||||||
|
} else {
|
||||||
|
if (unspill) {
|
||||||
|
inst = bld.exec_all().group(1, 0).OR(
|
||||||
|
ex_desc, ex_desc, brw_imm_ud(GFX12_SFID_UGM));
|
||||||
|
_mesa_set_add(spill_insts, inst);
|
||||||
|
} else {
|
||||||
|
inst = bld.exec_all().group(1, 0).OR(
|
||||||
|
ex_desc, ex_desc,
|
||||||
|
brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) |
|
||||||
|
GFX12_SFID_UGM));
|
||||||
|
_mesa_set_add(spill_insts, inst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ex_desc;
|
||||||
|
}
|
||||||
|
|
||||||
brw_reg
|
brw_reg
|
||||||
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
|
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
|
||||||
{
|
{
|
||||||
|
|
@ -782,28 +816,26 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
||||||
} else {
|
} else {
|
||||||
offset = build_lane_offsets(ubld, spill_offset, ip);
|
offset = build_lane_offsets(ubld, spill_offset, ip);
|
||||||
}
|
}
|
||||||
/* We leave the extended descriptor empty and flag the instruction to
|
|
||||||
* ask the generated to insert the extended descriptor in the address
|
|
||||||
* register. That way we don't need to burn an additional register
|
|
||||||
* for register allocation spill/fill.
|
|
||||||
*/
|
|
||||||
brw_reg srcs[] = {
|
brw_reg srcs[] = {
|
||||||
brw_imm_ud(0), /* desc */
|
brw_imm_ud(0), /* desc */
|
||||||
brw_imm_ud(0), /* ex_desc */
|
build_ex_desc(bld, reg_size, true),
|
||||||
offset, /* payload */
|
offset, /* payload */
|
||||||
brw_reg(), /* payload2 */
|
brw_reg(), /* payload2 */
|
||||||
};
|
};
|
||||||
|
|
||||||
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
||||||
srcs, ARRAY_SIZE(srcs));
|
|
||||||
unspill_inst->sfid = GFX12_SFID_UGM;
|
|
||||||
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
||||||
LSC_ADDR_SURFTYPE_SS,
|
LSC_ADDR_SURFTYPE_SS,
|
||||||
LSC_ADDR_SIZE_A32,
|
LSC_ADDR_SIZE_A32,
|
||||||
LSC_DATA_SIZE_D32,
|
LSC_DATA_SIZE_D32,
|
||||||
use_transpose ? reg_size * 8 : 1 /* num_channels */,
|
use_transpose ? reg_size * 8 : 1 /* num_channels */,
|
||||||
use_transpose,
|
use_transpose,
|
||||||
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
||||||
|
|
||||||
|
|
||||||
|
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
|
||||||
|
srcs, ARRAY_SIZE(srcs));
|
||||||
|
unspill_inst->sfid = GFX12_SFID_UGM;
|
||||||
unspill_inst->header_size = 0;
|
unspill_inst->header_size = 0;
|
||||||
unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
||||||
unspill_inst->exec_size);
|
unspill_inst->exec_size);
|
||||||
|
|
@ -812,14 +844,23 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
||||||
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
|
||||||
unspill_inst->send_has_side_effects = false;
|
unspill_inst->send_has_side_effects = false;
|
||||||
unspill_inst->send_is_volatile = true;
|
unspill_inst->send_is_volatile = true;
|
||||||
unspill_inst->send_ex_desc_scratch = true;
|
|
||||||
|
unspill_inst->src[0] = brw_imm_ud(
|
||||||
|
desc |
|
||||||
|
brw_message_desc(devinfo,
|
||||||
|
unspill_inst->mlen,
|
||||||
|
unspill_inst->size_written / REG_SIZE,
|
||||||
|
unspill_inst->header_size));
|
||||||
} else {
|
} else {
|
||||||
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
||||||
|
|
||||||
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
||||||
const brw_reg ex_desc = brw_imm_ud(0);
|
|
||||||
|
|
||||||
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
|
brw_reg srcs[] = {
|
||||||
|
brw_imm_ud(0), /* desc */
|
||||||
|
brw_imm_ud(0), /* ex_desc */
|
||||||
|
header
|
||||||
|
};
|
||||||
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
|
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
|
||||||
srcs, ARRAY_SIZE(srcs));
|
srcs, ARRAY_SIZE(srcs));
|
||||||
unspill_inst->mlen = 1;
|
unspill_inst->mlen = 1;
|
||||||
|
|
@ -828,10 +869,15 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
||||||
unspill_inst->send_has_side_effects = false;
|
unspill_inst->send_has_side_effects = false;
|
||||||
unspill_inst->send_is_volatile = true;
|
unspill_inst->send_is_volatile = true;
|
||||||
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
||||||
unspill_inst->desc =
|
|
||||||
|
unspill_inst->src[0] = brw_imm_ud(
|
||||||
brw_dp_desc(devinfo, bti,
|
brw_dp_desc(devinfo, bti,
|
||||||
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
|
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
|
||||||
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) |
|
||||||
|
brw_message_desc(devinfo,
|
||||||
|
unspill_inst->mlen,
|
||||||
|
unspill_inst->size_written / REG_SIZE,
|
||||||
|
unspill_inst->header_size));
|
||||||
}
|
}
|
||||||
_mesa_set_add(spill_insts, unspill_inst);
|
_mesa_set_add(spill_insts, unspill_inst);
|
||||||
assert(unspill_inst->force_writemask_all || count % reg_size == 0);
|
assert(unspill_inst->force_writemask_all || count % reg_size == 0);
|
||||||
|
|
@ -857,21 +903,17 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
|
||||||
fs_inst *spill_inst;
|
fs_inst *spill_inst;
|
||||||
if (devinfo->verx10 >= 125) {
|
if (devinfo->verx10 >= 125) {
|
||||||
brw_reg offset = build_lane_offsets(bld, spill_offset, ip);
|
brw_reg offset = build_lane_offsets(bld, spill_offset, ip);
|
||||||
/* We leave the extended descriptor empty and flag the instruction
|
|
||||||
* relocate the extended descriptor. That way the surface offset is
|
|
||||||
* directly put into the instruction and we don't need to use a
|
|
||||||
* register to hold it.
|
|
||||||
*/
|
|
||||||
brw_reg srcs[] = {
|
brw_reg srcs[] = {
|
||||||
brw_imm_ud(0), /* desc */
|
brw_imm_ud(0), /* desc */
|
||||||
brw_imm_ud(0), /* ex_desc */
|
build_ex_desc(bld, reg_size, false),
|
||||||
offset, /* payload */
|
offset, /* payload */
|
||||||
src, /* payload2 */
|
src, /* payload2 */
|
||||||
};
|
};
|
||||||
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
||||||
srcs, ARRAY_SIZE(srcs));
|
srcs, ARRAY_SIZE(srcs));
|
||||||
spill_inst->sfid = GFX12_SFID_UGM;
|
spill_inst->sfid = GFX12_SFID_UGM;
|
||||||
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
|
||||||
LSC_ADDR_SURFTYPE_SS,
|
LSC_ADDR_SURFTYPE_SS,
|
||||||
LSC_ADDR_SIZE_A32,
|
LSC_ADDR_SIZE_A32,
|
||||||
LSC_DATA_SIZE_D32,
|
LSC_DATA_SIZE_D32,
|
||||||
|
|
@ -885,14 +927,23 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
|
||||||
spill_inst->size_written = 0;
|
spill_inst->size_written = 0;
|
||||||
spill_inst->send_has_side_effects = true;
|
spill_inst->send_has_side_effects = true;
|
||||||
spill_inst->send_is_volatile = false;
|
spill_inst->send_is_volatile = false;
|
||||||
spill_inst->send_ex_desc_scratch = true;
|
|
||||||
|
spill_inst->src[0] = brw_imm_ud(
|
||||||
|
desc |
|
||||||
|
brw_message_desc(devinfo,
|
||||||
|
spill_inst->mlen,
|
||||||
|
spill_inst->size_written / REG_SIZE,
|
||||||
|
spill_inst->header_size));
|
||||||
} else {
|
} else {
|
||||||
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
||||||
|
|
||||||
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
||||||
const brw_reg ex_desc = brw_imm_ud(0);
|
brw_reg srcs[] = {
|
||||||
|
brw_imm_ud(0), /* desc */
|
||||||
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
|
brw_imm_ud(0), /* ex_desc */
|
||||||
|
header,
|
||||||
|
src
|
||||||
|
};
|
||||||
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
||||||
srcs, ARRAY_SIZE(srcs));
|
srcs, ARRAY_SIZE(srcs));
|
||||||
spill_inst->mlen = 1;
|
spill_inst->mlen = 1;
|
||||||
|
|
@ -902,10 +953,17 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
|
||||||
spill_inst->send_has_side_effects = true;
|
spill_inst->send_has_side_effects = true;
|
||||||
spill_inst->send_is_volatile = false;
|
spill_inst->send_is_volatile = false;
|
||||||
spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
||||||
spill_inst->desc =
|
|
||||||
|
spill_inst->src[0] = brw_imm_ud(
|
||||||
brw_dp_desc(devinfo, bti,
|
brw_dp_desc(devinfo, bti,
|
||||||
GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
|
GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
|
||||||
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8)) |
|
||||||
|
brw_message_desc(devinfo,
|
||||||
|
spill_inst->mlen,
|
||||||
|
spill_inst->size_written / REG_SIZE,
|
||||||
|
spill_inst->header_size));
|
||||||
|
spill_inst->src[1] = brw_imm_ud(
|
||||||
|
brw_message_ex_desc(devinfo, spill_inst->ex_mlen));
|
||||||
}
|
}
|
||||||
_mesa_set_add(spill_insts, spill_inst);
|
_mesa_set_add(spill_insts, spill_inst);
|
||||||
assert(spill_inst->force_writemask_all || count % reg_size == 0);
|
assert(spill_inst->force_writemask_all || count % reg_size == 0);
|
||||||
|
|
|
||||||
|
|
@ -193,10 +193,6 @@ public:
|
||||||
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
|
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
|
||||||
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
|
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
|
||||||
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
|
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
|
||||||
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
|
|
||||||
* the scratch surface offset to build
|
|
||||||
* extended descriptor
|
|
||||||
*/
|
|
||||||
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
|
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
|
||||||
* bindless surface offset (26bits instead of
|
* bindless surface offset (26bits instead of
|
||||||
* 20bits)
|
* 20bits)
|
||||||
|
|
|
||||||
|
|
@ -1389,6 +1389,9 @@ setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
|
||||||
* we can use the surface handle directly as the extended descriptor.
|
* we can use the surface handle directly as the extended descriptor.
|
||||||
*/
|
*/
|
||||||
inst->src[1] = retype(surface, BRW_TYPE_UD);
|
inst->src[1] = retype(surface, BRW_TYPE_UD);
|
||||||
|
/* Gfx20+ assumes ExBSO with UGM */
|
||||||
|
if (devinfo->ver >= 20 && inst->sfid == GFX12_SFID_UGM)
|
||||||
|
inst->send_ex_bso = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case LSC_ADDR_SURFTYPE_BTI:
|
case LSC_ADDR_SURFTYPE_BTI:
|
||||||
|
|
@ -2610,3 +2613,80 @@ brw_lower_uniform_pull_constant_loads(fs_visitor &s)
|
||||||
|
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
brw_lower_send_descriptors(fs_visitor &s)
|
||||||
|
{
|
||||||
|
const intel_device_info *devinfo = s.devinfo;
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
|
||||||
|
if (inst->opcode != SHADER_OPCODE_SEND)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
|
||||||
|
|
||||||
|
/* Descriptor */
|
||||||
|
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
|
||||||
|
uint32_t desc_imm = inst->desc |
|
||||||
|
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
|
||||||
|
|
||||||
|
assert(inst->src[0].file != BAD_FILE);
|
||||||
|
assert(inst->src[1].file != BAD_FILE);
|
||||||
|
|
||||||
|
brw_reg desc = inst->src[0];
|
||||||
|
if (desc.file == IMM) {
|
||||||
|
inst->src[0] = brw_imm_ud(desc.ud | desc_imm);
|
||||||
|
} else {
|
||||||
|
brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_DESC);
|
||||||
|
ubld.OR(addr_reg, desc, brw_imm_ud(desc_imm));
|
||||||
|
inst->src[0] = addr_reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Extended descriptor */
|
||||||
|
brw_reg ex_desc = inst->src[1];
|
||||||
|
uint32_t ex_desc_imm = inst->ex_desc |
|
||||||
|
brw_message_ex_desc(devinfo, inst->ex_mlen);
|
||||||
|
|
||||||
|
if (ex_desc.file == IMM)
|
||||||
|
ex_desc_imm |= ex_desc.ud;
|
||||||
|
|
||||||
|
bool needs_addr_reg = false;
|
||||||
|
if (ex_desc.file != IMM)
|
||||||
|
needs_addr_reg = true;
|
||||||
|
if (devinfo->ver < 12 && ex_desc.file == IMM &&
|
||||||
|
(ex_desc_imm & INTEL_MASK(15, 12)) != 0)
|
||||||
|
needs_addr_reg = true;
|
||||||
|
|
||||||
|
if (inst->send_ex_bso) {
|
||||||
|
needs_addr_reg = true;
|
||||||
|
/* When using the extended bindless offset, the whole extended
|
||||||
|
* descriptor is the surface handle.
|
||||||
|
*/
|
||||||
|
ex_desc_imm = 0;
|
||||||
|
} else {
|
||||||
|
if (needs_addr_reg)
|
||||||
|
ex_desc_imm |= inst->sfid | inst->eot << 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needs_addr_reg) {
|
||||||
|
brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
|
||||||
|
BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC);
|
||||||
|
if (ex_desc.file == IMM)
|
||||||
|
ubld.MOV(addr_reg, brw_imm_ud(ex_desc_imm));
|
||||||
|
else if (ex_desc_imm == 0)
|
||||||
|
ubld.MOV(addr_reg, ex_desc);
|
||||||
|
else
|
||||||
|
ubld.OR(addr_reg, ex_desc, brw_imm_ud(ex_desc_imm));
|
||||||
|
inst->src[1] = addr_reg;
|
||||||
|
} else {
|
||||||
|
inst->src[1] = brw_imm_ud(ex_desc_imm);
|
||||||
|
}
|
||||||
|
|
||||||
|
progress = true;
|
||||||
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -175,10 +175,20 @@ brw_optimize(fs_visitor &s)
|
||||||
if (progress)
|
if (progress)
|
||||||
OPT(brw_lower_simd_width);
|
OPT(brw_lower_simd_width);
|
||||||
|
|
||||||
OPT(brw_lower_sends_overlapping_payload);
|
|
||||||
|
|
||||||
OPT(brw_lower_uniform_pull_constant_loads);
|
OPT(brw_lower_uniform_pull_constant_loads);
|
||||||
|
|
||||||
|
if (OPT(brw_lower_send_descriptors)) {
|
||||||
|
/* No need for standard copy_propagation since
|
||||||
|
* brw_fs_opt_address_reg_load will only optimize defs.
|
||||||
|
*/
|
||||||
|
if (OPT(brw_opt_copy_propagation_defs))
|
||||||
|
OPT(brw_opt_algebraic);
|
||||||
|
OPT(brw_opt_address_reg_load);
|
||||||
|
OPT(brw_opt_dead_code_eliminate);
|
||||||
|
}
|
||||||
|
|
||||||
|
OPT(brw_lower_sends_overlapping_payload);
|
||||||
|
|
||||||
OPT(brw_lower_indirect_mov);
|
OPT(brw_lower_indirect_mov);
|
||||||
|
|
||||||
OPT(brw_lower_find_live_channel);
|
OPT(brw_lower_find_live_channel);
|
||||||
|
|
|
||||||
75
src/intel/compiler/brw_opt_address_reg_load.cpp
Normal file
75
src/intel/compiler/brw_opt_address_reg_load.cpp
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Intel Corporation
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "brw_fs.h"
|
||||||
|
#include "brw_fs_builder.h"
|
||||||
|
#include "brw_cfg.h"
|
||||||
|
#include "brw_eu.h"
|
||||||
|
|
||||||
|
/** @file brw_fs_opt_address_reg_load.cpp
|
||||||
|
*
|
||||||
|
* Turn this sequence :
|
||||||
|
*
|
||||||
|
* add(8) vgrf64:UD, vgrf63:UD, 192u
|
||||||
|
* mov(1) a0.4:UD, vgrf64+0.0<0>:UD
|
||||||
|
*
|
||||||
|
* into :
|
||||||
|
*
|
||||||
|
* add(1) a0.4:UD, vgrf63+0.0<0>:UD, 192u
|
||||||
|
*/
|
||||||
|
|
||||||
|
using namespace brw;
|
||||||
|
|
||||||
|
static bool
|
||||||
|
opt_address_reg_load_local(fs_visitor &s, bblock_t *block, const brw::def_analysis &defs)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
foreach_inst_in_block_safe(fs_inst, inst, block) {
|
||||||
|
if (!inst->dst.is_address() || inst->opcode != BRW_OPCODE_MOV)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
fs_inst *src_inst = defs.get(inst->src[0]);
|
||||||
|
if (src_inst == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (src_inst->uses_address_register_implicitly() ||
|
||||||
|
src_inst->sources > 2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
fs_builder ubld = fs_builder(&s).at(block, inst).exec_all().group(1, 0);
|
||||||
|
brw_reg sources[3];
|
||||||
|
for (unsigned i = 0; i < src_inst->sources; i++) {
|
||||||
|
sources[i] = inst->src[i].file == VGRF ? component(src_inst->src[i], 0) : src_inst->src[i];
|
||||||
|
}
|
||||||
|
ubld.emit(src_inst->opcode, inst->dst, sources, src_inst->sources);
|
||||||
|
|
||||||
|
inst->remove(block);
|
||||||
|
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
brw_opt_address_reg_load(fs_visitor &s)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
const brw::def_analysis &defs = s.def_analysis.require();
|
||||||
|
|
||||||
|
foreach_block(block, s.cfg) {
|
||||||
|
foreach_inst_in_block_safe(fs_inst, inst, block) {
|
||||||
|
progress = opt_address_reg_load_local(s, block, defs) || progress;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (progress) {
|
||||||
|
s.cfg->adjust_block_ips();
|
||||||
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
@ -88,6 +88,7 @@ libintel_compiler_brw_files = files(
|
||||||
'brw_nir_rt.c',
|
'brw_nir_rt.c',
|
||||||
'brw_nir_rt_builder.h',
|
'brw_nir_rt_builder.h',
|
||||||
'brw_opt.cpp',
|
'brw_opt.cpp',
|
||||||
|
'brw_opt_address_reg_load.cpp',
|
||||||
'brw_opt_algebraic.cpp',
|
'brw_opt_algebraic.cpp',
|
||||||
'brw_opt_bank_conflicts.cpp',
|
'brw_opt_bank_conflicts.cpp',
|
||||||
'brw_opt_cmod_propagation.cpp',
|
'brw_opt_cmod_propagation.cpp',
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue