diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 18683e15631..f0785046bb6 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1587,6 +1587,7 @@ brw_send_indirect_split_message(struct brw_codegen *p, unsigned desc_imm, struct brw_reg ex_desc, unsigned ex_desc_imm, + bool ex_desc_scratch, bool eot); void brw_ff_sync(struct brw_codegen *p, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index e647319613b..281094ed406 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -2746,6 +2746,7 @@ brw_send_indirect_split_message(struct brw_codegen *p, unsigned desc_imm, struct brw_reg ex_desc, unsigned ex_desc_imm, + bool ex_desc_scratch, bool eot) { const struct intel_device_info *devinfo = p->devinfo; @@ -2781,6 +2782,7 @@ brw_send_indirect_split_message(struct brw_codegen *p, } if (ex_desc.file == BRW_IMMEDIATE_VALUE && + !ex_desc_scratch && (devinfo->ver >= 12 || ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) { ex_desc.ud |= ex_desc_imm; @@ -2807,7 +2809,16 @@ brw_send_indirect_split_message(struct brw_codegen *p, */ unsigned imm_part = ex_desc_imm | sfid | eot << 5; - if (ex_desc.file == BRW_IMMEDIATE_VALUE) { + if (ex_desc_scratch) { + /* Or the scratch surface offset together with the immediate part of + * the extended descriptor. + */ + assert(devinfo->verx10 >= 125); + brw_AND(p, addr, + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + brw_OR(p, addr, addr, brw_imm_ud(imm_part)); + } else if (ex_desc.file == BRW_IMMEDIATE_VALUE) { /* ex_desc bits 15:12 don't exist in the instruction encoding prior * to Gfx12, so we may have fallen back to an indirect extended * descriptor. diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index e8c45b4e59c..62f5cb51fe1 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -335,13 +335,14 @@ fs_generator::generate_send(fs_inst *inst, uint32_t ex_desc_imm = inst->ex_desc | brw_message_ex_desc(devinfo, inst->ex_mlen); - if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { + if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm || + inst->send_ex_desc_scratch) { /* If we have any sort of extended descriptor, then we need SENDS. This * also covers the dual-payload case because ex_mlen goes in ex_desc. */ brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, desc, desc_imm, ex_desc, ex_desc_imm, - inst->eot); + inst->send_ex_desc_scratch, inst->eot); if (inst->check_tdr) brw_inst_set_opcode(p->isa, brw_last_inst, devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 4f63f8ad50c..1b6082e5801 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -348,10 +348,15 @@ private: void build_interference_graph(bool allow_spilling); void discard_interference_graph(); + fs_reg build_lane_offsets(const fs_builder &bld, + uint32_t spill_offset, int ip); + fs_reg build_single_offset(const fs_builder &bld, + uint32_t spill_offset, int ip); + void emit_unspill(const fs_builder &bld, struct shader_stats *stats, - fs_reg dst, uint32_t spill_offset, unsigned count); + fs_reg dst, uint32_t spill_offset, unsigned count, int ip); void emit_spill(const fs_builder &bld, struct shader_stats *stats, - fs_reg src, uint32_t spill_offset, unsigned count); + fs_reg src, uint32_t spill_offset, unsigned count, int ip); void set_spill_costs(); int choose_spill_reg(); @@ -448,6 +453,10 @@ namespace { unsigned spill_max_size(const backend_shader *s) { + /* LSC is limited to SIMD16 sends */ + if (s->devinfo->has_lsc) + return 2; + /* FINISHME - On Gfx7+ it should be possible to avoid this limit * altogether by spilling directly from the temporary GRF * allocated to hold the result of the instruction (and the @@ -661,7 +670,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) first_vgrf_node = node_count; node_count += fs->alloc.count; last_vgrf_node = node_count - 1; - if (devinfo->ver >= 9 && allow_spilling) { + if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) { scratch_header_node = node_count++; } else { scratch_header_node = -1; @@ -742,11 +751,59 @@ fs_reg_alloc::discard_interference_graph() have_spill_costs = false; } +fs_reg +fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip) +{ + fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_REGISTER_TYPE_UD); + fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset)); + _mesa_set_add(spill_insts, inst); + return offset; +} + +fs_reg +fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip) +{ + /* LSC messages are limited to SIMD16 */ + assert(bld.dispatch_width() <= 16); + + const fs_builder ubld = bld.exec_all(); + const unsigned reg_count = ubld.dispatch_width() / 8; + + fs_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_REGISTER_TYPE_UD); + fs_inst *inst; + + /* Build an offset per lane in SIMD8 */ + inst = ubld.group(8, 0).MOV(retype(offset, BRW_REGISTER_TYPE_UW), + brw_imm_uv(0x76543210)); + _mesa_set_add(spill_insts, inst); + inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_REGISTER_TYPE_UW)); + _mesa_set_add(spill_insts, inst); + + /* Build offsets in the upper 8 lanes of SIMD16 */ + if (ubld.dispatch_width() > 8) { + inst = ubld.group(8, 0).ADD( + byte_offset(offset, REG_SIZE), + byte_offset(offset, 0), + brw_imm_ud(8)); + _mesa_set_add(spill_insts, inst); + } + + /* Make the offset a dword */ + inst = ubld.SHL(offset, offset, brw_imm_ud(2)); + _mesa_set_add(spill_insts, inst); + + /* Add the base offset */ + inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset)); + _mesa_set_add(spill_insts, inst); + + return offset; +} + void fs_reg_alloc::emit_unspill(const fs_builder &bld, struct shader_stats *stats, fs_reg dst, - uint32_t spill_offset, unsigned count) + uint32_t spill_offset, unsigned count, int ip) { const intel_device_info *devinfo = bld.shader->devinfo; const unsigned reg_size = dst.component_size(bld.dispatch_width()) / @@ -757,7 +814,53 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, ++stats->fill_count; fs_inst *unspill_inst; - if (devinfo->ver >= 9) { + if (devinfo->verx10 >= 125) { + /* LSC is limited to SIMD16 load/store but we can load more using + * transpose messages. + */ + const bool use_transpose = bld.dispatch_width() > 16; + const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld; + fs_reg offset; + if (use_transpose) { + offset = build_single_offset(ubld, spill_offset, ip); + } else { + offset = build_lane_offsets(ubld, spill_offset, ip); + } + /* We leave the extended descriptor empty and flag the instruction to + * ask the generated to insert the extended descriptor in the address + * register. That way we don't need to burn an additional register + * for register allocation spill/fill. + */ + fs_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + offset, /* payload */ + fs_reg(), /* payload2 */ + }; + + unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst, + srcs, ARRAY_SIZE(srcs)); + unspill_inst->sfid = GFX12_SFID_UGM; + unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + unspill_inst->exec_size, + LSC_ADDR_SURFTYPE_BSS, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + use_transpose ? reg_size * 8 : 1 /* num_channels */, + use_transpose, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + true /* has_dest */); + unspill_inst->header_size = 0; + unspill_inst->mlen = + lsc_msg_desc_src0_len(devinfo, unspill_inst->desc); + unspill_inst->ex_mlen = 0; + unspill_inst->size_written = + lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE; + unspill_inst->send_has_side_effects = false; + unspill_inst->send_is_volatile = true; + unspill_inst->send_ex_desc_scratch = true; + } else if (devinfo->ver >= 9) { fs_reg header = this->scratch_header; fs_builder ubld = bld.exec_all().group(1, 0); assert(spill_offset % 16 == 0); @@ -765,15 +868,8 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, brw_imm_ud(spill_offset / 16)); _mesa_set_add(spill_insts, unspill_inst); - unsigned bti; - fs_reg ex_desc; - if (devinfo->verx10 >= 125) { - bti = GFX9_BTI_BINDLESS; - ex_desc = component(this->scratch_header, 0); - } else { - bti = GFX8_BTI_STATELESS_NON_COHERENT; - ex_desc = brw_imm_ud(0); - } + const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; + const fs_reg ex_desc = brw_imm_ud(0); fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header }; unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, @@ -815,7 +911,7 @@ void fs_reg_alloc::emit_spill(const fs_builder &bld, struct shader_stats *stats, fs_reg src, - uint32_t spill_offset, unsigned count) + uint32_t spill_offset, unsigned count, int ip) { const intel_device_info *devinfo = bld.shader->devinfo; const unsigned reg_size = src.component_size(bld.dispatch_width()) / @@ -826,7 +922,40 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, ++stats->spill_count; fs_inst *spill_inst; - if (devinfo->ver >= 9) { + if (devinfo->verx10 >= 125) { + fs_reg offset = build_lane_offsets(bld, spill_offset, ip); + /* We leave the extended descriptor empty and flag the instruction + * relocate the extended descriptor. That way the surface offset is + * directly put into the instruction and we don't need to use a + * register to hold it. + */ + fs_reg srcs[] = { + brw_imm_ud(0), /* desc */ + brw_imm_ud(0), /* ex_desc */ + offset, /* payload */ + src, /* payload2 */ + }; + spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), + srcs, ARRAY_SIZE(srcs)); + spill_inst->sfid = GFX12_SFID_UGM; + spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, + bld.dispatch_width(), + LSC_ADDR_SURFTYPE_BSS, + LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + 1 /* num_channels */, + false /* transpose */, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + false /* has_dest */); + spill_inst->header_size = 0; + spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc); + spill_inst->ex_mlen = reg_size; + spill_inst->size_written = 0; + spill_inst->send_has_side_effects = true; + spill_inst->send_is_volatile = false; + spill_inst->send_ex_desc_scratch = true; + } else if (devinfo->ver >= 9) { fs_reg header = this->scratch_header; fs_builder ubld = bld.exec_all().group(1, 0); assert(spill_offset % 16 == 0); @@ -834,15 +963,8 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, brw_imm_ud(spill_offset / 16)); _mesa_set_add(spill_insts, spill_inst); - unsigned bti; - fs_reg ex_desc; - if (devinfo->verx10 >= 125) { - bti = GFX9_BTI_BINDLESS; - ex_desc = component(this->scratch_header, 0); - } else { - bti = GFX8_BTI_STATELESS_NON_COHERENT; - ex_desc = brw_imm_ud(0); - } + const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; + const fs_reg ex_desc = brw_imm_ud(0); fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src }; spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), @@ -1033,25 +1155,16 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) * SIMD16 mode, because we'd stomp the FB writes. */ if (!fs->spilled_any_registers) { - if (devinfo->ver >= 9) { + if (devinfo->verx10 >= 125) { + /* We will allocate a register on the fly */ + } else if (devinfo->ver >= 9) { this->scratch_header = alloc_scratch_header(); fs_builder ubld = fs->bld.exec_all().group(8, 0).at( fs->cfg->first_block(), fs->cfg->first_block()->start()); - fs_inst *inst; - if (devinfo->verx10 >= 125) { - inst = ubld.MOV(this->scratch_header, brw_imm_ud(0)); - _mesa_set_add(spill_insts, inst); - inst = ubld.group(1, 0).AND(component(this->scratch_header, 0), - retype(brw_vec1_grf(0, 5), - BRW_REGISTER_TYPE_UD), - brw_imm_ud(INTEL_MASK(31, 10))); - _mesa_set_add(spill_insts, inst); - } else { - inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, - this->scratch_header); - _mesa_set_add(spill_insts, inst); - } + fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, + this->scratch_header); + _mesa_set_add(spill_insts, inst); } else { bool mrf_used[BRW_MAX_MRF(devinfo->ver)]; get_used_mrfs(fs, mrf_used); @@ -1112,7 +1225,7 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) * unspill destination is a block-local temporary. */ emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats, - unspill_dst, subset_spill_offset, count); + unspill_dst, subset_spill_offset, count, ip); } } @@ -1167,10 +1280,10 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) if (inst->is_partial_write() || (!inst->force_writemask_all && !per_channel)) emit_unspill(ubld, &fs->shader_stats, spill_src, - subset_spill_offset, regs_written(inst)); + subset_spill_offset, regs_written(inst), ip); emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src, - subset_spill_offset, regs_written(inst)); + subset_spill_offset, regs_written(inst), ip); } for (fs_inst *inst = (fs_inst *)before->next; diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h index 8db0c988ceb..33011f7299d 100644 --- a/src/intel/compiler/brw_ir.h +++ b/src/intel/compiler/brw_ir.h @@ -174,6 +174,10 @@ struct backend_instruction { bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */ + bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use + * the scratch surface offset to build + * extended descriptor + */ bool eot:1; /* Chooses which flag subregister (f0.0 to f1.1) is used for conditional