diff --git a/src/intel/compiler/brw/brw_builder.h b/src/intel/compiler/brw/brw_builder.h index b9c973f6dee..13540785288 100644 --- a/src/intel/compiler/brw/brw_builder.h +++ b/src/intel/compiler/brw/brw_builder.h @@ -638,6 +638,18 @@ public: return emit(SHADER_OPCODE_SEND, SEND_NUM_SRCS)->as_send(); } + brw_scratch_inst * + FILL() const + { + return emit(SHADER_OPCODE_LSC_FILL, FILL_NUM_SRCS)->as_scratch(); + } + + brw_scratch_inst * + SPILL() const + { + return emit(SHADER_OPCODE_LSC_SPILL, SPILL_NUM_SRCS)->as_scratch(); + } + brw_urb_inst * URB_WRITE(const brw_reg srcs[], unsigned num_srcs) const { diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index da7c3812562..74d3b62d39a 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -549,6 +549,9 @@ enum ENUM_PACKED opcode { * Acts as a scheduling barrier. */ SHADER_OPCODE_LOAD_REG, + + SHADER_OPCODE_LSC_FILL, + SHADER_OPCODE_LSC_SPILL, }; enum send_srcs { @@ -714,6 +717,23 @@ enum interpolator_logical_srcs { INTERP_NUM_SRCS }; +enum spill_srcs { + /** Register used for the address in scratch space. */ + SPILL_SRC_PAYLOAD1, + + /** Register to be spilled. */ + SPILL_SRC_PAYLOAD2, + + SPILL_NUM_SRCS +}; + +enum fill_srcs { + /** Register used for the address in scratch space. */ + FILL_SRC_PAYLOAD1, + + FILL_NUM_SRCS +}; + enum brw_reduce_op { BRW_REDUCE_OP_ADD, BRW_REDUCE_OP_MUL, diff --git a/src/intel/compiler/brw/brw_inst.cpp b/src/intel/compiler/brw/brw_inst.cpp index ac30b1936f0..0829d3565d2 100644 --- a/src/intel/compiler/brw/brw_inst.cpp +++ b/src/intel/compiler/brw/brw_inst.cpp @@ -232,6 +232,10 @@ brw_inst_kind_for_opcode(enum opcode opcode) case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: return BRW_KIND_LOGICAL; + case SHADER_OPCODE_LSC_FILL: + case SHADER_OPCODE_LSC_SPILL: + return BRW_KIND_SCRATCH; + default: return BRW_KIND_BASE; } @@ -302,6 +306,12 @@ brw_inst::is_payload(unsigned arg) const case SHADER_OPCODE_SEND: return arg >= SEND_SRC_PAYLOAD1; + case SHADER_OPCODE_LSC_FILL: + return arg == FILL_SRC_PAYLOAD1; + + case SHADER_OPCODE_LSC_SPILL: + return arg == SPILL_SRC_PAYLOAD1 || arg == SPILL_SRC_PAYLOAD2; + case SHADER_OPCODE_SEND_GATHER: return arg >= SEND_GATHER_SRC_SCALAR; @@ -551,6 +561,25 @@ brw_inst::size_read(const struct intel_device_info *devinfo, int arg) const } break; + case SHADER_OPCODE_LSC_FILL: + if (arg == FILL_SRC_PAYLOAD1) { + return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, + as_scratch()->use_transpose ? 1 : exec_size) * + REG_SIZE; + } + break; + + case SHADER_OPCODE_LSC_SPILL: + if (arg == SPILL_SRC_PAYLOAD1) { + assert(!as_scratch()->use_transpose); + + return lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, exec_size) * + REG_SIZE; + } else if (arg == SPILL_SRC_PAYLOAD2) { + return src[arg].component_size(exec_size); + } + break; + case SHADER_OPCODE_SEND_GATHER: if (arg >= SEND_GATHER_SRC_PAYLOAD) { /* SEND_GATHER is Xe3+, so no need to pass devinfo around. */ @@ -940,6 +969,7 @@ brw_inst::has_side_effects() const return as_send()->has_side_effects; case BRW_OPCODE_SYNC: + case SHADER_OPCODE_LSC_SPILL: case SHADER_OPCODE_MEMORY_STORE_LOGICAL: case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL: case SHADER_OPCODE_MEMORY_FENCE: @@ -965,6 +995,7 @@ brw_inst::is_volatile() const switch (opcode) { case SHADER_OPCODE_MEMORY_LOAD_LOGICAL: case SHADER_OPCODE_LOAD_REG: + case SHADER_OPCODE_LSC_FILL: return true; case SHADER_OPCODE_MEMORY_STORE_LOGICAL: return as_mem()->flags & MEMORY_FLAG_VOLATILE_ACCESS; diff --git a/src/intel/compiler/brw/brw_inst.h b/src/intel/compiler/brw/brw_inst.h index 4e9d7a12cd8..c38f95489fe 100644 --- a/src/intel/compiler/brw/brw_inst.h +++ b/src/intel/compiler/brw/brw_inst.h @@ -50,6 +50,7 @@ enum ENUM_PACKED brw_inst_kind { BRW_KIND_LOAD_PAYLOAD, BRW_KIND_URB, BRW_KIND_FB_WRITE, + BRW_KIND_SCRATCH, }; brw_inst_kind brw_inst_kind_for_opcode(enum opcode opcode); @@ -82,6 +83,7 @@ struct brw_inst : brw_exec_node { KIND_HELPERS(as_load_payload, brw_load_payload_inst, BRW_KIND_LOAD_PAYLOAD); KIND_HELPERS(as_urb, brw_urb_inst, BRW_KIND_URB); KIND_HELPERS(as_fb_write, brw_fb_write_inst, BRW_KIND_FB_WRITE); + KIND_HELPERS(as_scratch, brw_scratch_inst, BRW_KIND_SCRATCH); #undef KIND_HELPERS @@ -370,6 +372,18 @@ struct brw_fb_write_inst : brw_inst { bool last_rt; }; +struct brw_scratch_inst : brw_inst { + /** Offset in scratch space for the load or store. */ + unsigned offset; + + /** + * Should a LSC transpose message be used for the fill? + * + * Currently this must be false for spills. + */ + bool use_transpose; +}; + /** * Make the execution of \p inst dependent on the evaluation of a possibly * inverted predicate. diff --git a/src/intel/compiler/brw/brw_lower_fill_spill.cpp b/src/intel/compiler/brw/brw_lower_fill_spill.cpp new file mode 100644 index 00000000000..994a5199ebd --- /dev/null +++ b/src/intel/compiler/brw/brw_lower_fill_spill.cpp @@ -0,0 +1,181 @@ +/* + * Copyright 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "brw_shader.h" +#include "brw_builder.h" + +static brw_reg +build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill) +{ + /* Use a different area of the address register than what is used in + * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have + * interactions between the spill/fill instructions and the other send + * messages. + */ + brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD, + BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC); + + brw_builder ubld = bld.uniform(); + + ubld.AND(ex_desc, + retype(brw_vec1_grf(0, 5), BRW_TYPE_UD), + brw_imm_ud(INTEL_MASK(31, 10))); + + const intel_device_info *devinfo = bld.shader->devinfo; + if (devinfo->verx10 >= 200) { + ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4)); + } else { + if (unspill) { + ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM)); + } else { + ubld.OR(ex_desc, + ex_desc, + brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM)); + } + } + + return ex_desc; +} + +static void +brw_lower_lsc_fill(const intel_device_info *devinfo, brw_shader &s, + brw_inst *inst) +{ + assert(devinfo->verx10 >= 125); + + const brw_builder bld(inst); + brw_reg dst = inst->dst; + brw_reg offset = inst->src[FILL_SRC_PAYLOAD1]; + + const unsigned reg_size = inst->dst.component_size(inst->exec_size) / + REG_SIZE; + brw_reg ex_desc = build_ex_desc(bld, reg_size, true); + + /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can + * load more using transpose messages. + */ + const bool use_transpose = inst->as_scratch()->use_transpose; + const brw_builder ubld = use_transpose ? bld.uniform() : bld; + + uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, + use_transpose ? reg_size * 8 : 1 /* num_channels */, + use_transpose, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); + + + brw_send_inst *unspill_inst = ubld.SEND(); + unspill_inst->dst = dst; + + unspill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); + unspill_inst->src[SEND_SRC_EX_DESC] = ex_desc; + unspill_inst->src[SEND_SRC_PAYLOAD1] = offset; + unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg(); + + unspill_inst->sfid = BRW_SFID_UGM; + unspill_inst->header_size = 0; + unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, + unspill_inst->exec_size); + unspill_inst->ex_mlen = 0; + unspill_inst->size_written = + lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE; + unspill_inst->has_side_effects = false; + unspill_inst->is_volatile = true; + + unspill_inst->src[0] = brw_imm_ud( + desc | + brw_message_desc(devinfo, + unspill_inst->mlen, + unspill_inst->size_written / REG_SIZE, + unspill_inst->header_size)); + + assert(unspill_inst->size_written == inst->size_written); + assert(unspill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, FILL_SRC_PAYLOAD1)); + + inst->remove(); +} + +static void +brw_lower_lsc_spill(const intel_device_info *devinfo, brw_inst *inst) +{ + assert(devinfo->verx10 >= 125); + + const brw_builder bld(inst); + brw_reg offset = inst->src[SPILL_SRC_PAYLOAD1]; + brw_reg src = inst->src[SPILL_SRC_PAYLOAD2]; + + const unsigned reg_size = src.component_size(bld.dispatch_width()) / + REG_SIZE; + + assert(!inst->as_scratch()->use_transpose); + + const brw_reg ex_desc = build_ex_desc(bld, reg_size, false); + + brw_send_inst *spill_inst = bld.SEND(); + + spill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); + spill_inst->src[SEND_SRC_EX_DESC] = ex_desc; + spill_inst->src[SEND_SRC_PAYLOAD1] = offset; + spill_inst->src[SEND_SRC_PAYLOAD2] = src; + + spill_inst->sfid = BRW_SFID_UGM; + uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE, + LSC_ADDR_SURFTYPE_SS, + LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, + 1 /* num_channels */, + false /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); + spill_inst->header_size = 0; + spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, + bld.dispatch_width()); + spill_inst->ex_mlen = reg_size; + spill_inst->size_written = 0; + spill_inst->has_side_effects = true; + spill_inst->is_volatile = false; + + spill_inst->src[0] = brw_imm_ud( + desc | + brw_message_desc(devinfo, + spill_inst->mlen, + spill_inst->size_written / REG_SIZE, + spill_inst->header_size)); + + assert(spill_inst->size_written == inst->size_written); + assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD1) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD1)); + assert(spill_inst->size_read(devinfo, SEND_SRC_PAYLOAD2) == inst->size_read(devinfo, SPILL_SRC_PAYLOAD2)); + + inst->remove(); +} + +bool +brw_lower_fill_and_spill(brw_shader &s) +{ + bool progress = false; + + foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) { + switch (inst->opcode) { + case SHADER_OPCODE_LSC_FILL: + brw_lower_lsc_fill(s.devinfo, s, inst); + progress = true; + break; + + case SHADER_OPCODE_LSC_SPILL: + brw_lower_lsc_spill(s.devinfo, inst); + progress = true; + break; + + default: + break; + } + } + + if (progress) + s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | + BRW_DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp index 3aa30446074..b0d8e768b0a 100644 --- a/src/intel/compiler/brw/brw_opt_cse.cpp +++ b/src/intel/compiler/brw/brw_opt_cse.cpp @@ -473,6 +473,9 @@ hash_inst(const void *v) case BRW_KIND_BASE: /* Nothing else to do. */ break; + + case BRW_KIND_SCRATCH: + UNREACHABLE("Spill and fills should not exist yet."); } if (inst->opcode == BRW_OPCODE_MAD) { diff --git a/src/intel/compiler/brw/brw_print.cpp b/src/intel/compiler/brw/brw_print.cpp index 737bd0886ca..1da137306bd 100644 --- a/src/intel/compiler/brw/brw_print.cpp +++ b/src/intel/compiler/brw/brw_print.cpp @@ -269,6 +269,11 @@ brw_instruction_name(const struct brw_isa_info *isa, const brw_inst *inst) case SHADER_OPCODE_FLOW: return "flow"; + + case SHADER_OPCODE_LSC_FILL: + return "fill_lsc"; + case SHADER_OPCODE_LSC_SPILL: + return "spill_lsc"; } UNREACHABLE("not reached"); diff --git a/src/intel/compiler/brw/brw_reg_allocate.cpp b/src/intel/compiler/brw/brw_reg_allocate.cpp index fdb31af96d9..bd3d3bd9b7e 100644 --- a/src/intel/compiler/brw/brw_reg_allocate.cpp +++ b/src/intel/compiler/brw/brw_reg_allocate.cpp @@ -296,8 +296,6 @@ private: bool build_interference_graph(bool allow_spilling); - brw_reg build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill); - brw_reg build_lane_offsets(const brw_builder &bld, uint32_t spill_offset, int ip); brw_reg build_single_offset(const brw_builder &bld, @@ -505,6 +503,10 @@ brw_inst_has_source_and_destination_hazard(const struct intel_device_info *devin * be overly conservative. */ return inst->as_dpas()->rcount > 1; + + case SHADER_OPCODE_LSC_FILL: + return false; + default: /* The SIMD16 compressed instruction * @@ -631,6 +633,13 @@ brw_reg_alloc::setup_inst_interference(const brw_inst *inst) ra_add_node_interference(g, first_vgrf_node + inst->src[SEND_SRC_PAYLOAD1].nr, first_vgrf_node + inst->src[SEND_SRC_PAYLOAD2].nr); + } else if (inst->opcode == SHADER_OPCODE_LSC_SPILL && + inst->src[SPILL_SRC_PAYLOAD1].file == VGRF && + inst->src[SPILL_SRC_PAYLOAD2].file == VGRF && + inst->src[SPILL_SRC_PAYLOAD1].nr != inst->src[SPILL_SRC_PAYLOAD2].nr) { + ra_add_node_interference(g, + first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD1].nr, + first_vgrf_node + inst->src[SPILL_SRC_PAYLOAD2].nr); } /* When we do send-from-GRF for FB writes, we need to ensure that the last @@ -774,43 +783,6 @@ brw_reg_alloc::build_single_offset(const brw_builder &bld, uint32_t spill_offset return offset; } -brw_reg -brw_reg_alloc::build_ex_desc(const brw_builder &bld, unsigned reg_size, bool unspill) -{ - /* Use a different area of the address register than what is used in - * brw_lower_logical_sends.c (brw_address_reg(2)) so we don't have - * interactions between the spill/fill instructions and the other send - * messages. - */ - brw_reg ex_desc = bld.vaddr(BRW_TYPE_UD, - BRW_ADDRESS_SUBREG_INDIRECT_SPILL_DESC); - - brw_builder ubld = bld.uniform(); - - brw_inst *inst = ubld.AND(ex_desc, - retype(brw_vec1_grf(0, 5), BRW_TYPE_UD), - brw_imm_ud(INTEL_MASK(31, 10))); - _mesa_set_add(spill_insts, inst); - - const intel_device_info *devinfo = bld.shader->devinfo; - if (devinfo->verx10 >= 200) { - inst = ubld.SHR(ex_desc, ex_desc, brw_imm_ud(4)); - _mesa_set_add(spill_insts, inst); - } else { - if (unspill) { - inst = ubld.OR(ex_desc, ex_desc, brw_imm_ud(BRW_SFID_UGM)); - _mesa_set_add(spill_insts, inst); - } else { - inst = ubld.OR(ex_desc, - ex_desc, - brw_imm_ud(brw_message_ex_desc(devinfo, reg_size) | BRW_SFID_UGM)); - _mesa_set_add(spill_insts, inst); - } - } - - return ex_desc; -} - brw_reg brw_reg_alloc::build_lane_offsets(const brw_builder &bld, uint32_t spill_offset, int ip) { @@ -905,7 +877,6 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld, for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) { ++stats->fill_count; - brw_send_inst *unspill_inst; if (devinfo->verx10 >= 125) { /* LSC is limited to SIMD16 (SIMD32 on Xe2) load/store but we can * load more using transpose messages. @@ -921,46 +892,26 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld, offset = build_lane_offsets(ubld, spill_offset, ip); } - uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, - LSC_ADDR_SURFTYPE_SS, - LSC_ADDR_SIZE_A32, - LSC_DATA_SIZE_D32, - use_transpose ? reg_size * 8 : 1 /* num_channels */, - use_transpose, - LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); - - const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, true); - - unspill_inst = ubld.SEND(); + const bool exec_all = use_transpose || bld.has_writemask_all(); + brw_scratch_inst *unspill_inst = bld.exec_all(exec_all).FILL(); unspill_inst->dst = dst; - unspill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); - unspill_inst->src[SEND_SRC_EX_DESC] = ex_desc_reg; - unspill_inst->src[SEND_SRC_PAYLOAD1] = offset; - unspill_inst->src[SEND_SRC_PAYLOAD2] = brw_reg(); + unspill_inst->src[FILL_SRC_PAYLOAD1] = offset; - unspill_inst->sfid = BRW_SFID_UGM; - unspill_inst->header_size = 0; - unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, - unspill_inst->exec_size); - unspill_inst->ex_mlen = 0; + unspill_inst->offset = spill_offset; + unspill_inst->use_transpose = use_transpose; unspill_inst->size_written = lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE; - unspill_inst->has_side_effects = false; - unspill_inst->is_volatile = true; + assert(unspill_inst->size_written == (reg_size * REG_SIZE)); - unspill_inst->src[0] = brw_imm_ud( - desc | - brw_message_desc(devinfo, - unspill_inst->mlen, - unspill_inst->size_written / REG_SIZE, - unspill_inst->header_size)); + _mesa_set_add(spill_insts, unspill_inst); + assert(unspill_inst->force_writemask_all || count % reg_size == 0); } else { brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; - unspill_inst = bld.SEND(); + brw_send_inst *unspill_inst = bld.SEND(); unspill_inst->dst = dst; unspill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); @@ -983,9 +934,10 @@ brw_reg_alloc::emit_unspill(const brw_builder &bld, unspill_inst->mlen, unspill_inst->size_written / REG_SIZE, unspill_inst->header_size)); + + _mesa_set_add(spill_insts, unspill_inst); + assert(unspill_inst->force_writemask_all || count % reg_size == 0); } - _mesa_set_add(spill_insts, unspill_inst); - assert(unspill_inst->force_writemask_all || count % reg_size == 0); dst.offset += reg_size * REG_SIZE; spill_offset += reg_size * REG_SIZE; @@ -1005,48 +957,26 @@ brw_reg_alloc::emit_spill(const brw_builder &bld, for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) { ++stats->spill_count; - brw_send_inst *spill_inst; if (devinfo->verx10 >= 125) { brw_reg offset = build_lane_offsets(bld, spill_offset, ip); - const brw_reg ex_desc_reg = build_ex_desc(bld, reg_size, false); - - spill_inst = bld.SEND(); + brw_scratch_inst *spill_inst = bld.SPILL(); spill_inst->dst = bld.null_reg_f(); - spill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); - spill_inst->src[SEND_SRC_EX_DESC] = ex_desc_reg; - spill_inst->src[SEND_SRC_PAYLOAD1] = offset; - spill_inst->src[SEND_SRC_PAYLOAD2] = src; + spill_inst->src[SPILL_SRC_PAYLOAD1] = offset; + spill_inst->src[SPILL_SRC_PAYLOAD2] = src; - spill_inst->sfid = BRW_SFID_UGM; - uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_STORE, - LSC_ADDR_SURFTYPE_SS, - LSC_ADDR_SIZE_A32, - LSC_DATA_SIZE_D32, - 1 /* num_channels */, - false /* transpose */, - LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); - spill_inst->header_size = 0; - spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, - bld.dispatch_width()); - spill_inst->ex_mlen = reg_size; - spill_inst->size_written = 0; - spill_inst->has_side_effects = true; - spill_inst->is_volatile = false; + spill_inst->offset = spill_offset; + spill_inst->use_transpose = false; - spill_inst->src[0] = brw_imm_ud( - desc | - brw_message_desc(devinfo, - spill_inst->mlen, - spill_inst->size_written / REG_SIZE, - spill_inst->header_size)); + _mesa_set_add(spill_insts, spill_inst); + assert(spill_inst->force_writemask_all || count % reg_size == 0); } else { brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip); const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT; - spill_inst = bld.SEND(); + brw_send_inst *spill_inst = bld.SEND(); spill_inst->dst = bld.null_reg_f(); spill_inst->src[SEND_SRC_DESC] = brw_imm_ud(0); @@ -1072,9 +1002,10 @@ brw_reg_alloc::emit_spill(const brw_builder &bld, spill_inst->header_size)); spill_inst->src[1] = brw_imm_ud( brw_message_ex_desc(devinfo, spill_inst->ex_mlen)); + + _mesa_set_add(spill_insts, spill_inst); + assert(spill_inst->force_writemask_all || count % reg_size == 0); } - _mesa_set_add(spill_insts, spill_inst); - assert(spill_inst->force_writemask_all || count % reg_size == 0); src.offset += reg_size * REG_SIZE; spill_offset += reg_size * REG_SIZE; diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index 4a5a8965222..d5f091566ac 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -1277,6 +1277,10 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num); + if (s.spilled_any_registers) { + OPT(brw_lower_fill_and_spill); + } + OPT(brw_opt_bank_conflicts); OPT_V(brw_schedule_instructions_post_ra); diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index e03a38b1cee..b53b4f86bb4 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -328,6 +328,7 @@ bool brw_lower_constant_loads(brw_shader &s); bool brw_lower_csel(brw_shader &s); bool brw_lower_derivatives(brw_shader &s); bool brw_lower_dpas(brw_shader &s); +bool brw_lower_fill_and_spill(brw_shader &s); bool brw_lower_find_live_channel(brw_shader &s); bool brw_lower_indirect_mov(brw_shader &s); bool brw_lower_integer_multiplication(brw_shader &s); diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build index 0d2e8f28dde..cb44c62083e 100644 --- a/src/intel/compiler/brw/meson.build +++ b/src/intel/compiler/brw/meson.build @@ -43,6 +43,7 @@ libintel_compiler_brw_files = files( 'brw_load_reg.cpp', 'brw_lower.cpp', 'brw_lower_dpas.cpp', + 'brw_lower_fill_spill.cpp', 'brw_lower_integer_multiplication.cpp', 'brw_lower_logical_sends.cpp', 'brw_lower_pack.cpp',