diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index adf0467ff98..978f9087071 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -440,6 +440,7 @@ public: node_count = 0; first_payload_node = 0; first_mrf_hack_node = 0; + scratch_header_node = 0; grf127_send_hack_node = 0; first_vgrf_node = 0; last_vgrf_node = 0; @@ -472,6 +473,7 @@ private: void set_spill_costs(); int choose_spill_reg(); + fs_reg alloc_scratch_header(); fs_reg alloc_spill_reg(unsigned size, int ip); void spill_reg(unsigned spill_reg); @@ -496,6 +498,7 @@ private: int node_count; int first_payload_node; int first_mrf_hack_node; + int scratch_header_node; int grf127_send_hack_node; int first_vgrf_node; int last_vgrf_node; @@ -504,6 +507,8 @@ private: int *spill_vgrf_ip; int spill_vgrf_ip_alloc; int spill_node_count; + + fs_reg scratch_header; }; /** @@ -579,6 +584,8 @@ namespace { unsigned spill_base_mrf(const backend_shader *s) { + /* We don't use the MRF hack on Gen9+ */ + assert(s->devinfo->gen < 9); return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1; } } @@ -610,6 +617,10 @@ fs_reg_alloc::setup_live_interference(unsigned node, ra_add_node_interference(g, node, first_mrf_hack_node + i); } + /* Everything interferes with the scratch header */ + if (scratch_header_node >= 0) + ra_add_node_interference(g, node, scratch_header_node); + /* Add interference with every vgrf whose live range intersects this * node's. We only need to look at nodes below this one as the reflexivity * of interference will take care of the rest. @@ -748,7 +759,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) node_count = 0; first_payload_node = node_count; node_count += payload_node_count; - if (devinfo->gen >= 7 && allow_spilling) { + if (devinfo->gen >= 7 && devinfo->gen < 9 && allow_spilling) { first_mrf_hack_node = node_count; node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; } else { @@ -763,6 +774,11 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) first_vgrf_node = node_count; node_count += fs->alloc.count; last_vgrf_node = node_count - 1; + if (devinfo->gen >= 9 && allow_spilling) { + scratch_header_node = node_count++; + } else { + scratch_header_node = -1; + } first_spill_node = node_count; fs->calculate_payload_ranges(payload_node_count, @@ -865,8 +881,29 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst, for (unsigned i = 0; i < count / reg_size; i++) { fs_inst *unspill_inst; - if (devinfo->gen >= 7 && devinfo->gen < 9 && - spill_offset < (1 << 12) * REG_SIZE) { + if (devinfo->gen >= 9) { + fs_reg header = this->scratch_header; + fs_builder ubld = bld.exec_all().group(1, 0); + assert(spill_offset % 16 == 0); + unspill_inst = ubld.MOV(component(header, 2), + brw_imm_ud(spill_offset / 16)); + _mesa_set_add(spill_insts, unspill_inst); + + fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header }; + unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst, + srcs, ARRAY_SIZE(srcs)); + unspill_inst->mlen = 1; + unspill_inst->header_size = 1; + unspill_inst->size_written = reg_size * REG_SIZE; + unspill_inst->send_has_side_effects = false; + unspill_inst->send_is_volatile = true; + unspill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + unspill_inst->desc = + brw_dp_read_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8), + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + BRW_DATAPORT_READ_TARGET_RENDER_CACHE); + } else if (devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE) { /* The Gen7 descriptor-based offset is 12 bits of HWORD units. * Because the Gen7-style scratch block read is hardwired to BTI 255, * on Gen9+ it would cause the DC to do an IA-coherent read, what @@ -893,16 +930,44 @@ void fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src, uint32_t spill_offset, unsigned count) { + const gen_device_info *devinfo = bld.shader->devinfo; const unsigned reg_size = src.component_size(bld.dispatch_width()) / REG_SIZE; assert(count % reg_size == 0); for (unsigned i = 0; i < count / reg_size; i++) { - fs_inst *spill_inst = - bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src); - spill_inst->offset = spill_offset; - spill_inst->mlen = 1 + reg_size; /* header, value */ - spill_inst->base_mrf = spill_base_mrf(bld.shader); + fs_inst *spill_inst; + if (devinfo->gen >= 9) { + fs_reg header = this->scratch_header; + fs_builder ubld = bld.exec_all().group(1, 0); + assert(spill_offset % 16 == 0); + spill_inst = ubld.MOV(component(header, 2), + brw_imm_ud(spill_offset / 16)); + _mesa_set_add(spill_insts, spill_inst); + + fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header, src }; + spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(), + srcs, ARRAY_SIZE(srcs)); + spill_inst->mlen = 1; + spill_inst->ex_mlen = reg_size; + spill_inst->size_written = 0; + spill_inst->header_size = 1; + spill_inst->send_has_side_effects = true; + spill_inst->send_is_volatile = false; + spill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + spill_inst->desc = + brw_dp_write_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8), + GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, + 0 /* not a render target */, + false /* send_commit_msg */); + } else { + spill_inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, + bld.null_reg_f(), src); + spill_inst->offset = spill_offset; + spill_inst->mlen = 1 + reg_size; /* header, value */ + spill_inst->base_mrf = spill_base_mrf(bld.shader); + } _mesa_set_add(spill_insts, spill_inst); src.offset += reg_size * REG_SIZE; @@ -1011,6 +1076,19 @@ fs_reg_alloc::choose_spill_reg() return node - first_vgrf_node; } +fs_reg +fs_reg_alloc::alloc_scratch_header() +{ + int vgrf = fs->alloc.allocate(1); + assert(first_vgrf_node + vgrf == scratch_header_node); + ra_set_node_class(g, scratch_header_node, + compiler->fs_reg_sets[rsi].classes[0]); + + setup_live_interference(scratch_header_node, 0, INT_MAX); + + return fs_reg(VGRF, vgrf, BRW_REGISTER_TYPE_UD); +} + fs_reg fs_reg_alloc::alloc_spill_reg(unsigned size, int ip) { @@ -1058,13 +1136,22 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) * SIMD16 mode, because we'd stomp the FB writes. */ if (!fs->spilled_any_registers) { - bool mrf_used[BRW_MAX_MRF(devinfo->gen)]; - get_used_mrfs(fs, mrf_used); + if (devinfo->gen >= 9) { + this->scratch_header = alloc_scratch_header(); + fs_builder ubld = fs->bld.exec_all().group(8, 0).at( + fs->cfg->first_block(), fs->cfg->first_block()->start()); + fs_inst *header_inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, + this->scratch_header); + _mesa_set_add(spill_insts, header_inst); + } else { + bool mrf_used[BRW_MAX_MRF(devinfo->gen)]; + get_used_mrfs(fs, mrf_used); - for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) { - if (mrf_used[i]) { - fs->fail("Register spilling not supported with m%d used", i); - return; + for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) { + if (mrf_used[i]) { + fs->fail("Register spilling not supported with m%d used", i); + return; + } } } @@ -1115,8 +1202,8 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) * 32 bit channels. It shouldn't hurt in any case because the * unspill destination is a block-local temporary. */ - emit_unspill(ibld.exec_all().group(width, 0), - unspill_dst, subset_spill_offset, count); + emit_unspill(ibld.exec_all().group(width, 0), unspill_dst, + subset_spill_offset, count); } } diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index e0f8f4fa7c3..380081c1b71 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -424,6 +424,14 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN7_SFID_DATAPORT_DATA_CACHE: switch ((inst->desc >> 14) & 0x1f) { + case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: + case GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: + /* We have no data for this but assume it's a little faster than + * untyped surface read/write. + */ + latency = 200; + break; + case GEN7_DATAPORT_DC_DWORD_SCATTERED_READ: case GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: