diff --git a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp index 297a26be488..a265914bf01 100644 --- a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp @@ -268,7 +268,7 @@ AluInstr::do_print(std::ostream& os) const } os << " : "; } else { - os << "__." << swzchar[dest_chan()] << " : "; + os << " __." << swzchar[dest_chan()] << " : "; } } diff --git a/src/gallium/drivers/r600/sfn/sfn_instr_lds.cpp b/src/gallium/drivers/r600/sfn/sfn_instr_lds.cpp index dd8f71fda63..a70fcd1e35a 100644 --- a/src/gallium/drivers/r600/sfn/sfn_instr_lds.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_instr_lds.cpp @@ -383,18 +383,6 @@ LDSAtomicInstr::replace_source(PRegister old_src, PVirtualValue new_src) if (old_src->pin() == pin_array || new_src->pin() == pin_array) return false; -<<<<<<< HEAD - if (new_src->get_addr()) { - for (auto& s : m_srcs) { - auto addr = s->get_addr(); - /* can't have two different indirect addresses in the same instr */ - if (addr && !addr->equal_to(*new_src->get_addr())) - return false; - } - } - -======= ->>>>>>> 74c0ddf158e (r600/sfn: Don't copy-propagate indirect access into LDS instr) for (unsigned i = 0; i < m_srcs.size(); ++i) { if (old_src->equal_to(*m_srcs[i])) { m_srcs[i] = new_src; diff --git a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp index 7eb2dbef0fe..9ebeb1619f0 100644 --- a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp @@ -26,6 +26,9 @@ #include "sfn_scheduler.h" +#include "amd_family.h" +#include "r600_isa.h" +#include "sfn_alu_defines.h" #include "sfn_debug.h" #include "sfn_instr_alugroup.h" #include "sfn_instr_controlflow.h" @@ -135,9 +138,21 @@ public: AluInstr *m_last_lds_instr{nullptr}; }; +struct ArrayChanHash +{ + std::size_t operator()(std::pair const& s) const noexcept + { + return std::hash{}((size_t(s.first) << 3) | s.second); + } +}; + +using ArrayCheckSet = std::unordered_set, ArrayChanHash>; + class BlockScheduler { public: - BlockScheduler(r600_chip_class chip_class); + BlockScheduler(r600_chip_class chip_class, + radeon_family family); + void run(Shader *shader); void finalize(); @@ -176,6 +191,10 @@ private: template bool schedule_block(std::list& ready_list); + void update_array_writes(const AluGroup& group); + bool check_array_reads(const AluInstr& instr); + bool check_array_reads(const AluGroup& group); + std::list alu_vec_ready; std::list alu_trans_ready; std::list alu_groups_ready; @@ -208,10 +227,17 @@ private: int m_lds_addr_count{0}; int m_alu_groups_scheduled{0}; r600_chip_class m_chip_class; + radeon_family m_chip_family; bool m_idx0_loading{false}; bool m_idx1_loading{false}; bool m_idx0_pending{false}; bool m_idx1_pending{false}; + + bool m_nop_after_rel_dest{false}; + bool m_nop_befor_rel_src{false}; + + ArrayCheckSet m_last_indirect_array_write; + ArrayCheckSet m_last_direct_array_write; }; Shader * @@ -231,7 +257,9 @@ schedule(Shader *original) // to be able to re-start scheduling auto scheduled_shader = original; - BlockScheduler s(original->chip_class()); + + BlockScheduler s(original->chip_class(), original->chip_family()); + s.run(scheduled_shader); s.finalize(); @@ -245,7 +273,8 @@ schedule(Shader *original) return scheduled_shader; } -BlockScheduler::BlockScheduler(r600_chip_class chip_class): +BlockScheduler::BlockScheduler(r600_chip_class chip_class, + radeon_family chip_family): current_shed(sched_alu), m_last_pos(nullptr), m_last_pixel(nullptr), @@ -253,6 +282,12 @@ BlockScheduler::BlockScheduler(r600_chip_class chip_class): m_current_block(nullptr), m_chip_class(chip_class) { + m_nop_after_rel_dest = chip_family == CHIP_RV770; + + m_nop_befor_rel_src = m_chip_class == ISA_CC_R600 && + chip_family != CHIP_RV670 && + chip_family != CHIP_RS780 && + chip_family != CHIP_RS880; } void @@ -503,26 +538,31 @@ BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks) if (!alu_groups_ready.empty() && !has_lds_ready) { group = *alu_groups_ready.begin(); - sfn_log << SfnLog::schedule << "try schedule " << - *group << "\n"; + if (!check_array_reads(*group)) { - /* Only start a new CF if we have no pending AR reads */ - if (m_current_block->try_reserve_kcache(*group)) { - alu_groups_ready.erase(alu_groups_ready.begin()); - success = true; - } else { - if (m_current_block->expected_ar_uses() == 0) { - start_new_block(out_blocks, Block::alu); - if (!m_current_block->try_reserve_kcache(*group)) - unreachable("Scheduling a group in a new block should always succeed"); + sfn_log << SfnLog::schedule << "try schedule " << + *group << "\n"; + + /* Only start a new CF if we have no pending AR reads */ + if (m_current_block->try_reserve_kcache(*group)) { alu_groups_ready.erase(alu_groups_ready.begin()); - sfn_log << SfnLog::schedule << "Schedule ALU group\n"; success = true; } else { - sfn_log << SfnLog::schedule << "Don't add group because of " << - m_current_block->expected_ar_uses() - << "pending AR loads\n"; + if (m_current_block->expected_ar_uses() == 0) { + start_new_block(out_blocks, Block::alu); + + if (!m_current_block->try_reserve_kcache(*group)) + unreachable("Scheduling a group in a new block should always succeed"); + alu_groups_ready.erase(alu_groups_ready.begin()); + sfn_log << SfnLog::schedule << "Schedule ALU group\n"; + success = true; + } else { + sfn_log << SfnLog::schedule << "Don't add group because of " << + m_current_block->expected_ar_uses() + << "pending AR loads\n"; + group = nullptr; + } } } } @@ -570,10 +610,20 @@ BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks) // kcache reservation failed, so we have to start a new CF start_new_block(out_blocks, Block::alu); } else { - return false; + // Ready is not empty, but we didn't schedule anything, this + // means we had a indirect array read or write conflict that we + // can resolve with an extra group that has a NOP instruction + if (!alu_trans_ready.empty() || !alu_vec_ready.empty()) { + group->add_vec_instructions(new AluInstr(op0_nop, 0)); + break; + } else { + return false; + } } } + + sfn_log << SfnLog::schedule << "Finalize ALU group\n"; group->set_scheduled(); group->fix_last_flag(); @@ -595,6 +645,8 @@ BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks) m_current_block->push_back(group); + update_array_writes(*group); + m_idx0_pending |= m_idx0_loading; m_idx0_loading = false; @@ -705,6 +757,11 @@ BlockScheduler::schedule_alu_to_group_vec(AluGroup *group) while (i != e) { sfn_log << SfnLog::schedule << "Try schedule to vec " << **i; + if (check_array_reads(**i)) { + ++i; + continue; + } + if (!m_current_block->try_reserve_kcache(**i)) { sfn_log << SfnLog::schedule << " failed (kcache)\n"; ++i; @@ -770,6 +827,12 @@ BlockScheduler::schedule_alu_to_group_trans(AluGroup *group, auto i = readylist.begin(); auto e = readylist.end(); while (i != e) { + + if (check_array_reads(**i)) { + ++i; + continue; + } + sfn_log << SfnLog::schedule << "Try schedule to trans " << **i; if (!m_current_block->try_reserve_kcache(**i)) { sfn_log << SfnLog::schedule << " failed (kcache)\n"; @@ -1020,4 +1083,118 @@ BlockScheduler::collect_ready_type(std::list& ready, std::list& availa return !ready.empty(); } +class CheckArrayAccessVisitor : public ConstRegisterVisitor { +public: + void visit(const Register& value) override {(void)value;} + void visit(const LocalArray& value) override {(void)value;} + void visit(const UniformValue& value) override {(void)value;} + void visit(const LiteralConstant& value) override {(void)value;} + void visit(const InlineConstant& value) override {(void)value;} +}; + +class UpdateArrayWrite : public CheckArrayAccessVisitor { +public: + UpdateArrayWrite(ArrayCheckSet& indirect_arrays, + ArrayCheckSet& direct_arrays, + bool tdw): + last_indirect_array_write(indirect_arrays), + last_direct_array_write(direct_arrays), + track_direct_writes(tdw) + { + } + + void visit(const LocalArrayValue& value) override { + int array_base = value.array().base_sel(); + auto entry = std::make_pair(array_base, value.chan()); + if (value.addr()) + last_indirect_array_write.insert(entry); + else if (track_direct_writes) + last_direct_array_write.insert(entry); + } +private: + ArrayCheckSet& last_indirect_array_write; + ArrayCheckSet& last_direct_array_write; + bool track_direct_writes {false}; +}; + + +void BlockScheduler::update_array_writes(const AluGroup& group) +{ + if (m_nop_after_rel_dest || m_nop_befor_rel_src) { + m_last_direct_array_write.clear(); + m_last_indirect_array_write.clear(); + + UpdateArrayWrite visitor(m_last_indirect_array_write, + m_last_direct_array_write, + m_nop_befor_rel_src); + + for (auto alu : group) { + if (alu && alu->dest()) + alu->dest()->accept(visitor); + } + } +} + +class CheckArrayRead : public CheckArrayAccessVisitor { +public: + CheckArrayRead(const ArrayCheckSet& indirect_arrays, + const ArrayCheckSet& direct_arrays): + last_indirect_array_write(indirect_arrays), + last_direct_array_write(direct_arrays) + { + } + + void visit(const LocalArrayValue& value) override { + int array_base = value.array().base_sel(); + auto entry = std::make_pair(array_base, value.chan()); + + if (last_indirect_array_write.find(entry) != + last_indirect_array_write.end()) + need_extra_group = true; + + if (value.addr() && last_direct_array_write.find(entry) != + last_indirect_array_write.end()) { + need_extra_group = true; + } + } + + const ArrayCheckSet& last_indirect_array_write; + const ArrayCheckSet& last_direct_array_write; + bool need_extra_group {false}; +}; + + +bool BlockScheduler::check_array_reads(const AluInstr& instr) +{ + if (m_nop_after_rel_dest || m_nop_befor_rel_src) { + + CheckArrayRead visitor(m_last_indirect_array_write, + m_last_direct_array_write); + + for (auto& s : instr.sources()) { + s->accept(visitor); + } + return visitor.need_extra_group; + } + return false; +} + +bool BlockScheduler::check_array_reads(const AluGroup& group) +{ + if (m_nop_after_rel_dest || m_nop_befor_rel_src) { + + CheckArrayRead visitor(m_last_indirect_array_write, + m_last_direct_array_write); + + for (auto alu : group) { + for (auto& s : alu->sources()) { + s->accept(visitor); + } + } + return visitor.need_extra_group; + } + return false; +} + + } // namespace r600 diff --git a/src/gallium/drivers/r600/sfn/sfn_virtualvalues.h b/src/gallium/drivers/r600/sfn/sfn_virtualvalues.h index 009d6418dd8..ccf3c565b33 100644 --- a/src/gallium/drivers/r600/sfn/sfn_virtualvalues.h +++ b/src/gallium/drivers/r600/sfn/sfn_virtualvalues.h @@ -453,6 +453,8 @@ public: Values::const_iterator begin() const { return m_values.begin(); } Values::const_iterator end() const { return m_values.end(); } + uint32_t base_sel() const { return m_base_sel;} + private: uint32_t m_base_sel; uint32_t m_nchannels; diff --git a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp index daa5aaec98c..6ad5dd9491f 100644 --- a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp +++ b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp @@ -471,3 +471,285 @@ BLOCK_END auto sh = from_string(input); check(schedule(sh), expect); } + +TEST_F(TestShaderFromNir, ScheduleSplitLoadAddrAndNOPAfterIndirectDest) +{ + const char *input = +R"(FS +CHIPCLASS R600 +FAMILY R600 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START + ALU MOV S1.x : KC0[0].x {WL} + ALU MOV A1[S1.x].x : KC0[0].y {WL} + ALU ADD S2.x : A1[1].x KC0[1].x {WL} + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + + const char *expect = +R"(FS +CHIPCLASS R600 +FAMILY R600 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START +ALU_GROUP_BEGIN + ALU MOVA_INT AR : KC0[0].x {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU MOV A1[AR].x : KC0[0].y {WL} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU NOP __.x : {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU ADD S2.x@chgr : A1[1].x KC0[1].x {WL} +ALU_GROUP_END +BLOCK_END +BLOCK_START + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + auto sh = from_string(input); + split_address_loads(*sh); + optimize(*sh); + check(schedule(sh), expect); + +} + +TEST_F(TestShaderFromNir, ScheduleSplitLoadAddrAndNOPBeforIndirectSrc) +{ + const char *input = +R"(FS +CHIPCLASS R600 +FAMILY R600 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START + ALU MOV S1.x : KC0[0].x {WL} + ALU MOV A1[0].x : KC0[0].y {WL} + ALU ADD S2.x : A1[S1.x].x KC0[1].x {WL} + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + + const char *expect = +R"(FS +CHIPCLASS R600 +FAMILY R600 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START +ALU_GROUP_BEGIN + ALU MOVA_INT AR : KC0[0].x {} + ALU MOV A1[0].x : KC0[0].y {WL} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU NOP __.x : {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU ADD S2.x@chgr : A1[AR].x KC0[1].x {WL} +ALU_GROUP_END +BLOCK_END +BLOCK_START + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + auto sh = from_string(input); + split_address_loads(*sh); + optimize(*sh); + check(schedule(sh), expect); + +} + + +TEST_F(TestShaderFromNir, ScheduleSplitLoadAddrAndNOPAfterIndirectDestRV670) +{ + const char *input = +R"(FS +CHIPCLASS R600 +FAMILY RV670 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START + ALU MOV S1.x : KC0[0].x {WL} + ALU MOV A1[S1.x].x : KC0[0].y {WL} + ALU ADD S2.x : A1[1].x KC0[1].x {WL} + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + + const char *expect = +R"(FS +CHIPCLASS R600 +FAMILY RV670 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START +ALU_GROUP_BEGIN + ALU MOVA_INT AR : KC0[0].x {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU MOV A1[AR].x : KC0[0].y {WL} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU ADD S2.x@chgr : A1[1].x KC0[1].x {WL} +ALU_GROUP_END +BLOCK_END +BLOCK_START + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + auto sh = from_string(input); + split_address_loads(*sh); + optimize(*sh); + check(schedule(sh), expect); + +} + +TEST_F(TestShaderFromNir, ScheduleSplitLoadAddrAndNOPAfterIndirectDestEG) +{ + const char *input = +R"(FS +CHIPCLASS EVERGREEN +FAMILY BARTS +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START + ALU MOV S1.x : KC0[0].x {WL} + ALU MOV A1[S1.x].x : KC0[0].y {WL} + ALU ADD S2.x : A1[1].x KC0[1].x {WL} + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + + const char *expect = +R"(FS +CHIPCLASS EVERGREEN +FAMILY BARTS +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START +ALU_GROUP_BEGIN + ALU MOVA_INT AR : KC0[0].x {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU MOV A1[AR].x : KC0[0].y {WL} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU ADD S2.x@chgr : A1[1].x KC0[1].x {WL} +ALU_GROUP_END +BLOCK_END +BLOCK_START + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + auto sh = from_string(input); + split_address_loads(*sh); + optimize(*sh); + check(schedule(sh), expect); + +} + +TEST_F(TestShaderFromNir, ScheduleSplitLoadAddrAndNOPAfterIndirectDestRV770) +{ + const char *input = +R"(FS +CHIPCLASS R700 +FAMILY RV770 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START + ALU MOV S1.x : KC0[0].x {WL} + ALU MOV A1[S1.x].x : KC0[0].y {WL} + ALU ADD S2.x : A1[1].x KC0[1].x {WL} + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + + const char *expect = +R"(FS +CHIPCLASS R700 +FAMILY RV770 +PROP MAX_COLOR_EXPORTS:1 +PROP COLOR_EXPORTS:1 +PROP COLOR_EXPORT_MASK:15 +PROP WRITE_ALL_COLORS:0 +OUTPUT LOC:0 NAME:1 MASK:15 +ARRAYS A1[2].x +SHADER +BLOCK_START +ALU_GROUP_BEGIN + ALU MOVA_INT AR : KC0[0].x {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU MOV A1[AR].x : KC0[0].y {WL} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU NOP __.x : {L} +ALU_GROUP_END +ALU_GROUP_BEGIN + ALU ADD S2.x@chgr : A1[1].x KC0[1].x {WL} +ALU_GROUP_END +BLOCK_END +BLOCK_START + EXPORT_DONE PIXEL 0 S2.xxxx +BLOCK_END)"; + + auto sh = from_string(input); + split_address_loads(*sh); + optimize(*sh); + check(schedule(sh), expect); + +} + + + +