diff --git a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp index d11ca91816d..e040ef5e6c2 100644 --- a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp @@ -468,6 +468,33 @@ bool AluInstr::do_replace_source(PRegister old_src, PVirtualValue new_src) return process; } +bool AluInstr::replace_src(int i, PVirtualValue new_src, uint32_t to_set, + SourceMod to_clear) +{ + auto old_src = m_src[i]->as_register(); + assert(old_src); + + if (!can_replace_source(old_src, new_src)) { + std::cerr << "Can't replace src " << *old_src << " with " << *new_src << "\n"; + return false; + } + + assert(old_src); + old_src->del_use(this); + + m_src[i] = new_src; + + auto r = new_src->as_register(); + if (r) + r->add_use(this); + + m_source_modifiers |= to_set << (2 * i); + m_source_modifiers &= ~(to_clear << (2 * i)); + + return true; +} + + bool AluInstr::can_replace_source(PRegister old_src, PVirtualValue new_src) { if (!check_readport_validation(old_src, new_src)) diff --git a/src/gallium/drivers/r600/sfn/sfn_instr_alu.h b/src/gallium/drivers/r600/sfn/sfn_instr_alu.h index 8d6bca32b8b..a3310b5afb7 100644 --- a/src/gallium/drivers/r600/sfn/sfn_instr_alu.h +++ b/src/gallium/drivers/r600/sfn/sfn_instr_alu.h @@ -199,6 +199,9 @@ public: void inc_ar_uses() { ++m_num_ar_uses;} auto num_ar_uses() const {return m_num_ar_uses;} + bool replace_src(int i, PVirtualValue new_src, uint32_t to_set, + SourceMod to_clear); + void set_source_mod(int src, SourceMod mod) { m_source_modifiers |= mod << (2 * src); } diff --git a/src/gallium/drivers/r600/sfn/sfn_peephole.cpp b/src/gallium/drivers/r600/sfn/sfn_peephole.cpp index dc4ef59e64a..a8cae1d5bc9 100644 --- a/src/gallium/drivers/r600/sfn/sfn_peephole.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_peephole.cpp @@ -50,6 +50,9 @@ public: void convert_to_mov(AluInstr *alu, int src_idx); + void apply_source_mods(AluInstr *alu); + void apply_dest_clamp(AluInstr *alu); + bool progress{false}; }; @@ -81,6 +84,10 @@ void PeepholeVisitor::visit(AluInstr *instr) { switch (instr->opcode()) { + case op1_mov: + if (instr->has_alu_flag(alu_dst_clamp)) + apply_dest_clamp(instr); + break; case op2_add: case op2_add_int: if (value_is_const_uint(instr->src(0), 0)) @@ -110,8 +117,13 @@ PeepholeVisitor::visit(AluInstr *instr) progress |= visitor.success; } } + break; default:; } + + auto opinfo = alu_ops.at(instr->opcode()); + if (opinfo.can_srcmod && !opinfo.is_fp64) + apply_source_mods(instr); } void @@ -154,6 +166,108 @@ PeepholeVisitor::visit(IfInstr *instr) } } +void PeepholeVisitor::apply_source_mods(AluInstr *alu) +{ + bool has_abs = alu->n_sources() / alu->alu_slots() < 3; + + for (unsigned i = 0; i < alu->n_sources(); ++i) { + + auto reg = alu->psrc(i)->as_register(); + if (!reg) + continue; + if (!reg->has_flag(Register::ssa)) + continue; + if (reg->parents().size() != 1) + continue; + + auto p = (*reg->parents().begin())->as_alu(); + if (!p) + continue; + + if (p->opcode() != op1_mov) + continue; + + if (!has_abs && p->has_source_mod(0, AluInstr::mod_abs)) + continue; + + if (!p->has_source_mod(0, AluInstr::mod_abs) && + !p->has_source_mod(0, AluInstr::mod_neg)) + continue; + + if (p->has_alu_flag(alu_dst_clamp)) + continue; + + auto new_src = p->psrc(0); + bool new_src_not_pinned = new_src->pin() == pin_free || + new_src->pin() == pin_none; + + bool old_src_not_pinned = reg->pin() == pin_free || + reg->pin() == pin_none; + + bool sources_equal_channel = reg->pin() == pin_chan && + new_src->pin() == pin_chan && + new_src->chan() == reg->chan(); + + if (!new_src_not_pinned && + !old_src_not_pinned && + !sources_equal_channel) + continue; + + uint32_t to_set = 0; + AluInstr::SourceMod to_clear = AluInstr::mod_none; + + if (p->has_source_mod(0, AluInstr::mod_abs)) + to_set |= AluInstr::mod_abs; + if (p->has_source_mod(0, AluInstr::mod_neg)) { + if (!alu->has_source_mod(i, AluInstr::mod_neg)) + to_set |= AluInstr::mod_neg; + else + to_clear = AluInstr::mod_neg; + } + + progress |= alu->replace_src(i, new_src, to_set, to_clear); + } +} + +void PeepholeVisitor::apply_dest_clamp(AluInstr *alu) +{ + if (alu->has_source_mod(0, AluInstr::mod_abs) || + alu->has_source_mod(0, AluInstr::mod_neg)) + return; + + auto dest = alu->dest(); + + assert(dest); + + if (!dest->has_flag(Register::ssa)) + return; + + auto src = alu->psrc(0)->as_register(); + if (!src) + return; + + if (src->parents().size() != 1) + return; + + if (src->uses().size() != 1) + return; + + auto new_parent = (*src->parents().begin())->as_alu(); + if (!new_parent) + return; + + auto opinfo = alu_ops.at(new_parent->opcode()); + if (!opinfo.can_clamp) + return; + + // Move clamp flag to the parent, and let copy propagation do the rest + new_parent->set_alu_flag(alu_dst_clamp); + alu->reset_alu_flag(alu_dst_clamp); + + progress = true; +} + + static EAluOp pred_from_op(EAluOp pred_op, EAluOp op) { diff --git a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp index 309d715faf8..ce6203c1cf5 100644 --- a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp +++ b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp @@ -369,6 +369,206 @@ BLOCK_END check(sh, expect); }; +TEST_F(TestShaderFromNir, PeeholeSoureModsSimple) +{ + const char *input = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10 +SHADER +BLOCK_START + ALU MOV S2.x@free{s} : I[0] {WL} + ALU MOV S3.y@free{s} : L[0x40c00000] {WL} + ALU MOV S4.z@free{s} : L[0xc1140000] {WL} + ALU MOV S5.w@free{s} : L[0xbfe00000] {WL} + ALU MOV S6.x@free{s} : L[0x3fa00000] {WL} + ALU MOV S7.x{s} : |KC0[0].x| {W} + ALU MOV S7.y{s} : -KC0[0].y {W} + ALU MOV S7.z{s} : -|KC0[0].z| {W} + ALU MOV S7.w{s} : KC0[0].w {WL} + ALU ADD S8.y@free{s} : S3.y@free{s} S7.x{s} {WL} + ALU ADD S9.z@free{s} : S4.z@free{s} S7.y{s} {WL} + ALU ADD S10.w@free{s} : S5.w@free{s} S7.z{s} {WL} + ALU ADD S11.x@free{s} : S6.x@free{s} S7.w{s} {WL} + ALU EXP_IEEE S12.y@free{s} : S8.y@free{s} + S8.y@free{s} + S8.y@free{s} {WL} + ALU EXP_IEEE S13.z@free{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {WL} + ALU EXP_IEEE S14.x@free{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {WL} + ALU EXP_IEEE S15.y@free{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL} + ALU MOV S17.x{s} : S12.y@free{s} {W} + ALU MOV S17.y{s} : S13.z@free{s} {W} + ALU MOV S17.z{s} : S14.x@free{s} {W} + ALU MOV S17.w{s} : S15.y@free{s} {WL} + ALU MOV S18.x@group{s} : S17.x{s} {W} + ALU MOV S18.y@group{s} : S17.y{s} {W} + ALU MOV S18.z@group{s} : S17.z{s} {W} + ALU MOV S18.w@group{s} : S17.w{s} {WL} + EXPORT_DONE PARAM 0 S18.xyzw +BLOCK_END)"; + + const char *expect = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10 +SHADER +BLOCK_START + ALU ADD S8.y@free{s} : L[0x40c00000] |KC0[0].x| {WL} + ALU ADD S9.z@free{s} : L[0xc1140000] -KC0[0].y {WL} + ALU ADD S10.w@free{s} : L[0xbfe00000] -|KC0[0].z| {WL} + ALU ADD S11.x@free{s} : L[0x3fa00000] KC0[0].w {WL} + ALU EXP_IEEE S18.x@group{s} : S8.y@free{s} + S8.y@free{s} + S8.y@free{s} {W} + ALU EXP_IEEE S18.y@group{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {W} + ALU EXP_IEEE S18.z@group{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {W} + ALU EXP_IEEE S18.w@group{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL} + EXPORT_DONE PARAM 0 S18.xyzw +BLOCK_END +)"; + auto sh = from_string(input); + optimize(*sh); + check(sh, expect); +}; + +TEST_F(TestShaderFromNir, PeeholeSoureModsAbsNegTwice) +{ + const char *input = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10 +SHADER +BLOCK_START + ALU MOV S2.x@free{s} : I[0] {WL} + ALU MOV S3.y@free{s} : L[0x40c00000] {WL} + ALU MOV S4.z@free{s} : L[0xc1140000] {WL} + ALU MOV S5.w@free{s} : L[0xbfe00000] {WL} + ALU MOV S6.x@free{s} : L[0x3fa00000] {WL} + ALU MOV S7.x{s} : |KC0[0].x| {W} + ALU MOV S7.y{s} : -KC0[0].y {W} + ALU MOV S7.z{s} : -|KC0[0].z| {W} + ALU MOV S7.w{s} : KC0[0].w {WL} + ALU MOV S8.x : |S7.x| {W} + ALU MOV S8.y : -S7.y {W} + ALU MOV S8.z : -|S7.z| {W} + ALU MOV S8.w : -|S7.x| {WL} + ALU ADD S19.y@free{s} : S3.y@free{s} S8.x {WL} + ALU ADD S9.z@free{s} : S4.z@free{s} S8.y {WL} + ALU ADD S10.w@free{s} : S5.w@free{s} S8.z {WL} + ALU ADD S11.x@free{s} : S6.x@free{s} S8.w {WL} + ALU EXP_IEEE S12.y@free{s} : S19.y@free{s} + S19.y@free{s} + S19.y@free{s} {WL} + ALU EXP_IEEE S13.z@free{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {WL} + ALU EXP_IEEE S14.x@free{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {WL} + ALU EXP_IEEE S15.y@free{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL} + ALU MOV S17.x{s} : S12.y@free{s} {W} + ALU MOV S17.y{s} : S13.z@free{s} {W} + ALU MOV S17.z{s} : S14.x@free{s} {W} + ALU MOV S17.w{s} : S15.y@free{s} {WL} + ALU MOV S18.x@group{s} : S17.x{s} {W} + ALU MOV S18.y@group{s} : S17.y{s} {W} + ALU MOV S18.z@group{s} : S17.z{s} {W} + ALU MOV S18.w@group{s} : S17.w{s} {WL} + EXPORT_DONE PARAM 0 S18.xyzw +BLOCK_END)"; + + const char *expect = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10 +SHADER +BLOCK_START + ALU ADD S19.y@free{s} : L[0x40c00000] |KC0[0].x| {WL} + ALU ADD S9.z@free{s} : L[0xc1140000] KC0[0].y {WL} + ALU ADD S10.w@free{s} : L[0xbfe00000] |KC0[0].z| {WL} + ALU ADD S11.x@free{s} : L[0x3fa00000] -|KC0[0].x| {WL} + ALU EXP_IEEE S18.x@group{s} : S19.y@free{s} + S19.y@free{s} + S19.y@free{s} {W} + ALU EXP_IEEE S18.y@group{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {W} + ALU EXP_IEEE S18.z@group{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {W} + ALU EXP_IEEE S18.w@group{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL} + EXPORT_DONE PARAM 0 S18.xyzw +BLOCK_END +)"; + auto sh = from_string(input); + optimize(*sh); + check(sh, expect); +}; + +TEST_F(TestShaderFromNir, PeeholeSoureModsClamp) +{ + const char *input = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +SHADER +BLOCK_START + ALU MOV S1.x{s} : |KC0[0].x| {W} + ALU MOV S2.y{s} : -KC0[0].y {W} + ALU ADD S3.x : S1.x S2.y {W} + ALU MOV CLAMP S4.x : S3.x {W} + EXPORT_DONE PARAM 0 S4.xxxx +BLOCK_END)"; + + const char *expect = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +SHADER +BLOCK_START + ALU ADD CLAMP S3.x : |KC0[0].x| -KC0[0].y {W} + EXPORT_DONE PARAM 0 S3.xxxx +BLOCK_END +)"; + auto sh = from_string(input); + optimize(*sh); + check(sh, expect); +}; + +TEST_F(TestShaderFromNir, PeeholeSoureModsMuliSlot) +{ + const char *input = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +REGISTERS R1.xyzw +SHADER +BLOCK_START + ALU MOV S1.x{s} : |KC0[0].x| {W} + ALU MOV S1.y{s} : -KC0[0].y {W} + ALU MOV S1.z{s} : |KC0[0].z| {W} + ALU MOV S1.w{s} : KC0[0].w {W} + ALU MOV S2.x{s} : |R1.x| {W} + ALU MOV S2.y{s} : R1.y {W} + ALU MOV S2.z{s} : -R1.z {W} + ALU MOV S2.w{s} : -R1.w {W} + ALU DOT4 S5.x : S1.x S2.x + S1.y S2.y + S1.z S2.z + S1.w S2.w {W} + EXPORT_DONE PARAM 0 S5.xxxx +BLOCK_END)"; + + const char *expect = +R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 NAME:15 +OUTPUT LOC:0 NAME:0 MASK:15 +REGISTERS R1.xyzw +SHADER +BLOCK_START + ALU DOT4 S5.x : |KC0[0].x| |R1.x| + -KC0[0].y R1.y + |KC0[0].z| -R1.z + KC0[0].w -R1.w {W} + EXPORT_DONE PARAM 0 S5.xxxx +BLOCK_END +)"; + auto sh = from_string(input); + optimize(*sh); + check(sh, expect); +}; + + TEST_F(TestShaderFromNir, OptimizeIntoGroup) { const char *input =