From 0e793a6cef3a653a3a40dcf8ff98e7d19dc74c5c Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Mon, 8 Sep 2025 17:05:32 +0200 Subject: [PATCH] r600/sfn: Wire up some omod optimizations Signed-off-by: Gert Wollny Part-of: --- src/gallium/drivers/r600/sfn/sfn_peephole.cpp | 99 ++++++++++++++++++- .../r600/sfn/tests/sfn_optimizer_test.cpp | 31 ++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/sfn/sfn_peephole.cpp b/src/gallium/drivers/r600/sfn/sfn_peephole.cpp index 3156078b448..e6bfb5e307a 100644 --- a/src/gallium/drivers/r600/sfn/sfn_peephole.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_peephole.cpp @@ -34,6 +34,11 @@ public: void apply_source_mods(AluInstr *alu); void apply_dest_clamp(AluInstr *alu); void try_fuse_with_prev(AluInstr *alu); + bool try_omod(AluInstr *alu); + + using OModInfo = std::pair; + + OModInfo get_omod(AluInstr *alu) const; bool progress{false}; }; @@ -65,6 +70,8 @@ public: void PeepholeVisitor::visit(AluInstr *instr) { + const auto opinfo = alu_ops.at(instr->opcode()); + switch (instr->opcode()) { case op1_mov: if (instr->has_alu_flag(alu_dst_clamp)) @@ -86,6 +93,10 @@ PeepholeVisitor::visit(AluInstr *instr) convert_to_mov(instr, 1); else if (value_is_const_float(instr->src(1), 1.0f)) convert_to_mov(instr, 0); + else { + if (try_omod(instr)) + return; + } break; case op3_muladd: case op3_muladd_ieee: @@ -106,9 +117,93 @@ PeepholeVisitor::visit(AluInstr *instr) default:; } - auto opinfo = alu_ops.at(instr->opcode()); if (opinfo.can_srcmod) - apply_source_mods(instr); + apply_source_mods(instr); +} + +bool +PeepholeVisitor::try_omod(AluInstr *alu) +{ + auto [omod, src_idx] = get_omod(alu); + + if (src_idx == -1) + return false; + + auto reg = alu->src(src_idx).as_register(); + if (!reg || !reg->has_flag(Register::ssa) || reg->uses().size() != 1 || + reg->parents().empty()) + return false; + + assert(reg->parents().size() == 1); + + auto p = *reg->parents().begin(); + + auto parent = p->as_alu(); + + if (!parent) + return false; + + if (parent->has_alu_flag(alu_dst_clamp)) + return false; + + if (parent->output_modifier() != AluInstr::omod_none) + return false; + + switch (parent->opcode()) { + case op1_cos: + case op1_exp_ieee: + case op1_log_clamped: + case op1_log_ieee: + case op1_max4: + case op1_recip_clamped: + case op1_recip_ieee: + case op1_recipsqrt_clamped: + case op1_recipsqrt_ieee1: + case op1_sin: + case op1_sqrt_ieee: + case op2_add: + case op2_dot4: + case op2_dot4_ieee: + case op2_dot: + case op2_dot_ieee: + case op2_max: + case op2_min: + case op2_mul: + case op2_mul_ieee: + break; + default: + return false; + } + + parent->set_output_modifier(omod); + if (alu->has_alu_flag(alu_dst_clamp)) { + parent->set_alu_flag(alu_dst_clamp); + alu->reset_alu_flag(alu_dst_clamp); + } + + convert_to_mov(alu, src_idx); + progress = true; + + return true; +} + +PeepholeVisitor::OModInfo +PeepholeVisitor::get_omod(AluInstr *alu) const +{ + const std::array, 3> mod_table = { + {{2.0f, AluInstr::omod_mul2}, + {4.0f, AluInstr::omod_mul4}, + {0.5f, AluInstr::omod_div2}} + }; + + for (int i = 0; i < 2; ++i) { + for (int k = 0; k < 3; ++k) { + if (value_is_const_float(alu->src(i), mod_table[k].first)) + return std::make_pair(mod_table[k].second, 1 - i); + } + } + + return std::make_pair(AluInstr::omod_none, -1); } void diff --git a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp index 1718217a132..5cfcc32d447 100644 --- a/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp +++ b/src/gallium/drivers/r600/sfn/tests/sfn_optimizer_test.cpp @@ -772,6 +772,37 @@ BLOCK_END check(sh, expect); }; +TEST_F(TestShaderFromNir, OptimizeOmodAdd_d2) +{ + const char *input = + R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 +OUTPUT LOC:0 VARYING_SLOT:0 MASK:15 +REGISTERS R0.x R1.x R2.x +SHADER +BLOCK_START + ALU ADD S4.x : R0.x R1.x {W} + ALU MUL S5.y : S4.x L[0x3f000000] {W} + EXPORT_DONE PARAM 0 S5.yyyy +BLOCK_END)"; + + const char *expect = + R"(VS +CHIPCLASS CAYMAN +INPUT LOC:0 +OUTPUT LOC:0 VARYING_SLOT:0 MASK:15 +REGISTERS R0.x R1.x R2.x +SHADER +BLOCK_START + ALU ADD D2 S4.x : R0.x R1.x {W} + EXPORT_DONE PARAM 0 S4.xxxx +BLOCK_END +)"; + auto sh = from_string(input); + optimize(*sh); + check(sh, expect); +}; TEST_F(TestShaderFromNir, OptimizeIntoGroup) {