r600/sfn: Implement source mod optimization in backend

Signed-off-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23702>
This commit is contained in:
Gert Wollny 2023-06-16 17:52:36 +02:00 committed by Marge Bot
parent da92733d5a
commit ae7d904a73
4 changed files with 344 additions and 0 deletions

View file

@ -468,6 +468,33 @@ bool AluInstr::do_replace_source(PRegister old_src, PVirtualValue new_src)
return process;
}
bool AluInstr::replace_src(int i, PVirtualValue new_src, uint32_t to_set,
SourceMod to_clear)
{
auto old_src = m_src[i]->as_register();
assert(old_src);
if (!can_replace_source(old_src, new_src)) {
std::cerr << "Can't replace src " << *old_src << " with " << *new_src << "\n";
return false;
}
assert(old_src);
old_src->del_use(this);
m_src[i] = new_src;
auto r = new_src->as_register();
if (r)
r->add_use(this);
m_source_modifiers |= to_set << (2 * i);
m_source_modifiers &= ~(to_clear << (2 * i));
return true;
}
bool AluInstr::can_replace_source(PRegister old_src, PVirtualValue new_src)
{
if (!check_readport_validation(old_src, new_src))

View file

@ -199,6 +199,9 @@ public:
void inc_ar_uses() { ++m_num_ar_uses;}
auto num_ar_uses() const {return m_num_ar_uses;}
bool replace_src(int i, PVirtualValue new_src, uint32_t to_set,
SourceMod to_clear);
void set_source_mod(int src, SourceMod mod) {
m_source_modifiers |= mod << (2 * src);
}

View file

@ -50,6 +50,9 @@ public:
void convert_to_mov(AluInstr *alu, int src_idx);
void apply_source_mods(AluInstr *alu);
void apply_dest_clamp(AluInstr *alu);
bool progress{false};
};
@ -81,6 +84,10 @@ void
PeepholeVisitor::visit(AluInstr *instr)
{
switch (instr->opcode()) {
case op1_mov:
if (instr->has_alu_flag(alu_dst_clamp))
apply_dest_clamp(instr);
break;
case op2_add:
case op2_add_int:
if (value_is_const_uint(instr->src(0), 0))
@ -110,8 +117,13 @@ PeepholeVisitor::visit(AluInstr *instr)
progress |= visitor.success;
}
}
break;
default:;
}
auto opinfo = alu_ops.at(instr->opcode());
if (opinfo.can_srcmod && !opinfo.is_fp64)
apply_source_mods(instr);
}
void
@ -154,6 +166,108 @@ PeepholeVisitor::visit(IfInstr *instr)
}
}
void PeepholeVisitor::apply_source_mods(AluInstr *alu)
{
bool has_abs = alu->n_sources() / alu->alu_slots() < 3;
for (unsigned i = 0; i < alu->n_sources(); ++i) {
auto reg = alu->psrc(i)->as_register();
if (!reg)
continue;
if (!reg->has_flag(Register::ssa))
continue;
if (reg->parents().size() != 1)
continue;
auto p = (*reg->parents().begin())->as_alu();
if (!p)
continue;
if (p->opcode() != op1_mov)
continue;
if (!has_abs && p->has_source_mod(0, AluInstr::mod_abs))
continue;
if (!p->has_source_mod(0, AluInstr::mod_abs) &&
!p->has_source_mod(0, AluInstr::mod_neg))
continue;
if (p->has_alu_flag(alu_dst_clamp))
continue;
auto new_src = p->psrc(0);
bool new_src_not_pinned = new_src->pin() == pin_free ||
new_src->pin() == pin_none;
bool old_src_not_pinned = reg->pin() == pin_free ||
reg->pin() == pin_none;
bool sources_equal_channel = reg->pin() == pin_chan &&
new_src->pin() == pin_chan &&
new_src->chan() == reg->chan();
if (!new_src_not_pinned &&
!old_src_not_pinned &&
!sources_equal_channel)
continue;
uint32_t to_set = 0;
AluInstr::SourceMod to_clear = AluInstr::mod_none;
if (p->has_source_mod(0, AluInstr::mod_abs))
to_set |= AluInstr::mod_abs;
if (p->has_source_mod(0, AluInstr::mod_neg)) {
if (!alu->has_source_mod(i, AluInstr::mod_neg))
to_set |= AluInstr::mod_neg;
else
to_clear = AluInstr::mod_neg;
}
progress |= alu->replace_src(i, new_src, to_set, to_clear);
}
}
void PeepholeVisitor::apply_dest_clamp(AluInstr *alu)
{
if (alu->has_source_mod(0, AluInstr::mod_abs) ||
alu->has_source_mod(0, AluInstr::mod_neg))
return;
auto dest = alu->dest();
assert(dest);
if (!dest->has_flag(Register::ssa))
return;
auto src = alu->psrc(0)->as_register();
if (!src)
return;
if (src->parents().size() != 1)
return;
if (src->uses().size() != 1)
return;
auto new_parent = (*src->parents().begin())->as_alu();
if (!new_parent)
return;
auto opinfo = alu_ops.at(new_parent->opcode());
if (!opinfo.can_clamp)
return;
// Move clamp flag to the parent, and let copy propagation do the rest
new_parent->set_alu_flag(alu_dst_clamp);
alu->reset_alu_flag(alu_dst_clamp);
progress = true;
}
static EAluOp
pred_from_op(EAluOp pred_op, EAluOp op)
{

View file

@ -369,6 +369,206 @@ BLOCK_END
check(sh, expect);
};
TEST_F(TestShaderFromNir, PeeholeSoureModsSimple)
{
const char *input =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10
SHADER
BLOCK_START
ALU MOV S2.x@free{s} : I[0] {WL}
ALU MOV S3.y@free{s} : L[0x40c00000] {WL}
ALU MOV S4.z@free{s} : L[0xc1140000] {WL}
ALU MOV S5.w@free{s} : L[0xbfe00000] {WL}
ALU MOV S6.x@free{s} : L[0x3fa00000] {WL}
ALU MOV S7.x{s} : |KC0[0].x| {W}
ALU MOV S7.y{s} : -KC0[0].y {W}
ALU MOV S7.z{s} : -|KC0[0].z| {W}
ALU MOV S7.w{s} : KC0[0].w {WL}
ALU ADD S8.y@free{s} : S3.y@free{s} S7.x{s} {WL}
ALU ADD S9.z@free{s} : S4.z@free{s} S7.y{s} {WL}
ALU ADD S10.w@free{s} : S5.w@free{s} S7.z{s} {WL}
ALU ADD S11.x@free{s} : S6.x@free{s} S7.w{s} {WL}
ALU EXP_IEEE S12.y@free{s} : S8.y@free{s} + S8.y@free{s} + S8.y@free{s} {WL}
ALU EXP_IEEE S13.z@free{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {WL}
ALU EXP_IEEE S14.x@free{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {WL}
ALU EXP_IEEE S15.y@free{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL}
ALU MOV S17.x{s} : S12.y@free{s} {W}
ALU MOV S17.y{s} : S13.z@free{s} {W}
ALU MOV S17.z{s} : S14.x@free{s} {W}
ALU MOV S17.w{s} : S15.y@free{s} {WL}
ALU MOV S18.x@group{s} : S17.x{s} {W}
ALU MOV S18.y@group{s} : S17.y{s} {W}
ALU MOV S18.z@group{s} : S17.z{s} {W}
ALU MOV S18.w@group{s} : S17.w{s} {WL}
EXPORT_DONE PARAM 0 S18.xyzw
BLOCK_END)";
const char *expect =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10
SHADER
BLOCK_START
ALU ADD S8.y@free{s} : L[0x40c00000] |KC0[0].x| {WL}
ALU ADD S9.z@free{s} : L[0xc1140000] -KC0[0].y {WL}
ALU ADD S10.w@free{s} : L[0xbfe00000] -|KC0[0].z| {WL}
ALU ADD S11.x@free{s} : L[0x3fa00000] KC0[0].w {WL}
ALU EXP_IEEE S18.x@group{s} : S8.y@free{s} + S8.y@free{s} + S8.y@free{s} {W}
ALU EXP_IEEE S18.y@group{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {W}
ALU EXP_IEEE S18.z@group{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {W}
ALU EXP_IEEE S18.w@group{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL}
EXPORT_DONE PARAM 0 S18.xyzw
BLOCK_END
)";
auto sh = from_string(input);
optimize(*sh);
check(sh, expect);
};
TEST_F(TestShaderFromNir, PeeholeSoureModsAbsNegTwice)
{
const char *input =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10
SHADER
BLOCK_START
ALU MOV S2.x@free{s} : I[0] {WL}
ALU MOV S3.y@free{s} : L[0x40c00000] {WL}
ALU MOV S4.z@free{s} : L[0xc1140000] {WL}
ALU MOV S5.w@free{s} : L[0xbfe00000] {WL}
ALU MOV S6.x@free{s} : L[0x3fa00000] {WL}
ALU MOV S7.x{s} : |KC0[0].x| {W}
ALU MOV S7.y{s} : -KC0[0].y {W}
ALU MOV S7.z{s} : -|KC0[0].z| {W}
ALU MOV S7.w{s} : KC0[0].w {WL}
ALU MOV S8.x : |S7.x| {W}
ALU MOV S8.y : -S7.y {W}
ALU MOV S8.z : -|S7.z| {W}
ALU MOV S8.w : -|S7.x| {WL}
ALU ADD S19.y@free{s} : S3.y@free{s} S8.x {WL}
ALU ADD S9.z@free{s} : S4.z@free{s} S8.y {WL}
ALU ADD S10.w@free{s} : S5.w@free{s} S8.z {WL}
ALU ADD S11.x@free{s} : S6.x@free{s} S8.w {WL}
ALU EXP_IEEE S12.y@free{s} : S19.y@free{s} + S19.y@free{s} + S19.y@free{s} {WL}
ALU EXP_IEEE S13.z@free{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {WL}
ALU EXP_IEEE S14.x@free{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {WL}
ALU EXP_IEEE S15.y@free{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL}
ALU MOV S17.x{s} : S12.y@free{s} {W}
ALU MOV S17.y{s} : S13.z@free{s} {W}
ALU MOV S17.z{s} : S14.x@free{s} {W}
ALU MOV S17.w{s} : S15.y@free{s} {WL}
ALU MOV S18.x@group{s} : S17.x{s} {W}
ALU MOV S18.y@group{s} : S17.y{s} {W}
ALU MOV S18.z@group{s} : S17.z{s} {W}
ALU MOV S18.w@group{s} : S17.w{s} {WL}
EXPORT_DONE PARAM 0 S18.xyzw
BLOCK_END)";
const char *expect =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
OUTPUT LOC:1 NAME:5 MASK:15 SID:9 SPI_SID:10
SHADER
BLOCK_START
ALU ADD S19.y@free{s} : L[0x40c00000] |KC0[0].x| {WL}
ALU ADD S9.z@free{s} : L[0xc1140000] KC0[0].y {WL}
ALU ADD S10.w@free{s} : L[0xbfe00000] |KC0[0].z| {WL}
ALU ADD S11.x@free{s} : L[0x3fa00000] -|KC0[0].x| {WL}
ALU EXP_IEEE S18.x@group{s} : S19.y@free{s} + S19.y@free{s} + S19.y@free{s} {W}
ALU EXP_IEEE S18.y@group{s} : S9.z@free{s} + S9.z@free{s} + S9.z@free{s} {W}
ALU EXP_IEEE S18.z@group{s} : S10.w@free{s} + S10.w@free{s} + S10.w@free{s} {W}
ALU EXP_IEEE S18.w@group{s} : S11.x@free{s} + S11.x@free{s} + S11.x@free{s} + S11.x@free{s} {WL}
EXPORT_DONE PARAM 0 S18.xyzw
BLOCK_END
)";
auto sh = from_string(input);
optimize(*sh);
check(sh, expect);
};
TEST_F(TestShaderFromNir, PeeholeSoureModsClamp)
{
const char *input =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
SHADER
BLOCK_START
ALU MOV S1.x{s} : |KC0[0].x| {W}
ALU MOV S2.y{s} : -KC0[0].y {W}
ALU ADD S3.x : S1.x S2.y {W}
ALU MOV CLAMP S4.x : S3.x {W}
EXPORT_DONE PARAM 0 S4.xxxx
BLOCK_END)";
const char *expect =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
SHADER
BLOCK_START
ALU ADD CLAMP S3.x : |KC0[0].x| -KC0[0].y {W}
EXPORT_DONE PARAM 0 S3.xxxx
BLOCK_END
)";
auto sh = from_string(input);
optimize(*sh);
check(sh, expect);
};
TEST_F(TestShaderFromNir, PeeholeSoureModsMuliSlot)
{
const char *input =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
REGISTERS R1.xyzw
SHADER
BLOCK_START
ALU MOV S1.x{s} : |KC0[0].x| {W}
ALU MOV S1.y{s} : -KC0[0].y {W}
ALU MOV S1.z{s} : |KC0[0].z| {W}
ALU MOV S1.w{s} : KC0[0].w {W}
ALU MOV S2.x{s} : |R1.x| {W}
ALU MOV S2.y{s} : R1.y {W}
ALU MOV S2.z{s} : -R1.z {W}
ALU MOV S2.w{s} : -R1.w {W}
ALU DOT4 S5.x : S1.x S2.x + S1.y S2.y + S1.z S2.z + S1.w S2.w {W}
EXPORT_DONE PARAM 0 S5.xxxx
BLOCK_END)";
const char *expect =
R"(VS
CHIPCLASS CAYMAN
INPUT LOC:0 NAME:15
OUTPUT LOC:0 NAME:0 MASK:15
REGISTERS R1.xyzw
SHADER
BLOCK_START
ALU DOT4 S5.x : |KC0[0].x| |R1.x| + -KC0[0].y R1.y + |KC0[0].z| -R1.z + KC0[0].w -R1.w {W}
EXPORT_DONE PARAM 0 S5.xxxx
BLOCK_END
)";
auto sh = from_string(input);
optimize(*sh);
check(sh, expect);
};
TEST_F(TestShaderFromNir, OptimizeIntoGroup)
{
const char *input =