r300: merge together MOV and MAD instructions

Assuming they write different channels of the same destination
and they share at least one source or one of the sources is
RC_FILE_NONE.

shader-db with RV530:
total instructions in shared programs: 136033 -> 135673 (-0.26%)
instructions in affected programs: 22987 -> 22627 (-1.57%)
total temps in shared programs: 18977 -> 18965 (-0.06%)
temps in affected programs: 74 -> 62 (-16.22%)

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6188

Reviewed-by: Filip Gawin <filip@gawin.net>
Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18288>
This commit is contained in:
Pavel Ondračka 2022-08-11 14:49:15 +02:00 committed by Marge Bot
parent 708b2b611a
commit 9156a7aab5

View file

@ -1007,7 +1007,8 @@ static int have_shared_source(struct rc_instruction * inst1, struct rc_instructi
for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr)
inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
shared_src = i;
}
}
@ -1105,6 +1106,97 @@ static int merge_mov_add_mul(
return 1;
}
/**
* This function will try to merge MOV and MAD instructions with the same
* destination, making use of the constant swizzles. This only works
* if there is a shared source or one of the sources is RC_FILE_NONE.
*
* For example:
* MOV temp[0].x const[0].x
* MAD temp[0].yz const[0].yz const[1].yz input[0].xw
*
* becomes
* MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
*/
static bool merge_mov_mad(
struct radeon_compiler * c,
struct rc_instruction * inst1,
struct rc_instruction * inst2)
{
struct rc_instruction * mov, * mad;
if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
mov = inst1;
mad = inst2;
} else {
mov = inst2;
mad = inst1;
}
int shared_index = have_shared_source(mad, mov);
unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
struct rc_src_register src[3];
src[0] = mad->U.I.SrcReg[0];
src[1] = mad->U.I.SrcReg[1];
src[2] = mad->U.I.SrcReg[2];
/* Shared source is the one for multiplication. */
if (shared_index == 0 || shared_index == 1) {
src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
mov->U.I.SrcReg[0].Swizzle);
src[1 - shared_index].Swizzle = fill_swizzle(
src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
/* Shared source is the one for used for addition, or it is none. Additionally,
* if the mov SrcReg is none, we merge it with the addition (third) reg as well
* because than we have the highest change the swizzles will be legal.
*/
} else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
src[2].File == RC_FILE_NONE) {
src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
if (src[2].File == RC_FILE_NONE) {
src[2].File = mov->U.I.SrcReg[0].File;
src[2].Index = mov->U.I.SrcReg[0].Index;
src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
src[2].Abs = mov->U.I.SrcReg[0].Abs;
}
/* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
* fill the other one with ones and the reg for addition with zeros.
*/
} else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
src[none_src] = mov->U.I.SrcReg[0];
src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
mad->U.I.SrcReg[none_src].Swizzle);
src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
wmask, RC_SWIZZLE_ONE);
src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
} else {
return false;
}
if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
return false;
inst2->U.I.Opcode = RC_OPCODE_MAD;
inst2->U.I.SrcReg[0] = src[0];
inst2->U.I.SrcReg[1] = src[1];
inst2->U.I.SrcReg[2] = src[2];
inst2->U.I.DstReg.WriteMask = wmask;
rc_remove_instruction(inst1);
return true;
}
static bool inst_combination(
struct rc_instruction * inst1,
struct rc_instruction * inst2,
@ -1187,6 +1279,11 @@ static void merge_channels(struct radeon_compiler * c, struct rc_instruction * i
if (merge_mov_add_mul(c, inst, cur))
return;
}
if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
if (merge_mov_mad(c, inst, cur))
return;
}
}
}
}
@ -1220,6 +1317,7 @@ void rc_optimize(struct radeon_compiler * c, void *user)
inst = inst->Next;
if (cur->U.I.Opcode == RC_OPCODE_MOV ||
cur->U.I.Opcode == RC_OPCODE_ADD ||
cur->U.I.Opcode == RC_OPCODE_MAD ||
cur->U.I.Opcode == RC_OPCODE_MUL)
merge_channels(c, cur);
}