mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 13:00:09 +01:00
aco: Try to reassign split vector registers post-RA.
Eliminate unnecessary copies when the operand registers of a p_split_vector instruction are not clobbered between the p_split_vector and the user of its definitions. This happens when p_split_vector doesn't kill its operand and its definitions have a shorter lifespan that the operand. It affects every NGG culling shader among other things. This optimization exists because it's too difficult to solve it in RA, and should be removed after we solved this in RA. v2 by Daniel Schürmann: - Rearrange and simplify conditions for the new optimization - Fix a few bugs v3 by Daniel Schürmann: - Check number of encoded ALU operands Fossil DB stats on Rembrandt (RDNA2): Totals from 64896 (48.10% of 134906) affected shaders: CodeSize: 175693348 -> 175434944 (-0.15%) Instrs: 33333912 -> 33269388 (-0.19%) Latency: 183766084 -> 183763432 (-0.00%); split: -0.00%, +0.00% InvThroughput: 28589651 -> 28589340 (-0.00%); split: -0.00%, +0.00% Copies: 2806550 -> 2742038 (-2.30%) Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16161>
This commit is contained in:
parent
3d29779a25
commit
75b1027722
1 changed files with 110 additions and 0 deletions
|
|
@ -543,6 +543,114 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
num_encoded_alu_operands(const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->isSALU()) {
|
||||
if (instr->isSOP2())
|
||||
return 2;
|
||||
else if (instr->isSOP1())
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (instr->isVALU()) {
|
||||
if (instr->isVOP1())
|
||||
return 1;
|
||||
else if (instr->isVOPC() || instr->isVOP2())
|
||||
return 2;
|
||||
else if (instr->opcode == aco_opcode::v_writelane_b32_e64 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32)
|
||||
return 2; /* potentially VOP3, but reads VDST as SRC2 */
|
||||
else if (instr->isVOP3() || instr->isVOP3P())
|
||||
return instr->operands.size();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
try_reassign_split_vector(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
/* We are looking for the following pattern:
|
||||
*
|
||||
* sA, sB = p_split_vector s[X:Y]
|
||||
* ... X and Y not overwritten here ...
|
||||
* use sA or sB <--- current instruction
|
||||
*
|
||||
* If possible, we propagate the registers from the p_split_vector
|
||||
* operand into the current instruction and the above is optimized into:
|
||||
*
|
||||
* use sX or sY
|
||||
*
|
||||
* Thereby, we might violate register assignment rules.
|
||||
* This optimization exists because it's too difficult to solve it
|
||||
* in RA, and should be removed after we solved this in RA.
|
||||
*/
|
||||
|
||||
if (!instr->isVALU() && !instr->isSALU())
|
||||
return;
|
||||
|
||||
for (unsigned i = 0; i < num_encoded_alu_operands(instr); i++) {
|
||||
/* Find the instruction that writes the current operand. */
|
||||
const Operand& op = instr->operands[i];
|
||||
Idx op_instr_idx = last_writer_idx(ctx, op);
|
||||
if (!op_instr_idx.found())
|
||||
continue;
|
||||
|
||||
/* Check if the operand is written by p_split_vector. */
|
||||
Instruction* split_vec = ctx.get(op_instr_idx);
|
||||
if (split_vec->opcode != aco_opcode::p_split_vector)
|
||||
continue;
|
||||
|
||||
Operand& split_op = split_vec->operands[0];
|
||||
|
||||
/* Don't do anything if the p_split_vector operand is not a temporary
|
||||
* or is killed by the p_split_vector.
|
||||
* In this case the definitions likely already reuse the same registers as the operand.
|
||||
*/
|
||||
if (!split_op.isTemp() || split_op.isKill())
|
||||
continue;
|
||||
|
||||
/* Only propagate operands of the same type */
|
||||
if (split_op.getTemp().type() != op.getTemp().type())
|
||||
continue;
|
||||
|
||||
/* Check if the p_split_vector operand's registers are overwritten. */
|
||||
if (is_overwritten_since(ctx, split_op, op_instr_idx))
|
||||
continue;
|
||||
|
||||
PhysReg reg = split_op.physReg();
|
||||
for (Definition& def : split_vec->definitions) {
|
||||
if (def.getTemp() != op.getTemp()) {
|
||||
reg = reg.advance(def.bytes());
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Don't propagate misaligned SGPRs.
|
||||
* Note: No ALU instruction can take a variable larger than 64bit.
|
||||
*/
|
||||
if (op.regClass() == s2 && reg.reg() % 2 != 0)
|
||||
break;
|
||||
|
||||
/* If there is only one use (left), recolor the split_vector definition */
|
||||
if (ctx.uses[op.tempId()] == 1)
|
||||
def.setFixed(reg);
|
||||
else
|
||||
ctx.uses[op.tempId()]--;
|
||||
|
||||
/* Use the p_split_vector operand register directly.
|
||||
*
|
||||
* Note: this might violate register assignment rules to some extend
|
||||
* in case the definition does not get recolored, eventually.
|
||||
*/
|
||||
instr->operands[i].setFixed(reg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
|
|
@ -559,6 +667,8 @@ process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
|
||||
try_combine_dpp(ctx, instr);
|
||||
|
||||
try_reassign_split_vector(ctx, instr);
|
||||
|
||||
if (instr)
|
||||
save_reg_writes(ctx, instr);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue