aco: Use v_cmpx pre GFX10.

Foz-DB Vega10:
Totals from 29508 (21.85% of 135041) affected shaders:
CodeSize: 184345656 -> 184345820 (+0.00%)
Instrs: 35906154 -> 35906195 (+0.00%)
Latency: 581696114 -> 581530021 (-0.03%); split: -0.03%, +0.00%
InvThroughput: 245625572 -> 245561351 (-0.03%); split: -0.03%, +0.00%
Copies: 3134925 -> 3278672 (+4.59%)

Signed-off-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18049>
This commit is contained in:
Georg Lehmann 2022-08-14 19:25:09 +02:00 committed by Marge Bot
parent 393e577435
commit 7b9d3ebe42

View file

@ -355,24 +355,24 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
if (exec_val->definitions.size() > 1)
return;
/* Check if a suitable v_cmpx opcode exists. */
const aco_opcode v_cmpx_op =
exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
const bool vopc = v_cmpx_op != aco_opcode::num_opcodes;
/* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */
const bool save_original_exec = exec_copy->opcode == aco_opcode::s_and_saveexec_b32 ||
exec_copy->opcode == aco_opcode::s_and_saveexec_b64;
/* Position where the original exec mask copy should be inserted. */
const int save_original_exec_idx = exec_val_idx;
/* The copy can be removed when it kills its operand. */
const bool can_remove_copy = exec_copy->operands[0].isKill();
/* The copy can be removed when it kills its operand.
* v_cmpx also writes the original destination pre GFX10.
*/
const bool can_remove_copy =
exec_copy->operands[0].isKill() || (vopc && ctx.program->gfx_level < GFX10);
/* Whether exec_val and exec_copy are adjacent (with p_logical_end inbetween). */
const bool val_and_copy_adjacent = exec_val_idx == exec_copy_idx - 2;
/* Only use v_cmpx on GFX10+ where it doesn't always clobber the VCC.
* Also check if a suitable v_cmpx opcode exists.
*/
const aco_opcode v_cmpx_op =
exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
const bool usable_vcmpx = ctx.program->gfx_level >= GFX10 && v_cmpx_op != aco_opcode::num_opcodes;
const bool vopc = exec_val->isVOPC() && usable_vcmpx;
/* Always allow reassigning when the value is written by (usable) VOPC.
* Note, VOPC implicitly contains "& exec" because it yields zero on inactive lanes.
* Additionally, when value is copied as-is, also allow SALU and parallelcopies.
@ -422,16 +422,53 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
for (const Operand& op : exec_val->operands)
if (regs_intersect(exec_copy_def, op))
return;
/* We would write over the saved exec value in this case. */
if (((vopc && ctx.program->gfx_level < GFX10) || !can_remove_copy) &&
regs_intersect(exec_copy_def, exec_wr_def))
return;
}
/* Reassign the instruction to write exec directly. */
exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
if (vopc) {
/* Add one extra definition for exec and copy the VOP3-specific fields if present. */
if (ctx.program->gfx_level < GFX10) {
if (exec_val->isSDWA() || exec_val->isDPP()) {
/* This might work but it needs testing and more code to copy the instruction. */
return;
}
else if (!exec_val->isVOP3()) {
aco_ptr<Instruction> tmp = std::move(exec_val);
exec_val.reset(create_instruction<VOPC_instruction>(
tmp->opcode, tmp->format, tmp->operands.size(), tmp->definitions.size() + 1));
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
exec_val->definitions.begin());
} else {
aco_ptr<Instruction> tmp = std::move(exec_val);
exec_val.reset(create_instruction<VOP3_instruction>(
tmp->opcode, tmp->format, tmp->operands.size(), tmp->definitions.size() + 1));
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
exec_val->definitions.begin());
VOP3_instruction& src = tmp->vop3();
VOP3_instruction& dst = exec_val->vop3();
dst.opsel = src.opsel;
dst.omod = src.omod;
dst.clamp = src.clamp;
std::copy(std::cbegin(src.abs), std::cend(src.abs), std::begin(dst.abs));
std::copy(std::cbegin(src.neg), std::cend(src.neg), std::begin(dst.neg));
}
}
/* Set v_cmpx opcode. */
exec_val->opcode = v_cmpx_op;
*exec_val->definitions.rbegin() = Definition(exec, ctx.program->lane_mask);
/* TODO: change instruction from VOP3 to plain VOPC when possible. */
} else {
/* Reassign the instruction to write exec directly. */
exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
}
if (!val_and_copy_adjacent) {