aco/optimizer: add new helpers for applying output modifiers

To replace the old instr_mod_labels.

Foz-DB Navi21:
Totals from 683 (0.70% of 97591) affected shaders:
Instrs: 3341288 -> 3340447 (-0.03%); split: -0.03%, +0.00%
CodeSize: 18522460 -> 18520212 (-0.01%); split: -0.01%, +0.00%
Latency: 34359519 -> 34358772 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 9229621 -> 9229494 (-0.00%); split: -0.00%, +0.00%
Copies: 368383 -> 368260 (-0.03%); split: -0.04%, +0.00%
PreSGPRs: 48060 -> 48061 (+0.00%)
SALU: 543991 -> 543150 (-0.15%); split: -0.16%, +0.00%

Changes are caused by optimizing not(salu) without killed scc.

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38658>
This commit is contained in:
Georg Lehmann 2025-01-04 16:59:41 +01:00 committed by Marge Bot
parent fc29821d3b
commit 37d3c63a12

View file

@ -311,6 +311,7 @@ struct opt_ctx {
std::vector<ssa_info> info; std::vector<ssa_info> info;
std::vector<aco_ptr<Instruction>> pre_combine_instrs; std::vector<aco_ptr<Instruction>> pre_combine_instrs;
std::vector<uint16_t> uses; std::vector<uint16_t> uses;
std::unordered_map<Instruction*, aco_ptr<Instruction>> replacement_instr;
}; };
aco_type aco_type
@ -2975,35 +2976,6 @@ original_temp_id(opt_ctx& ctx, Temp tmp)
return tmp.id(); return tmp.id();
} }
Instruction*
follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
{
if (!op.isTemp())
return nullptr;
if (!ignore_uses && ctx.uses[op.tempId()] > 1)
return nullptr;
Instruction* instr = ctx.info[op.tempId()].parent_instr;
if (instr->definitions[0].getTemp() != op.getTemp())
return nullptr;
if (instr->definitions.size() == 2) {
unsigned idx =
instr->definitions[1].isTemp() && instr->definitions[1].tempId() == op.tempId();
assert(instr->definitions[idx].isTemp() && instr->definitions[idx].tempId() == op.tempId());
if (instr->definitions[!idx].isTemp() && ctx.uses[instr->definitions[!idx].tempId()])
return nullptr;
}
for (Operand& operand : instr->operands) {
if (fixed_to_exec(operand))
return nullptr;
}
return instr;
}
bool bool
is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value) is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
{ {
@ -3372,53 +3344,17 @@ match_and_apply_patterns(opt_ctx& ctx, alu_opt_info& info,
return false; return false;
} }
/* s_not(cmp(a, b)) -> get_vcmp_inverse(cmp)(a, b) */
bool
combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
if (ctx.uses[instr->definitions[1].tempId()])
return false;
if (!instr->operands[0].isTemp() || ctx.uses[instr->operands[0].tempId()] != 1)
return false;
Instruction* cmp = follow_operand(ctx, instr->operands[0]);
if (!cmp)
return false;
aco_opcode new_opcode = get_vcmp_inverse(cmp->opcode);
if (new_opcode == aco_opcode::num_opcodes)
return false;
/* Invert compare instruction and assign this instruction's definition */
cmp->opcode = new_opcode;
ctx.info[instr->definitions[0].tempId()] = ctx.info[cmp->definitions[0].tempId()];
std::swap(instr->definitions[0], cmp->definitions[0]);
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
ctx.info[cmp->definitions[0].tempId()].parent_instr = cmp;
ctx.uses[instr->operands[0].tempId()]--;
return true;
}
/* v_not(v_xor(a, b)) -> v_xnor(a, b) */ /* v_not(v_xor(a, b)) -> v_xnor(a, b) */
bool Instruction*
combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr) apply_v_not(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* op_instr)
{ {
if (instr->usesModifiers()) if (ctx.program->gfx_level < GFX10 || instr->usesModifiers() ||
return false; op_instr->opcode != aco_opcode::v_xor_b32 || op_instr->isSDWA())
return nullptr;
Instruction* op_instr = follow_operand(ctx, instr->operands[0]); op_instr->definitions[0] = instr->definitions[0];
if (!op_instr || op_instr->opcode != aco_opcode::v_xor_b32 || op_instr->isSDWA())
return false;
ctx.uses[instr->operands[0].tempId()]--;
std::swap(instr->definitions[0], op_instr->definitions[0]);
op_instr->opcode = aco_opcode::v_xnor_b32; op_instr->opcode = aco_opcode::v_xnor_b32;
ctx.info[op_instr->definitions[0].tempId()].label = 0; return op_instr;
ctx.info[op_instr->definitions[0].tempId()].parent_instr = op_instr;
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
return true;
} }
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b) /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
@ -3426,61 +3362,47 @@ combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b) * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
* s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b) * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
* s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b) * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
* s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */ * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b)
bool * s_not(cmp(a, b)) -> get_vcmp_inverse(cmp)(a, b) */
combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr) Instruction*
apply_s_not(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* op_instr)
{ {
/* checks */ if (op_instr->definitions.size() == 1 && ctx.uses[instr->definitions[1].tempId()])
if (!instr->operands[0].isTemp()) return nullptr;
return false; else if (op_instr->definitions.size() == 2 && ctx.uses[op_instr->definitions[1].tempId()])
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) return nullptr;
return false;
Instruction* op2_instr = follow_operand(ctx, instr->operands[0]); switch (op_instr->opcode) {
if (!op2_instr) case aco_opcode::s_and_b32: op_instr->opcode = aco_opcode::s_nand_b32; break;
return false; case aco_opcode::s_or_b32: op_instr->opcode = aco_opcode::s_nor_b32; break;
switch (op2_instr->opcode) { case aco_opcode::s_xor_b32: op_instr->opcode = aco_opcode::s_xnor_b32; break;
case aco_opcode::s_and_b32: case aco_opcode::s_and_b64: op_instr->opcode = aco_opcode::s_nand_b64; break;
case aco_opcode::s_or_b32: case aco_opcode::s_or_b64: op_instr->opcode = aco_opcode::s_nor_b64; break;
case aco_opcode::s_xor_b32: case aco_opcode::s_xor_b64: op_instr->opcode = aco_opcode::s_xnor_b64; break;
case aco_opcode::s_and_b64: default: {
case aco_opcode::s_or_b64: if (!op_instr->isVOPC())
case aco_opcode::s_xor_b64: break; return nullptr;
default: return false; aco_opcode new_opcode = get_vcmp_inverse(op_instr->opcode);
if (new_opcode == aco_opcode::num_opcodes)
return nullptr;
op_instr->opcode = new_opcode;
}
} }
/* create instruction */ for (unsigned i = 0; i < op_instr->definitions.size(); i++)
std::swap(instr->definitions[0], op2_instr->definitions[0]); op_instr->definitions[i] = instr->definitions[i];
std::swap(instr->definitions[1], op2_instr->definitions[1]);
ctx.uses[instr->operands[0].tempId()]--;
ctx.info[op2_instr->definitions[0].tempId()].label = 0;
ctx.info[op2_instr->definitions[0].tempId()].parent_instr = op2_instr;
ctx.info[op2_instr->definitions[1].tempId()].parent_instr = op2_instr;
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
ctx.info[instr->definitions[1].tempId()].parent_instr = instr.get();
switch (op2_instr->opcode) { return op_instr;
case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
default: break;
}
return true;
} }
/* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b) /* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b)
* s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b) * s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b)
*/ */
bool Instruction*
combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr) apply_s_abs(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* op_instr)
{ {
Instruction* op_instr = follow_operand(ctx, instr->operands[0], false); if (op_instr->definitions.size() != 2 || ctx.uses[op_instr->definitions[1].tempId()])
if (!op_instr) return nullptr;
return false;
if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) { if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) {
for (unsigned i = 0; i < 2; i++) { for (unsigned i = 0; i < 2; i++) {
@ -3489,30 +3411,21 @@ combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr)
!is_operand_constant(ctx, op_instr->operands[i], 32, &constant)) !is_operand_constant(ctx, op_instr->operands[i], 32, &constant))
continue; continue;
if (op_instr->operands[i].isTemp())
ctx.uses[op_instr->operands[i].tempId()]--;
op_instr->operands[0] = op_instr->operands[!i]; op_instr->operands[0] = op_instr->operands[!i];
op_instr->operands[1] = Operand::c32(-int32_t(constant)); op_instr->operands[1] = Operand::c32(-int32_t(constant));
goto use_absdiff; goto use_absdiff;
} }
return false; return nullptr;
} else if (op_instr->opcode != aco_opcode::s_sub_i32 && } else if (op_instr->opcode != aco_opcode::s_sub_i32 &&
op_instr->opcode != aco_opcode::s_sub_u32) { op_instr->opcode != aco_opcode::s_sub_u32) {
return false; return nullptr;
} }
use_absdiff: use_absdiff:
op_instr->opcode = aco_opcode::s_absdiff_i32; op_instr->opcode = aco_opcode::s_absdiff_i32;
std::swap(instr->definitions[0], op_instr->definitions[0]); op_instr->definitions[0] = instr->definitions[0];
std::swap(instr->definitions[1], op_instr->definitions[1]); op_instr->definitions[1] = instr->definitions[1];
ctx.uses[instr->operands[0].tempId()]--; return op_instr;
ctx.info[op_instr->definitions[0].tempId()].label = 0;
ctx.info[op_instr->definitions[0].tempId()].parent_instr = op_instr;
ctx.info[op_instr->definitions[1].tempId()].parent_instr = op_instr;
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
ctx.info[instr->definitions[1].tempId()].parent_instr = instr.get();
return true;
} }
bool bool
@ -3654,18 +3567,9 @@ apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
/* Remove superfluous extract after ds_read like so: /* Remove superfluous extract after ds_read like so:
* p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN() * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
*/ */
bool Instruction*
apply_load_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract) apply_load_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract, Instruction* load)
{ {
/* Check if p_extract has a usedef operand and is the only user. */
if (ctx.uses[extract->operands[0].tempId()] > 1)
return false;
/* Check if the usedef is the right format. */
Instruction* load = ctx.info[extract->operands[0].tempId()].parent_instr;
if (!load->isDS() && !load->isSMEM() && !load->isMUBUF() && !load->isFlatLike())
return false;
unsigned extract_idx = extract->operands[1].constantValue(); unsigned extract_idx = extract->operands[1].constantValue();
unsigned bits_extracted = extract->operands[2].constantValue(); unsigned bits_extracted = extract->operands[2].constantValue();
bool sign_ext = extract->operands[3].constantValue(); bool sign_ext = extract->operands[3].constantValue();
@ -3698,17 +3602,17 @@ apply_load_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
case aco_opcode::s_buffer_load_ushort: case aco_opcode::s_buffer_load_ushort:
case aco_opcode::buffer_load_ushort: case aco_opcode::buffer_load_ushort:
case aco_opcode::buffer_load_short_d16: bits_loaded = 16; break; case aco_opcode::buffer_load_short_d16: bits_loaded = 16; break;
default: return false; default: return nullptr;
} }
/* TODO: These are doable, but probably don't occur too often. */ /* TODO: These are doable, but probably don't occur too often. */
if (extract_idx || bits_extracted > bits_loaded || dst_bitsize > 32 || if (extract_idx || bits_extracted > bits_loaded || dst_bitsize > 32 ||
(load->definitions[0].regClass().type() != extract->definitions[0].regClass().type())) (load->definitions[0].regClass().type() != extract->definitions[0].regClass().type()))
return false; return nullptr;
/* We can't shrink some loads because that would remove zeroing of the offset/address LSBs. */ /* We can't shrink some loads because that would remove zeroing of the offset/address LSBs. */
if (!can_shrink && bits_extracted < bits_loaded) if (!can_shrink && bits_extracted < bits_loaded)
return false; return nullptr;
/* Shrink the load if the extracted bit size is smaller. */ /* Shrink the load if the extracted bit size is smaller. */
bits_loaded = MIN2(bits_loaded, bits_extracted); bits_loaded = MIN2(bits_loaded, bits_extracted);
@ -3774,12 +3678,8 @@ apply_load_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
} }
/* The load now produces the exact same thing as the extract, remove the extract. */ /* The load now produces the exact same thing as the extract, remove the extract. */
std::swap(load->definitions[0], extract->definitions[0]); load->definitions[0] = extract->definitions[0];
ctx.uses[extract->definitions[0].tempId()] = 0; return load;
ctx.info[load->definitions[0].tempId()].label = 0;
ctx.info[extract->definitions[0].tempId()].parent_instr = extract.get();
ctx.info[load->definitions[0].tempId()].parent_instr = load;
return true;
} }
void void
@ -3940,6 +3840,102 @@ op_info_get_constant(opt_ctx& ctx, alu_opt_op op_info, aco_type type, uint64_t*
return true; return true;
} }
Instruction*
apply_output_impl(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* parent)
{
if (instr->opcode == aco_opcode::p_extract &&
(parent->isDS() || parent->isSMEM() || parent->isMUBUF() || parent->isFlatLike()))
return apply_load_extract(ctx, instr, parent);
else if (instr->opcode == aco_opcode::p_extract)
return nullptr;
else if (instr->opcode == aco_opcode::v_not_b32)
return apply_v_not(ctx, instr, parent);
else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64)
return apply_s_not(ctx, instr, parent);
else if (instr->opcode == aco_opcode::s_abs_i32)
return apply_s_abs(ctx, instr, parent);
else
UNREACHABLE("unhandled opcode");
return nullptr;
}
bool
apply_output(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
switch (instr->opcode) {
case aco_opcode::p_extract:
case aco_opcode::v_not_b32:
case aco_opcode::s_not_b32:
case aco_opcode::s_not_b64:
case aco_opcode::s_abs_i32: break;
default: return false;
}
int temp_idx = -1;
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (temp_idx < 0 && instr->operands[i].isTemp())
temp_idx = i;
else if (instr->operands[i].isConstant())
continue;
else
return false;
}
if (temp_idx < 0)
return false;
unsigned tmpid = instr->operands[temp_idx].tempId();
Instruction* parent = ctx.info[tmpid].parent_instr;
if (ctx.uses[tmpid] != 1 || parent->definitions[0].tempId() != tmpid)
return false;
int64_t alt_idx = ctx.info[tmpid].is_combined() ? ctx.info[tmpid].val : -1;
aco::small_vec<Operand, 4> pre_opt_ops;
for (const Operand& op : parent->operands)
pre_opt_ops.push_back(op);
Instruction* new_instr = apply_output_impl(ctx, instr, parent);
if (new_instr == nullptr)
return false;
for (const Operand& op : parent->operands) {
if (op.isTemp())
ctx.uses[op.tempId()]++;
}
for (const Operand& op : pre_opt_ops) {
if (op.isTemp())
decrease_and_dce(ctx, op.getTemp());
}
ctx.uses[tmpid] = 0;
ctx.info[tmpid].parent_instr = nullptr;
if (new_instr != parent)
ctx.replacement_instr.emplace(parent, new_instr);
if (alt_idx >= 0) {
Instruction* new_pre_combine =
apply_output_impl(ctx, instr, ctx.pre_combine_instrs[alt_idx].get());
if (new_pre_combine != ctx.pre_combine_instrs[alt_idx].get())
ctx.pre_combine_instrs[alt_idx].reset(new_pre_combine);
if (new_pre_combine)
ctx.info[new_instr->definitions[0].tempId()].set_combined(alt_idx);
}
for (Definition& def : new_instr->definitions) {
ctx.info[def.tempId()].parent_instr = new_instr;
ctx.info[def.tempId()].label &=
instr_mod_labels | canonicalized_labels | label_combined_instr;
}
instr.reset();
return true;
}
bool bool
create_fma_cb(opt_ctx& ctx, alu_opt_info& info) create_fma_cb(opt_ctx& ctx, alu_opt_info& info)
{ {
@ -4165,9 +4161,11 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isDPP()) if (instr->isDPP())
return; return;
if (instr->opcode == aco_opcode::p_extract) { if (!instr->isVALU() && !instr->isSALU() && !instr->isPseudo())
apply_load_extract(ctx, instr); return;
}
if (apply_output(ctx, instr))
return;
/* TODO: There are still some peephole optimizations that could be done: /* TODO: There are still some peephole optimizations that could be done:
* - abs(a - b) -> s_absdiff_i32 * - abs(a - b) -> s_absdiff_i32
@ -4230,15 +4228,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
return; return;
} }
if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
combine_not_xor(ctx, instr);
} else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
if (!combine_salu_not_bitwise(ctx, instr))
combine_inverse_comparison(ctx, instr);
} else if (instr->opcode == aco_opcode::s_abs_i32) {
combine_sabsdiff(ctx, instr);
}
alu_opt_info info; alu_opt_info info;
if (!alu_opt_gather_info(ctx, instr.get(), info)) if (!alu_opt_gather_info(ctx, instr.get(), info))
return; return;
@ -4744,12 +4733,29 @@ to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
return true; return true;
} }
void
insert_replacement_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
if (!instr.get() || instr->definitions.empty() ||
ctx.info[instr->definitions[0].tempId()].parent_instr == instr.get())
return;
while (true) {
auto it = ctx.replacement_instr.find(instr.get());
if (it == ctx.replacement_instr.end())
return;
instr = std::move(it->second);
ctx.replacement_instr.erase(it);
}
}
void void
select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{ {
const uint32_t threshold = 4; const uint32_t threshold = 4;
if (is_dead(ctx.uses, instr.get())) { if (!instr.get() || is_dead(ctx.uses, instr.get())) {
instr.reset(); instr.reset();
return; return;
} }
@ -4840,8 +4846,12 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
* no operand instruction was eliminated. * no operand instruction was eliminated.
*/ */
bool use_prev = std::all_of( bool use_prev = std::all_of(
prev_instr->operands.begin(), prev_instr->operands.end(), [&](Operand op) prev_instr->operands.begin(), prev_instr->operands.end(),
{ return !op.isTemp() || !is_dead(ctx.uses, ctx.info[op.tempId()].parent_instr); }); [&](Operand op)
{
return !op.isTemp() || (ctx.info[op.tempId()].parent_instr &&
!is_dead(ctx.uses, ctx.info[op.tempId()].parent_instr));
});
if (use_prev) { if (use_prev) {
for (const Operand& op : prev_instr->operands) { for (const Operand& op : prev_instr->operands) {
@ -5456,6 +5466,14 @@ optimize(Program* program)
combine_instruction(ctx, instr); combine_instruction(ctx, instr);
} }
if (!ctx.replacement_instr.empty()) {
for (Block& block : program->blocks) {
ctx.fp_mode = block.fp_mode;
for (aco_ptr<Instruction>& instr : block.instructions)
insert_replacement_instr(ctx, instr);
}
}
validate_opt_ctx(ctx); validate_opt_ctx(ctx);
/* 4. Top-Down DAG pass (backward) to select instructions (includes DCE) */ /* 4. Top-Down DAG pass (backward) to select instructions (includes DCE) */