aco/optimizer: use new helpers to apply insert

Foz-DB Navi21:
Totals from 505 (0.52% of 97591) affected shaders:
Instrs: 1438254 -> 1436780 (-0.10%); split: -0.11%, +0.01%
CodeSize: 8063364 -> 8054192 (-0.11%); split: -0.13%, +0.01%
Latency: 18596788 -> 18597262 (+0.00%); split: -0.01%, +0.01%
InvThroughput: 5213861 -> 5213061 (-0.02%); split: -0.02%, +0.01%
VClause: 37121 -> 37130 (+0.02%)
Copies: 174744 -> 175222 (+0.27%); split: -0.07%, +0.34%
Branches: 65722 -> 65718 (-0.01%)
VALU: 912967 -> 911074 (-0.21%); split: -0.21%, +0.00%
SALU: 251045 -> 251560 (+0.21%); split: -0.01%, +0.21%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38658>
This commit is contained in:
Georg Lehmann 2025-01-09 22:04:42 +01:00 committed by Marge Bot
parent d60ce9ceef
commit ee28801eae

View file

@ -70,12 +70,11 @@ enum Label {
label_omod4 = 1ull << 34, label_omod4 = 1ull << 34,
label_omod5 = 1ull << 35, label_omod5 = 1ull << 35,
label_clamp = 1ull << 36, label_clamp = 1ull << 36,
label_insert = 1ull << 38,
label_f2f16 = 1ull << 39, label_f2f16 = 1ull << 39,
}; };
static constexpr uint64_t instr_mod_labels = static constexpr uint64_t instr_mod_labels =
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16; label_omod2 | label_omod4 | label_omod5 | label_clamp | label_f2f16;
static constexpr uint64_t input_mod_labels = static constexpr uint64_t input_mod_labels =
label_abs_fp16 | label_abs_fp32_64 | label_neg_fp16 | label_neg_fp32_64; label_abs_fp16 | label_abs_fp32_64 | label_neg_fp16 | label_neg_fp32_64;
@ -292,16 +291,6 @@ struct ssa_info {
void set_extract() { add_label(label_extract); } void set_extract() { add_label(label_extract); }
bool is_extract() { return label & label_extract; } bool is_extract() { return label & label_extract; }
void set_insert(Instruction* insert)
{
if (label & temp_labels)
return;
add_label(label_insert);
mod_instr = insert;
}
bool is_insert() { return label & label_insert; }
}; };
struct opt_ctx { struct opt_ctx {
@ -986,7 +975,8 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info)
if (!info.uses_insert()) { if (!info.uses_insert()) {
info.insert = SubdwordSel::dword; info.insert = SubdwordSel::dword;
} else if (info.defs[0].bytes() != 4 || } else if (info.defs[0].bytes() != 4 ||
(!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2))) { (!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2)) ||
ctx.program->gfx_level < GFX8 || ctx.program->gfx_level >= GFX11) {
return false; return false;
} else { } else {
info.format = format_combine(info.format, Format::SDWA); info.format = format_combine(info.format, Format::SDWA);
@ -3018,19 +3008,12 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
case aco_opcode::p_extract: { case aco_opcode::p_extract: {
if (instr->operands[0].isTemp()) { if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_extract(); ctx.info[instr->definitions[0].tempId()].set_extract();
if (instr->definitions[0].bytes() == 4 && instr->operands[0].regClass() == v1 &&
parse_insert(instr.get()))
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
} }
break; break;
} }
case aco_opcode::p_insert: { case aco_opcode::p_insert: {
if (instr->operands[0].isTemp()) { if (instr->operands[0].isTemp() && parse_extract(instr.get()))
if (instr->operands[0].regClass() == v1) ctx.info[instr->definitions[0].tempId()].set_extract();
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
if (parse_extract(instr.get()))
ctx.info[instr->definitions[0].tempId()].set_extract();
}
break; break;
} }
case aco_opcode::v_cvt_f16_f32: { case aco_opcode::v_cvt_f16_f32: {
@ -3600,7 +3583,7 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
instr->valu().clamp = true; instr->valu().clamp = true;
instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]); instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]);
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16; ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_f2f16;
ctx.uses[def_info.mod_instr->definitions[0].tempId()]--; ctx.uses[def_info.mod_instr->definitions[0].tempId()]--;
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get(); ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr; ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr;
@ -3609,45 +3592,34 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
} }
/* Combine an p_insert (or p_extract, in some cases) instruction with instr. /* Combine an p_insert (or p_extract, in some cases) instruction with instr.
* p_insert(instr(...)) -> instr_insert(). * p_insert(parent(...)) -> instr_insert().
*/ */
bool Instruction*
apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr) apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* parent)
{ {
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1) if (instr->definitions[0].regClass() != v1)
return false; return nullptr;
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; SubdwordSel sel = parse_insert(instr.get());
if (!def_info.is_insert()) if (!sel)
return false; return nullptr;
/* if the insert instruction is dead, then the single user of this
* instruction is a different instruction */
if (!ctx.uses[def_info.mod_instr->definitions[0].tempId()])
return false;
/* MADs/FMAs are created later, so we don't have to update the original add */ if (ctx.info[instr->operands[0].tempId()].label & temp_labels)
assert(!ctx.info[instr->definitions[0].tempId()].is_combined()); return nullptr;
SubdwordSel sel = parse_insert(def_info.mod_instr); alu_opt_info parent_info;
assert(sel); if (!alu_opt_gather_info(ctx, parent, parent_info))
return nullptr;
if (!can_use_SDWA(ctx.program->gfx_level, instr, true)) if (parent_info.uses_insert())
return false; return nullptr;
convert_to_SDWA(ctx.program->gfx_level, instr); parent_info.insert = sel;
if (instr->sdwa().dst_sel.size() != 4)
return false;
instr->sdwa().dst_sel = sel;
instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]); parent_info.defs[0].setTemp(instr->definitions[0].getTemp());
ctx.info[instr->definitions[0].tempId()].label = 0; if (!alu_opt_info_is_valid(ctx, parent_info))
ctx.uses[def_info.mod_instr->definitions[0].tempId()]--; return nullptr;
ctx.info[instr->definitions[0].tempId()].label = 0; return alu_opt_info_to_instr(ctx, parent_info, parent);
ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr;
for (const Definition& def : instr->definitions)
ctx.info[def.tempId()].parent_instr = instr.get();
return true;
} }
/* Remove superfluous extract after ds_read like so: /* Remove superfluous extract after ds_read like so:
@ -3946,8 +3918,8 @@ apply_output_impl(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* parent
if (instr->opcode == aco_opcode::p_extract && if (instr->opcode == aco_opcode::p_extract &&
(parent->isDS() || parent->isSMEM() || parent->isMUBUF() || parent->isFlatLike())) (parent->isDS() || parent->isSMEM() || parent->isMUBUF() || parent->isFlatLike()))
return apply_load_extract(ctx, instr, parent); return apply_load_extract(ctx, instr, parent);
else if (instr->opcode == aco_opcode::p_extract) else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert)
return nullptr; return apply_insert(ctx, instr, parent);
else if (instr->opcode == aco_opcode::v_not_b32) else if (instr->opcode == aco_opcode::v_not_b32)
return apply_v_not(ctx, instr, parent); return apply_v_not(ctx, instr, parent);
else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64)
@ -3969,6 +3941,7 @@ apply_output(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{ {
switch (instr->opcode) { switch (instr->opcode) {
case aco_opcode::p_extract: case aco_opcode::p_extract:
case aco_opcode::p_insert:
case aco_opcode::v_not_b32: case aco_opcode::v_not_b32:
case aco_opcode::s_not_b32: case aco_opcode::s_not_b32:
case aco_opcode::s_not_b64: case aco_opcode::s_not_b64:
@ -4244,7 +4217,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isVALU()) { if (instr->isVALU()) {
while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr)) while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
; ;
apply_insert(ctx, instr);
} }
if (instr->isDPP()) if (instr->isDPP())