aco/optimizer: use new helpers to apply insert

Foz-DB Navi21:
Totals from 505 (0.52% of 97591) affected shaders:
Instrs: 1438254 -> 1436780 (-0.10%); split: -0.11%, +0.01%
CodeSize: 8063364 -> 8054192 (-0.11%); split: -0.13%, +0.01%
Latency: 18596788 -> 18597262 (+0.00%); split: -0.01%, +0.01%
InvThroughput: 5213861 -> 5213061 (-0.02%); split: -0.02%, +0.01%
VClause: 37121 -> 37130 (+0.02%)
Copies: 174744 -> 175222 (+0.27%); split: -0.07%, +0.34%
Branches: 65722 -> 65718 (-0.01%)
VALU: 912967 -> 911074 (-0.21%); split: -0.21%, +0.00%
SALU: 251045 -> 251560 (+0.21%); split: -0.01%, +0.21%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38658>
This commit is contained in:
Georg Lehmann 2025-01-09 22:04:42 +01:00 committed by Marge Bot
parent d60ce9ceef
commit ee28801eae

View file

@ -70,12 +70,11 @@ enum Label {
label_omod4 = 1ull << 34,
label_omod5 = 1ull << 35,
label_clamp = 1ull << 36,
label_insert = 1ull << 38,
label_f2f16 = 1ull << 39,
};
static constexpr uint64_t instr_mod_labels =
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_f2f16;
static constexpr uint64_t input_mod_labels =
label_abs_fp16 | label_abs_fp32_64 | label_neg_fp16 | label_neg_fp32_64;
@ -292,16 +291,6 @@ struct ssa_info {
void set_extract() { add_label(label_extract); }
bool is_extract() { return label & label_extract; }
void set_insert(Instruction* insert)
{
if (label & temp_labels)
return;
add_label(label_insert);
mod_instr = insert;
}
bool is_insert() { return label & label_insert; }
};
struct opt_ctx {
@ -986,7 +975,8 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info)
if (!info.uses_insert()) {
info.insert = SubdwordSel::dword;
} else if (info.defs[0].bytes() != 4 ||
(!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2))) {
(!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2)) ||
ctx.program->gfx_level < GFX8 || ctx.program->gfx_level >= GFX11) {
return false;
} else {
info.format = format_combine(info.format, Format::SDWA);
@ -3018,19 +3008,12 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
case aco_opcode::p_extract: {
if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_extract();
if (instr->definitions[0].bytes() == 4 && instr->operands[0].regClass() == v1 &&
parse_insert(instr.get()))
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
}
break;
}
case aco_opcode::p_insert: {
if (instr->operands[0].isTemp()) {
if (instr->operands[0].regClass() == v1)
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
if (parse_extract(instr.get()))
ctx.info[instr->definitions[0].tempId()].set_extract();
}
if (instr->operands[0].isTemp() && parse_extract(instr.get()))
ctx.info[instr->definitions[0].tempId()].set_extract();
break;
}
case aco_opcode::v_cvt_f16_f32: {
@ -3600,7 +3583,7 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
instr->valu().clamp = true;
instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]);
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_f2f16;
ctx.uses[def_info.mod_instr->definitions[0].tempId()]--;
ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get();
ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr;
@ -3609,45 +3592,34 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
/* Combine an p_insert (or p_extract, in some cases) instruction with instr.
* p_insert(instr(...)) -> instr_insert().
* p_insert(parent(...)) -> instr_insert().
*/
bool
apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
Instruction*
apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* parent)
{
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
return false;
if (instr->definitions[0].regClass() != v1)
return nullptr;
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
if (!def_info.is_insert())
return false;
/* if the insert instruction is dead, then the single user of this
* instruction is a different instruction */
if (!ctx.uses[def_info.mod_instr->definitions[0].tempId()])
return false;
SubdwordSel sel = parse_insert(instr.get());
if (!sel)
return nullptr;
/* MADs/FMAs are created later, so we don't have to update the original add */
assert(!ctx.info[instr->definitions[0].tempId()].is_combined());
if (ctx.info[instr->operands[0].tempId()].label & temp_labels)
return nullptr;
SubdwordSel sel = parse_insert(def_info.mod_instr);
assert(sel);
alu_opt_info parent_info;
if (!alu_opt_gather_info(ctx, parent, parent_info))
return nullptr;
if (!can_use_SDWA(ctx.program->gfx_level, instr, true))
return false;
if (parent_info.uses_insert())
return nullptr;
convert_to_SDWA(ctx.program->gfx_level, instr);
if (instr->sdwa().dst_sel.size() != 4)
return false;
instr->sdwa().dst_sel = sel;
parent_info.insert = sel;
instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]);
ctx.info[instr->definitions[0].tempId()].label = 0;
ctx.uses[def_info.mod_instr->definitions[0].tempId()]--;
ctx.info[instr->definitions[0].tempId()].label = 0;
ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr;
for (const Definition& def : instr->definitions)
ctx.info[def.tempId()].parent_instr = instr.get();
return true;
parent_info.defs[0].setTemp(instr->definitions[0].getTemp());
if (!alu_opt_info_is_valid(ctx, parent_info))
return nullptr;
return alu_opt_info_to_instr(ctx, parent_info, parent);
}
/* Remove superfluous extract after ds_read like so:
@ -3946,8 +3918,8 @@ apply_output_impl(opt_ctx& ctx, aco_ptr<Instruction>& instr, Instruction* parent
if (instr->opcode == aco_opcode::p_extract &&
(parent->isDS() || parent->isSMEM() || parent->isMUBUF() || parent->isFlatLike()))
return apply_load_extract(ctx, instr, parent);
else if (instr->opcode == aco_opcode::p_extract)
return nullptr;
else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert)
return apply_insert(ctx, instr, parent);
else if (instr->opcode == aco_opcode::v_not_b32)
return apply_v_not(ctx, instr, parent);
else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64)
@ -3969,6 +3941,7 @@ apply_output(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
switch (instr->opcode) {
case aco_opcode::p_extract:
case aco_opcode::p_insert:
case aco_opcode::v_not_b32:
case aco_opcode::s_not_b32:
case aco_opcode::s_not_b64:
@ -4244,7 +4217,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isVALU()) {
while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
;
apply_insert(ctx, instr);
}
if (instr->isDPP())