From 468ee8b80c7ffc03017d031df10875219430098e Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 1 Dec 2023 16:20:38 +0000 Subject: [PATCH] aco: implement 16-bit fsat on GFX8 GFX8 doesn't have v_med3_f16. Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 6 +++++- src/amd/compiler/aco_optimizer.cpp | 13 +++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index e18b205f222..fe11ad16a23 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2631,9 +2631,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) break; } Temp src = get_alu_src(ctx, instr->src[0]); - if (dst.regClass() == v2b) { + if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) { bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00), src); + } else if (dst.regClass() == v2b) { + bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src) + ->valu() + .clamp = true; } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(), Operand::c32(0x3f800000u), src); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 9fdbffc7994..cd8e8bf787e 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1890,13 +1890,19 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u); VALU_instruction* vop3 = instr->isVOP3() ? &instr->valu() : NULL; - if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod)) + if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->omod)) continue; bool abs = vop3 && vop3->abs[i]; bool neg = neg1 ^ (vop3 && vop3->neg[i]); - Temp other = instr->operands[i].getTemp(); + + if (vop3 && vop3->clamp) { + if (!abs && !neg && other.type() == RegType::vgpr) + ctx.info[other.id()].set_clamp(instr.get()); + continue; + } + if (abs && neg && other.type() == RegType::vgpr) ctx.info[instr->definitions[0].tempId()].set_neg_abs(other); else if (abs && !neg && other.type() == RegType::vgpr) @@ -4562,6 +4568,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) ctx.mad_infos.emplace_back(nullptr, 0); ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1); } else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) { + /* Optimize v_med3 to v_add so that it can be dual issued on GFX11. We start with v_med3 in + * case omod can be applied. + */ unsigned idx; if (detect_clamp(instr.get(), &idx)) { instr->format = asVOP3(Format::VOP2);