aco: use -1.0*x and 1.0*|x| for fneg/fabs

Besides -1.0*x being 1 dword smaller than x^0x80000000, this commit also
improves generated code when the application requires that denormals are
flushed.

Future versions of DXVK will require that 32-bit denormals are flushed.

fossil-db (GFX8):
Totals from 21021 (14.22% of 147787) affected shaders:
SGPRs: 1288960 -> 1288944 (-0.00%); split: -0.01%, +0.01%
VGPRs: 792672 -> 792848 (+0.02%); split: -0.01%, +0.03%
CodeSize: 62439228 -> 62403552 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 136182 -> 136181 (-0.00%); split: +0.00%, -0.00%
Instrs: 12230882 -> 12239927 (+0.07%); split: -0.01%, +0.08%

fossil-db (GFX10.3):
Totals from 20191 (13.80% of 146267) affected shaders:
VGPRs: 799992 -> 800032 (+0.01%)
CodeSize: 59763656 -> 59715484 (-0.08%); split: -0.12%, +0.03%
MaxWaves: 525378 -> 525376 (-0.00%)
Instrs: 11511082 -> 11517419 (+0.06%); split: -0.00%, +0.06%

fossil-db (GFX8, d3d float controls):
Totals from 87160 (58.98% of 147787) affected shaders:
SGPRs: 5395072 -> 5408480 (+0.25%); split: -0.06%, +0.31%
VGPRs: 3596716 -> 3581592 (-0.42%); split: -0.55%, +0.13%
CodeSize: 271347396 -> 266814460 (-1.67%); split: -1.67%, +0.00%
MaxWaves: 539669 -> 540400 (+0.14%); split: +0.15%, -0.02%
Instrs: 53395194 -> 52257505 (-2.13%); split: -2.13%, +0.00%

fossil-db (GFX10.3, d3d float controls):
Totals from 82306 (56.27% of 146267) affected shaders:
VGPRs: 3572312 -> 3558848 (-0.38%); split: -0.44%, +0.06%
CodeSize: 273494748 -> 269648968 (-1.41%); split: -1.41%, +0.00%
MaxWaves: 2007156 -> 2009950 (+0.14%); split: +0.15%, -0.01%
Instrs: 52251568 -> 51356424 (-1.71%); split: -1.71%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9079>
This commit is contained in:
Rhys Perry 2020-06-25 11:44:26 +01:00 committed by Marge Bot
parent 561fcfb50f
commit e3c283e0bc
3 changed files with 174 additions and 115 deletions

View file

@ -2073,13 +2073,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0xbc00u), as_vgpr(ctx, src));
} else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0xbf800000u), as_vgpr(ctx, src));
} else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));
@ -2095,13 +2091,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_fabs: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v2b) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0x3c00), as_vgpr(ctx, src)).instr;
mul->vop3().abs[1] = true;
} else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.must_flush_denorms32)
src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), as_vgpr(ctx, src)).instr;
mul->vop3().abs[1] = true;
} else if (dst.regClass() == v2) {
if (ctx->block->fp_mode.must_flush_denorms16_64)
src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src));

View file

@ -117,7 +117,8 @@ enum Label {
label_fcanonicalize = 1 << 28,
label_constant_16bit = 1 << 29,
label_usedef = 1 << 30, /* generic label */
label_vop3p = 1 << 31,
label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
label_canonicalized = 1ull << 32,
};
static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub | label_vop3p |
@ -524,10 +525,21 @@ struct ssa_info {
return label & label_fcanonicalize;
}
void set_canonicalized()
{
add_label(label_canonicalized);
}
bool is_canonicalized()
{
return label & label_canonicalized;
}
};
struct opt_ctx {
Program* program;
float_mode fp_mode;
std::vector<aco_ptr<Instruction>> instructions;
ssa_info* info;
std::pair<uint32_t,Temp> last_literal;
@ -908,12 +920,40 @@ bool does_fp_op_flush_denorms(opt_ctx &ctx, aco_opcode op)
return op != aco_opcode::v_cndmask_b32;
}
bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_opcode op)
bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_ptr<Instruction>& instr, Temp tmp)
{
float_mode *fp = &ctx.fp_mode;
if (ctx.info[tmp.id()].is_canonicalized() ||
(tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
return true;
aco_opcode op = instr->opcode;
return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);
}
void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
bool is_copy_label(opt_ctx &ctx, aco_ptr<Instruction>& instr, ssa_info& info)
{
return info.is_temp() || (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
}
bool is_op_canonicalized(opt_ctx &ctx, Operand op)
{
float_mode *fp = &ctx.fp_mode;
if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
(op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
return true;
if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
if (op.bytes() == 2)
return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
else if (op.bytes() == 4)
return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
}
return false;
}
void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
ASSERTED bool all_const = false;
@ -962,8 +1002,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
/* VALU: propagate neg, abs & inline constants */
else if (instr->isVALU()) {
bool is_fp = can_eliminate_fcanonicalize(ctx, instr->opcode);
if ((info.is_temp() || (info.is_fcanonicalize() && is_fp)) && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) {
instr->operands[i].setTemp(info.temp);
info = ctx.info[info.temp.id()];
}
@ -982,7 +1021,24 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
else
can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));
if (info.is_abs() && can_use_mod) {
if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
instr->operands[i].setTemp(info.temp);
} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
instr->operands[i].setTemp(info.temp);
} else if (info.is_neg() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
if (!instr->isDPP() && !instr->isSDWA())
to_VOP3(ctx, instr);
instr->operands[i].setTemp(info.temp);
if (instr->isDPP() && !instr->dpp().abs[i])
instr->dpp().neg[i] = true;
else if (instr->isSDWA() && !instr->sdwa().abs[i])
instr->sdwa().neg[i] = true;
else if (instr->isVOP3() && !instr->vop3().abs[i])
instr->vop3().neg[i] = true;
}
if (info.is_abs() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
if (!instr->isDPP() && !instr->isSDWA())
to_VOP3(ctx, instr);
instr->operands[i] = Operand(info.temp);
@ -992,25 +1048,6 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
instr->sdwa().abs[i] = true;
else
instr->vop3().abs[i] = true;
}
if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
instr->operands[i].setTemp(info.temp);
continue;
} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
instr->operands[i].setTemp(info.temp);
continue;
} else if (info.is_neg() && can_use_mod) {
if (!instr->isDPP() && !instr->isSDWA())
to_VOP3(ctx, instr);
instr->operands[i].setTemp(info.temp);
if (instr->isDPP())
instr->dpp().neg[i] = true;
else if (instr->isSDWA())
instr->sdwa().neg[i] = true;
else
instr->vop3().neg[i] = true;
continue;
}
unsigned bits = get_operand_size(instr, i);
@ -1164,13 +1201,27 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
if (instr->definitions.empty())
return;
if (instr->isVOPC()) {
ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
return;
}
if (instr->isVOP3P()) {
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
return;
if (instr->isVALU() || instr->isVINTRP()) {
if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
instr->opcode == aco_opcode::v_cndmask_b32) {
bool canonicalized = true;
if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
for (unsigned i = 0; canonicalized && (i < ops); i++)
canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
}
if (canonicalized)
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
}
if (instr->isVOPC()) {
ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
return;
}
if (instr->isVOP3P()) {
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
return;
}
}
switch (instr->opcode) {
@ -1324,6 +1375,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64());
} else if (instr->operands[0].isTemp()) {
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
} else {
assert(instr->operands[0].isFixed());
}
@ -1337,27 +1390,42 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
/* TODO: try to move the negate/abs modifier to the consumer instead */
if (instr->usesModifiers())
break;
bool uses_mods = instr->usesModifiers();
bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
for (unsigned i = 0; i < 2; i++) {
if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
if (!instr->isDPP() && !instr->isSDWA() &&
(instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
VOP3_instruction *vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
continue;
bool abs = vop3 && vop3->abs[i];
bool neg = neg1 ^ (vop3 && vop3->neg[i]);
Temp other = instr->operands[i].getTemp();
if (abs && neg && other.type() == RegType::vgpr)
ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
else if (abs && !neg && other.type() == RegType::vgpr)
ctx.info[instr->definitions[0].tempId()].set_abs(other);
else if (!abs && neg && other.type() == RegType::vgpr)
ctx.info[instr->definitions[0].tempId()].set_neg(other);
else if (!abs && !neg)
ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
} else if (uses_mods) {
continue;
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000) &&
!(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32) &&
!instr->definitions[0].isPrecise()) { /* 1.0 */
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp());
} else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000)) {
ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(instr->operands[i].getTemp());
} else if (instr->operands[!i].constantValue() == 0u &&
!(fp16 ? block.fp_mode.preserve_signed_zero_inf_nan16_64 : block.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
} else {
continue;
@ -1376,36 +1444,6 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
case aco_opcode::v_mul_u32_u24:
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
break;
case aco_opcode::v_and_b32: { /* abs */
if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
instr->operands[1].getTemp().type() == RegType::vgpr &&
((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x7FFFFFFFu)) ||
(instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x7FFFu))))
ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp());
else
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
break;
}
case aco_opcode::v_xor_b32: { /* neg */
if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
((instr->definitions[0].bytes() == 4 && instr->operands[0].constantEquals(0x80000000u)) ||
(instr->definitions[0].bytes() == 2 && instr->operands[0].constantEquals(0x8000u)))) {
if (ctx.info[instr->operands[1].tempId()].is_neg()) {
ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp);
} else if (instr->operands[1].getTemp().type() == RegType::vgpr) {
if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */
instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp);
instr->opcode = aco_opcode::v_or_b32;
ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp());
} else {
ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp());
}
}
} else {
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
}
break;
}
case aco_opcode::v_med3_f16:
case aco_opcode::v_med3_f32: { /* clamp */
VOP3_instruction& vop3 = instr->vop3();
@ -1529,6 +1567,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
case aco_opcode::v_or_b32:
case aco_opcode::v_lshlrev_b32:
case aco_opcode::v_bcnt_u32_b32:
case aco_opcode::v_and_b32:
case aco_opcode::v_xor_b32:
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
break;
case aco_opcode::v_min_f32:
@ -1564,6 +1604,12 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
}
break;
case aco_opcode::s_mul_i32:
/* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
* This pattern is created from a uniform nir_op_b2f. */
if (instr->operands[0].constantEquals(0x3f800000u))
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
break;
default:
break;
}
@ -2574,8 +2620,7 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr)
sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
}
ssa_info& info = ctx.info[instr->operands[i].tempId()];
if ((info.is_temp() || (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr->opcode))) &&
info.temp.type() == RegType::sgpr)
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr)
operand_mask |= 1u << i;
}
unsigned max_sgprs = 1;
@ -2656,7 +2701,7 @@ bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info)
}
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
!instr_info.can_use_output_modifiers[(int)instr->opcode])
@ -2669,11 +2714,11 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
/* omod flushes -0 to +0 and has no effect if denormals are enabled */
bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */
if (instr->definitions[0].bytes() == 4)
can_use_omod = can_use_omod && block.fp_mode.denorm32 == 0 &&
!block.fp_mode.preserve_signed_zero_inf_nan32;
can_use_omod = can_use_omod && ctx.fp_mode.denorm32 == 0 &&
!ctx.fp_mode.preserve_signed_zero_inf_nan32;
else
can_use_omod = can_use_omod && block.fp_mode.denorm16_64 == 0 &&
!block.fp_mode.preserve_signed_zero_inf_nan16_64;
can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
!ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
@ -2813,7 +2858,7 @@ void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opse
}
}
void combine_vop3p(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
void combine_vop3p(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
VOP3P_instruction* vop3p = &instr->vop3p();
@ -2950,7 +2995,7 @@ void combine_vop3p(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
// this would mean that we'd have to fix the instruction uses while value propagation
void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
return;
@ -2958,11 +3003,11 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
if (instr->isVALU()) {
if (can_apply_sgprs(ctx, instr))
apply_sgprs(ctx, instr);
while (apply_omod_clamp(ctx, block, instr)) ;
while (apply_omod_clamp(ctx, instr)) ;
}
if (instr->isVOP3P())
return combine_vop3p(ctx, block, instr);
return combine_vop3p(ctx, instr);
if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
instr->definitions[0].setHint(vcc);
@ -3030,8 +3075,8 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
instr->opcode == aco_opcode::v_sub_f16 ||
instr->opcode == aco_opcode::v_subrev_f16;
if (mad16 || mad32) {
bool need_fma = mad32 ? (block.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) :
(block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) :
(ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
if (need_fma && instr->definitions[0].isPrecise())
return;
if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
@ -3630,21 +3675,24 @@ void optimize(Program* program)
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
for (Block& block : program->blocks) {
ctx.fp_mode = block.fp_mode;
for (aco_ptr<Instruction>& instr : block.instructions)
label_instruction(ctx, block, instr);
label_instruction(ctx, instr);
}
ctx.uses = dead_code_analysis(program);
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
for (Block& block : program->blocks) {
ctx.fp_mode = block.fp_mode;
for (aco_ptr<Instruction>& instr : block.instructions)
combine_instruction(ctx, block, instr);
combine_instruction(ctx, instr);
}
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); ++block_rit) {
Block* block = &(*block_rit);
ctx.fp_mode = block->fp_mode;
for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend(); ++instr_rit)
select_instruction(ctx, *instr_rit);
}
@ -3652,6 +3700,7 @@ void optimize(Program* program)
/* 4. Add literals to instructions */
for (Block& block : program->blocks) {
ctx.instructions.clear();
ctx.fp_mode = block.fp_mode;
for (aco_ptr<Instruction>& instr : block.instructions)
apply_literals(ctx, instr);
block.instructions.swap(ctx.instructions);

View file

@ -25,6 +25,18 @@
using namespace aco;
Temp fneg(Temp src)
{
return bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf800000u), src);
}
Temp fabs(Temp src)
{
Builder::Result res = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), src);
res.instr->vop3().abs[1] = true;
return res;
}
BEGIN_TEST(optimize.neg)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
@ -33,31 +45,30 @@ BEGIN_TEST(optimize.neg)
//! v1: %res0 = v_mul_f32 %a, -%b
//! p_unit_test 0, %res0
Temp neg_b = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[1]);
Temp neg_b = fneg(inputs[1]);
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
//! v1: %neg_a = v_xor_b32 0x80000000, %a
//~gfx[6-9]! v1: %res1 = v_mul_f32 0x123456, %neg_a
//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
//~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
//! p_unit_test 1, %res1
Temp neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[0]);
Temp neg_a = fneg(inputs[0]);
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x123456u), neg_a));
//! v1: %res2 = v_mul_f32 %a, %b
//! p_unit_test 2, %res2
Temp neg_neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), neg_a);
Temp neg_neg_a = fneg(neg_a);
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
/* we could optimize this case into just an abs(), but NIR already does this */
//! v1: %res3 = v_mul_f32 |%neg_a|, %b
//! v1: %res3 = v_mul_f32 |%a|, %b
//! p_unit_test 3, %res3
Temp abs_neg_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), neg_a);
Temp abs_neg_a = fabs(neg_a);
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
//! v1: %res4 = v_mul_f32 -|%a|, %b
//! p_unit_test 4, %res4
Temp abs_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), inputs[0]);
Temp neg_abs_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), abs_a);
Temp abs_a = fabs(inputs[0]);
Temp neg_abs_a = fneg(abs_a);
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
@ -74,9 +85,14 @@ BEGIN_TEST(optimize.neg)
//! v1: %res8 = v_mul_f32 %a, -%c
//! p_unit_test 8, %res8
Temp neg_c = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), bld.copy(bld.def(v1), inputs[2]));
Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
// //! v1: %res9 = v_mul_f32 |%neg_a|, %b
// //! p_unit_test 9, %res9
Temp abs_neg_abs_a = fabs(neg_abs_a);
writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
finish_opt_test();
}
END_TEST
@ -750,22 +766,22 @@ BEGIN_TEST(optimize.add3)
END_TEST
BEGIN_TEST(optimize.minmax)
for (unsigned i = GFX8; i <= GFX10; i++) {
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a = p_startpgm
if (!setup_cs("v1", (chip_class)i))
continue;
//! v1: %res0 = v_max3_f32 0, -0, %a
//! p_unit_test 0, %res0
Temp xor0 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), Operand(inputs[0]));
Temp xor0 = fneg(inputs[0]);
Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), xor0);
Temp xor1 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), min);
Temp xor1 = fneg(min);
writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
//! v1: %res1 = v_max3_f32 0, -0, -%a
//! p_unit_test 1, %res1
min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), Operand(inputs[0]));
xor1 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), min);
xor1 = fneg(min);
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
finish_opt_test();