diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index d22059574ad..75184742eda 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1271,10 +1271,6 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) assert(dst.regClass() == v1b || dst.regClass() == v2b); bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11; - /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */ - bool can_use_perm = ctx->program->gfx_level >= GFX10 && - (op.constantEquals(0) || op.constantEquals(0xff) || - op.constantEquals(0xffff) || op.constantEquals(0xff00)); if (dst.regClass() == v1b && use_sdwa) { uint8_t val = op.constantValue(); Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u)); @@ -1303,24 +1299,10 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) } else { bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero()); } - } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10 && - (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) { - if (dst.physReg().byte() == 2) { - Operand def_lo(dst.physReg().advance(-2), v2b); - Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op); - instr->valu().opsel = 0; - } else { - assert(dst.physReg().byte() == 0); - Operand def_hi(dst.physReg().advance(2), v2b); - Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi); - instr->valu().opsel = 2; - } - } else if (can_use_perm) { - uint8_t swiz[] = {4, 5, 6, 7}; - swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0; - if (dst.bytes() == 2) - swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0; - create_bperm(bld, swiz, dst, Operand::zero()); + } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) { + op = Operand::c32(op.constantValue()); + Instruction* instr = bld.vop3(aco_opcode::v_add_u16_e64, dst, op, Operand::c32(0)); + instr->valu().opsel[3] = dst.physReg().byte() == 2; } else { uint32_t offset = dst.physReg().byte() * 8u; uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset; diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index fad4010273e..f8eb019d4f6 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -268,7 +268,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 7 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0] - //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + //~gfx10! v2b: %_:v[0][0:16] = v_add_u16_e64 0x4205, 0 //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205)); @@ -276,7 +276,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 8 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0] - //~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205 + //~gfx10! v2b: %_:v[0][16:32] = v_add_u16_e64 0x4205, 0 opsel_hi //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205)); @@ -323,7 +323,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //>> p_unit_test 13 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0] - //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d + //~gfx10! v2b: %_:v[0][0:16] = v_add_u16_e64 0xff, 0 //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff)); @@ -331,7 +331,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 14 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0] - //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504 + //~gfx10! v2b: %_:v[0][16:32] = v_add_u16_e64 0xff00, 0 opsel_hi //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00));