diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index ffb47115836..076e850c584 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -795,8 +795,7 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst encoding |= dpp.neg[1] << 22; encoding |= dpp.abs[0] << 21; encoding |= dpp.neg[0] << 20; - if (ctx.gfx_level >= GFX10) - encoding |= 1 << 18; /* set Fetch Inactive */ + encoding |= dpp.fetch_inactive << 18; encoding |= dpp.bound_ctrl << 19; encoding |= dpp.dpp_ctrl << 8; encoding |= reg(ctx, dpp_op, 8); @@ -809,7 +808,7 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst /* first emit the instruction without the DPP operand */ Operand dpp_op = instr->operands[0]; - instr->operands[0] = Operand(PhysReg{234}, v1); + instr->operands[0] = Operand(PhysReg{233u + dpp.fetch_inactive}, v1); instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8); emit_instruction(ctx, out, instr); uint32_t encoding = reg(ctx, dpp_op, 8); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index d41a0e489dc..9cfa5b8b230 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -456,11 +456,13 @@ convert_to_DPP(amd_gfx_level gfx_level, aco_ptr& instr, bool dpp8) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */ + dpp->fetch_inactive = gfx_level >= GFX10; } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); dpp->row_mask = 0xf; dpp->bank_mask = 0xf; + dpp->fetch_inactive = gfx_level >= GFX10; } instr->valu().neg = tmp->valu().neg; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 2009dd8193e..7874096da8a 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1456,13 +1456,15 @@ struct DPP16_instruction : public VALU_instruction { uint8_t row_mask : 4; uint8_t bank_mask : 4; bool bound_ctrl : 1; - uint8_t padding3 : 7; + uint8_t fetch_inactive : 1; + uint8_t padding3 : 6; }; static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); struct DPP8_instruction : public VALU_instruction { uint32_t lane_sel : 24; - uint32_t padding : 8; + uint32_t fetch_inactive : 1; + uint32_t padding : 7; }; static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 3ebcc1c7ce8..86c75115c1d 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -160,9 +160,11 @@ class Format(Enum): return [('uint16_t', 'dpp_ctrl', None), ('uint8_t', 'row_mask', '0xF'), ('uint8_t', 'bank_mask', '0xF'), - ('bool', 'bound_ctrl', 'true')] + ('bool', 'bound_ctrl', 'true'), + ('bool', 'fetch_inactive', 'true')] elif self == Format.DPP8: - return [('uint32_t', 'lane_sel', 0)] + return [('uint32_t', 'lane_sel', 0), + ('bool', 'fetch_inactive', 'true')] elif self == Format.VOP3P: return [('uint8_t', 'opsel_lo', None), ('uint8_t', 'opsel_hi', None)] @@ -194,6 +196,8 @@ class Format(Enum): for i in range(min(num_operands, 2)): res += 'instr->sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i) res += 'instr->dst_sel = SubdwordSel(def0.bytes(), 0, false);\n' + elif self in [Format.DPP16, Format.DPP8]: + res += 'instr->fetch_inactive &= program->gfx_level >= GFX10;\n' return res diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 9043b356370..f939466f3cf 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -181,12 +181,13 @@ struct InstrPred { DPP16_instruction& bDPP = b->dpp16(); return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl && aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask && - aDPP.bound_ctrl == bDPP.bound_ctrl; + aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.fetch_inactive == bDPP.fetch_inactive; } if (a->isDPP8()) { DPP8_instruction& aDPP = a->dpp8(); DPP8_instruction& bDPP = b->dpp8(); - return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel; + return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel && + aDPP.fetch_inactive == bDPP.fetch_inactive; } if (a->isSDWA()) { SDWA_instruction& aSDWA = a->sdwa(); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 32fdb97b119..d53475c54c0 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -4866,12 +4866,14 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); dpp->lane_sel = info.instr->dpp8().lane_sel; + dpp->fetch_inactive = info.instr->dpp8().fetch_inactive; if (mov_uses_mods) instr->format = asVOP3(instr->format); } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; + dpp->fetch_inactive = info.instr->dpp16().fetch_inactive; } instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0]; diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 32ecb05814b..48ada196926 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -507,8 +507,10 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx)) continue; - /* GFX8/9 don't have fetch-inactive. */ - if (ctx.program->gfx_level < GFX10 && + bool dpp8 = mov->isDPP8(); + + /* Fetch-inactive means exec is ignored, which allows us to combine across exec changes. */ + if (!(dpp8 ? mov->dpp8().fetch_inactive : mov->dpp16().fetch_inactive) && is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), op_instr_idx)) continue; @@ -519,7 +521,6 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (op_used_twice) continue; - bool dpp8 = mov->isDPP8(); bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) && get_operand_size(instr, i) == 32; bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0]; @@ -548,12 +549,14 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); dpp->lane_sel = mov->dpp8().lane_sel; + dpp->fetch_inactive = mov->dpp8().fetch_inactive; if (mov_uses_mods) instr->format = asVOP3(instr->format); } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = mov->dpp16().dpp_ctrl; dpp->bound_ctrl = true; + dpp->fetch_inactive = mov->dpp16().fetch_inactive; } instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0]; instr->valu().abs[0] |= mov->valu().abs[0]; diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index d5f35e5672d..3eed711b7ed 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -707,12 +707,16 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " bank_mask:0x%.1x", dpp.bank_mask); if (dpp.bound_ctrl) fprintf(output, " bound_ctrl:1"); + if (dpp.fetch_inactive) + fprintf(output, " fi"); } else if (instr->isDPP8()) { const DPP8_instruction& dpp = instr->dpp8(); fprintf(output, " dpp8:["); for (unsigned i = 0; i < 8; i++) fprintf(output, "%s%u", i ? "," : "", (dpp.lane_sel >> (i * 3)) & 0x8); fprintf(output, "]"); + if (dpp.fetch_inactive) + fprintf(output, " fi"); } else if (instr->isSDWA()) { const SDWA_instruction& sdwa = instr->sdwa(); if (!instr->isVOPC()) { diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 32c3d798dea..05258490af4 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -146,6 +146,11 @@ validate_ir(Program* program) "Format cannot have DPP applied", instr.get()); check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11, "VOP3+DPP is GFX11+ only", instr.get()); + + bool fi = + instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive; + check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only", + instr.get()); } /* check SDWA */ diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index ee0299e124b..3654e3daed7 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -526,8 +526,8 @@ BEGIN_TEST(d3d11_derivs.fddxy) pbld.add_vsfs(vs, fs); /* Must be before BB1 */ - //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 - //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 + //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[1,1,3,3] bound_ctrl:1 fi + //>> v1: %_ = v_sub_f32 (kill)%_, (kill)%_ quad_perm:[2,2,2,2] bound_ctrl:1 fi //>> BB1 pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); END_TEST @@ -598,12 +598,12 @@ BEGIN_TEST(d3d11_derivs.get_lod) //>> v1: %y = v_interp_p2_f32 (kill)%_, (kill)%_:m0, (kill)%_ attr0.y //>> v2: %vec = p_create_vector %x, %y //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec - //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 - //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 - //>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 - //>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 - //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 + //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1 fi + //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1 fi + //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1 fi + //>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1 fi + //>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1 fi + //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1 fi //>> BB1 //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, %wqm 2d //>> BB2 diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index f49ddc55506..da06a0cccb2 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -59,7 +59,8 @@ BEGIN_TEST(optimize.neg) Temp neg_abs_a = fneg(abs_a); writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1])); - //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 + //~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 + //~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi //! p_unit_test 5, %res5 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1))); @@ -999,42 +1000,42 @@ BEGIN_TEST(optimizer.dpp) Operand d(inputs[3]); /* basic optimization */ - //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 + //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi //! p_unit_test 0, %res0 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b); writeout(0, res0); /* operand swapping */ - //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 + //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi //! p_unit_test 1, %res1 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1); writeout(1, res1); - //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 - //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 + //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi + //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi //! p_unit_test 2, %res2 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror); writeout(2, res2); /* modifiers */ - //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 + //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi //! p_unit_test 3, %res3 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); tmp3->dpp16().neg[0] = true; Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b); writeout(3, res3); - //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 + //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi //! p_unit_test 4, %res4 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b); res4->valu().neg[0] = true; writeout(4, res4); - //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 + //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi //! v1: %res5 = v_add_f32 %tmp5, %b clamp //! p_unit_test 5, %res5 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); @@ -1042,7 +1043,7 @@ BEGIN_TEST(optimizer.dpp) res5->valu().clamp = true; writeout(5, res5); - //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 + //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi //! p_unit_test 6, %res6 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); tmp6->dpp16().neg[0] = true; @@ -1050,14 +1051,14 @@ BEGIN_TEST(optimizer.dpp) res6->valu().abs[0] = true; writeout(6, res6); - //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 + //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi //! p_unit_test 7, %res7 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7); res7->valu().abs[0] = true; writeout(7, res7); - //! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 + //! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi //! v1: %res11 = v_add_u32 %tmp11, %b //! p_unit_test 11, %res11 auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); @@ -1065,7 +1066,7 @@ BEGIN_TEST(optimizer.dpp) Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b); writeout(11, res11); - //! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 + //! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi //! v1: %res12 = v_add_f16 %tmp12, %b //! p_unit_test 12, %res12 auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); @@ -1074,21 +1075,21 @@ BEGIN_TEST(optimizer.dpp) writeout(12, res12); /* vcc */ - //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 + //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi //! p_unit_test 8, %res8 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c); writeout(8, res8); /* sgprs */ - //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 + //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi //! v1: %res9 = v_add_f32 %tmp9, %d //! p_unit_test 9, %res9 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d); writeout(9, res9); - //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 + //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi //! v1: %res10 = v_add_f32 %d, %tmp10 //! p_unit_test 10, %res10 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); @@ -1109,7 +1110,7 @@ BEGIN_TEST(optimize.dpp_prop) Temp one = bld.copy(bld.def(v1), Operand::c32(1)); writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1))); - //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 + //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 fi //! p_unit_test 1, %res1 writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1))); @@ -1120,7 +1121,7 @@ BEGIN_TEST(optimize.dpp_prop) bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1))); //! v1: %literal2 = p_parallelcopy 0x12345679 - //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 + //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 fi //! p_unit_test 3, %res3 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u)); writeout(3, @@ -1132,7 +1133,7 @@ BEGIN_TEST(optimize.dpp_prop) Temp b_v = bld.copy(bld.def(v1), inputs[1]); writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1))); - //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 + //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 fi //! p_unit_test 5, %res5 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1))); @@ -2006,11 +2007,11 @@ BEGIN_TEST(optimize.dpp_opsel) Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1)); Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0)); - //! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 + //! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi //! p_unit_test 0, %res0 writeout(0, fadd(dpp16_hi, b_hi)); - //! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] + //! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi //! p_unit_test 1, %res1 writeout(1, fadd(b_lo, dpp8_hi)); diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index 900993ad8c7..811e762399b 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -365,21 +365,21 @@ BEGIN_TEST(optimizer_postRA.dpp) Operand d(inputs[3], PhysReg(0)); /* basic optimization */ - //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 0, %res0:v[2] Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); writeout(0, Operand(res0, reg_v2)); /* operand swapping */ - //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 1, %res1:v[2] Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2)); writeout(1, Operand(res1, reg_v2)); - //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 - //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 + //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi + //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi //! p_unit_test 2, %res2:v[2] Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), @@ -387,21 +387,21 @@ BEGIN_TEST(optimizer_postRA.dpp) writeout(2, Operand(res2, reg_v2)); /* modifiers */ - //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 3, %res3:v[2] auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); tmp3->dpp16().neg[0] = true; Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b); writeout(3, Operand(res3, reg_v2)); - //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 4, %res4:v[2] Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b); res4->valu().neg[0] = true; writeout(4, Operand(res4, reg_v2)); - //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp //! p_unit_test 5, %res5:v[2] Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -409,7 +409,7 @@ BEGIN_TEST(optimizer_postRA.dpp) res5->valu().clamp = true; writeout(5, Operand(res5, reg_v2)); - //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 6, %res6:v[2] auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); tmp6->dpp16().neg[0] = true; @@ -417,14 +417,14 @@ BEGIN_TEST(optimizer_postRA.dpp) res6->valu().abs[0] = true; writeout(6, Operand(res6, reg_v2)); - //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 + //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi //! p_unit_test 7, %res7:v[2] Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2)); res7->valu().abs[0] = true; writeout(7, Operand(res7, reg_v2)); - //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1] //! p_unit_test 12, %res12:v[2] auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -432,7 +432,7 @@ BEGIN_TEST(optimizer_postRA.dpp) Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b); writeout(12, Operand(res12, reg_v2)); - //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1] //! p_unit_test 13, %res13:v[2] auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -441,14 +441,14 @@ BEGIN_TEST(optimizer_postRA.dpp) writeout(13, Operand(res13, reg_v2)); /* vcc */ - //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 + //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi //! p_unit_test 8, %res8:v[2] Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c); writeout(8, Operand(res8, reg_v2)); - //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1] //! p_unit_test 9, %res9:v[2] Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -459,7 +459,7 @@ BEGIN_TEST(optimizer_postRA.dpp) /* control flow */ //! BB1 //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ - //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 10, %res10:v[2] Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -473,7 +473,7 @@ BEGIN_TEST(optimizer_postRA.dpp) writeout(10, Operand(res10, reg_v2)); /* can't combine if the v_mov_b32's operand is modified */ - //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi //! v1: %tmp11_2:v[0] = v_mov_b32 0 //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1] //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0] @@ -501,7 +501,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_exec) //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1] - //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 0, %res0:v[2] Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), @@ -525,7 +525,7 @@ BEGIN_TEST(optimizer_postRA.dpp_vcmpx) Operand a(inputs[0], PhysReg(256)); Operand b(inputs[1], PhysReg(257)); - //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1] //! p_unit_test 0, %res0:exec Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); @@ -605,7 +605,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf) //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] - //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 + //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi //! p_unit_test 10, %res10:v[12] Temp result = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); @@ -635,7 +635,7 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */ PhysReg reg_v12(268); /* temporary register */ - //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 + //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec