From dedfff9dbf04b2f8b649fb63ac880411bede4c39 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 25 Jul 2024 17:15:15 +0200 Subject: [PATCH] aco: only set latekill in live_var_analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleaner to have this all in one place, in my opinion. Reviewed-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 60 ++++--------------- src/amd/compiler/aco_live_var_analysis.cpp | 33 +++++++++- src/amd/compiler/aco_reduce_assign.cpp | 12 +--- src/amd/compiler/aco_spill.cpp | 6 +- src/amd/compiler/aco_validate.cpp | 5 -- src/amd/compiler/tests/test_d3d11_derivs.cpp | 24 ++++---- src/amd/compiler/tests/test_reduce_assign.cpp | 4 +- src/amd/compiler/tests/test_regalloc.cpp | 52 ++++++++-------- 8 files changed, 87 insertions(+), 109 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 3cfb2f52f71..c788dcb1269 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -180,13 +180,8 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) { /* GFX6-7: there is no bpermute instruction */ - Operand index_op(index); - Operand input_data(data); - index_op.setLateKill(true); - input_data.setLateKill(true); - return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm), - bld.def(bld.lm, vcc), index_op, input_data); + bld.def(bld.lm, vcc), index, data); } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) { /* GFX10 wave64 mode: emulate full-wave bpermute */ @@ -199,11 +194,6 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1); Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); - Operand input_data(data); - - index_x4.setLateKill(true); - input_data.setLateKill(true); - same_half.setLateKill(true); if (ctx->options->gfx_level <= GFX10_3) { /* We need one pair of shared VGPRs: @@ -212,11 +202,10 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2), - bld.def(s1, scc), index_x4, input_data, same_half); + bld.def(s1, scc), index_x4, data, same_half); } else { return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2), - bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data, - same_half); + bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half); } } else { /* GFX8-9 or GFX10 wave32: bpermute works normally */ @@ -3610,11 +3599,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) Temp ref = get_alu_src(ctx, instr->src[0]); Temp src = get_alu_src(ctx, instr->src[1], 2); Temp accum = get_alu_src(ctx, instr->src[2], 4); - Builder::Result res = bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), - as_vgpr(ctx, ref), as_vgpr(ctx, accum)); - res.instr->operands[0].setLateKill(true); - res.instr->operands[1].setLateKill(true); - res.instr->operands[2].setLateKill(true); + bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref), + as_vgpr(ctx, accum)); emit_split_vector(ctx, dst, 4); break; } @@ -5613,13 +5599,9 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem Builder bld(ctx->program, ctx->block); if (in_exec_divergent_or_in_loop(ctx)) { - Operand prim_mask_op = bld.m0(prim_mask); - prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ - Operand coord2_op(coord2); - coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */ bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1, - coord2_op, prim_mask_op); + coord2, bld.m0(prim_mask)); return; } @@ -5676,11 +5658,8 @@ emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, } } else { assert(!high_16bits); - Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, - bld.m0(prim_mask), idx, component); - - if (ctx->program->dev.has_16bank_lds) - interp_p1->operands[0].setLateKill(true); + Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, + bld.m0(prim_mask), idx, component); bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component); @@ -5696,11 +5675,9 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig if (ctx->options->gfx_level >= GFX11) { uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id); if (in_exec_divergent_or_in_loop(ctx)) { - Operand prim_mask_op = bld.m0(prim_mask); - prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl), - prim_mask_op); + bld.m0(prim_mask)); } else { Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); @@ -6189,11 +6166,8 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v mimg->operands[0] = Operand(rsrc); mimg->operands[1] = samp; mimg->operands[2] = vdata; - for (unsigned i = 0; i < coords.size(); i++) { + for (unsigned i = 0; i < coords.size(); i++) mimg->operands[3 + i] = Operand(coords[i]); - if (coords[i].regClass().is_linear_vgpr()) - mimg->operands[3 + i].setLateKill(true); - } mimg->mimg().strict_wqm = strict_wqm; return &bld.insert(std::move(mimg))->mimg(); @@ -8219,9 +8193,7 @@ create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)}; for (unsigned i = 0; i < 4; i++) { exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1); - exp->operands[i].setLateKill(true); exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1); - exp->operands[i + 4].setLateKill(true); } RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels)); @@ -8267,9 +8239,6 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr) Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa))); Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa))); - A.setLateKill(true); - B.setLateKill(true); - VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu(); vop3p.neg_lo[0] = (signed_mask & 0x1) != 0; vop3p.neg_lo[1] = (signed_mask & 0x2) != 0; @@ -10501,9 +10470,7 @@ visit_block(isel_context* ctx, nir_block* block) if (ctx->block->kind & block_kind_top_level) { Builder bld(ctx->program, ctx->block); for (Temp tmp : ctx->unended_linear_vgprs) { - Operand op(tmp); - op.setLateKill(true); - bld.pseudo(aco_opcode::p_end_linear_vgpr, op); + bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp); } ctx->unended_linear_vgprs.clear(); } @@ -11462,16 +11429,13 @@ add_startpgm(struct isel_context* ctx) } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) { /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog. */ - Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset)); - scratch_offset.setLateKill(true); - Operand scratch_addr = ctx->args->ring_offsets.used ? Operand(get_arg(ctx, ctx->args->ring_offsets)) : Operand(s2); Builder bld(ctx->program, ctx->block); bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr, - scratch_offset); + get_arg(ctx, ctx->args->scratch_offset)); } } diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index fac93e84898..757af0e3202 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -179,8 +179,13 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) ctx.program->needs_vcc |= instr_needs_vcc(insn); insn->register_demand = RegisterDemand(new_demand.vgpr, new_demand.sgpr); + bool has_vgpr_def = false; + /* KILL */ for (Definition& definition : insn->definitions) { + has_vgpr_def |= definition.regClass().type() == RegType::vgpr && + !definition.regClass().is_linear_vgpr(); + if (!definition.isTemp()) { continue; } @@ -212,13 +217,39 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) if (insn->operands[op_idx].isOfType(RegType::sgpr)) insn->operands[op_idx].setLateKill(true); } + } else if (insn->opcode == aco_opcode::p_bpermute_readlane || + insn->opcode == aco_opcode::p_bpermute_permlane || + insn->opcode == aco_opcode::p_bpermute_shared_vgpr || + insn->opcode == aco_opcode::p_dual_src_export_gfx11 || + insn->opcode == aco_opcode::v_mqsad_u32_u8) { + for (Operand& op : insn->operands) + op.setLateKill(true); + } else if (insn->opcode == aco_opcode::p_interp_gfx11) { + insn->operands.back().setLateKill(true); /* we don't want the bld.lm def to use m0 */ + if (insn->operands.size() == 7) + insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */ + } else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) { + insn->operands[0].setLateKill(true); + } else if (insn->opcode == aco_opcode::p_init_scratch) { + insn->operands.back().setLateKill(true); + } else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) { + insn->operands[0].setLateKill(true); + insn->operands[1].setLateKill(true); } /* we need to do this in a separate loop because the next one can * setKill() for several operands at once and we don't want to * overwrite that in a later iteration */ - for (Operand& op : insn->operands) + for (Operand& op : insn->operands) { op.setKill(false); + /* Linear vgprs must be late kill: this is to ensure linear VGPR operands and + * normal VGPR definitions don't try to use the same register, which is problematic + * because of assignment restrictions. + */ + if (op.hasRegClass() && op.regClass().is_linear_vgpr() && !op.isUndefined() && + has_vgpr_def) + op.setLateKill(true); + } /* GEN */ for (unsigned i = 0; i < insn->operands.size(); ++i) { diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index d280306a8e9..11c0d2022e4 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -64,11 +64,9 @@ setup_reduce_temp(Program* program) aco_ptr end{create_instruction( aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)}; end->operands[0] = Operand(reduceTmp); - end->operands[0].setLateKill(true); - if (vtmp_inserted_at >= 0) { + if (vtmp_inserted_at >= 0) end->operands[1] = Operand(vtmp); - end->operands[1].setLateKill(true); - } + /* insert after the phis of the block */ std::vector>::iterator it = block.instructions.begin(); while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) @@ -164,16 +162,12 @@ setup_reduce_temp(Program* program) if (instr->isReduction()) { instr->operands[1] = Operand(reduceTmp); - instr->operands[1].setLateKill(true); - if (need_vtmp) { + if (need_vtmp) instr->operands[2] = Operand(vtmp); - instr->operands[2].setLateKill(true); - } } else { assert(instr->opcode == aco_opcode::p_interp_gfx11 || instr->opcode == aco_opcode::p_bpermute_permlane); instr->operands[0] = Operand(reduceTmp); - instr->operands[0].setLateKill(true); } } } diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index d252f1bba6c..86bfbb39119 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1419,10 +1419,8 @@ end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector& vgpr_spi aco_ptr destr{ create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, temps.size(), 0)}; - for (unsigned i = 0; i < temps.size(); i++) { + for (unsigned i = 0; i < temps.size(); i++) destr->operands[i] = Operand(temps[i]); - destr->operands[i].setLateKill(true); - } std::vector>::iterator it = block.instructions.begin(); while (is_phi(*it)) @@ -1540,7 +1538,6 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) /* spill sgpr: just add the vgpr temp to operands */ Instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0); spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); - spill->operands[0].setLateKill(true); spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size); spill->operands[2] = (*it)->operands[0]; instructions.emplace_back(aco_ptr(spill)); @@ -1586,7 +1583,6 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) /* reload sgpr: just add the vgpr temp to operands */ Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); - reload->operands[0].setLateKill(true); reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size); reload->definitions[0] = (*it)->definitions[0]; instructions.emplace_back(aco_ptr(reload)); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index d7816ecc708..5733b14b382 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -372,11 +372,6 @@ validate_ir(Program* program) op.isUndefined()) continue; - /* Check that linear vgprs are late kill: this is to ensure linear VGPR operands and - * normal VGPR definitions don't try to use the same register, which is problematic - * because of assignment restrictions. */ - check(op.isLateKill(), "Linear VGPR operands must be late kill", instr.get()); - /* Only kill linear VGPRs in top-level blocks. Otherwise, we might have to move linear * VGPRs to make space for normal ones and that isn't possible inside control flow. */ if (op.isKill()) { diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index 5a1ce39378e..b14b135c376 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -39,7 +39,7 @@ BEGIN_TEST(d3d11_derivs.simple) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ @@ -80,7 +80,7 @@ BEGIN_TEST(d3d11_derivs.constant) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ @@ -151,7 +151,7 @@ BEGIN_TEST(d3d11_derivs.bias) //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%bias 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ @@ -194,7 +194,7 @@ BEGIN_TEST(d3d11_derivs.offset) //>> v4: %_ = image_sample_o (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%offset 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ @@ -239,7 +239,7 @@ BEGIN_TEST(d3d11_derivs.array) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2darray da //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z ; $_ @@ -286,7 +286,7 @@ BEGIN_TEST(d3d11_derivs.bias_array) //>> v4: %_ = image_sample_b (kill)%_, (kill)%_, v1: undef, (latekill)%wqm, (kill)%bias 2darray da //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.z ; $_ @@ -331,7 +331,7 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ @@ -373,7 +373,7 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2darray da //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y ; $_ @@ -420,7 +420,7 @@ BEGIN_TEST(d3d11_derivs.cube) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm cube da //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_cubeid_f32 v#rf_tmp, v#_, v#_, v#_ ; $_ $_ @@ -467,7 +467,7 @@ BEGIN_TEST(d3d11_derivs.cube_array) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm cube da //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); //>> v_cubeid_f32 v#rf, v#_, v#_, v#_ ; $_ $_ @@ -548,7 +548,7 @@ BEGIN_TEST(d3d11_derivs.bc_optimize) //>> v4: %_ = image_sample (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); END_TEST @@ -589,7 +589,7 @@ BEGIN_TEST(d3d11_derivs.get_lod) //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, (latekill)%wqm 2d //>> BB2 //>> BB6 - //>> p_end_linear_vgpr (latekill)(kill)%wqm + //>> p_end_linear_vgpr (kill)%wqm pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR"); END_TEST diff --git a/src/amd/compiler/tests/test_reduce_assign.cpp b/src/amd/compiler/tests/test_reduce_assign.cpp index b211ca8ff5b..651ebb1c2aa 100644 --- a/src/amd/compiler/tests/test_reduce_assign.cpp +++ b/src/amd/compiler/tests/test_reduce_assign.cpp @@ -33,7 +33,7 @@ BEGIN_TEST(setup_reduce_temp.divergent_if_phi) program.get(), bld, Operand(inputs[0]), [&]() -> void { - //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, (latekill)%lv, lv1: undef op:umin32 cluster_size:64 + //>> s1: %_, s2: %_, s1: %_:scc = p_reduce %a, %lv, lv1: undef op:umin32 cluster_size:64 Instruction* reduce = bld.reduction(aco_opcode::p_reduce, bld.def(s1), bld.def(bld.lm), bld.def(s1, scc), inputs[1], Operand(v1.as_linear()), Operand(v1.as_linear()), umin32); @@ -45,7 +45,7 @@ BEGIN_TEST(setup_reduce_temp.divergent_if_phi) }); bld.pseudo(aco_opcode::p_phi, bld.def(v1), Operand::c32(1), Operand::zero()); //>> /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ - //! p_end_linear_vgpr (latekill)%lv + //! p_end_linear_vgpr %lv finish_setup_reduce_temp_test(); END_TEST diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index 90265a7156e..1b878c2f46d 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -297,9 +297,7 @@ END_TEST static void end_linear_vgpr(Temp tmp) { - Operand op(tmp); - op.setLateKill(true); - bld.pseudo(aco_opcode::p_end_linear_vgpr, op); + bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp); } BEGIN_TEST(regalloc.linear_vgpr.alloc.basic) @@ -308,10 +306,10 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.basic) //>> lv1: %ltmp0:v[31] = p_start_linear_vgpr //! lv1: %ltmp1:v[30] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] //! lv1: %ltmp2:v[31] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp1:v[30] - //! p_end_linear_vgpr (latekill)%ltmp2:v[31] + //! p_end_linear_vgpr %ltmp1:v[30] + //! p_end_linear_vgpr %ltmp2:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); end_linear_vgpr(ltmp0); @@ -331,7 +329,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_grow) //! lv1: %ltmp0:v[31] = p_start_linear_vgpr //! lv1: %ltmp1:v[30] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); end_linear_vgpr(ltmp0); @@ -347,8 +345,8 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_grow) //! lv2: %ltmp2:v[29-30] = p_start_linear_vgpr Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v2.as_linear())); - //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31] - //! p_end_linear_vgpr (latekill)%ltmp2:v[29-30] + //! p_end_linear_vgpr %ltmp1_2:v[31] + //! p_end_linear_vgpr %ltmp2:v[29-30] end_linear_vgpr(ltmp1); end_linear_vgpr(ltmp2); @@ -371,9 +369,9 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_shrink) //! lv1: %ltmp2:v[29] = p_start_linear_vgpr //! lv1: %ltmp3:v[28] = p_start_linear_vgpr //! lv1: %ltmp4:v[27] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] - //! p_end_linear_vgpr (latekill)%ltmp2:v[29] - //! p_end_linear_vgpr (latekill)%ltmp4:v[27] + //! p_end_linear_vgpr %ltmp0:v[31] + //! p_end_linear_vgpr %ltmp2:v[29] + //! p_end_linear_vgpr %ltmp4:v[27] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); @@ -392,9 +390,9 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_shrink) //! v28: %_:v[0-27] = p_unit_test bld.pseudo(aco_opcode::p_unit_test, bld.def(RegClass::get(RegType::vgpr, 28 * 4))); - //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31] - //! p_end_linear_vgpr (latekill)%ltmp3_2:v[30] - //! p_end_linear_vgpr (latekill)%ltmp5:v[28-29] + //! p_end_linear_vgpr %ltmp1_2:v[31] + //! p_end_linear_vgpr %ltmp3_2:v[30] + //! p_end_linear_vgpr %ltmp5:v[28-29] end_linear_vgpr(ltmp1); end_linear_vgpr(ltmp3); end_linear_vgpr(ltmp5); @@ -412,7 +410,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_normal) //! lv1: %ltmp0:v[31] = p_start_linear_vgpr //! lv1: %ltmp1:v[30] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); end_linear_vgpr(ltmp0); @@ -421,7 +419,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_normal) //! v31: %_:v[0-30] = p_unit_test bld.pseudo(aco_opcode::p_unit_test, bld.def(RegClass::get(RegType::vgpr, 31 * 4))); - //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31] + //! p_end_linear_vgpr %ltmp1_2:v[31] end_linear_vgpr(ltmp1); finish_ra_test(ra_test_policy{pessimistic}); @@ -437,7 +435,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_vec) //! lv1: %ltmp0:v[31] = p_start_linear_vgpr //! lv1: %ltmp1:v[30] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); end_linear_vgpr(ltmp0); @@ -447,7 +445,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.compact_for_vec) RegClass v31 = RegClass::get(RegType::vgpr, 31 * 4); bld.pseudo(aco_opcode::p_create_vector, bld.def(v31), Operand(v31)); - //! p_end_linear_vgpr (latekill)%ltmp1_2:v[31] + //! p_end_linear_vgpr %ltmp1_2:v[31] end_linear_vgpr(ltmp1); finish_ra_test(ra_test_policy{pessimistic}); @@ -467,7 +465,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.killed_op) Temp tmp1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1)); //! lv1: %ltmp0:v[31] = p_start_linear_vgpr %tmp1:v[31] - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()), tmp1); end_linear_vgpr(ltmp0); @@ -494,7 +492,7 @@ BEGIN_TEST(regalloc.linear_vgpr.alloc.move_killed_op) //~gfx8_optimistic! v1: %tmp1_2:v[31], v1: %tmp2_2:v[30] = p_parallelcopy %tmp1:v[30], %tmp2:v[31] //~gfx8_pessimistic! v1: %tmp2_2:v[30], v1: %tmp1_2:v[31] = p_parallelcopy %tmp2:v[31], %tmp1:v[30] //! lv1: %ltmp0:v[31] = p_start_linear_vgpr %tmp1_2:v[31] - //! p_end_linear_vgpr (latekill)%ltmp0:v[31] + //! p_end_linear_vgpr %ltmp0:v[31] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear()), tmp1); end_linear_vgpr(ltmp0); @@ -514,7 +512,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def) //>> lv2: %ltmp0:v[30-31] = p_start_linear_vgpr //! lv1: %ltmp1:v[29] = p_start_linear_vgpr //! lv1: %ltmp2:v[28] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp1:v[29] + //! p_end_linear_vgpr %ltmp1:v[29] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v2.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); @@ -549,8 +547,8 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_def) program->blocks[2].logical_preds.push_back(1); program->blocks[2].kind |= block_kind_top_level; - //! p_end_linear_vgpr (latekill)%ltmp0_2:v[30-31] - //! p_end_linear_vgpr (latekill)%ltmp2_2:v[29] + //! p_end_linear_vgpr %ltmp0_2:v[30-31] + //! p_end_linear_vgpr %ltmp2_2:v[29] end_linear_vgpr(ltmp0); end_linear_vgpr(ltmp2); @@ -578,7 +576,7 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_phis) //>> lv1: %ltmp0:v[31] = p_start_linear_vgpr //! lv1: %ltmp1:v[30] = p_start_linear_vgpr //! lv1: %ltmp2:v[29] = p_start_linear_vgpr - //! p_end_linear_vgpr (latekill)%ltmp1:v[30] + //! p_end_linear_vgpr %ltmp1:v[30] Temp ltmp0 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp1 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); Temp ltmp2 = bld.pseudo(aco_opcode::p_start_linear_vgpr, bld.def(v1.as_linear())); @@ -614,8 +612,8 @@ BEGIN_TEST(regalloc.linear_vgpr.compact_for_future_phis) Temp tmp = bld.pseudo(aco_opcode::p_phi, bld.def(v30), Operand(v30)); bld.pseudo(aco_opcode::p_unit_test, tmp); - //! p_end_linear_vgpr (latekill)%ltmp0_2:v[31] - //! p_end_linear_vgpr (latekill)%ltmp2_2:v[30] + //! p_end_linear_vgpr %ltmp0_2:v[31] + //! p_end_linear_vgpr %ltmp2_2:v[30] end_linear_vgpr(ltmp0); end_linear_vgpr(ltmp2);