From e5b48da908723f4eaf3fab7a96e06ae9f85bce7d Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Wed, 17 Jul 2024 23:15:14 +0200 Subject: [PATCH] aco: remove optimize_cmp_subgroup_invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new NIR optimization pass handles all these cases and more. No Foz-DB changes. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 118 +---------------------------- 1 file changed, 1 insertion(+), 117 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 947e0b5aaa2..9f0560380bd 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -81,13 +81,12 @@ enum Label { label_f2f32 = 1ull << 37, label_f2f16 = 1ull << 38, label_split = 1ull << 39, - label_subgroup_invocation = 1ull << 40, }; static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_add_sub | label_vop3p | label_bitwise | label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | label_dpp8 | - label_f2f32 | label_subgroup_invocation; + label_f2f32; static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16; @@ -452,14 +451,6 @@ struct ssa_info { } bool is_split() { return label & label_split; } - - void set_subgroup_invocation(Instruction* label_instr) - { - add_label(label_subgroup_invocation); - instr = label_instr; - } - - bool is_subgroup_invocation() { return label & label_subgroup_invocation; } }; struct opt_ctx { @@ -2051,27 +2042,6 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); break; } - case aco_opcode::v_mbcnt_lo_u32_b32: { - if (instr->operands[0].constantEquals(-1) && instr->operands[1].constantEquals(0)) { - if (ctx.program->wave_size == 32) - ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get()); - else - ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); - } - break; - } - case aco_opcode::v_mbcnt_hi_u32_b32: - case aco_opcode::v_mbcnt_hi_u32_b32_e64: { - if (instr->operands[0].constantEquals(-1) && instr->operands[1].isTemp() && - ctx.info[instr->operands[1].tempId()].is_usedef()) { - Instruction* usedef_instr = ctx.info[instr->operands[1].tempId()].instr; - if (usedef_instr->opcode == aco_opcode::v_mbcnt_lo_u32_b32 && - usedef_instr->operands[0].constantEquals(-1) && - usedef_instr->operands[1].constantEquals(0)) - ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get()); - } - break; - } case aco_opcode::v_cvt_f16_f32: { if (instr->operands[0].isTemp()) ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get()); @@ -2151,87 +2121,6 @@ follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false) return instr; } -/* Optimize v_cmp of constant with subgroup invocation to a constant mask. - * Ideally, we can trade v_cmp for a constant (or literal). - * In a less ideal case, we trade v_cmp for a SALU instruction, which is still a win. - */ -bool -optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr& instr) -{ - /* This optimization only applies to VOPC with 2 operands. */ - if (instr->operands.size() != 2) - return false; - - /* Find the constant operand or return early if there isn't one. */ - const int const_op_idx = instr->operands[0].isConstant() ? 0 - : instr->operands[1].isConstant() ? 1 - : -1; - if (const_op_idx == -1) - return false; - - /* Find the operand that has the subgroup invocation. */ - const int mbcnt_op_idx = 1 - const_op_idx; - const Operand mbcnt_op = instr->operands[mbcnt_op_idx]; - if (!mbcnt_op.isTemp() || !ctx.info[mbcnt_op.tempId()].is_subgroup_invocation()) - return false; - - /* Adjust opcode so we don't have to care about const_op_idx below. */ - const aco_opcode op = const_op_idx == 0 ? get_vcmp_swapped(instr->opcode) : instr->opcode; - const unsigned wave_size = ctx.program->wave_size; - const unsigned val = instr->operands[const_op_idx].constantValue(); - - /* Find suitable constant bitmask corresponding to the value. */ - unsigned first_bit = 0, num_bits = 0; - switch (op) { - case aco_opcode::v_cmp_eq_u32: - case aco_opcode::v_cmp_eq_i32: - first_bit = val; - num_bits = val >= wave_size ? 0 : 1; - break; - case aco_opcode::v_cmp_le_u32: - case aco_opcode::v_cmp_le_i32: - first_bit = 0; - num_bits = val >= wave_size ? wave_size : (val + 1); - break; - case aco_opcode::v_cmp_lt_u32: - case aco_opcode::v_cmp_lt_i32: - first_bit = 0; - num_bits = val >= wave_size ? wave_size : val; - break; - case aco_opcode::v_cmp_ge_u32: - case aco_opcode::v_cmp_ge_i32: - first_bit = val; - num_bits = val >= wave_size ? 0 : (wave_size - val); - break; - case aco_opcode::v_cmp_gt_u32: - case aco_opcode::v_cmp_gt_i32: - first_bit = val + 1; - num_bits = val >= wave_size ? 0 : (wave_size - val - 1); - break; - default: return false; - } - - Instruction* cpy = NULL; - const uint64_t mask = BITFIELD64_RANGE(first_bit, num_bits); - if (!Operand::is_constant_representable(mask, wave_size / 8, true, false)) { - /* Mask can't be represented as a 64-bit constant or literal, create a vector */ - cpy = create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 2, 1); - cpy->operands[0] = Operand::c32(mask); - cpy->operands[1] = Operand::c32(mask >> 32); - } else { - /* Copy mask as a literal constant. */ - cpy = create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1); - cpy->operands[0] = wave_size == 32 ? Operand::c32((uint32_t)mask) : Operand::c64(mask); - } - - cpy->definitions[0] = instr->definitions[0]; - ctx.info[instr->definitions[0].tempId()].label = 0; - decrease_uses(ctx, ctx.info[mbcnt_op.tempId()].instr); - instr.reset(cpy); - - return true; -} - bool is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value) { @@ -3862,11 +3751,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) apply_ds_extract(ctx, instr); } - if (instr->isVOPC()) { - if (optimize_cmp_subgroup_invocation(ctx, instr)) - return; - } - /* TODO: There are still some peephole optimizations that could be done: * - abs(a - b) -> s_absdiff_i32 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32