From 26fce534b51c1ce723174c1311eea2547afd14eb Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 2 Oct 2023 15:44:49 +0100 Subject: [PATCH] aco: shrink DPP8_instruction Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_assembler.cpp | 3 +-- src/amd/compiler/aco_instruction_selection.cpp | 9 ++++----- src/amd/compiler/aco_ir.cpp | 3 +-- src/amd/compiler/aco_ir.h | 5 +++-- src/amd/compiler/aco_lower_to_hw_instr.cpp | 17 +++++++---------- src/amd/compiler/aco_opcodes.py | 2 ++ src/amd/compiler/aco_opt_value_numbering.cpp | 3 +-- src/amd/compiler/aco_optimizer.cpp | 3 +-- src/amd/compiler/aco_optimizer_postRA.cpp | 2 +- src/amd/compiler/aco_print_ir.cpp | 7 ++++--- 10 files changed, 25 insertions(+), 29 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index a081597119d..ffb47115836 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -814,8 +814,7 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst emit_instruction(ctx, out, instr); uint32_t encoding = reg(ctx, dpp_op, 8); encoding |= dpp.opsel[0] && !instr->isVOP3() ? 128 : 0; - for (unsigned i = 0; i < 8; ++i) - encoding |= dpp.lane_sel[i] << (8 + i * 3); + encoding |= dpp.lane_sel << 8; out.push_back(encoding); return; } else if (instr->isVOP3()) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index ab712f1441b..4c0d3af5d12 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -275,11 +275,10 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) { dpp_ctrl = dpp_row_xmask(xor_mask); } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) { - Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src); - for (unsigned i = 0; i < 8; i++) { - ret->dpp8().lane_sel[i] = ((i & and_mask) ^ xor_mask); - } - return ret; + uint32_t lane_sel = 0; + for (unsigned i = 0; i < 8; i++) + lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3); + return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel); } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) { uint64_t lane_mask = 0; for (unsigned i = 0; i < 16; i++) diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index d38a6263b7d..d41a0e489dc 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -455,8 +455,7 @@ convert_to_DPP(amd_gfx_level gfx_level, aco_ptr& instr, bool dpp8) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); - for (unsigned i = 0; i < 8; i++) - dpp->lane_sel[i] = i; + dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */ } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 808c6096c04..2009dd8193e 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1461,9 +1461,10 @@ struct DPP16_instruction : public VALU_instruction { static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); struct DPP8_instruction : public VALU_instruction { - uint8_t lane_sel[8]; + uint32_t lane_sel : 24; + uint32_t padding : 8; }; -static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 8, "Unexpected padding"); +static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); struct SubdwordSel { enum sdwa_sel : uint8_t { diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index a8d7e3be566..e26860d2184 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2831,12 +2831,12 @@ lower_to_hw_instr(Program* program) Operand src0 = instr->operands[i]; Operand src1 = instr->operands[i + 4]; + uint32_t lane_sel_xor1 = 0; + for (unsigned j = 0; j < 8; j++) + lane_sel_xor1 |= (j ^ 1) << (j * 3); + /* Swap odd, even lanes of mrt0. */ - Builder::Result ret = - bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0); - for (unsigned j = 0; j < 8; j++) { - ret->dpp8().lane_sel[j] = j ^ 1; - } + bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0, lane_sel_xor1); /* Swap even lanes between mrt0 and mrt1. */ bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1, @@ -2845,11 +2845,8 @@ lower_to_hw_instr(Program* program) Operand(clobber_vcc.physReg(), bld.lm)); /* Swap odd, even lanes of mrt0 again. */ - ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), - Operand(tmp.physReg(), v1)); - for (unsigned j = 0; j < 8; j++) { - ret->dpp8().lane_sel[j] = j ^ 1; - } + bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), + Operand(tmp.physReg(), v1), lane_sel_xor1); mrt0[i] = Operand(dst0, v1); mrt1[i] = Operand(dst1, v1); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index dc559198a66..3ebcc1c7ce8 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -161,6 +161,8 @@ class Format(Enum): ('uint8_t', 'row_mask', '0xF'), ('uint8_t', 'bank_mask', '0xF'), ('bool', 'bound_ctrl', 'true')] + elif self == Format.DPP8: + return [('uint32_t', 'lane_sel', 0)] elif self == Format.VOP3P: return [('uint8_t', 'opsel_lo', None), ('uint8_t', 'opsel_hi', None)] diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 6e9d768e97b..9043b356370 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -186,8 +186,7 @@ struct InstrPred { if (a->isDPP8()) { DPP8_instruction& aDPP = a->dpp8(); DPP8_instruction& bDPP = b->dpp8(); - return aDPP.pass_flags == bDPP.pass_flags && - !memcmp(aDPP.lane_sel, bDPP.lane_sel, sizeof(aDPP.lane_sel)); + return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel; } if (a->isSDWA()) { SDWA_instruction& aSDWA = a->sdwa(); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index b58fc3674e2..32fdb97b119 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -4865,8 +4865,7 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); - for (unsigned j = 0; j < 8; ++j) - dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; + dpp->lane_sel = info.instr->dpp8().lane_sel; if (mov_uses_mods) instr->format = asVOP3(instr->format); } else { diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 4ef73792fd3..32ecb05814b 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -547,7 +547,7 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); - memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel)); + dpp->lane_sel = mov->dpp8().lane_sel; if (mov_uses_mods) instr->format = asVOP3(instr->format); } else { diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index ef8132f694f..d5f35e5672d 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -709,9 +709,10 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " bound_ctrl:1"); } else if (instr->isDPP8()) { const DPP8_instruction& dpp = instr->dpp8(); - fprintf(output, " dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]", dpp.lane_sel[0], dpp.lane_sel[1], - dpp.lane_sel[2], dpp.lane_sel[3], dpp.lane_sel[4], dpp.lane_sel[5], dpp.lane_sel[6], - dpp.lane_sel[7]); + fprintf(output, " dpp8:["); + for (unsigned i = 0; i < 8; i++) + fprintf(output, "%s%u", i ? "," : "", (dpp.lane_sel >> (i * 3)) & 0x8); + fprintf(output, "]"); } else if (instr->isSDWA()) { const SDWA_instruction& sdwa = instr->sdwa(); if (!instr->isVOPC()) {