From 47d824a64416b2cf662237cd8d32594cd6d806a5 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 23 Apr 2024 17:52:05 +0200 Subject: [PATCH] aco/lower_to_hw: fix 16bit p_insert on gfx8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 19 +++++++-- src/amd/compiler/tests/test_to_hw_instr.cpp | 47 +++++++++++++++------ 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 96a46b9eee5..9568908b346 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2683,9 +2683,22 @@ lower_to_hw_instr(Program* program) } } else { assert(dst.regClass() == v2b); - bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op) - ->sdwa() - .sel[1] = SubdwordSel::ubyte; + if (!offset) { + bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().sel[0] = + SubdwordSel::ubyte; + } else if (program->gfx_level >= GFX9) { + bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op) + ->sdwa() + .sel[1] = SubdwordSel::ubyte; + } else { + assert(offset == 8); + Definition dst_hi = Definition(dst.physReg().advance(1), v1b); + bld.vop1_sdwa(aco_opcode::v_mov_b32, dst_hi, op)->sdwa().sel[0] = + SubdwordSel::ubyte; + uint32_t c = ~(BITFIELD_MASK(offset) << (dst.physReg().byte() * 8)); + bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(c), + Operand(PhysReg(op.physReg().reg()), v1)); + } } break; } diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 81667082c25..e78893b2459 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -715,29 +715,52 @@ BEGIN_TEST(to_hw_instr.insert) #undef INS -#define INS(idx, def_b) \ - bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b), Operand(v1_lo, v2b), \ - Operand::c32(idx), Operand::c32(8u)); +#define INS(idx, def_b, op_b) \ + bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b), \ + Operand(v1_lo.advance(op_b), v2b), Operand::c32(idx), Operand::c32(8u)); //>> p_unit_test 2 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); //~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8 - //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0 + //~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:ubyte0 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00 - INS(0, 0) - //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0 + INS(0, 0, 0) + //~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:ubyte0 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504 if (lvl != GFX7) - INS(0, 2) + INS(0, 2, 0) + //~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:ubyte2 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c02 + if (lvl != GFX7) + INS(0, 0, 2) + //~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:ubyte2 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc020504 + if (lvl != GFX7) + INS(0, 2, 2) //~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16] - //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0 + //~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte0 + //~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1] + //~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c - INS(1, 0) - //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0 + INS(1, 0, 0) + //~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte0 + //~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1] + //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0 //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0504 if (lvl != GFX7) - INS(1, 2) - + INS(1, 2, 0) + //~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][16:32] dst_sel:ubyte1 dst_preserve src0_sel:ubyte2 + //~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1] + //~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte2 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706020c + if (lvl != GFX7) + INS(1, 0, 2) + //~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][16:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte2 + //~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1] + //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte2 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x20c0504 + if (lvl != GFX7) + INS(1, 2, 2) #undef INS finish_to_hw_instr_test();