From fb13ed6ff04512d9513eec73aeb76b4afc290e2d Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 18 Aug 2022 14:31:06 +0100 Subject: [PATCH] aco: fix long-jump version of discard early exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It isn't safe to modify the exec mask before the discard block, and the definition interferes with GFX11 NOP insertion. Just use s[0:1] instead, since we won't be using it. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_assembler.cpp | 20 +++++++++---- src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 4 +-- src/amd/compiler/tests/test_assembler.cpp | 33 ++++++++++++++++++++++ 4 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 4c9c69627ae..6a46844983e 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -902,10 +902,18 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, { Builder bld(ctx.program); - Definition def_tmp_lo(branch->definitions[0].physReg(), s1); - Operand op_tmp_lo(branch->definitions[0].physReg(), s1); - Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1); - Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1); + Definition def; + if (branch->definitions.empty()) { + assert(ctx.program->blocks[branch->block].kind & block_kind_discard_early_exit); + def = Definition(PhysReg(0), s2); /* The discard early exit block doesn't use SGPRs. */ + } else { + def = branch->definitions[0]; + } + + Definition def_tmp_lo(def.physReg(), s1); + Operand op_tmp_lo(def.physReg(), s1); + Definition def_tmp_hi(def.physReg().advance(4), s1); + Operand op_tmp_hi(def.physReg().advance(4), s1); aco_ptr instr; @@ -926,7 +934,7 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, } /* create the new PC and stash SCC in the LSB */ - instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr); + instr.reset(bld.sop1(aco_opcode::s_getpc_b64, def).instr); emit_instruction(ctx, out, instr.get()); instr.reset( @@ -944,7 +952,7 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, /* create the s_setpc_b64 to jump */ instr.reset( - bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); + bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)).instr); emit_instruction(ctx, out, instr.get()); } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 4394cee2284..45266b64a09 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1836,6 +1836,7 @@ enum block_kind { block_kind_branch = 1 << 8, block_kind_merge = 1 << 9, block_kind_invert = 1 << 10, + block_kind_discard_early_exit = 1 << 11, block_kind_uses_discard = 1 << 12, block_kind_needs_lowering = 1 << 13, block_kind_export_end = 1 << 15, diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 6250b7f285d..dd3345adabb 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2121,6 +2121,7 @@ lower_to_hw_instr(Program* program) if (!discard_block) { discard_block = program->create_and_insert_block(); + discard_block->kind = block_kind_discard_early_exit; block = &program->blocks[block_idx]; bld.reset(discard_block); @@ -2133,8 +2134,7 @@ lower_to_hw_instr(Program* program) } assert(instr->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], - discard_block->index); + bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index); discard_block->linear_preds.push_back(block->index); block->linear_succs.push_back(discard_block->index); diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 3e1669f44e8..1cca50e7d8f 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -226,6 +226,39 @@ BEGIN_TEST(assembler.long_jump.constaddr) finish_assembler_test(); END_TEST +BEGIN_TEST(assembler.long_jump.discard_early_exit) + if (!setup_cs(NULL, (amd_gfx_level)GFX10)) + return; + + //! BB0: + //! s_cbranch_scc1 BB1 ; bf850006 + //! s_getpc_b64 s[0:1] ; be801f00 + //! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014 + //! s_bitcmp1_b32 s0, 0 ; bf0d8000 + //! s_bitset0_b32 s0, 0 ; be801b80 + //! s_setpc_b64 s[0:1] ; be802000 + bld.sopp(aco_opcode::s_cbranch_scc0, 2); + + bld.reset(program->create_and_insert_block()); + + //! BB1: + //! s_nop 1 ; bf800001 + //!(then repeated 32766 times) + //! s_endpgm ; bf810000 + for (unsigned i = 0; i < INT16_MAX; i++) + bld.sopp(aco_opcode::s_nop, -1, 1); + + //! BB2: + //! s_endpgm ; bf810000 + bld.reset(program->create_and_insert_block()); + + program->blocks[1].linear_preds.push_back(0u); + program->blocks[2].linear_preds.push_back(0u); + program->blocks[2].kind = block_kind_discard_early_exit; + + finish_assembler_test(); +END_TEST + BEGIN_TEST(assembler.v_add3) for (unsigned i = GFX9; i <= GFX10; i++) { if (!setup_cs(NULL, (amd_gfx_level)i))