From 369c9b642524cb543c59d006ad7e3ce089fa6879 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 15 Nov 2022 05:51:24 +0000 Subject: [PATCH] aco: fix p_interp_gfx11 to not overwrite SCC s_wqm_b64 clobbers SCC. Found this while working on dual source blending. Fixes: 6113ee650a2 ("aco/gfx11: fix FS input loads in quad-divergent control flow") Signed-off-by: Samuel Pitoiset Reviewed-by: Georg Lehmann Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_builder_h.py | 2 +- src/amd/compiler/aco_instruction_selection.cpp | 2 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 5 ++++- src/amd/compiler/aco_opcodes.py | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index cff50c89132..41912825be7 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -522,7 +522,7 @@ public: } <% import itertools -formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6)]), +formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6), (3,6)]), ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]), ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])), ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])), diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 83a853742ef..50ef490c30f 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5329,7 +5329,7 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ Operand coord2_op(coord2); coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */ - bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), + bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), bld.def(s1, scc), Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op); return; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 74244b0907e..7984237c154 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2384,12 +2384,14 @@ lower_to_hw_instr(Program* program) assert(instr->definitions[0].regClass() == v1 || instr->definitions[0].regClass() == v2b); assert(instr->definitions[1].regClass() == bld.lm); + assert(instr->definitions[2].isFixed() && instr->definitions[2].physReg() == scc); assert(instr->operands[0].regClass() == v1.as_linear()); assert(instr->operands[1].isConstant()); assert(instr->operands[2].isConstant()); assert(instr->operands.back().physReg() == m0); Definition dst = instr->definitions[0]; PhysReg exec_tmp = instr->definitions[1].physReg(); + Definition clobber_scc = instr->definitions[2]; PhysReg lin_vgpr = instr->operands[0].physReg(); unsigned attribute = instr->operands[1].constantValue(); unsigned component = instr->operands[2].constantValue(); @@ -2406,7 +2408,8 @@ lower_to_hw_instr(Program* program) } bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), Operand(exec, bld.lm)); - bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), Operand(exec, bld.lm)); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc, + Operand(exec, bld.lm)); bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1), attribute, component); bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(exec_tmp, bld.lm)); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 2594d4f5cb2..cd89b402707 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -336,8 +336,8 @@ opcode("p_init_scratch") opcode("p_jump_to_epilog") # loads and interpolates a fragment shader input with a correct exec mask -#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0 -#dst0=result, dst1=exec_tmp, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0 +#dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0 +#dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0 opcode("p_interp_gfx11") # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)