From ec347ee9bc41f99dc8e398c652d873cc192bc99c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 9 Nov 2020 19:42:22 +0100 Subject: [PATCH] aco: fix combining add/sub to b2i if a new dest needs to be allocated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The uses vector needs to be expanded to avoid out of bounds access and to make sure the number of uses is initialized to 0. This fixes combining more v_and(a, v_subbrev_co_u32). fossilds-db (Vega10): Totals from 4574 (3.28% of 139517) affected shaders: SGPRs: 291625 -> 292217 (+0.20%); split: -0.01%, +0.21% VGPRs: 276368 -> 276188 (-0.07%); split: -0.07%, +0.01% SpillSGPRs: 455 -> 533 (+17.14%) SpillVGPRs: 76 -> 78 (+2.63%) CodeSize: 23327500 -> 23304152 (-0.10%); split: -0.17%, +0.07% MaxWaves: 22044 -> 22066 (+0.10%) Instrs: 4583064 -> 4576301 (-0.15%); split: -0.15%, +0.01% Cycles: 47925276 -> 47871968 (-0.11%); split: -0.13%, +0.01% VMEM: 1599363 -> 1597473 (-0.12%); split: +0.08%, -0.19% SMEM: 331461 -> 331126 (-0.10%); split: +0.08%, -0.18% VClause: 80639 -> 80696 (+0.07%); split: -0.02%, +0.09% SClause: 155992 -> 155993 (+0.00%); split: -0.02%, +0.02% Copies: 333482 -> 333318 (-0.05%); split: -0.12%, +0.07% Branches: 70967 -> 70968 (+0.00%) PreSGPRs: 187078 -> 187711 (+0.34%); split: -0.01%, +0.35% PreVGPRs: 244918 -> 244785 (-0.05%) Signed-off-by: Samuel Pitoiset Reviewed-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 13 ++++++++++--- src/amd/compiler/tests/test_optimizer.cpp | 6 ++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index aa9fad589b3..9d6219037de 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2260,9 +2260,16 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode n } ctx.uses[instr->operands[i].tempId()]--; new_instr->definitions[0] = instr->definitions[0]; - new_instr->definitions[1] = - instr->definitions.size() == 2 ? instr->definitions[1] : - Definition(ctx.program->allocateTmp(ctx.program->lane_mask)); + if (instr->definitions.size() == 2) { + new_instr->definitions[1] = instr->definitions[1]; + } else { + new_instr->definitions[1] = + Definition(ctx.program->allocateTmp(ctx.program->lane_mask)); + /* Make sure the uses vector is large enough and the number of + * uses properly initialized to 0. + */ + ctx.uses.push_back(0); + } new_instr->definitions[1].setHint(vcc); new_instr->operands[0] = Operand(0u); new_instr->operands[1] = instr->operands[!i]; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 4bb5898e236..f43ae731072 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -119,6 +119,12 @@ BEGIN_TEST(optimize.cndmask) Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev); writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev)); + //! v1: %res4 = v_cndmask_b32 0, %a, %c + //! p_unit_test 4, %res4 + Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), Operand(inputs[2])); + Temp sub = bld.vsub32(bld.def(v1), Operand(0u), cndmask); + writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub)); + finish_opt_test(); } END_TEST