From ee40beb60d2877efc88fa21097cfc0439fd4223e Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 24 Apr 2024 16:57:10 +0100 Subject: [PATCH] aco/waitcnt: fix DS/VMEM ordered writes when mixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Cc: mesa-stable Part-of: (cherry picked from commit 5b1b09ad429f180dd0b7758487b7e0557c8b094b) --- .pick_status.json | 2 +- src/amd/compiler/aco_insert_waitcnt.cpp | 8 ++- .../compiler/tests/test_insert_waitcnt.cpp | 68 +++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index bf802c0cfa1..9908c3c5f63 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -244,7 +244,7 @@ "description": "aco/waitcnt: fix DS/VMEM ordered writes when mixed", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index ae94582f6ce..51f02d47549 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -411,18 +411,20 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i if (it == ctx.gpr_map.end()) continue; + wait_imm reg_imm = it->second.imm; + /* Vector Memory reads and writes return in the order they were issued */ uint8_t vmem_type = get_vmem_type(instr); if (vmem_type && ((it->second.events & vm_events) == event_vmem) && it->second.vmem_types == vmem_type) - continue; + reg_imm.vm = wait_imm::unset_counter; /* LDS reads and writes return in the order they were issued. same for GDS */ if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) - continue; + reg_imm.lgkm = wait_imm::unset_counter; - wait.combine(it->second.imm); + wait.combine(reg_imm); } } } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index b9fb5b7e12a..e1a933a62d7 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -111,3 +111,71 @@ BEGIN_TEST(insert_waitcnt.clause) finish_waitcnt_test(); END_TEST + +BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.vmem) + if (!setup_cs(NULL, GFX10)) + return; + + Definition def_v4(PhysReg(260), v1); + Operand op_v0(PhysReg(256), v1); + Operand desc0(PhysReg(0), s4); + + //>> BB0 + //! /* logical preds: / linear preds: / kind: top-level, */ + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); + + //>> BB1 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + bld.reset(program->create_and_insert_block()); + bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); + + bld.reset(program->create_and_insert_block()); + program->blocks[2].linear_preds.push_back(0); + program->blocks[2].linear_preds.push_back(1); + program->blocks[2].logical_preds.push_back(0); + program->blocks[2].logical_preds.push_back(1); + + //>> BB2 + //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */ + //! s_waitcnt lgkmcnt(0) + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); + + finish_waitcnt_test(); +END_TEST + +BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds) + if (!setup_cs(NULL, GFX10)) + return; + + Definition def_v4(PhysReg(260), v1); + Operand op_v0(PhysReg(256), v1); + Operand desc0(PhysReg(0), s4); + + //>> BB0 + //! /* logical preds: / linear preds: / kind: top-level, */ + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); + + //>> BB1 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + bld.reset(program->create_and_insert_block()); + bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); + + bld.reset(program->create_and_insert_block()); + program->blocks[2].linear_preds.push_back(0); + program->blocks[2].linear_preds.push_back(1); + program->blocks[2].logical_preds.push_back(0); + program->blocks[2].logical_preds.push_back(1); + + //>> BB2 + //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */ + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); + + finish_waitcnt_test(); +END_TEST