diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 02498c01195..2e43fa2d62a 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -1019,6 +1019,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector& register_demand, + Instruction* current, int idx) +{ + hazard_query hq; + init_hazard_query(ctx, &hq); + + DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); + unsigned skip = 0; + + for (int i = 0; i < VMEM_CLAUSE_MAX_GRAB_DIST; i++) { + aco_ptr& candidate = block->instructions[cursor.source_idx]; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + + if (!should_form_clause(current, candidate.get())) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(cursor); + continue; + } + + if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success || + ctx.mv.downwards_move(cursor, true) != move_success) + break; + + skip++; + } + + return skip; +} + void schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) { @@ -1028,6 +1059,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) ctx.mv.register_demand = live_vars.register_demand[block->index].data(); /* go through all instructions and find memory loads */ + unsigned num_stores = 0; for (unsigned idx = 0; idx < block->instructions.size(); idx++) { Instruction* current = block->instructions[idx].get(); @@ -1040,8 +1072,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) } } - if (current->definitions.empty()) + if (current->definitions.empty()) { + num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0; continue; + } if (current->isVMEM() || current->isFlatLike()) { ctx.mv.current = current; @@ -1054,6 +1088,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) } } + /* GFX11 benefits from creating VMEM store clauses. */ + if (num_stores > 1 && program->gfx_level >= GFX11) { + for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { + Instruction* current = block->instructions[idx].get(); + if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike())) + continue; + + ctx.mv.current = current; + idx -= + schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx); + } + } + /* resummarize the block's register demand */ block->register_demand = RegisterDemand(); for (unsigned idx = 0; idx < block->instructions.size(); idx++) {