From bbbb2be123e709dc703d52cc2d81b732b6213213 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Wed, 3 Sep 2025 16:15:31 +0200 Subject: [PATCH] r600/sfn: emit 64 bit predicates like normal ALU ops Also clean up the scheduler changes we did to deal with one-slot and two slot predicate ops at the same time. Signed-off-by: Gert Wollny Part-of: --- .../drivers/r600/sfn/sfn_scheduler.cpp | 69 +++++++++++++++++-- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp index 7f847caf572..a4e25281009 100644 --- a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp @@ -161,6 +161,9 @@ private: bool collect_ready_alu_vec(std::list& ready, std::list& available, AluInstr **predicate); + bool collect_ready_alu_multislot(std::list& ready, + std::list& available, + AluInstr **predicate); bool schedule_tex(Shader::ShaderBlocks& out_blocks); bool schedule_vtx(Shader::ShaderBlocks& out_blocks); @@ -1138,7 +1141,6 @@ BlockScheduler::collect_ready(CollectInstructions& available) sfn_log << SfnLog::schedule << "Ready instructions\n"; bool result = false; result |= collect_ready_type(alu_trans_ready, available.alu_trans); - result |= collect_ready_type(alu_multi_slot_ready, available.alu_multi_slot); result |= collect_ready_type(alu_groups_ready, available.alu_groups); result |= collect_ready_type(gds_ready, available.gds_instr); result |= collect_ready_type(tex_ready, available.tex); @@ -1146,13 +1148,34 @@ BlockScheduler::collect_ready(CollectInstructions& available) result |= collect_ready_type(free_ready, available.free_instr); result |= collect_ready_type(waitacks_ready, available.waitacks); - if (!result && available.predicate && available.alu_groups.empty() && - available.gds_instr.empty() && available.tex.empty() && - available.fetches.empty() && available.free_instr.empty()) - result |= - collect_ready_alu_vec(alu_vec_ready, available.alu_vec, &available.predicate); - else + bool may_emit_predicate = + !result && available.predicate && available.alu_groups.empty() && + available.alu_trans.empty() && available.gds_instr.empty() && + available.tex.empty() && available.fetches.empty() && available.free_instr.empty(); + + if (may_emit_predicate) { + if (available.predicate->alu_slots() > 1) { + result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec, nullptr); + + if (!result) + result |= collect_ready_alu_multislot(alu_multi_slot_ready, + available.alu_multi_slot, + &available.predicate); + } else { + result |= collect_ready_alu_multislot(alu_multi_slot_ready, + available.alu_multi_slot, + nullptr); + if (!result) + result |= collect_ready_alu_vec(alu_vec_ready, + available.alu_vec, + &available.predicate); + } + } else { result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec, nullptr); + result |= collect_ready_alu_multislot(alu_multi_slot_ready, + available.alu_multi_slot, + nullptr); + } sfn_log << SfnLog::schedule << "\n"; return result; @@ -1225,6 +1248,7 @@ BlockScheduler::collect_ready_alu_vec(std::list& ready, if (predicate && *predicate && available.empty() && ready.size() < 16 && (*predicate)->ready()) { + assert((*predicate)->alu_slots() == 1); ready.push_back(*predicate); *predicate = nullptr; } @@ -1242,6 +1266,37 @@ BlockScheduler::collect_ready_alu_vec(std::list& ready, return !ready.empty(); } +bool +BlockScheduler::collect_ready_alu_multislot(std::list& ready, + std::list& available, + AluInstr **predicate) +{ + auto i = available.begin(); + auto e = available.end(); + + int lookahead = 16; + while (i != e && ready.size() < 16 && lookahead-- > 0) { + if ((*i)->ready()) { + ready.push_back(*i); + auto old_i = i; + ++i; + available.erase(old_i); + } else + ++i; + } + + if (predicate && *predicate && available.empty() && ready.size() < 16 && + (*predicate)->ready()) { + ready.push_back(*predicate); + *predicate = nullptr; + } + + for (auto& i : ready) + sfn_log << SfnLog::schedule << "M: " << *i << "\n"; + + return !ready.empty(); +} + template struct type_char { };