2023-10-12 10:52:45 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright 2023 Valve Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_ir.h"
|
|
|
|
|
|
|
|
|
|
#include "util/bitscan.h"
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
#include "util/bitset.h"
|
2023-10-12 10:52:45 +02:00
|
|
|
#include "util/macros.h"
|
|
|
|
|
|
|
|
|
|
#include <limits>
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This pass implements a simple forward list-scheduler which works on a small
|
|
|
|
|
* partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
|
|
|
|
|
* entirely freely. Memory load instructions must be kept in-order and any other
|
|
|
|
|
* instruction must not be re-scheduled at all.
|
|
|
|
|
*
|
|
|
|
|
* The main goal of this scheduler is to create more memory clauses, schedule
|
|
|
|
|
* memory loads early, and to improve ALU instruction level parallelism.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
namespace aco {
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
constexpr unsigned num_nodes = 16;
|
|
|
|
|
using mask_t = uint16_t;
|
|
|
|
|
static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
struct VOPDInfo {
|
2025-04-09 15:14:48 +01:00
|
|
|
VOPDInfo() : can_be_opx(0), is_dst_odd(0), src_banks(0), has_literal(0), is_commutative(0) {}
|
|
|
|
|
uint16_t can_be_opx : 1;
|
2024-01-17 20:47:27 +00:00
|
|
|
uint16_t is_dst_odd : 1;
|
|
|
|
|
uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */
|
|
|
|
|
uint16_t has_literal : 1;
|
2024-02-06 11:17:02 +00:00
|
|
|
uint16_t is_commutative : 1;
|
2024-01-17 20:47:27 +00:00
|
|
|
aco_opcode op = aco_opcode::num_opcodes;
|
|
|
|
|
uint32_t literal = 0;
|
2025-03-26 16:01:29 +00:00
|
|
|
uint8_t port_vgprs[2] = {0, 0};
|
2024-01-17 20:47:27 +00:00
|
|
|
};
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
struct InstrInfo {
|
|
|
|
|
Instruction* instr;
|
2025-01-28 10:10:14 +01:00
|
|
|
int16_t wait_cycles; /* estimated remaining cycles until instruction can be issued. */
|
2023-10-12 10:52:45 +02:00
|
|
|
mask_t dependency_mask; /* bitmask of nodes which have to be scheduled before this node. */
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
mask_t write_for_read_mask; /* bitmask of nodes in the DAG that have a RaW dependency. */
|
2023-10-12 10:52:45 +02:00
|
|
|
uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct RegisterInfo {
|
|
|
|
|
mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
|
aco/sched_ilp: use more realistic memory latencies
The last commit changes order of instructions more aggressively,
and because the memory load latencies here are wastly underestimated,
it ruins some of the work of pre-RA memory scheduling.
With the new heuristic large latency values work fine, so use them.
Foz-DB GFX1150:
Totals from 71343 (89.88% of 79377) affected shaders:
Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54%
Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02%
InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01%
VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03%
SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02%
VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 70576 (88.91% of 79377) affected shaders:
Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53%
Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02%
InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01%
VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02%
SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02%
VALU: 24515334 -> 24515335 (+0.00%)
Foz-DB Navi21:
Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04%
CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03%
Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01%
InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00%
VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02%
SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 41908 (66.49% of 63026) affected shaders:
Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04%
CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03%
Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01%
InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00%
VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03%
SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 15:57:44 +01:00
|
|
|
uint16_t latency : 11; /* estimated outstanding latency of last register write outside the DAG. */
|
|
|
|
|
uint16_t direct_dependency : 4; /* node that has to be scheduled before any other access. */
|
|
|
|
|
uint16_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
|
2023-10-12 10:52:45 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct SchedILPContext {
|
|
|
|
|
Program* program;
|
2024-01-17 20:47:27 +00:00
|
|
|
bool is_vopd = false;
|
2023-10-12 10:52:45 +02:00
|
|
|
InstrInfo nodes[num_nodes];
|
|
|
|
|
RegisterInfo regs[512];
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
BITSET_DECLARE(reg_has_latency, 512) = { 0 };
|
2023-10-12 10:52:45 +02:00
|
|
|
mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
|
|
|
|
|
mask_t active_mask = 0; /* bitmask of valid instruction nodes. */
|
|
|
|
|
uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
|
|
|
|
|
uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
|
2025-01-19 11:18:01 +01:00
|
|
|
bool potential_partial_clause; /* indicates that last_non_reorderable is the last instruction in
|
|
|
|
|
the DAG, meaning the clause might continue outside of it. */
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
/* VOPD scheduler: */
|
|
|
|
|
VOPDInfo vopd[num_nodes];
|
|
|
|
|
VOPDInfo prev_vopd_info;
|
|
|
|
|
InstrInfo prev_info;
|
|
|
|
|
|
|
|
|
|
mask_t vopd_odd_mask = 0;
|
|
|
|
|
mask_t vopd_even_mask = 0;
|
2023-10-12 10:52:45 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns true for side-effect free SALU and VALU instructions.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
can_reorder(const Instruction* const instr)
|
|
|
|
|
{
|
2025-01-24 08:42:00 +01:00
|
|
|
if (instr->isVALU() || instr->isVINTRP())
|
2023-10-12 10:52:45 +02:00
|
|
|
return true;
|
|
|
|
|
if (!instr->isSALU() || instr->isSOPP())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (instr->opcode) {
|
|
|
|
|
/* SOP2 */
|
|
|
|
|
case aco_opcode::s_cbranch_g_fork:
|
|
|
|
|
case aco_opcode::s_rfe_restore_b64:
|
|
|
|
|
/* SOP1 */
|
|
|
|
|
case aco_opcode::s_setpc_b64:
|
|
|
|
|
case aco_opcode::s_swappc_b64:
|
|
|
|
|
case aco_opcode::s_rfe_b64:
|
|
|
|
|
case aco_opcode::s_cbranch_join:
|
|
|
|
|
case aco_opcode::s_set_gpr_idx_idx:
|
|
|
|
|
case aco_opcode::s_sendmsg_rtn_b32:
|
|
|
|
|
case aco_opcode::s_sendmsg_rtn_b64:
|
2024-05-16 16:54:49 +01:00
|
|
|
case aco_opcode::s_barrier_signal:
|
|
|
|
|
case aco_opcode::s_barrier_signal_isfirst:
|
|
|
|
|
case aco_opcode::s_get_barrier_state:
|
|
|
|
|
case aco_opcode::s_barrier_init:
|
|
|
|
|
case aco_opcode::s_barrier_join:
|
|
|
|
|
case aco_opcode::s_wakeup_barrier:
|
2023-10-12 10:52:45 +02:00
|
|
|
/* SOPK */
|
|
|
|
|
case aco_opcode::s_cbranch_i_fork:
|
|
|
|
|
case aco_opcode::s_getreg_b32:
|
|
|
|
|
case aco_opcode::s_setreg_b32:
|
|
|
|
|
case aco_opcode::s_setreg_imm32_b32:
|
|
|
|
|
case aco_opcode::s_call_b64:
|
|
|
|
|
case aco_opcode::s_waitcnt_vscnt:
|
|
|
|
|
case aco_opcode::s_waitcnt_vmcnt:
|
|
|
|
|
case aco_opcode::s_waitcnt_expcnt:
|
|
|
|
|
case aco_opcode::s_waitcnt_lgkmcnt:
|
|
|
|
|
case aco_opcode::s_subvector_loop_begin:
|
|
|
|
|
case aco_opcode::s_subvector_loop_end:
|
|
|
|
|
/* SOPC */
|
|
|
|
|
case aco_opcode::s_setvskip:
|
|
|
|
|
case aco_opcode::s_set_gpr_idx_on: return false;
|
|
|
|
|
default: break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
VOPDInfo
|
2024-06-26 12:53:19 +01:00
|
|
|
get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
|
2024-01-17 20:47:27 +00:00
|
|
|
{
|
|
|
|
|
if (instr->format != Format::VOP1 && instr->format != Format::VOP2)
|
|
|
|
|
return VOPDInfo();
|
|
|
|
|
|
|
|
|
|
VOPDInfo info;
|
2025-04-09 15:14:48 +01:00
|
|
|
info.can_be_opx = true;
|
2024-02-06 11:17:02 +00:00
|
|
|
info.is_commutative = true;
|
2024-01-17 20:47:27 +00:00
|
|
|
switch (instr->opcode) {
|
|
|
|
|
case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break;
|
|
|
|
|
case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break;
|
2024-02-06 11:17:02 +00:00
|
|
|
case aco_opcode::v_fmamk_f32:
|
|
|
|
|
info.op = aco_opcode::v_dual_fmamk_f32;
|
|
|
|
|
info.is_commutative = false;
|
|
|
|
|
break;
|
2024-01-17 20:47:27 +00:00
|
|
|
case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break;
|
|
|
|
|
case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break;
|
|
|
|
|
case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break;
|
|
|
|
|
case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break;
|
|
|
|
|
case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break;
|
2024-02-07 11:16:18 +00:00
|
|
|
case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break;
|
2024-06-26 12:53:19 +01:00
|
|
|
case aco_opcode::v_bfrev_b32:
|
|
|
|
|
if (!instr->operands[0].isConstant())
|
|
|
|
|
return VOPDInfo();
|
|
|
|
|
info.op = aco_opcode::v_dual_mov_b32;
|
|
|
|
|
break;
|
2024-02-06 11:17:02 +00:00
|
|
|
case aco_opcode::v_cndmask_b32:
|
|
|
|
|
info.op = aco_opcode::v_dual_cndmask_b32;
|
|
|
|
|
info.is_commutative = false;
|
|
|
|
|
break;
|
2024-01-17 20:47:27 +00:00
|
|
|
case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break;
|
|
|
|
|
case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break;
|
|
|
|
|
case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break;
|
|
|
|
|
case aco_opcode::v_add_u32:
|
|
|
|
|
info.op = aco_opcode::v_dual_add_nc_u32;
|
2025-04-09 15:14:48 +01:00
|
|
|
info.can_be_opx = false;
|
2024-01-17 20:47:27 +00:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_lshlrev_b32:
|
|
|
|
|
info.op = aco_opcode::v_dual_lshlrev_b32;
|
2025-04-09 15:14:48 +01:00
|
|
|
info.can_be_opx = false;
|
2024-02-06 11:17:02 +00:00
|
|
|
info.is_commutative = false;
|
2024-01-17 20:47:27 +00:00
|
|
|
break;
|
|
|
|
|
case aco_opcode::v_and_b32:
|
|
|
|
|
info.op = aco_opcode::v_dual_and_b32;
|
2025-04-09 15:14:48 +01:00
|
|
|
info.can_be_opx = false;
|
2024-01-17 20:47:27 +00:00
|
|
|
break;
|
|
|
|
|
default: return VOPDInfo();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Each instruction may use at most one SGPR. */
|
|
|
|
|
if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr))
|
|
|
|
|
return VOPDInfo();
|
|
|
|
|
|
|
|
|
|
info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1;
|
|
|
|
|
|
|
|
|
|
static const unsigned bank_mask[3] = {0x3, 0x3, 0x1};
|
|
|
|
|
bool has_sgpr = false;
|
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
2024-06-26 12:53:19 +01:00
|
|
|
Operand op = instr->operands[i];
|
|
|
|
|
if (instr->opcode == aco_opcode::v_bfrev_b32)
|
|
|
|
|
op = Operand::get_const(ctx.program->gfx_level, util_bitreverse(op.constantValue()), 4);
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
|
2025-03-26 16:01:29 +00:00
|
|
|
if (op.isOfType(RegType::vgpr)) {
|
2024-06-26 12:53:19 +01:00
|
|
|
info.src_banks |= 1 << (port * 4 + (op.physReg().reg() & bank_mask[port]));
|
2025-03-26 16:01:29 +00:00
|
|
|
if (port < 2)
|
|
|
|
|
info.port_vgprs[port] = op.physReg().reg();
|
|
|
|
|
}
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
/* Check all operands because of fmaak/fmamk. */
|
2024-06-26 12:53:19 +01:00
|
|
|
if (op.isLiteral()) {
|
|
|
|
|
assert(!info.has_literal || info.literal == op.constantValue());
|
2024-01-17 20:47:27 +00:00
|
|
|
info.has_literal = true;
|
2024-06-26 12:53:19 +01:00
|
|
|
info.literal = op.constantValue();
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Check all operands because of cndmask. */
|
2024-06-26 12:53:19 +01:00
|
|
|
has_sgpr |= !op.isConstant() && op.isOfType(RegType::sgpr);
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* An instruction can't use both a literal and an SGPR. */
|
|
|
|
|
if (has_sgpr && info.has_literal)
|
|
|
|
|
return VOPDInfo();
|
|
|
|
|
|
2024-02-06 11:17:02 +00:00
|
|
|
info.is_commutative &= instr->operands[0].isOfType(RegType::vgpr);
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-26 15:47:40 +00:00
|
|
|
bool
|
2025-03-26 15:53:18 +00:00
|
|
|
are_src_banks_compatible(enum amd_gfx_level gfx_level, const VOPDInfo& a, const VOPDInfo& b,
|
|
|
|
|
bool swap)
|
2025-03-26 15:47:40 +00:00
|
|
|
{
|
2025-03-26 15:53:18 +00:00
|
|
|
if (gfx_level >= GFX12 && a.op == aco_opcode::v_dual_mov_b32 &&
|
|
|
|
|
b.op == aco_opcode::v_dual_mov_b32) {
|
|
|
|
|
/* On GFX12+, OPY uses src2 if both OPX and OPY are v_dual_mov_b32, so there are no
|
|
|
|
|
* compatibility issues. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-26 15:47:40 +00:00
|
|
|
uint16_t a_src_banks = a.src_banks;
|
2025-03-26 16:01:29 +00:00
|
|
|
uint8_t a_port_vgprs[2] = {a.port_vgprs[0], a.port_vgprs[1]};
|
2025-03-26 15:47:40 +00:00
|
|
|
if (swap) {
|
|
|
|
|
uint16_t src0 = a.src_banks & 0xf;
|
|
|
|
|
uint16_t src1 = a.src_banks & 0xf0;
|
|
|
|
|
uint16_t src2 = a.src_banks & 0x300;
|
|
|
|
|
a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
|
2025-03-26 16:01:29 +00:00
|
|
|
std::swap(a_port_vgprs[0], a_port_vgprs[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* On GFX12+, we can skip checking a src0/src1 port if both SRCx and SRCy use the same VGPR and
|
|
|
|
|
* the same sized operand.
|
|
|
|
|
*/
|
|
|
|
|
if (gfx_level >= GFX12) {
|
|
|
|
|
bool a_is_dot2cc =
|
|
|
|
|
a.op == aco_opcode::v_dual_dot2acc_f32_f16 || a.op == aco_opcode::v_dual_dot2acc_f32_bf16;
|
|
|
|
|
bool b_is_dot2cc =
|
|
|
|
|
b.op == aco_opcode::v_dual_dot2acc_f32_f16 || b.op == aco_opcode::v_dual_dot2acc_f32_bf16;
|
|
|
|
|
if (a_port_vgprs[0] == b.port_vgprs[0] && a_is_dot2cc == b_is_dot2cc)
|
|
|
|
|
a_src_banks &= ~0xf;
|
|
|
|
|
if (a_port_vgprs[1] == b.port_vgprs[1] && a_is_dot2cc == b_is_dot2cc)
|
|
|
|
|
a_src_banks &= ~0xf0;
|
2025-03-26 15:47:40 +00:00
|
|
|
}
|
2025-03-26 16:01:29 +00:00
|
|
|
|
2025-03-26 15:47:40 +00:00
|
|
|
return (a_src_banks & b.src_banks) == 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
enum vopd_compatibility {
|
|
|
|
|
vopd_incompatible = 0x0,
|
|
|
|
|
vopd_first_is_opx = 0x1,
|
|
|
|
|
vopd_second_is_opx = 0x2,
|
|
|
|
|
vopd_need_swap = 0x4,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
unsigned
|
2025-03-26 15:53:18 +00:00
|
|
|
is_vopd_compatible(enum amd_gfx_level gfx_level, const VOPDInfo& a, const VOPDInfo& b)
|
2024-02-06 11:17:02 +00:00
|
|
|
{
|
2025-04-09 15:14:48 +01:00
|
|
|
if ((!a.can_be_opx && !b.can_be_opx) || (a.is_dst_odd == b.is_dst_odd))
|
2025-03-31 17:36:27 +01:00
|
|
|
return vopd_incompatible;
|
2024-02-06 11:17:02 +00:00
|
|
|
|
|
|
|
|
/* Both can use a literal, but it must be the same literal. */
|
|
|
|
|
if (a.has_literal && b.has_literal && a.literal != b.literal)
|
2025-03-31 17:36:27 +01:00
|
|
|
return vopd_incompatible;
|
2024-02-06 11:17:02 +00:00
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
unsigned compat = vopd_incompatible;
|
2025-03-27 17:21:09 +00:00
|
|
|
|
2024-02-06 11:17:02 +00:00
|
|
|
/* The rest is checking src VGPR bank compatibility. */
|
2025-03-26 15:53:18 +00:00
|
|
|
if (are_src_banks_compatible(gfx_level, a, b, false)) {
|
2025-03-31 17:36:27 +01:00
|
|
|
if (a.can_be_opx)
|
|
|
|
|
compat |= vopd_first_is_opx;
|
|
|
|
|
if (b.can_be_opx)
|
|
|
|
|
compat |= vopd_second_is_opx;
|
|
|
|
|
return compat;
|
|
|
|
|
}
|
2024-02-06 11:17:02 +00:00
|
|
|
|
2025-03-26 15:47:40 +00:00
|
|
|
/* The rest of this function checks if we can resolve the VGPR bank incompatibility by swapping
|
|
|
|
|
* the operands of one of the instructions.
|
|
|
|
|
*/
|
2024-02-06 11:17:02 +00:00
|
|
|
if (!a.is_commutative && !b.is_commutative)
|
2025-03-31 17:36:27 +01:00
|
|
|
return vopd_incompatible;
|
2024-02-06 11:17:02 +00:00
|
|
|
|
2025-03-26 15:53:18 +00:00
|
|
|
if (!are_src_banks_compatible(gfx_level, a, b, true))
|
2025-03-31 17:36:27 +01:00
|
|
|
return vopd_incompatible;
|
2024-02-06 11:17:02 +00:00
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
/* Swapping v_mov_b32 makes it become an OPY-only opcode. */
|
|
|
|
|
if (a.can_be_opx && (b.is_commutative || a.op != aco_opcode::v_dual_mov_b32))
|
|
|
|
|
compat |= vopd_first_is_opx;
|
|
|
|
|
if (b.can_be_opx && (a.is_commutative || b.op != aco_opcode::v_dual_mov_b32))
|
|
|
|
|
compat |= vopd_second_is_opx;
|
2025-03-27 17:21:09 +00:00
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
return compat ? (compat | vopd_need_swap) : vopd_incompatible;
|
2024-02-06 11:17:02 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
unsigned
|
|
|
|
|
can_use_vopd(const SchedILPContext& ctx, unsigned idx)
|
2024-01-17 20:47:27 +00:00
|
|
|
{
|
2025-03-31 17:36:27 +01:00
|
|
|
VOPDInfo first_info = ctx.vopd[idx];
|
|
|
|
|
VOPDInfo second_info = ctx.prev_vopd_info;
|
2024-01-17 20:47:27 +00:00
|
|
|
Instruction* first = ctx.nodes[idx].instr;
|
|
|
|
|
Instruction* second = ctx.prev_info.instr;
|
|
|
|
|
|
|
|
|
|
if (!second)
|
2025-03-31 17:36:27 +01:00
|
|
|
return 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
if (second_info.op == aco_opcode::num_opcodes || first_info.op == aco_opcode::num_opcodes)
|
|
|
|
|
return 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
|
2025-03-26 15:53:18 +00:00
|
|
|
unsigned compat = is_vopd_compatible(ctx.program->gfx_level, first_info, second_info);
|
2025-03-31 17:36:27 +01:00
|
|
|
if (!compat)
|
|
|
|
|
return 0;
|
2025-03-27 17:21:09 +00:00
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
assert(first->definitions.size() == 1);
|
|
|
|
|
assert(first->definitions[0].size() == 1);
|
|
|
|
|
assert(second->definitions.size() == 1);
|
|
|
|
|
assert(second->definitions[0].size() == 1);
|
|
|
|
|
|
|
|
|
|
/* Check for WaW dependency. */
|
|
|
|
|
if (first->definitions[0].physReg() == second->definitions[0].physReg())
|
2025-03-31 17:36:27 +01:00
|
|
|
return 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
/* Check for RaW dependency. */
|
|
|
|
|
for (Operand op : second->operands) {
|
|
|
|
|
assert(op.size() == 1);
|
|
|
|
|
if (first->definitions[0].physReg() == op.physReg())
|
2025-03-31 17:36:27 +01:00
|
|
|
return 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-27 17:21:09 +00:00
|
|
|
/* WaR dependencies are not a concern before GFX12. */
|
|
|
|
|
if (ctx.program->gfx_level >= GFX12) {
|
|
|
|
|
/* From RDNA4 ISA doc:
|
|
|
|
|
* The OPX instruction must not overwrite sources of the OPY instruction".
|
|
|
|
|
*/
|
|
|
|
|
bool war = false;
|
|
|
|
|
for (Operand op : first->operands) {
|
|
|
|
|
assert(op.size() == 1);
|
|
|
|
|
if (second->definitions[0].physReg() == op.physReg())
|
|
|
|
|
war = true;
|
|
|
|
|
}
|
2025-03-31 17:36:27 +01:00
|
|
|
if (war) {
|
|
|
|
|
compat &= ~vopd_second_is_opx;
|
|
|
|
|
compat = compat & vopd_first_is_opx ? compat : 0;
|
|
|
|
|
}
|
2025-03-27 17:21:09 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
return compat;
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
|
|
|
|
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
Instruction_cycle_info
|
|
|
|
|
get_cycle_info_with_mem_latency(const SchedILPContext& ctx, const Instruction* const instr)
|
2023-10-12 10:52:45 +02:00
|
|
|
{
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
|
|
|
|
|
|
aco/sched_ilp: use more realistic memory latencies
The last commit changes order of instructions more aggressively,
and because the memory load latencies here are wastly underestimated,
it ruins some of the work of pre-RA memory scheduling.
With the new heuristic large latency values work fine, so use them.
Foz-DB GFX1150:
Totals from 71343 (89.88% of 79377) affected shaders:
Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54%
Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02%
InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01%
VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03%
SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02%
VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 70576 (88.91% of 79377) affected shaders:
Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53%
Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02%
InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01%
VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02%
SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02%
VALU: 24515334 -> 24515335 (+0.00%)
Foz-DB Navi21:
Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04%
CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03%
Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01%
InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00%
VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02%
SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 41908 (66.49% of 63026) affected shaders:
Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04%
CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03%
Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01%
InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00%
VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03%
SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 15:57:44 +01:00
|
|
|
/* Based on get_wait_counter_info in aco_statistics.cpp. */
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
if (instr->isVMEM() || instr->isFlatLike()) {
|
|
|
|
|
cycle_info.latency = 320;
|
|
|
|
|
} else if (instr->isSMEM()) {
|
|
|
|
|
if (instr->operands.empty()) {
|
|
|
|
|
cycle_info.latency = 1;
|
|
|
|
|
} else if (instr->operands[0].size() == 2 ||
|
|
|
|
|
(instr->operands[1].isConstant() &&
|
|
|
|
|
(instr->operands.size() < 3 || instr->operands[2].isConstant()))) {
|
|
|
|
|
/* Likely cached. */
|
|
|
|
|
cycle_info.latency = 30;
|
|
|
|
|
} else {
|
|
|
|
|
cycle_info.latency = 200;
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->isLDSDIR()) {
|
|
|
|
|
cycle_info.latency = 13;
|
|
|
|
|
} else if (instr->isDS()) {
|
|
|
|
|
cycle_info.latency = 20;
|
aco/sched_ilp: use more realistic memory latencies
The last commit changes order of instructions more aggressively,
and because the memory load latencies here are wastly underestimated,
it ruins some of the work of pre-RA memory scheduling.
With the new heuristic large latency values work fine, so use them.
Foz-DB GFX1150:
Totals from 71343 (89.88% of 79377) affected shaders:
Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54%
Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02%
InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01%
VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03%
SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02%
VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 70576 (88.91% of 79377) affected shaders:
Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53%
Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02%
InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01%
VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02%
SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02%
VALU: 24515334 -> 24515335 (+0.00%)
Foz-DB Navi21:
Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04%
CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03%
Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01%
InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00%
VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02%
SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 41908 (66.49% of 63026) affected shaders:
Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04%
CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03%
Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01%
InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00%
VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03%
SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 15:57:44 +01:00
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
return cycle_info;
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
is_memory_instr(const Instruction* const instr)
|
|
|
|
|
{
|
|
|
|
|
/* For memory instructions, we allow to reorder them with ALU if it helps
|
|
|
|
|
* to form larger clauses or to increase def-use distances.
|
|
|
|
|
*/
|
2024-07-17 14:43:40 +01:00
|
|
|
return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || instr->accessesLDS() ||
|
|
|
|
|
instr->isEXP();
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr unsigned max_sgpr = 128;
|
|
|
|
|
constexpr unsigned min_vgpr = 256;
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|
|
|
|
{
|
|
|
|
|
InstrInfo& entry = ctx.nodes[idx];
|
|
|
|
|
entry.instr = instr;
|
2025-01-28 10:10:14 +01:00
|
|
|
entry.wait_cycles = 0;
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
entry.write_for_read_mask = 0;
|
2023-10-12 10:52:45 +02:00
|
|
|
const mask_t mask = BITFIELD_BIT(idx);
|
|
|
|
|
bool reorder = can_reorder(instr);
|
|
|
|
|
ctx.active_mask |= mask;
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
if (ctx.is_vopd) {
|
2024-06-26 12:53:19 +01:00
|
|
|
VOPDInfo vopd = get_vopd_info(ctx, entry.instr);
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
ctx.vopd[idx] = vopd;
|
|
|
|
|
ctx.vopd_odd_mask &= ~mask;
|
|
|
|
|
ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0;
|
|
|
|
|
ctx.vopd_even_mask &= ~mask;
|
|
|
|
|
ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
assert(op.isFixed());
|
|
|
|
|
unsigned reg = op.physReg();
|
|
|
|
|
if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
|
|
|
|
|
reorder &= reg != pops_exiting_wave_id;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < op.size(); i++) {
|
|
|
|
|
RegisterInfo& reg_info = ctx.regs[reg + i];
|
|
|
|
|
|
|
|
|
|
/* Add register reads. */
|
|
|
|
|
reg_info.read_mask |= mask;
|
|
|
|
|
|
|
|
|
|
if (reg_info.has_direct_dependency) {
|
|
|
|
|
/* A previous dependency is still part of the DAG. */
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
2023-10-12 10:52:45 +02:00
|
|
|
entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
} else if (BITSET_TEST(ctx.reg_has_latency, reg + i)) {
|
2025-01-28 10:10:14 +01:00
|
|
|
entry.wait_cycles = MAX2(entry.wait_cycles, reg_info.latency);
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Check if this instructions reads implicit registers. */
|
|
|
|
|
if (needs_exec_mask(instr)) {
|
|
|
|
|
for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (ctx.regs[reg].has_direct_dependency) {
|
2023-10-12 10:52:45 +02:00
|
|
|
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
|
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
ctx.regs[reg].read_mask |= mask;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
|
|
|
|
|
for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (ctx.regs[reg].has_direct_dependency) {
|
2023-10-12 10:52:45 +02:00
|
|
|
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
|
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
ctx.regs[reg].read_mask |= mask;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-23 10:35:51 +01:00
|
|
|
mask_t write_dep_mask = 0;
|
2023-10-12 10:52:45 +02:00
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
for (unsigned i = 0; i < def.size(); i++) {
|
|
|
|
|
RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
|
|
|
|
|
|
|
|
|
|
/* Add all previous register reads and writes to the dependencies. */
|
2025-01-23 10:35:51 +01:00
|
|
|
write_dep_mask |= reg_info.read_mask;
|
2023-10-12 10:52:45 +02:00
|
|
|
reg_info.read_mask = mask;
|
|
|
|
|
|
|
|
|
|
/* This register write is a direct dependency for all following reads. */
|
|
|
|
|
reg_info.has_direct_dependency = 1;
|
|
|
|
|
reg_info.direct_dependency = idx;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!reorder) {
|
|
|
|
|
ctx.non_reorder_mask |= mask;
|
|
|
|
|
|
|
|
|
|
/* Set this node as last non-reorderable instruction */
|
|
|
|
|
if (ctx.next_non_reorderable == UINT8_MAX) {
|
|
|
|
|
ctx.next_non_reorderable = idx;
|
|
|
|
|
} else {
|
|
|
|
|
ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
|
|
|
|
|
}
|
|
|
|
|
ctx.last_non_reorderable = idx;
|
|
|
|
|
entry.next_non_reorderable = UINT8_MAX;
|
|
|
|
|
|
|
|
|
|
/* Just don't reorder these at all. */
|
|
|
|
|
if (!is_memory_instr(instr) || instr->definitions.empty() ||
|
2024-01-17 20:47:27 +00:00
|
|
|
get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) {
|
2023-10-12 10:52:45 +02:00
|
|
|
/* Add all previous instructions as dependencies. */
|
2025-01-23 10:35:51 +01:00
|
|
|
entry.dependency_mask = ctx.active_mask & ~ctx.non_reorder_mask;
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Remove non-reorderable instructions from dependencies, since WaR dependencies can interfere
|
|
|
|
|
* with clause formation. This should be fine, since these are always scheduled in-order and
|
|
|
|
|
* any cases that are actually a concern for clause formation are added as transitive
|
|
|
|
|
* dependencies. */
|
2025-01-23 10:35:51 +01:00
|
|
|
write_dep_mask &= ~ctx.non_reorder_mask;
|
2025-01-19 11:18:01 +01:00
|
|
|
ctx.potential_partial_clause = true;
|
2023-10-12 10:52:45 +02:00
|
|
|
} else if (ctx.last_non_reorderable != UINT8_MAX) {
|
2025-01-19 11:18:01 +01:00
|
|
|
ctx.potential_partial_clause = false;
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
2025-01-23 10:35:51 +01:00
|
|
|
entry.dependency_mask |= write_dep_mask;
|
2023-10-12 10:52:45 +02:00
|
|
|
entry.dependency_mask &= ~mask;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_nodes; i++) {
|
|
|
|
|
if (!ctx.nodes[i].instr || i == idx)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Add transitive dependencies. */
|
|
|
|
|
if (entry.dependency_mask & BITFIELD_BIT(i))
|
|
|
|
|
entry.dependency_mask |= ctx.nodes[i].dependency_mask;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_t idx)
|
|
|
|
|
{
|
|
|
|
|
const mask_t mask = ~BITFIELD_BIT(idx);
|
|
|
|
|
ctx.active_mask &= mask;
|
|
|
|
|
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
int latency = 0;
|
|
|
|
|
int stall = 1;
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (!ctx.is_vopd) {
|
aco/sched_ilp: base latency and issue cycles on aco_statistics
This matters for trans and scalar fpu instructions.
Foz-DB GFX1150:
Totals from 53894 (67.90% of 79377) affected shaders:
Instrs: 38528421 -> 38481337 (-0.12%); split: -0.16%, +0.04%
CodeSize: 200206016 -> 200023916 (-0.09%); split: -0.12%, +0.03%
Latency: 265011734 -> 264303762 (-0.27%); split: -0.28%, +0.02%
InvThroughput: 53804490 -> 53696097 (-0.20%); split: -0.21%, +0.01%
VClause: 736996 -> 736988 (-0.00%); split: -0.00%, +0.00%
SClause: 1118494 -> 1118474 (-0.00%); split: -0.01%, +0.01%
VALU: 21982349 -> 21982358 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 50791 (63.99% of 79377) affected shaders:
Instrs: 37511862 -> 37495712 (-0.04%); split: -0.11%, +0.07%
CodeSize: 197990892 -> 197925104 (-0.03%); split: -0.09%, +0.06%
Latency: 261929261 -> 261273534 (-0.25%); split: -0.27%, +0.01%
InvThroughput: 43978329 -> 43921618 (-0.13%); split: -0.14%, +0.01%
VClause: 727683 -> 727695 (+0.00%); split: -0.00%, +0.00%
SClause: 1092527 -> 1092544 (+0.00%); split: -0.01%, +0.01%
VALU: 22646553 -> 22646566 (+0.00%)
Foz-DB Navi21:
Totals from 43899 (55.30% of 79377) affected shaders:
Instrs: 35649081 -> 35649110 (+0.00%); split: -0.00%, +0.00%
CodeSize: 192336212 -> 192337276 (+0.00%); split: -0.00%, +0.00%
Latency: 270621538 -> 270221431 (-0.15%); split: -0.16%, +0.02%
InvThroughput: 66757841 -> 66715918 (-0.06%); split: -0.07%, +0.01%
VClause: 734884 -> 734867 (-0.00%); split: -0.01%, +0.01%
SClause: 1072956 -> 1072951 (-0.00%); split: -0.01%, +0.01%
Foz-DB Vega10:
Totals from 52687 (83.60% of 63026) affected shaders:
Instrs: 24595280 -> 24595693 (+0.00%); split: -0.01%, +0.01%
CodeSize: 127199836 -> 127200164 (+0.00%); split: -0.01%, +0.01%
Latency: 252281578 -> 252497934 (+0.09%); split: -0.03%, +0.12%
InvThroughput: 136551527 -> 136577609 (+0.02%); split: -0.01%, +0.03%
VClause: 536798 -> 536718 (-0.01%); split: -0.04%, +0.03%
SClause: 819978 -> 819693 (-0.03%); split: -0.04%, +0.01%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 16:31:23 +01:00
|
|
|
Instruction_cycle_info cycle_info = get_cycle_info_with_mem_latency(ctx, instr);
|
|
|
|
|
latency = cycle_info.latency;
|
|
|
|
|
stall = cycle_info.issue_cycles;
|
|
|
|
|
|
|
|
|
|
if (ctx.nodes[idx].wait_cycles > 0) {
|
|
|
|
|
/* Add remaining latency stall. */
|
|
|
|
|
stall += ctx.nodes[idx].wait_cycles;
|
|
|
|
|
}
|
|
|
|
|
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
unsigned i;
|
|
|
|
|
BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
|
|
|
|
|
if (ctx.regs[i].latency <= stall) {
|
|
|
|
|
ctx.regs[i].latency = 0;
|
|
|
|
|
BITSET_CLEAR(ctx.reg_has_latency, i);
|
|
|
|
|
} else {
|
|
|
|
|
ctx.regs[i].latency -= stall;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
const unsigned reg = op.physReg();
|
|
|
|
|
if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < op.size(); i++) {
|
|
|
|
|
RegisterInfo& reg_info = ctx.regs[reg + i];
|
|
|
|
|
reg_info.read_mask &= mask;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (needs_exec_mask(instr)) {
|
|
|
|
|
ctx.regs[exec_lo].read_mask &= mask;
|
|
|
|
|
ctx.regs[exec_hi].read_mask &= mask;
|
|
|
|
|
}
|
|
|
|
|
if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
|
|
|
|
|
ctx.regs[flat_scr_lo].read_mask &= mask;
|
|
|
|
|
ctx.regs[flat_scr_hi].read_mask &= mask;
|
|
|
|
|
}
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
for (const Definition& def : instr->definitions) {
|
|
|
|
|
for (unsigned i = 0; i < def.size(); i++) {
|
|
|
|
|
unsigned reg = def.physReg().reg() + i;
|
|
|
|
|
ctx.regs[reg].read_mask &= mask;
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) {
|
|
|
|
|
ctx.regs[reg].has_direct_dependency = false;
|
|
|
|
|
if (!ctx.is_vopd) {
|
|
|
|
|
BITSET_SET(ctx.reg_has_latency, reg);
|
|
|
|
|
ctx.regs[reg].latency = latency;
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
for (unsigned i = 0; i < num_nodes; i++) {
|
2023-10-12 10:52:45 +02:00
|
|
|
ctx.nodes[i].dependency_mask &= mask;
|
2025-01-28 10:10:14 +01:00
|
|
|
ctx.nodes[i].wait_cycles -= stall;
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (ctx.nodes[idx].write_for_read_mask & BITFIELD_BIT(i) && !ctx.is_vopd) {
|
2025-01-28 10:10:14 +01:00
|
|
|
ctx.nodes[i].wait_cycles = MAX2(ctx.nodes[i].wait_cycles, latency);
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
}
|
|
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
|
|
|
|
|
if (ctx.next_non_reorderable == idx) {
|
|
|
|
|
ctx.non_reorder_mask &= mask;
|
|
|
|
|
ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
|
2025-01-19 11:18:01 +01:00
|
|
|
if (ctx.last_non_reorderable == idx) {
|
2023-10-12 10:52:45 +02:00
|
|
|
ctx.last_non_reorderable = UINT8_MAX;
|
2025-01-19 11:18:01 +01:00
|
|
|
ctx.potential_partial_clause = false;
|
|
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns a bitfield of nodes which have to be scheduled before the
|
|
|
|
|
* next non-reorderable instruction.
|
|
|
|
|
* If the next non-reorderable instruction can form a clause, returns the
|
|
|
|
|
* dependencies of the entire clause.
|
|
|
|
|
*/
|
|
|
|
|
mask_t
|
|
|
|
|
collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask_t clause_mask)
|
|
|
|
|
{
|
|
|
|
|
const InstrInfo& entry = ctx.nodes[next];
|
|
|
|
|
mask_t dependencies = entry.dependency_mask;
|
2025-01-19 11:18:01 +01:00
|
|
|
clause_mask |= BITFIELD_BIT(next);
|
|
|
|
|
|
|
|
|
|
/* If we dependent on the clause, don't add our dependencies. */
|
|
|
|
|
if (dependencies & clause_mask)
|
|
|
|
|
return 0;
|
2023-10-12 10:52:45 +02:00
|
|
|
|
|
|
|
|
if (!is_memory_instr(entry.instr))
|
|
|
|
|
return dependencies;
|
|
|
|
|
|
|
|
|
|
/* If this is potentially an "open" clause, meaning that the clause might
|
|
|
|
|
* consist of instruction not yet added to the DAG, consider all previous
|
|
|
|
|
* instructions as dependencies. This prevents splitting of larger, already
|
|
|
|
|
* formed clauses.
|
|
|
|
|
*/
|
2025-01-19 11:18:01 +01:00
|
|
|
if (next == ctx.last_non_reorderable && ctx.potential_partial_clause)
|
2023-10-12 10:52:45 +02:00
|
|
|
return (~clause_mask & ctx.active_mask) | dependencies;
|
|
|
|
|
|
|
|
|
|
/* Check if this can form a clause with the following non-reorderable instruction */
|
2025-01-19 11:18:01 +01:00
|
|
|
if (entry.next_non_reorderable != UINT8_MAX &&
|
|
|
|
|
should_form_clause(entry.instr, ctx.nodes[entry.next_non_reorderable].instr)) {
|
|
|
|
|
dependencies |= collect_clause_dependencies(ctx, entry.next_non_reorderable, clause_mask);
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dependencies;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the index of the next instruction to be selected.
|
|
|
|
|
*/
|
|
|
|
|
unsigned
|
2024-01-17 20:47:27 +00:00
|
|
|
select_instruction_ilp(const SchedILPContext& ctx)
|
2023-10-12 10:52:45 +02:00
|
|
|
{
|
|
|
|
|
mask_t mask = ctx.active_mask;
|
|
|
|
|
|
2025-01-19 12:03:02 +01:00
|
|
|
/* First, continue the currently open clause.
|
|
|
|
|
* Otherwise collect all dependencies of the next non-reorderable instruction(s).
|
2023-10-12 10:52:45 +02:00
|
|
|
* These make up the list of possible candidates.
|
|
|
|
|
*/
|
2025-01-19 12:03:02 +01:00
|
|
|
if (ctx.next_non_reorderable != UINT8_MAX) {
|
|
|
|
|
if (ctx.prev_info.instr && ctx.nodes[ctx.next_non_reorderable].dependency_mask == 0 &&
|
|
|
|
|
should_form_clause(ctx.prev_info.instr, ctx.nodes[ctx.next_non_reorderable].instr))
|
|
|
|
|
return ctx.next_non_reorderable;
|
2023-10-12 10:52:45 +02:00
|
|
|
mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
|
2025-01-19 12:03:02 +01:00
|
|
|
}
|
2023-10-12 10:52:45 +02:00
|
|
|
|
2025-01-24 08:42:00 +01:00
|
|
|
/* VINTRP(gfx6-10.3) can be handled like alu, but switching between VINTRP and other
|
|
|
|
|
* alu has a cost. So if the previous instr was VINTRP, try to keep selecting VINTRP.
|
|
|
|
|
*/
|
|
|
|
|
bool prefer_vintrp = ctx.prev_info.instr && ctx.prev_info.instr->isVINTRP();
|
|
|
|
|
|
2025-01-28 10:10:14 +01:00
|
|
|
/* Select the instruction with lowest wait_cycles of all candidates. */
|
2023-10-12 10:52:45 +02:00
|
|
|
unsigned idx = -1u;
|
2025-01-24 08:42:00 +01:00
|
|
|
bool idx_vintrp = false;
|
2025-01-28 10:10:14 +01:00
|
|
|
int32_t wait_cycles = INT32_MAX;
|
2023-10-12 10:52:45 +02:00
|
|
|
u_foreach_bit (i, mask) {
|
|
|
|
|
const InstrInfo& candidate = ctx.nodes[i];
|
|
|
|
|
|
|
|
|
|
/* Check if the candidate has pending dependencies. */
|
|
|
|
|
if (candidate.dependency_mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
2025-01-24 08:42:00 +01:00
|
|
|
bool is_vintrp = prefer_vintrp && candidate.instr->isVINTRP();
|
|
|
|
|
|
|
|
|
|
if (idx == -1u || (is_vintrp && !idx_vintrp) ||
|
2025-01-28 10:10:14 +01:00
|
|
|
(is_vintrp == idx_vintrp && candidate.wait_cycles < wait_cycles)) {
|
2023-10-12 10:52:45 +02:00
|
|
|
idx = i;
|
2025-01-24 08:42:00 +01:00
|
|
|
idx_vintrp = is_vintrp;
|
2025-01-28 10:10:14 +01:00
|
|
|
wait_cycles = candidate.wait_cycles;
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-19 11:18:01 +01:00
|
|
|
if (idx != -1u)
|
|
|
|
|
return idx;
|
|
|
|
|
|
|
|
|
|
/* Select the next non-reorderable instruction. (it must have no dependencies) */
|
|
|
|
|
assert(ctx.next_non_reorderable != UINT8_MAX);
|
|
|
|
|
assert(ctx.nodes[ctx.next_non_reorderable].dependency_mask == 0);
|
|
|
|
|
return ctx.next_non_reorderable;
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
bool
|
2025-03-31 17:36:27 +01:00
|
|
|
compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, unsigned* vopd_compat,
|
|
|
|
|
unsigned current, unsigned candidate)
|
2024-01-17 20:47:27 +00:00
|
|
|
{
|
2025-03-31 17:36:27 +01:00
|
|
|
unsigned candidate_compat = can_use_vopd(ctx, candidate);
|
|
|
|
|
if (candidate_compat) {
|
2024-01-17 20:47:27 +00:00
|
|
|
/* If we can form a VOPD instruction, always prefer to do so. */
|
2025-03-31 17:36:27 +01:00
|
|
|
if (!*vopd_compat) {
|
|
|
|
|
*vopd_compat = candidate_compat;
|
2024-01-17 20:47:27 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2025-03-31 17:36:27 +01:00
|
|
|
if (*vopd_compat)
|
2024-01-17 20:47:27 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Neither current nor candidate can form a VOPD instruction with the previously scheduled
|
|
|
|
|
* instruction. */
|
|
|
|
|
VOPDInfo current_vopd = ctx.vopd[current];
|
|
|
|
|
VOPDInfo candidate_vopd = ctx.vopd[candidate];
|
|
|
|
|
|
|
|
|
|
/* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */
|
|
|
|
|
bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes;
|
|
|
|
|
bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes;
|
|
|
|
|
if (current_vopd_capable != candidate_vopd_capable)
|
|
|
|
|
return !candidate_vopd_capable;
|
|
|
|
|
|
|
|
|
|
/* If we have to select from VOPD-capable instructions, prefer maintaining a balance of
|
|
|
|
|
* odd/even instructions, in case selecting this instruction fails to make a pair.
|
|
|
|
|
*/
|
|
|
|
|
if (current_vopd_capable && num_vopd_odd_minus_even != 0) {
|
|
|
|
|
assert(candidate_vopd_capable);
|
|
|
|
|
bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0;
|
|
|
|
|
if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd)
|
|
|
|
|
return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
if (ctx.nodes[candidate].wait_cycles < ctx.nodes[current].wait_cycles) {
|
|
|
|
|
*vopd_compat = candidate_compat;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned
|
2025-03-31 17:36:27 +01:00
|
|
|
select_instruction_vopd(const SchedILPContext& ctx, unsigned* vopd_compat)
|
2024-01-17 20:47:27 +00:00
|
|
|
{
|
2025-03-31 17:36:27 +01:00
|
|
|
*vopd_compat = 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
mask_t mask = ctx.active_mask;
|
|
|
|
|
if (ctx.next_non_reorderable != UINT8_MAX)
|
|
|
|
|
mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask;
|
|
|
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
|
return ctx.next_non_reorderable;
|
|
|
|
|
|
|
|
|
|
int num_vopd_odd_minus_even =
|
|
|
|
|
(int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask);
|
|
|
|
|
|
|
|
|
|
unsigned cur = -1u;
|
|
|
|
|
u_foreach_bit (i, mask) {
|
|
|
|
|
const InstrInfo& candidate = ctx.nodes[i];
|
|
|
|
|
|
|
|
|
|
/* Check if the candidate has pending dependencies. */
|
|
|
|
|
if (candidate.dependency_mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (cur == -1u) {
|
|
|
|
|
cur = i;
|
2025-03-31 17:36:27 +01:00
|
|
|
*vopd_compat = can_use_vopd(ctx, i);
|
|
|
|
|
} else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, vopd_compat, cur, i)) {
|
2024-01-17 20:47:27 +00:00
|
|
|
cur = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(cur != -1u);
|
|
|
|
|
return cur;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-05 14:51:49 +00:00
|
|
|
void
|
2024-06-26 12:53:19 +01:00
|
|
|
get_vopd_opcode_operands(const SchedILPContext& ctx, Instruction* instr, const VOPDInfo& info,
|
|
|
|
|
bool swap, aco_opcode* op, unsigned* num_operands, Operand* operands)
|
2024-02-05 14:51:49 +00:00
|
|
|
{
|
|
|
|
|
*op = info.op;
|
|
|
|
|
*num_operands += instr->operands.size();
|
|
|
|
|
std::copy(instr->operands.begin(), instr->operands.end(), operands);
|
2024-02-06 11:17:02 +00:00
|
|
|
|
2024-06-26 12:53:19 +01:00
|
|
|
if (instr->opcode == aco_opcode::v_bfrev_b32) {
|
|
|
|
|
operands[0] = Operand::get_const(ctx.program->gfx_level,
|
|
|
|
|
util_bitreverse(operands[0].constantValue()), 4);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-07 11:16:18 +00:00
|
|
|
if (swap && info.op == aco_opcode::v_dual_mov_b32) {
|
|
|
|
|
*op = aco_opcode::v_dual_add_nc_u32;
|
|
|
|
|
(*num_operands)++;
|
2024-06-26 12:53:19 +01:00
|
|
|
operands[1] = operands[0];
|
2024-02-07 11:16:18 +00:00
|
|
|
operands[0] = Operand::zero();
|
|
|
|
|
} else if (swap) {
|
2024-02-06 11:17:02 +00:00
|
|
|
if (info.op == aco_opcode::v_dual_sub_f32)
|
|
|
|
|
*op = aco_opcode::v_dual_subrev_f32;
|
|
|
|
|
else if (info.op == aco_opcode::v_dual_subrev_f32)
|
|
|
|
|
*op = aco_opcode::v_dual_sub_f32;
|
|
|
|
|
std::swap(operands[0], operands[1]);
|
|
|
|
|
}
|
2024-02-05 14:51:49 +00:00
|
|
|
}
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
Instruction*
|
2025-03-31 17:36:27 +01:00
|
|
|
create_vopd_instruction(const SchedILPContext& ctx, unsigned idx, unsigned compat)
|
2024-01-17 20:47:27 +00:00
|
|
|
{
|
2025-03-31 17:36:27 +01:00
|
|
|
Instruction* x = ctx.prev_info.instr; /* second */
|
|
|
|
|
Instruction* y = ctx.nodes[idx].instr; /* first */
|
2024-02-05 14:51:49 +00:00
|
|
|
VOPDInfo x_info = ctx.prev_vopd_info;
|
|
|
|
|
VOPDInfo y_info = ctx.vopd[idx];
|
2025-03-31 17:36:27 +01:00
|
|
|
x_info.can_be_opx = x_info.can_be_opx && (compat & vopd_second_is_opx);
|
2024-02-05 14:51:49 +00:00
|
|
|
|
2024-02-06 11:17:02 +00:00
|
|
|
bool swap_x = false, swap_y = false;
|
2025-03-31 17:36:27 +01:00
|
|
|
if (compat & vopd_need_swap) {
|
2024-02-06 11:17:02 +00:00
|
|
|
assert(x_info.is_commutative || y_info.is_commutative);
|
2024-02-07 11:16:18 +00:00
|
|
|
/* Avoid swapping v_mov_b32 because it will become an OPY-only opcode. */
|
2025-05-06 16:47:44 +01:00
|
|
|
if (x_info.op == aco_opcode::v_dual_mov_b32 && y_info.op == aco_opcode::v_dual_mov_b32) {
|
|
|
|
|
swap_x = !x_info.can_be_opx;
|
|
|
|
|
swap_y = !swap_x;
|
|
|
|
|
} else if (x_info.op == aco_opcode::v_dual_mov_b32 && !y_info.is_commutative) {
|
2024-02-07 11:16:18 +00:00
|
|
|
swap_x = true;
|
2025-04-09 15:14:48 +01:00
|
|
|
x_info.can_be_opx = false;
|
2024-02-07 11:16:18 +00:00
|
|
|
} else {
|
|
|
|
|
swap_x = x_info.is_commutative && x_info.op != aco_opcode::v_dual_mov_b32;
|
|
|
|
|
swap_y = y_info.is_commutative && !swap_x;
|
|
|
|
|
}
|
2025-05-06 16:47:44 +01:00
|
|
|
y_info.can_be_opx &= !swap_y || y_info.op != aco_opcode::v_dual_mov_b32;
|
2024-02-06 11:17:02 +00:00
|
|
|
}
|
|
|
|
|
|
2025-04-09 15:14:48 +01:00
|
|
|
if (!x_info.can_be_opx) {
|
2024-01-17 20:47:27 +00:00
|
|
|
std::swap(x, y);
|
2024-02-05 14:51:49 +00:00
|
|
|
std::swap(x_info, y_info);
|
2024-02-06 11:17:02 +00:00
|
|
|
std::swap(swap_x, swap_y);
|
2024-01-17 20:47:27 +00:00
|
|
|
}
|
2025-04-09 15:14:48 +01:00
|
|
|
assert(x_info.can_be_opx);
|
2024-01-17 20:47:27 +00:00
|
|
|
|
2024-02-05 14:51:49 +00:00
|
|
|
aco_opcode x_op, y_op;
|
|
|
|
|
unsigned num_operands = 0;
|
|
|
|
|
Operand operands[6];
|
2024-06-26 12:53:19 +01:00
|
|
|
get_vopd_opcode_operands(ctx, x, x_info, swap_x, &x_op, &num_operands, operands);
|
|
|
|
|
get_vopd_opcode_operands(ctx, y, y_info, swap_y, &y_op, &num_operands, operands + num_operands);
|
2024-02-05 14:51:49 +00:00
|
|
|
|
2024-03-25 15:55:27 +01:00
|
|
|
Instruction* instr = create_instruction(x_op, Format::VOPD, num_operands, 2);
|
2024-03-25 12:05:50 +01:00
|
|
|
instr->vopd().opy = y_op;
|
2024-01-17 20:47:27 +00:00
|
|
|
instr->definitions[0] = x->definitions[0];
|
|
|
|
|
instr->definitions[1] = y->definitions[0];
|
2024-02-05 14:51:49 +00:00
|
|
|
std::copy(operands, operands + num_operands, instr->operands.begin());
|
2024-01-17 20:47:27 +00:00
|
|
|
|
|
|
|
|
return instr;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-12 19:27:48 +00:00
|
|
|
template <typename It>
|
|
|
|
|
void
|
|
|
|
|
do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin,
|
|
|
|
|
It instructions_end)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned i = 0; i < num_nodes; i++) {
|
|
|
|
|
if (remove_it == instructions_end)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
add_entry(ctx, (remove_it++)->get(), i);
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
ctx.prev_info.instr = NULL;
|
2025-03-31 17:36:27 +01:00
|
|
|
unsigned vopd_compat = 0;
|
2024-01-17 20:47:27 +00:00
|
|
|
|
2024-01-12 19:27:48 +00:00
|
|
|
while (ctx.active_mask) {
|
2025-03-31 17:36:27 +01:00
|
|
|
unsigned next_idx =
|
|
|
|
|
ctx.is_vopd ? select_instruction_vopd(ctx, &vopd_compat) : select_instruction_ilp(ctx);
|
2024-01-12 19:27:48 +00:00
|
|
|
Instruction* next_instr = ctx.nodes[next_idx].instr;
|
|
|
|
|
|
2025-03-31 17:36:27 +01:00
|
|
|
if (vopd_compat) {
|
|
|
|
|
std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx, vopd_compat));
|
2024-01-17 20:47:27 +00:00
|
|
|
ctx.prev_info.instr = NULL;
|
|
|
|
|
} else {
|
|
|
|
|
(insert_it++)->reset(next_instr);
|
|
|
|
|
ctx.prev_info = ctx.nodes[next_idx];
|
|
|
|
|
ctx.prev_vopd_info = ctx.vopd[next_idx];
|
|
|
|
|
}
|
2024-01-12 19:27:48 +00:00
|
|
|
|
|
|
|
|
remove_entry(ctx, next_instr, next_idx);
|
|
|
|
|
ctx.nodes[next_idx].instr = NULL;
|
|
|
|
|
|
|
|
|
|
if (remove_it != instructions_end) {
|
|
|
|
|
add_entry(ctx, (remove_it++)->get(), next_idx);
|
|
|
|
|
} else if (ctx.last_non_reorderable != UINT8_MAX) {
|
2025-01-19 11:18:01 +01:00
|
|
|
ctx.potential_partial_clause = false;
|
2024-01-12 19:27:48 +00:00
|
|
|
ctx.last_non_reorderable = UINT8_MAX;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
schedule_ilp(Program* program)
|
|
|
|
|
{
|
|
|
|
|
SchedILPContext ctx = {program};
|
|
|
|
|
|
|
|
|
|
for (Block& block : program->blocks) {
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (block.instructions.empty())
|
|
|
|
|
continue;
|
2023-10-12 10:52:45 +02:00
|
|
|
auto it = block.instructions.begin();
|
|
|
|
|
auto insert_it = block.instructions.begin();
|
2024-01-12 19:27:48 +00:00
|
|
|
do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
|
|
|
|
|
block.instructions.resize(insert_it - block.instructions.begin());
|
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.
Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%
Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%
Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%
Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2025-01-26 13:41:53 +01:00
|
|
|
if (block.linear_succs.empty() || block.instructions.back()->opcode == aco_opcode::s_branch)
|
|
|
|
|
BITSET_ZERO(ctx.reg_has_latency);
|
2023-10-12 10:52:45 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-17 20:47:27 +00:00
|
|
|
void
|
|
|
|
|
schedule_vopd(Program* program)
|
|
|
|
|
{
|
|
|
|
|
if (program->gfx_level < GFX11 || program->wave_size != 32)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
SchedILPContext ctx = {program};
|
|
|
|
|
ctx.is_vopd = true;
|
|
|
|
|
|
|
|
|
|
for (Block& block : program->blocks) {
|
|
|
|
|
auto it = block.instructions.rbegin();
|
|
|
|
|
auto insert_it = block.instructions.rbegin();
|
|
|
|
|
do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend());
|
|
|
|
|
block.instructions.erase(block.instructions.begin(), insert_it.base());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-12 10:52:45 +02:00
|
|
|
} // namespace aco
|