nak: fix MMA latencies on Ampere
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Fixes: 7a01953a39 ("nak: Add Ampere and Ada latency information")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37941>
This commit is contained in:
Karol Herbst 2025-10-21 13:28:29 +02:00 committed by Marge Bot
parent cf4df97093
commit e7dca5a6ca

View file

@ -602,8 +602,9 @@ impl RegLatencySM80 {
| FP16 | FP16_Alu | FP16_F32 => 1,
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
Clmad => pred(has_pred, 5, 3),
IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
MMA_2x_collect => pred(has_pred, 12, 1),
IMMA_88 => pred(has_pred, 8, 1),
MMA_1x_collect => pred(has_pred, 11, 1),
MMA_2x_collect => pred(has_pred, 19, 1),
DMMA => pred(has_pred, 20, 1),
Cbu => 1,
Decoupled => 1,
@ -617,8 +618,9 @@ impl RegLatencySM80 {
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 1),
Clmad => pred(has_pred, 5, 1),
IMMA_88 | MMA_1x_collect => 8,
MMA_2x_collect => 12,
IMMA_88 => 8,
MMA_1x_collect => 11,
MMA_2x_collect => 19,
DMMA => 20,
Cbu => 1,
Decoupled => 1,
@ -634,8 +636,9 @@ impl RegLatencySM80 {
IMADWideWriteDH => pred(has_pred, 1, 1),
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
Clmad => pred(has_pred, 5, 3),
IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
MMA_2x_collect => pred(has_pred, 12, 1),
IMMA_88 => pred(has_pred, 8, 1),
MMA_1x_collect => pred(has_pred, 11, 1),
MMA_2x_collect => pred(has_pred, 19, 1),
DMMA => pred(has_pred, 20, 1),
Cbu => 1,
Decoupled => 1,
@ -653,8 +656,9 @@ impl RegLatencySM80 {
FP16 | FP16_Alu | FP16_F32 => pred(has_pred, 1, 2),
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 3),
Clmad => pred(has_pred, 5, 5),
IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 3),
MMA_2x_collect => pred(has_pred, 12, 3),
IMMA_88 => pred(has_pred, 8, 3),
MMA_1x_collect => pred(has_pred, 11, 3),
MMA_2x_collect => pred(has_pred, 19, 3),
DMMA => pred(has_pred, 20, 3),
Cbu => 1,
Decoupled => 1,
@ -671,8 +675,9 @@ impl RegLatencySM80 {
| FP16_F32 => 1,
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 1),
Clmad => pred(has_pred, 5, 3),
IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
MMA_2x_collect => pred(has_pred, 12, 1),
IMMA_88 => pred(has_pred, 8, 1),
MMA_1x_collect => pred(has_pred, 11, 1),
MMA_2x_collect => pred(has_pred, 19, 1),
DMMA => pred(has_pred, 20, 1),
Cbu => 1,
Decoupled => 1,
@ -689,8 +694,9 @@ impl RegLatencySM80 {
| FP16_F32 => 1,
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
Clmad => pred(has_pred, 5, 3),
IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
MMA_2x_collect => pred(has_pred, 12, 1),
IMMA_88 => pred(has_pred, 8, 1),
MMA_1x_collect => pred(has_pred, 11, 1),
MMA_2x_collect => pred(has_pred, 19, 1),
DMMA => pred(has_pred, 20, 1),
Cbu => 1,
Decoupled => 1,
@ -704,8 +710,9 @@ impl RegLatencySM80 {
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 2),
Clmad => pred(has_pred, 5, 2),
IMMA_88 | MMA_1x_collect => 8,
MMA_2x_collect => 12,
IMMA_88 => 8,
MMA_1x_collect => 11,
MMA_2x_collect => 19,
DMMA => 20,
Cbu => 1,
Decoupled => 1,
@ -720,8 +727,9 @@ impl RegLatencySM80 {
HFMA2_MMA => 2,
RedirectedFP64 => 3,
Clmad => pred(has_pred, 5, 1),
IMMA_88 | MMA_1x_collect => 8,
MMA_2x_collect => 12,
IMMA_88 => 8,
MMA_1x_collect => 11,
MMA_2x_collect => 19,
DMMA => 20,
Cbu => 1,
Decoupled => 1,
@ -736,8 +744,9 @@ impl RegLatencySM80 {
HFMA2_MMA => 2,
RedirectedFP64 => 2,
Clmad => pred(has_pred, 4, 2),
IMMA_88 | MMA_1x_collect => 7,
MMA_2x_collect => 11,
IMMA_88 => 7,
MMA_1x_collect => 10,
MMA_2x_collect => 18,
DMMA => 19,
Cbu => 1,
Decoupled => 1,
@ -750,8 +759,9 @@ impl RegLatencySM80 {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
| RedirectedFP64 | Clmad => 2,
IMMA_88 | MMA_1x_collect => 7,
MMA_2x_collect => 11,
IMMA_88 => 7,
MMA_1x_collect => 10,
MMA_2x_collect => 18,
DMMA => 19,
Cbu => 1,
Decoupled => 1,
@ -764,8 +774,9 @@ impl RegLatencySM80 {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
| RedirectedFP64 | Clmad => 2,
IMMA_88 | MMA_1x_collect => 4,
MMA_2x_collect => 8,
IMMA_88 => 4,
MMA_1x_collect => 8,
MMA_2x_collect => 16,
DMMA => 17,
Cbu => 1,
Decoupled => 1,
@ -778,8 +789,9 @@ impl RegLatencySM80 {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
| RedirectedFP64 | Clmad => 2,
IMMA_88 | MMA_1x_collect => 4,
MMA_2x_collect => 8,
IMMA_88 => 4,
MMA_1x_collect => 8,
MMA_2x_collect => 16,
DMMA => 16,
Cbu => 1,
Decoupled => 1,
@ -795,8 +807,9 @@ impl RegLatencySM80 {
}
HFMA2_MMA | RedirectedFP64 => pred(has_pred, 1, 9),
Clmad => pred(has_pred, 1, 11),
IMMA_88 | MMA_1x_collect => pred(has_pred, 7, 6),
MMA_2x_collect => pred(has_pred, 11, 6),
IMMA_88 => pred(has_pred, 7, 6),
MMA_1x_collect => pred(has_pred, 10, 5),
MMA_2x_collect => pred(has_pred, 18, 5),
DMMA => pred(has_pred, 19, 6),
Cbu => 1,
Decoupled => 1,
@ -815,15 +828,25 @@ impl RegLatencySM80 {
use RegLatencySM80::*;
match writer {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
| RedirectedFP64 => match reader {
MMA_2x_collect => 7,
_ => 1,
},
Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA | Cbu
| IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA => {
match reader {
MMA_2x_collect => 7,
_ => 1,
}
}
RedirectedFP64 => 1,
Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA
| Decoupled | DecoupledAgu => match reader {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
| IMADWideReadCL | IMADWideReadCH => 2,
| IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
| FP16_F32 | HFMA2_MMA => 2,
_ => 1,
},
Cbu => match reader {
CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
| IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
| FP16_F32 | HFMA2_MMA => 2,
MMA_2x_collect => 7,
_ => 1,
},
_ => {