nvk: add sm120 latencies via csv files.
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Two difference from the initial B100 values:

all raw seem to need a +1
hmma seems to need a +7
and +1 for raw hmma for good luck makes 9.

Cc: 25.2
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36217>
This commit is contained in:
Dave Airlie 2025-06-30 16:28:59 +10:00 committed by Marge Bot
parent 0fce848b54
commit 477533ee00
20 changed files with 1072 additions and 7 deletions

View file

@ -0,0 +1,208 @@
#! /usr/bin/env python3
#
# Copyright © 2024 Collabora Ltd. and Red Hat Inc.
# SPDX-License-Identifier: MIT
# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs
# and constructs a lib.rs which puts each of them in ::path::to::mod.
import argparse
import csv
import os
import sys
from mako import template
TEMPLATE_RS = template.Template(text="""\
// Copyright 2024 Red Hat Inc.
// SPDX-License-Identifier: MIT
// This file is generated by lat_rs_gen.py. DO NOT EDIT!
#![allow(unused_variables)]
const fn pred(has_pred: bool, a: u32, b: u32) -> u32 {
if has_pred {
a + b
} else {
b
}
}
% for reg_file, cats in file_cats.items():
<% enum_name = to_camel(reg_file) + 'Latency' + sm.upper() %>
#[derive(PartialEq)]
pub enum ${enum_name} {
% for category in cats[0].header.cats:
${to_camel(category)},
% endfor
}
impl ${to_camel(reg_file)}Latency${sm.upper()} {
% for bigcat in cats:
pub fn ${bigcat.header.latcat}(
${bigcat.header.cat0}: ${enum_name},
${bigcat.header.cat1}: ${enum_name},
has_pred: bool
) -> u32 {
use ${enum_name}::*;
match ${bigcat.header.cat1} {
% for cat in bigcat.header.cats:
${to_camel(cat)} => match ${bigcat.header.cat0} {
<% has_non = False %>
% for cat2 in bigcat.header.cats:
% if bigcat.fields[loop.parent.index].flds[cat2].pred == True:
${to_camel(cat2)} => pred(
has_pred,
${bigcat.fields[loop.parent.index].flds[cat2].value},
${bigcat.fields[loop.parent.index].flds[cat2].pred_val}
),
% elif bigcat.fields[loop.parent.index].flds[cat2].value != "none":
${to_camel(cat2)} => ${bigcat.fields[loop.parent.index].flds[cat2].value},
% else:
<% has_none = True %>
% endif
% endfor
% if has_none:
_ => panic!("Illegal ${bigcat.header.cat0} value in ${bigcat.header.latcat} for ${to_camel(cat)}"),
% endif
}
% endfor
}
}
% endfor
}
% endfor
""")
## A mere convenience to convert snake_case to CamelCase. Numbers are prefixed
## with "_".
def to_camel(snake_str):
result = ''.join(word.title() for word in snake_str.split('_'))
return result if not result[0].isdigit() else '_' + result
def reader(csvfile):
"""Wrapper around csv.reader that skips comments and blanks."""
# csv.reader actually reads the file one line at a time (it was designed to
# open excel generated sheets), so hold the file until all of the lines are
# read.
with open(csvfile, 'r') as f:
for line in csv.reader(f):
if line and not line[0].startswith('#'):
yield line
class Fld(object):
def __init__(self, line):
if "none" in line:
self.valid = False
else:
self.valid = True
self.pred = False
if "+" in line:
self.pred = True
part = line.split("+")
self.value = part[0]
self.pred_val = part[1]
elif " & sb" in line:
self.scoreboard = True
self.value = line.removesuffix(" & sb");
else:
self.scoreboard = False
self.value = line.strip()
class Header(object):
def __init__(self, line):
self.latcat = line[0].strip()
self.cats = line[1:]
if self.latcat == "raw":
self.cat0 = "writer"
self.cat1 = "reader"
elif self.latcat == "war":
self.cat0 = "reader"
self.cat1 = "writer"
elif self.latcat == "waw":
self.cat0 = "writer1"
self.cat1 = "writer2"
class Fields(object):
def __init__(self, header, line):
self.fldcat = line[0].strip()
self.flds = {}
for index, cat in enumerate(header.cats):
self.flds[cat] = Fld(line[index + 1])
class Category(object):
def __init__(self, header, fields):
self.header = header
self.fields = fields
lattypes = ["reg", "ureg", "pred", "upred"]
def emit_cats(dirname, f, sm, lat):
cats = []
for index, cat in enumerate(["raw", "war", "waw"]):
first_line = False
fields = []
for l in reader(dirname + "sm" + sm + "/" + lat + "_" + cat + ".csv"):
if first_line == False:
header = Header(l)
first_line = True
else:
fields.append(Fields(header, l))
cats.append(Category(header, fields))
try:
f.write(TEMPLATE_RS.render(lat=lat, sm=sm, cats=cats))
except Exception:
# In the event there's an error, this imports some helpers from mako
# to print a useful stack trace and prints it, then exits with
# status 1, if python is run with debug; otherwise it just raises
# the exception
import sys
from mako import exceptions
print(exceptions.text_error_template().render(), file=sys.stderr)
sys.exit(1)
def emit_sm(dirname, f, sm):
for lat in lattypes:
emit_cats(dirname, f, sm, lat)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
parser.add_argument('--out-rs', required=True, help='Output Rust file.')
parser.add_argument('--sm', help='SM', required=True)
parser.add_argument('csv_files', metavar='FILE', nargs='*',
action='append',
help='Input CSV filename')
args = parser.parse_args()
sys.path.insert(0, args.import_path)
import util
file_cats = {}
for csv_file in args.csv_files[0]:
split = os.path.basename(csv_file).removesuffix('.csv').split('_')
assert len(split) == 2
reg_file = split[0]
latcat = split[1]
r = reader(csv_file)
header = Header(next(r))
fields = [Fields(header, l) for l in r]
cat = Category(header, fields)
file_cats.setdefault(reg_file, []).append(cat)
environment = dict(
sm=args.sm,
file_cats=file_cats,
to_camel=to_camel,
)
util.write_template_rs(args.out_rs, TEMPLATE_RS, environment)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,39 @@
#! /usr/bin/env python3
#
# Copyright © 2024 Collabora Ltd. and Red Hat Inc.
# SPDX-License-Identifier: MIT
# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs
# and constructs a lib.rs which puts each of them in ::path::to::mod.
import argparse
import sys
from mako.template import Template
TEMPLATE_RS = Template("""\
// Copyright © 2024 Collabora Ltd. and Red Hat Inc.
// SPDX-License-Identifier: MIT
// This file is generated by lib_rs_gen.py. DO NOT EDIT!
% for mod in mods:
pub mod ${mod};
% endfor
""")
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
parser.add_argument('--out-rs', required=True, help='Output Rust file.')
parser.add_argument('mods', metavar='MOD', nargs='*',
action='append', help='Submodule')
args = parser.parse_args()
sys.path.insert(0, args.import_path)
import util
util.write_template_rs(args.out_rs, TEMPLATE_RS, dict(mods=args.mods[0]))
if __name__ == '__main__':
main()

View file

@ -0,0 +1,59 @@
# Copyright © 2025 Collabora, Ltd.
# SPDX-License-Identifier: MIT
_nak_lat_sm100_files = [
'pred_raw.csv',
'pred_war.csv',
'pred_waw.csv',
'reg_raw.csv',
'reg_war.csv',
'reg_waw.csv',
'upred_raw.csv',
'upred_war.csv',
'upred_waw.csv',
'ureg_raw.csv',
'ureg_war.csv',
'ureg_waw.csv',
]
_nak_lat_sms = {
'sm100': _nak_lat_sm100_files,
}
_lat_rs_gen = files('lat_rs_gen.py')
_lat_rs_generated = []
foreach sm, csvs : _nak_lat_sms
csv_files = []
foreach i : range(csvs.length())
csv_files += files(sm + '/' + csvs[i])
endforeach
_lat_rs_generated += custom_target(
sm+'.rs',
input : [_lat_rs_gen, csv_files],
output : [sm+'.rs'],
command : [
prog_python, '@INPUT@', '-p', nouveau_util_py_path,
'--out-rs', '@OUTPUT0@', '--sm', sm,
],
depend_files : nouveau_util_py,
)
endforeach
_nak_latencies_lib_rs = custom_target(
'lib.rs',
input : ['lib_rs_gen.py', _lat_rs_generated, nouveau_util_py],
output : ['lib.rs'],
command : [
prog_python, '@INPUT0@', '-p', nouveau_util_py_path,
'--out-rs', '@OUTPUT0@', _nak_lat_sms.keys()
],
)
libnak_latencies_rs = static_library(
'nak_latencies',
_nak_latencies_lib_rs,
gnu_symbol_visibility : 'hidden',
rust_abi : 'rust',
)

View file

@ -0,0 +1,13 @@
raw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
disp_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none
disp_dual_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none
coupled,4,4,4,4,5,8,5,5,6,6,1 & sb,none
dualalu,4,4,4,4,5,8,5,5,6,6,1 & sb,none
r2p,4,4,4,4,5,8,5,5,6,6,1 & sb,none
r2ur,none,none,none,none,none,none,none,none,none,none,none,none
fma,5,5,5,5,5,13,4,5,6,6,1 & sb,none
fp16,13,13,13,13,13,13,13,5,14,14,1 & sb,none
hfma2_mma,13,13,13,13,13,13,13,13,6,6,1 & sb,none
redirected_fp64,13,13,13,13,13,13,13,13,6,6,1 & sb,none
decoupled,13,13,13,13,13,13,13,13,14,14,1 & sb,none
guard,13,13,13,13,13,13,13,13,14,14,1 & sb,none
1 raw disp_alu disp_dual_alu coupled dualalu r2p r2ur fma fp16 hfma2_mma redirected_fp64 decoupled guard
2 disp_alu 13 13 13 13 13 13 13 13 14 14 1 & sb none
3 disp_dual_alu 13 13 13 13 13 13 13 13 14 14 1 & sb none
4 coupled 4 4 4 4 5 8 5 5 6 6 1 & sb none
5 dualalu 4 4 4 4 5 8 5 5 6 6 1 & sb none
6 r2p 4 4 4 4 5 8 5 5 6 6 1 & sb none
7 r2ur none none none none none none none none none none none none
8 fma 5 5 5 5 5 13 4 5 6 6 1 & sb none
9 fp16 13 13 13 13 13 13 13 5 14 14 1 & sb none
10 hfma2_mma 13 13 13 13 13 13 13 13 6 6 1 & sb none
11 redirected_fp64 13 13 13 13 13 13 13 13 6 6 1 & sb none
12 decoupled 13 13 13 13 13 13 13 13 14 14 1 & sb none
13 guard 13 13 13 13 13 13 13 13 14 14 1 & sb none

View file

@ -0,0 +1,14 @@
war,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
disp_alu,1,1,1,1,1,none,1,1,1,1,1,1
disp_dual_alu,1,1,1,1,1,none,1,1,1,1,1,1
coupled,1,1,1,1,1,none,1,1,1,1,1,1
dualalu,1,1,1,1,1,none,1,1,1,1,1,1
r2p,1,1,1,1,1,none,1,1,1,1,1,1
r2ur,1,1,1,1,1,none,1,1,1,1,1,1
fma,1,1,1,1,1,none,1,1,1,1,1,1
fp16,1,1,1,1,1,none,1,1,1,1,1,1
hfma2_mma,1,1,1,1,1,none,1,1,1,1,1,1
redirected_fp64,1,1,1,1,1,none,1,1,1,1,1,1
decoupled,1,1,1,1,1,none,1,1,1,1,1,1
guard,none,none,none,none,none,none,none,none,none,none,none,none
1 war disp_alu disp_dual_alu coupled dualalu r2p r2ur fma fp16 hfma2_mma redirected_fp64 decoupled guard
2 disp_alu 1 1 1 1 1 none 1 1 1 1 1 1
3 disp_dual_alu 1 1 1 1 1 none 1 1 1 1 1 1
4 coupled 1 1 1 1 1 none 1 1 1 1 1 1
5 dualalu 1 1 1 1 1 none 1 1 1 1 1 1
6 r2p 1 1 1 1 1 none 1 1 1 1 1 1
7 r2ur 1 1 1 1 1 none 1 1 1 1 1 1
8 fma 1 1 1 1 1 none 1 1 1 1 1 1
9 fp16 1 1 1 1 1 none 1 1 1 1 1 1
10 hfma2_mma 1 1 1 1 1 none 1 1 1 1 1 1
11 redirected_fp64 1 1 1 1 1 none 1 1 1 1 1 1
12 decoupled 1 1 1 1 1 none 1 1 1 1 1 1
13 guard none none none none none none none none none none none none

View file

@ -0,0 +1,13 @@
waw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
disp_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
disp_dual_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
coupled,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
dualalu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
r2p,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
r2ur,1,1,1,1,1,1,1,1,2,2,1 & sb,none
fma,1,1,1,1,1,1+8,1,1,2,2,1 & sb,none
fp16,2+6,2+6,2+6,2+6,2+6,2+6,2+6,1,2+7,2+7,1 & sb,none
hfma2_mma,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none
redirected_fp64,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none
decoupled,2+10,2+10,2+10,2+10,2+10,2+10,2+10,2+10,1+12,1+12,1 & sb,none
guard,none,none,none,none,none,none,none,none,none,none,none,none
1 waw disp_alu disp_dual_alu coupled dualalu r2p r2ur fma fp16 hfma2_mma redirected_fp64 decoupled guard
2 disp_alu 1 1 1 1 1 1+7 1 1 2 2 1 & sb none
3 disp_dual_alu 1 1 1 1 1 1+7 1 1 2 2 1 & sb none
4 coupled 1 1 1 1 1 1+7 1 1 2 2 1 & sb none
5 dualalu 1 1 1 1 1 1+7 1 1 2 2 1 & sb none
6 r2p 1 1 1 1 1 1+7 1 1 2 2 1 & sb none
7 r2ur 1 1 1 1 1 1 1 1 2 2 1 & sb none
8 fma 1 1 1 1 1 1+8 1 1 2 2 1 & sb none
9 fp16 2+6 2+6 2+6 2+6 2+6 2+6 2+6 1 2+7 2+7 1 & sb none
10 hfma2_mma 2+5 2+5 2+5 2+5 2+5 2+5 2+5 2+5 1 1 1 & sb none
11 redirected_fp64 2+5 2+5 2+5 2+5 2+5 2+5 2+5 2+5 1 1 1 & sb none
12 decoupled 2+10 2+10 2+10 2+10 2+10 2+10 2+10 2+10 1+12 1+12 1 & sb none
13 guard none none none none none none none none none none none none

View file

@ -0,0 +1,23 @@
raw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
alu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
dualalu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
disp_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
fma,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
fma_alu,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_read_ab,5,5,6,4,4,none,none,none,4,6,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_read_cl,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_read_ch,3,3,4,2,2,none,none,none,2,2,3,3,3,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_write_dl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_write_dh,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
fp16,5,5,6,5,5,none,none,none,3,5,4,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
fp16_alu,5,5,6,5,5,none,none,none,3,5,5,4,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
fp16_f32,5,5,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
hfma2_mma,6,6,6,6,6,none,none,none,6,6,6,6,6,8,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
redirected_fp64,6,6,6,6,6,none,none,none,6,6,6,6,6,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb
imma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
hmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
dmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
branch,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb
decoupled,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb
decoupled_agu,5,5,5,5,5,none,none,none,5,5,5,5,5,7,7,19,19,1 & sb,1 & sb,1 & sb,1 & sb
1 raw alu dualalu disp_64 fma fma_alu imad_wide_read_ab imad_wide_read_cl imad_wide_read_ch imad_wide_write_dl imad_wide_write_dh fp16 fp16_alu fp16_f32 hfma2_mma redirected_fp64 imma hmma dmma branch decoupled decoupled_agu
2 alu 4 4 6 5 5 none none none 3 5 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
3 dualalu 4 4 6 5 5 none none none 3 5 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
4 disp_64 none none none none none none none none none none none none none none none none none none none none none
5 fma 5 5 6 4 4 none none none 2 4 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
6 fma_alu 5 5 6 4 4 none none none 2 4 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
7 imad_wide_read_ab 5 5 6 4 4 none none none 4 6 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
8 imad_wide_read_cl 5 5 6 4 4 none none none 2 4 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
9 imad_wide_read_ch 3 3 4 2 2 none none none 2 2 3 3 3 8 8 19 19 1 & sb 1 & sb 1 & sb 1 & sb
10 imad_wide_write_dl none none none none none none none none none none none none none none none none none none none none none
11 imad_wide_write_dh none none none none none none none none none none none none none none none none none none none none none
12 fp16 5 5 6 5 5 none none none 3 5 4 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
13 fp16_alu 5 5 6 5 5 none none none 3 5 5 4 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
14 fp16_f32 5 5 6 5 5 none none none 3 5 5 5 5 10 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
15 hfma2_mma 6 6 6 6 6 none none none 6 6 6 6 6 8 10 19 19 1 & sb 1 & sb 1 & sb 1 & sb
16 redirected_fp64 6 6 6 6 6 none none none 6 6 6 6 6 8 8 19 19 1 & sb 1 & sb 1 & sb 1 & sb
17 imma 7 7 7 7 7 none none none 7 7 7 7 7 11 11 20 20 1 & sb 1 & sb 1 & sb 1 & sb
18 hmma 7 7 7 7 7 none none none 7 7 7 7 7 11 11 20 20 1 & sb 1 & sb 1 & sb 1 & sb
19 dmma 7 7 7 7 7 none none none 7 7 7 7 7 11 11 20 20 1 & sb 1 & sb 1 & sb 1 & sb
20 branch 4 4 4 4 4 none none none 4 4 4 4 4 6 6 19 19 1 & sb 1 & sb 1 & sb 1 & sb
21 decoupled 4 4 4 4 4 none none none 4 4 4 4 4 6 6 19 19 1 & sb 1 & sb 1 & sb 1 & sb
22 decoupled_agu 5 5 5 5 5 none none none 5 5 5 5 5 7 7 19 19 1 & sb 1 & sb 1 & sb 1 & sb

View file

@ -0,0 +1,23 @@
war,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
dualalu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
disp_64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
fma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
fma_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_write_dl,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_write_dh,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
fp16,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
fp16_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
fp16_f32,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
hfma2_mma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
redirected_fp64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
imma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
hmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
dmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
branch,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
decoupled,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
decoupled_agu,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
1 war alu dualalu disp_64 fma fma_alu imad_wide_read_ab imad_wide_read_cl imad_wide_read_ch imad_wide_write_dl imad_wide_write_dh fp16 fp16_alu fp16_f32 hfma2_mma redirected_fp64 imma hmma dmma branch decoupled decoupled_agu
2 alu 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
3 dualalu 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
4 disp_64 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
5 fma 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
6 fma_alu 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
7 imad_wide_read_ab none none none none none none none none none none none none none none none none none none none none none
8 imad_wide_read_cl none none none none none none none none none none none none none none none none none none none none none
9 imad_wide_read_ch none none none none none none none none none none none none none none none none none none none none none
10 imad_wide_write_dl 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
11 imad_wide_write_dh 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
12 fp16 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
13 fp16_alu 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
14 fp16_f32 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
15 hfma2_mma 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
16 redirected_fp64 1 1 1 1 1 1 1 1 none none 1 1 1 1 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
17 imma 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
18 hmma 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
19 dmma 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
20 branch 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
21 decoupled 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb
22 decoupled_agu 2 2 2 2 2 2 2 2 none none 2 2 2 2 1 1 1 1 & sb 1 & sb 1 & sb 1 & sb

View file

@ -0,0 +1,22 @@
waw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
dualalu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
disp_64,1,1,1,1,1,none,none,none,1,1,1,1,1,3+1,3+1,14+1,14+1,1 & sb,1 & sb,1 & sb,1 & sb
fma,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
fma_alu,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
imad_wide_write_dl,1+2,1+2,1+3,1+1,1+1,none,none,none,1,1+1,1+2,1+2,1+2,5+3,5+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
imad_wide_write_dh,1,1,1+1,1,1,none,none,none,1,1,1,1,1,5+1,5+1,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
fp16,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
fp16_alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
fp16_f32,1,1,1,1,1,none,none,none,1,1,1,1,1,3+2,3+2,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
hfma2_mma,1,1,1,1,1,none,none,none,1,1,1,1,1,1,3,14,14,1 & sb,1 & sb,1 & sb,1 & sb
redirected_fp64,1,1,1,1,1,none,none,none,1,1,1,1,1,2,1,13,13,1 & sb,1 & sb,1 & sb,1 & sb
imma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb
hmma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb
dmma,2+4,2+4,2+4,2+4,2+4,none,none,none,2+4,2+4,2+4,2+4,2+4,2+8,2+8,10+9,10+9,1 & sb,1 & sb,1 & sb,1 & sb
branch,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
decoupled,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
decoupled_agu,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
1 waw alu dualalu disp_64 fma fma_alu imad_wide_read_ab imad_wide_read_cl imad_wide_read_ch imad_wide_write_dl imad_wide_write_dh fp16 fp16_alu fp16_f32 hfma2_mma redirected_fp64 imma hmma dmma branch decoupled decoupled_agu
2 alu 1 1 1+1 1 1 none none none 1 1 1 1 1 3+3 3+3 14+2 14+2 1 & sb 1 & sb 1 & sb 1 & sb
3 dualalu 1 1 1+1 1 1 none none none 1 1 1 1 1 3+3 3+3 14+2 14+2 1 & sb 1 & sb 1 & sb 1 & sb
4 disp_64 1 1 1 1 1 none none none 1 1 1 1 1 3+1 3+1 14+1 14+1 1 & sb 1 & sb 1 & sb 1 & sb
5 fma 1 1 1+1 1 1 none none none 1 1+1 1 1 1 3+3 3+3 14+3 14+3 1 & sb 1 & sb 1 & sb 1 & sb
6 fma_alu 1 1 1+1 1 1 none none none 1 1+1 1 1 1 3+3 3+3 14+3 14+3 1 & sb 1 & sb 1 & sb 1 & sb
7 imad_wide_read_ab none none none none none none none none none none none none none none none none none none none none none
8 imad_wide_read_cl none none none none none none none none none none none none none none none none none none none none none
9 imad_wide_read_ch none none none none none none none none none none none none none none none none none none none none none
10 imad_wide_write_dl 1+2 1+2 1+3 1+1 1+1 none none none 1 1+1 1+2 1+2 1+2 5+3 5+3 14+3 14+3 1 & sb 1 & sb 1 & sb 1 & sb
11 imad_wide_write_dh 1 1 1+1 1 1 none none none 1 1 1 1 1 5+1 5+1 14+3 14+3 1 & sb 1 & sb 1 & sb 1 & sb
12 fp16 1 1 1+1 1 1 none none none 1 1 1 1 1 3+3 3+3 14+2 14+2 1 & sb 1 & sb 1 & sb 1 & sb
13 fp16_alu 1 1 1+1 1 1 none none none 1 1 1 1 1 3+3 3+3 14+2 14+2 1 & sb 1 & sb 1 & sb 1 & sb
14 fp16_f32 1 1 1 1 1 none none none 1 1 1 1 1 3+2 3+2 14+2 14+2 1 & sb 1 & sb 1 & sb 1 & sb
15 hfma2_mma 1 1 1 1 1 none none none 1 1 1 1 1 1 3 14 14 1 & sb 1 & sb 1 & sb 1 & sb
16 redirected_fp64 1 1 1 1 1 none none none 1 1 1 1 1 2 1 13 13 1 & sb 1 & sb 1 & sb 1 & sb
17 imma 2 2 2 2 2 none none none 2 2 2 2 2 2 2 1 1 1 & sb 1 & sb 1 & sb 1 & sb
18 hmma 2 2 2 2 2 none none none 2 2 2 2 2 2 2 1 1 1 & sb 1 & sb 1 & sb 1 & sb
19 dmma 2+4 2+4 2+4 2+4 2+4 none none none 2+4 2+4 2+4 2+4 2+4 2+8 2+8 10+9 10+9 1 & sb 1 & sb 1 & sb 1 & sb
20 branch 1+5 1+5 1+5 1+5 1+5 none none none 1+5 1+5 1+5 1+5 1+5 1+9 1+9 13+6 13+6 1 & sb 1 & sb 1 & sb 1 & sb
21 decoupled 1+5 1+5 1+5 1+5 1+5 none none none 1+5 1+5 1+5 1+5 1+5 1+9 1+9 13+6 13+6 1 & sb 1 & sb 1 & sb 1 & sb
22 decoupled_agu 1+5 1+5 1+5 1+5 1+5 none none none 1+5 1+5 1+5 1+5 1+5 1+9 1+9 13+6 13+6 1 & sb 1 & sb 1 & sb 1 & sb

View file

@ -0,0 +1,8 @@
raw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
coupled,none,6,1,none,none,none,1 & sb
udp,none,4,1,none,none,none,1 & sb
voteu,none,none,none,none,none,none,none
u_guard,none,11,5,none,none,none,1 & sb
bra_jmp,none,9,2,none,none,none,1 & sb
uldc_mma,none,11,5,none,none,none,1 & sb
usetmaxreg,none,none,none,none,none,none,none
1 raw coupled udp voteu u_guard bra_jmp uldc_mma usetmaxreg
2 coupled none 6 1 none none none 1 & sb
3 udp none 4 1 none none none 1 & sb
4 voteu none none none none none none none
5 u_guard none 11 5 none none none 1 & sb
6 bra_jmp none 9 2 none none none 1 & sb
7 uldc_mma none 11 5 none none none 1 & sb
8 usetmaxreg none none none none none none none

View file

@ -0,0 +1,9 @@
war,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
coupled,none,none,none,none,none,none,none
udp,1,1,none,1,1,1,none
voteu,2,2,none,1,1,1,none
u_guard,none,none,none,none,none,none,none
bra_jmp,none,none,none,none,none,none,none
uldc_mma,none,none,none,none,none,none,none
usetmaxreg,1,1,none,1,1,1,none
1 war coupled udp voteu u_guard bra_jmp uldc_mma usetmaxreg
2 coupled none none none none none none none
3 udp 1 1 none 1 1 1 none
4 voteu 2 2 none 1 1 1 none
5 u_guard none none none none none none none
6 bra_jmp none none none none none none none
7 uldc_mma none none none none none none none
8 usetmaxreg 1 1 none 1 1 1 none

View file

@ -0,0 +1,8 @@
waw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
coupled,none,none,none,none,none,none,none
udp,none,1,1,none,none,none,1 & sb
voteu,none,7,1,none,none,none,1 & sb
u_guard,none,none,none,none,none,none,none
bra_jmp,none,none,none,none,none,none,none
uldc_mma,none,none,none,none,none,none,none
usetmaxreg,none,8+2,8,none,none,none,1 & sb
1 waw coupled udp voteu u_guard bra_jmp uldc_mma usetmaxreg
2 coupled none none none none none none none
3 udp none 1 1 none none none 1 & sb
4 voteu none 7 1 none none none 1 & sb
5 u_guard none none none none none none none
6 bra_jmp none none none none none none none
7 uldc_mma none none none none none none none
8 usetmaxreg none 8+2 8 none none none 1 & sb

View file

@ -0,0 +1,18 @@
raw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2
coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2
decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
branch,none,none,none,none,none,none,none,1 & sb,none,none,none,10,3,3,3,13,3
coupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
decoupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
hfma2_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,13,5
tex,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
tma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
rpcmov_64,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
udp,none,none,none,none,none,none,none,1 & sb,none,none,none,4,2,2,2,13,2
uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,2,2,2,13,2
elect,none,none,none,none,none,none,none,1 & sb,none,none,none,8,2,2,2,13,2
r2ur,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
voteu,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
1 raw coupled coupled_mma decoupled branch coupled_bindless decoupled_bindless hfma2_mma to_ur tex tma rpcmov_64 udp uldc umov elect r2ur voteu
2 coupled none none none none none none none 1 & sb none none none 6 2 2 2 13 2
3 coupled_mma none none none none none none none 1 & sb none none none 6 2 2 2 13 2
4 decoupled none none none none none none none 1 & sb none none none 9 2 2 2 13 2
5 branch none none none none none none none 1 & sb none none none 10 3 3 3 13 3
6 coupled_bindless none none none none none none none 1 & sb none none none 12 5 5 5 15 5
7 decoupled_bindless none none none none none none none 1 & sb none none none 12 5 5 5 15 5
8 hfma2_mma none none none none none none none 1 & sb none none none 9 2 2 2 13 2
9 to_ur none none none none none none none 1 & sb none none none 12 5 5 5 13 5
10 tex none none none none none none none 1 & sb none none none 9 2 2 2 13 2
11 tma none none none none none none none 1 & sb none none none 9 2 2 2 13 2
12 rpcmov_64 none none none none none none none 1 & sb none none none 9 2 2 2 13 2
13 udp none none none none none none none 1 & sb none none none 4 2 2 2 13 2
14 uldc none none none none none none none 1 & sb none none none 12 5 5 5 15 5
15 umov none none none none none none none 1 & sb none none none 7 2 2 2 13 2
16 elect none none none none none none none 1 & sb none none none 8 2 2 2 13 2
17 r2ur none none none none none none none none none none none none none none none none none
18 voteu none none none none none none none none none none none none none none none none none

View file

@ -0,0 +1,18 @@
war,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
coupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
coupled_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
decoupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
to_ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
udp,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
uldc,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
umov,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
elect,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
r2ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
voteu,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
1 war coupled coupled_mma decoupled branch coupled_bindless decoupled_bindless hfma2_mma to_ur tex tma rpcmov_64 udp uldc umov elect r2ur voteu
2 coupled none none none none none none none none none none none none none none none none none
3 coupled_mma none none none none none none none none none none none none none none none none none
4 decoupled none none none none none none none none none none none none none none none none none
5 branch none none none none none none none none none none none none none none none none none
6 coupled_bindless none none none none none none none none none none none none none none none none none
7 decoupled_bindless none none none none none none none none none none none none none none none none none
8 hfma2_mma none none none none none none none none none none none none none none none none none
9 to_ur 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 1 1 1 1 none none
10 tex none none none none none none none none none none none none none none none none none
11 tma none none none none none none none none none none none none none none none none none
12 rpcmov_64 none none none none none none none none none none none none none none none none none
13 udp 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 1 1 1 1 none none
14 uldc 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 3 1 1 1 none none
15 umov 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 3 1 1 1 none none
16 elect 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 3 1 1 1 none none
17 r2ur 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 1 1 1 1 none none
18 voteu 1 1 1 1 1 1 1 1 & sb 1 1 & sb 1 3 1 1 1 none none

View file

@ -0,0 +1,19 @@
waw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,4+7,4,4,4,4+10,4
tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
udp,none,none,none,none,none,none,none,1 & sb,none,none,none,1,1,1,1,4+5,1
uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
elect,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
r2ur,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,1,1
voteu,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
1 waw coupled coupled_mma decoupled branch coupled_bindless decoupled_bindless hfma2_mma to_ur tex tma rpcmov_64 udp uldc umov elect r2ur voteu
2 coupled none none none none none none none 1 & sb none none none none none none none none none
3 coupled_mma none none none none none none none 1 & sb none none none none none none none none none
4 decoupled none none none none none none none 1 & sb none none none none none none none none none
5 branch none none none none none none none none none none none none none none none none none
6 coupled_bindless none none none none none none none none none none none none none none none none none
7 decoupled_bindless none none none none none none none none none none none none none none none none none
8 hfma2_mma none none none none none none none none none none none none none none none none none
9 to_ur none none none none none none none 1 & sb none none none 4+7 4 4 4 4+10 4
10 tex none none none none none none none none none none none none none none none none none
11 tma none none none none none none none none none none none none none none none none none
12 rpcmov_64 none none none none none none none none none none none none none none none none none
13 udp none none none none none none none 1 & sb none none none 1 1 1 1 4+5 1
14 uldc none none none none none none none 1 & sb none none none 7 1 1 1 10+1 1
15 umov none none none none none none none 1 & sb none none none 7 1 1 1 10+1 1
16 elect none none none none none none none 1 & sb none none none 7 1 1 1 10+1 1
17 r2ur none none none none none none none 1 & sb none none none 7 1 1 1 1 1
18 voteu none none none none none none none 1 & sb none none none 7 1 1 1 10+1 1

View file

@ -1,6 +1,8 @@
# Copyright © 2022 Collabora, Ltd.
# SPDX-License-Identifier: MIT
subdir('latencies')
dep_paste = dependency('paste',
version : '>= 1.0.14',
fallback : ['paste', 'dep_paste'],
@ -114,6 +116,7 @@ _libnak_rs = static_library(
link_with : [
_libnak_bindings_rs,
_libnak_ir_proc_rs,
libnak_latencies_rs,
],
)

View file

@ -27,6 +27,7 @@ mod opt_uniform_instrs;
mod qmd;
mod reg_tracker;
mod repair_ssa;
mod sm120_instr_latencies;
mod sm20;
mod sm30_instr_latencies;
mod sm32;

View file

@ -0,0 +1,552 @@
// Copyright © 2025 Red Hat.
// SPDX-License-Identifier: MIT
#![allow(non_camel_case_types)]
use crate::ir::*;
use nak_latencies::sm100::*;
// This contains the register scheduling information provided by NVIDIA. This
// file is for Blackwell only.
//
// These latencies come from B100 (SM100) and not the consumer RTX chips
// (SM120). We have to add some padding to get everything passing on the RTX
// chips so that's done in this file while using the sm100 CSVs.
// Coupled instructions are ones with fixed latencies, they need delays but not
// scoreboards. Decoupled instructions are ones with variable latencies, need
// scoreboards but not delays. There are also redirected instructions which
// depending on the SM, can be coupled or Decoupled so both delays and
// scoreboards needs to be provided.
fn op_reg_latency(op: &Op, reader: bool, op_reg_idx: usize) -> RegLatencySM100 {
use RegLatencySM100::*;
match op {
// this will need updating if imad grows support for input predicates
Op::IMad(_) | Op::IMul(_) => Fma,
Op::IMad64(_) => {
if reader {
match op_reg_idx {
0 | 1 => ImadWideReadAb,
2 => ImadWideReadCl, // vs upper C operand - work it out
_ => {
panic!("Illegal field in imadwide")
}
}
} else {
ImadWideWriteDh // as above this needs more work
}
}
Op::PopC(_) => Decoupled,
Op::IAdd3(_) | Op::IAdd3X(_) => Alu,
Op::BMsk(_) => Alu,
// Sgxt => Alu,
Op::Lop3(_) => Alu,
Op::Flo(_) => Decoupled,
Op::ISetP(_) => Dualalu,
Op::IAbs(_) => Alu,
Op::Lea(_) => Alu,
Op::LeaX(_) => Alu,
Op::IMnMx(_) => Dualalu,
Op::I2I(_) => Alu,
// I2IP => alu
Op::Shf(_) => Alu,
Op::F2FP(_) => Alu,
Op::FFma(_) => Fma,
Op::FAdd(_) => Fma,
Op::FMul(_) => Fma,
Op::FMnMx(_) => Dualalu,
Op::FSwzAdd(_) => Fma,
Op::FSet(_) => Dualalu,
// FSel => Alu,
Op::FSetP(_) => Dualalu,
// FChk => Decoupled,
Op::DAdd(_) | Op::DFma(_) | Op::DMul(_) | Op::DSetP(_) => {
RedirectedFp64
}
Op::DMnMx(_) => RedirectedFp64, // not in docs
Op::HAdd2(hadd2) => {
if hadd2.f32 {
Fp16F32
} else {
Fp16
}
}
Op::HFma2(_) | Op::HMul2(_) => Fp16,
Op::HSet2(_) | Op::HSetP2(_) | Op::HMnMx2(_) => Fp16Alu,
Op::Hmma(_) => Hmma,
Op::Ipa(_) => DecoupledAgu,
Op::MuFu(_) => Decoupled,
// Conversion functions all Decoupled
Op::F2F(_) => Decoupled,
Op::F2I(_) => Decoupled,
Op::I2F(_) => Decoupled,
Op::FRnd(_) => Decoupled,
Op::AL2P(_) => Decoupled,
Op::Mov(_) => Dualalu,
Op::Sel(_) => Dualalu,
Op::BRev(_) => Decoupled,
// P2R => Alu,
// R2P => Alu,
Op::PLop3(_) => Alu,
Op::Prmt(_) => Alu,
Op::Nop(_) => Disp64,
Op::Vote(_) => Dualalu,
Op::Match(_) => Decoupled,
Op::S2R(_) => DecoupledAgu,
Op::R2UR(_) => Alu,
Op::Redux(_) => {
if reader {
Decoupled
} else {
panic!("Illegal R2UR");
}
}
Op::CS2R(cs2r) => {
if cs2r.dst.as_reg().unwrap().comps() == 2 {
Disp64
} else {
Dualalu
}
}
// B2R => DecoupledAgu,
// LEPC => Disp64
Op::BMov(bmov) => match bmov.dst {
Dst::Reg(_) => Branch,
_ => Branch,
},
// RPCMOV.32 => Alu,
// RPCMOV.64 => Disp64
// PMTRIG => Disp64
// CSMTEST => Alu,
Op::Bar(_) => DecoupledAgu,
Op::Imma(_) => Imma,
Op::IDp4(_) => Fma,
Op::BClear(_) => Decoupled,
Op::Bra(_) => Decoupled,
Op::BSSy(_) => Decoupled,
Op::Kill(_) => Decoupled,
Op::Exit(_) => Decoupled,
Op::BSync(_) => Decoupled,
Op::Tex(_) => Decoupled,
Op::Tld(_) => Decoupled,
Op::Tld4(_) => Decoupled,
Op::Tmml(_) => Decoupled,
Op::Txd(_) => Decoupled,
Op::Txq(_) => Decoupled,
Op::Ldc(_) => Decoupled,
Op::ALd(_) => DecoupledAgu,
Op::ASt(_) => DecoupledAgu,
Op::Out(_) => DecoupledAgu,
Op::OutFinal(_) => DecoupledAgu,
Op::Ld(_) => DecoupledAgu,
Op::St(_) => DecoupledAgu,
Op::Atom(_) => DecoupledAgu,
//CCtl.i,c are coupled
Op::CCtl(_) => DecoupledAgu,
Op::MemBar(_) => Decoupled,
Op::SuLd(_) => Decoupled,
Op::SuSt(_) => Decoupled,
Op::SuAtom(_) => Decoupled,
Op::PixLd(_) => DecoupledAgu,
Op::Isberd(_) => DecoupledAgu,
Op::LdTram(_) => DecoupledAgu,
Op::Shfl(_) => DecoupledAgu,
//Op::LdSm(_) => DecoupledAgu
x => {
panic!("Illegal instuction in reg category {}", x);
}
}
}
fn op_pred_latency(op: &Op) -> PredLatencySM100 {
use PredLatencySM100::*;
match op {
Op::Atom(_) => Decoupled,
Op::DSetP(_) => RedirectedFp64,
Op::FMnMx(_) | Op::FSetP(_) => Dualalu,
Op::HFma2(_) => Fp16,
Op::HMnMx2(_) => Fp16,
Op::HSetP2(_) => Fp16,
Op::IAdd3(_) => Coupled,
Op::IAdd3X(_) => Coupled,
Op::IMad(_) => Fma,
Op::IMad64(_) => Fma,
Op::IMnMx(_) => Dualalu,
Op::IMul(_) => Fma,
Op::Ipa(_) => Decoupled,
Op::ISetP(_) => Dualalu,
Op::Ld(_) => Decoupled,
Op::Lea(_) | Op::LeaX(_) => Coupled,
Op::PixLd(_) => Decoupled,
Op::PLop3(_) => Coupled,
Op::PSetP(_) => Coupled,
Op::R2UR(_) => R2Ur,
Op::Sel(_) => Dualalu,
Op::Shfl(_) => Decoupled,
Op::SuLd(_) => Decoupled,
Op::SuSt(_) => Decoupled,
Op::Tex(_) => Decoupled,
Op::Tld(_) => Decoupled,
Op::Tld4(_) => Decoupled,
Op::Tmml(_) => Decoupled,
Op::Txd(_) => Decoupled,
Op::Txq(_) => Decoupled,
Op::Vote(_) => DispDualAlu,
Op::Match(_) => Decoupled,
_ => {
panic!("Illegal op in sm120 pred latency {}", op);
}
}
}
fn op_ureg_latency(
op: &Op,
reader: bool,
op_reg_idx: usize,
) -> UregLatencySM100 {
use UregLatencySM100::*;
// this decides between the category types for readers.
let bindless = reader && op.srcs_as_slice()[op_reg_idx].is_bindless_cbuf();
let coupled = if bindless { CoupledBindless } else { Coupled };
let decoupled = if bindless {
DecoupledBindless
} else {
Decoupled
};
// if this is a reader from a ureg, it could be a U* instruction or a
// regular instruction.
let uniform_op = op.is_uniform();
let coupled = if uniform_op { Udp } else { coupled };
let decoupled = if uniform_op { Udp } else { decoupled };
match op {
Op::BMsk(_) => coupled,
Op::BRev(_) => decoupled,
// uclea?
Op::Flo(_) => decoupled,
Op::IAdd3(_) | Op::IAdd3X(_) => coupled,
Op::IAbs(_) => coupled,
Op::IDp4(_) => coupled,
Op::IMnMx(_) => coupled,
Op::IMad(_) => coupled,
Op::IMad64(_) => coupled,
Op::ISetP(_) => coupled,
Op::Ldc(_) => {
if uniform_op {
ToUr
} else {
decoupled
}
}
Op::Lea(_) => coupled,
Op::LeaX(_) => coupled,
Op::Lop2(_) | Op::Lop3(_) => coupled,
Op::MuFu(_) => decoupled,
Op::Mov(_) => {
if uniform_op {
Umov
} else {
coupled
}
}
// mov32i => uldc
// p2ur => udp,
Op::PLop3(_) => coupled,
Op::PopC(_) => {
if uniform_op {
coupled
} else {
decoupled
}
}
Op::Prmt(_) => coupled,
Op::PSetP(_) => coupled,
// UR2UP
Op::Sel(_) => coupled,
// SGXT
Op::Shf(_) => coupled,
Op::Shfl(_) => decoupled,
Op::I2F(_) => decoupled,
Op::F2I(_) => decoupled,
Op::F2F(_) => decoupled,
Op::R2UR(_) => {
if !reader {
R2Ur
} else {
panic!("Illegal R2UR in ureg");
}
}
Op::Redux(_) => {
if !reader {
ToUr
} else {
panic!("Illegal R2UR in ureg");
}
}
Op::Vote(_) => Voteu,
Op::S2R(_) => ToUr,
Op::Tex(_) | Op::Tld(_) | Op::Tld4(_) | Op::Txq(_) => Tex,
Op::FRnd(_) => decoupled,
Op::F2FP(_)
| Op::FAdd(_)
| Op::FMul(_)
| Op::FFma(_)
| Op::FSet(_)
| Op::FSetP(_)
| Op::FMnMx(_)
| Op::HAdd2(_)
| Op::HMul2(_)
| Op::HSet2(_)
| Op::HFma2(_)
| Op::HMnMx2(_)
| Op::HSetP2(_) => coupled,
Op::DMul(_) | Op::DFma(_) | Op::DAdd(_) | Op::DSetP(_) => decoupled,
_ => {
panic!("Illegal instuction in ureg category {}", op);
}
}
}
fn op_upred_latency(op: &Op) -> UpredLatencySM100 {
use UpredLatencySM100::*;
let uniform_op = op.is_uniform();
match op {
Op::BMsk(_)
| Op::BRev(_)
| Op::Flo(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IMad(_)
| Op::ISetP(_)
| Op::Lea(_)
| Op::LeaX(_)
| Op::Lop3(_)
| Op::Mov(_) => Udp,
Op::Ldc(_) => UldcMma,
Op::PLop3(_) => {
if uniform_op {
Udp
} else {
Coupled
}
}
Op::PSetP(_) => {
if uniform_op {
Udp
} else {
Coupled
}
}
Op::Sel(_) => {
if uniform_op {
Udp
} else {
Coupled
}
}
Op::Vote(_) => {
if uniform_op {
Voteu
} else {
panic!("Illegal Vote in upred");
}
}
_ => {
panic!("Illegal instuction in upred category {}", op);
}
}
}
pub struct SM120Latency {}
impl SM120Latency {
pub fn needs_scoreboards(op: &Op) -> bool {
if op.is_uniform() {
match op_ureg_latency(op, false, 0) {
UregLatencySM100::Uldc
| UregLatencySM100::ToUr
| UregLatencySM100::Tex => true,
_ => false,
}
} else {
match op_reg_latency(op, false, 0) {
RegLatencySM100::Dmma
| RegLatencySM100::Hmma
| RegLatencySM100::RedirectedFp64
| RegLatencySM100::Branch
| RegLatencySM100::Decoupled
| RegLatencySM100::DecoupledAgu => true,
_ => false,
}
}
}
pub fn raw(
write: &Op,
dst_idx: usize,
read: Option<&Op>,
src_idx: usize,
) -> u32 {
let dst_file = match &write.dsts_as_slice()[dst_idx] {
Dst::None => return 0,
Dst::SSA(vec) => vec.file().unwrap(),
Dst::Reg(reg) => reg.file(),
};
match dst_file {
RegFile::GPR => {
let write_latency = op_reg_latency(write, false, dst_idx);
let read_latency = match read {
Some(op) => op_reg_latency(op, true, src_idx),
None => RegLatencySM100::RedirectedFp64,
};
// The latencies are for SM100 docs, but some chips need large
// one just override here.
if write_latency == RegLatencySM100::Hmma
|| read_latency == RegLatencySM100::Hmma
{
RegLatencySM100::raw(write_latency, read_latency, false) + 9
} else {
RegLatencySM100::raw(write_latency, read_latency, false) + 1
}
}
RegFile::UGPR => {
let write_latency = op_ureg_latency(write, false, dst_idx);
let read_latency = match read {
Some(op) => op_ureg_latency(op, true, src_idx),
None => UregLatencySM100::Uldc,
};
UregLatencySM100::raw(write_latency, read_latency, false) + 1
}
RegFile::Pred => {
let write_latency = op_pred_latency(write);
let read_latency = match read {
Some(op) => op_pred_latency(op),
None => PredLatencySM100::RedirectedFp64,
};
PredLatencySM100::raw(write_latency, read_latency, false) + 1
}
RegFile::UPred => {
let write_latency = op_upred_latency(write);
let read_latency = match read {
Some(op) => op_upred_latency(op),
None => UpredLatencySM100::UGuard,
};
UpredLatencySM100::raw(write_latency, read_latency, false) + 1
}
RegFile::Bar => 0, // Barriers have a HW scoreboard
_ => panic!("Not a register"),
}
}
pub fn war(read: &Op, src_idx: usize, write: &Op, dst_idx: usize) -> u32 {
let dst_file = match &write.dsts_as_slice()[dst_idx] {
Dst::None => return 0,
Dst::SSA(vec) => vec.file().unwrap(),
Dst::Reg(reg) => reg.file(),
};
match dst_file {
RegFile::GPR => {
let write_latency = op_reg_latency(write, false, dst_idx);
let read_latency = op_reg_latency(read, true, src_idx);
if write_latency == RegLatencySM100::Hmma
|| read_latency == RegLatencySM100::Hmma
{
RegLatencySM100::war(read_latency, write_latency, false) + 7
} else {
RegLatencySM100::war(read_latency, write_latency, false)
}
}
RegFile::UGPR => {
let write_latency = op_ureg_latency(write, false, dst_idx);
let read_latency = op_ureg_latency(read, true, src_idx);
UregLatencySM100::war(read_latency, write_latency, false)
}
RegFile::Pred => {
let write_latency = op_pred_latency(write);
let read_latency = op_pred_latency(read);
PredLatencySM100::war(read_latency, write_latency, false)
}
RegFile::UPred => {
let write_latency = op_upred_latency(write);
let read_latency = op_upred_latency(read);
UpredLatencySM100::war(read_latency, write_latency, false)
}
_ => panic!("Not a register"),
}
}
pub fn waw(
a: &Op,
a_dst_idx: usize,
b: &Op,
b_dst_idx: usize,
a_op_pred: bool,
) -> u32 {
let dst_file = match &a.dsts_as_slice()[a_dst_idx] {
Dst::None => return 0,
Dst::SSA(vec) => vec.file().unwrap(),
Dst::Reg(reg) => reg.file(),
};
match dst_file {
RegFile::GPR => {
let write1_latency = op_reg_latency(a, false, a_dst_idx);
let write2_latency = op_reg_latency(b, false, b_dst_idx);
if write1_latency == RegLatencySM100::Hmma
|| write2_latency == RegLatencySM100::Hmma
{
RegLatencySM100::waw(
write1_latency,
write2_latency,
a_op_pred,
) + 7
} else {
RegLatencySM100::waw(
write1_latency,
write2_latency,
a_op_pred,
)
}
}
RegFile::UGPR => {
let write1_latency = op_ureg_latency(a, false, a_dst_idx);
let write2_latency = op_ureg_latency(b, false, b_dst_idx);
UregLatencySM100::waw(write1_latency, write2_latency, a_op_pred)
}
RegFile::Pred => {
let write1_latency = op_pred_latency(a);
let write2_latency = op_pred_latency(b);
PredLatencySM100::waw(write1_latency, write2_latency, a_op_pred)
}
RegFile::UPred => {
let write1_latency = op_upred_latency(a);
let write2_latency = op_upred_latency(b);
UpredLatencySM100::waw(write1_latency, write2_latency, false)
}
_ => panic!("Not a register"),
}
}
}

View file

@ -3,10 +3,10 @@
use crate::ir::*;
use crate::legalize::LegalizeBuilder;
use crate::sm120_instr_latencies::SM120Latency;
use crate::sm70_encode::*;
use crate::sm75_instr_latencies::SM75Latency;
use crate::sm80_instr_latencies::SM80Latency;
pub struct ShaderModel70 {
sm: u8,
}
@ -154,7 +154,9 @@ impl ShaderModel for ShaderModel70 {
return false;
}
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::needs_scoreboards(op)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::needs_scoreboards(op)
} else if self.is_turing() {
SM75Latency::needs_scoreboards(op)
@ -188,7 +190,9 @@ impl ShaderModel for ShaderModel70 {
read: &Op,
src_idx: usize,
) -> u32 {
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::raw(write, dst_idx, Some(read), src_idx)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::raw(write, dst_idx, Some(read), src_idx)
} else if self.is_turing() {
SM75Latency::raw(write, dst_idx, Some(read), src_idx)
@ -204,7 +208,9 @@ impl ShaderModel for ShaderModel70 {
write: &Op,
dst_idx: usize,
) -> u32 {
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::war(read, src_idx, write, dst_idx)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::war(read, src_idx, write, dst_idx)
} else if self.is_turing() {
SM75Latency::war(read, src_idx, write, dst_idx)
@ -223,7 +229,9 @@ impl ShaderModel for ShaderModel70 {
b: &Op,
b_dst_idx: usize,
) -> u32 {
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
} else if self.is_turing() {
SM75Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
@ -235,7 +243,9 @@ impl ShaderModel for ShaderModel70 {
}
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 {
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::raw(write, dst_idx, None, 0)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::raw(write, dst_idx, None, 0)
} else if self.is_turing() {
SM75Latency::raw(write, dst_idx, None, 0)
@ -250,7 +260,9 @@ impl ShaderModel for ShaderModel70 {
}
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
if self.is_ampere() || self.is_ada() {
if self.is_blackwell() {
SM120Latency::raw(write, dst_idx, None, 0)
} else if self.is_ampere() || self.is_ada() {
SM80Latency::raw(write, dst_idx, None, 0)
} else if self.is_turing() {
SM75Latency::raw(write, dst_idx, None, 0)

View file

@ -77,6 +77,9 @@ executable(
install : with_tools.contains('nouveau'),
)
nouveau_util_py = files('util.py')
nouveau_util_py_path = meson.current_source_dir()
# Only generate Rust bindings for NVK
if with_nouveau_vk
cl_rs_generated = []