mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 02:10:11 +01:00
nvk: add sm120 latencies via csv files.
Two difference from the initial B100 values: all raw seem to need a +1 hmma seems to need a +7 and +1 for raw hmma for good luck makes 9. Cc: 25.2 Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.ca> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36217>
This commit is contained in:
parent
0fce848b54
commit
477533ee00
20 changed files with 1072 additions and 7 deletions
208
src/nouveau/compiler/latencies/lat_rs_gen.py
Normal file
208
src/nouveau/compiler/latencies/lat_rs_gen.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
#! /usr/bin/env python3
|
||||
#
|
||||
# Copyright © 2024 Collabora Ltd. and Red Hat Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs
|
||||
# and constructs a lib.rs which puts each of them in ::path::to::mod.
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
|
||||
from mako import template
|
||||
|
||||
TEMPLATE_RS = template.Template(text="""\
|
||||
// Copyright 2024 Red Hat Inc.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This file is generated by lat_rs_gen.py. DO NOT EDIT!
|
||||
#![allow(unused_variables)]
|
||||
|
||||
const fn pred(has_pred: bool, a: u32, b: u32) -> u32 {
|
||||
if has_pred {
|
||||
a + b
|
||||
} else {
|
||||
b
|
||||
}
|
||||
}
|
||||
|
||||
% for reg_file, cats in file_cats.items():
|
||||
<% enum_name = to_camel(reg_file) + 'Latency' + sm.upper() %>
|
||||
#[derive(PartialEq)]
|
||||
pub enum ${enum_name} {
|
||||
% for category in cats[0].header.cats:
|
||||
${to_camel(category)},
|
||||
% endfor
|
||||
}
|
||||
|
||||
impl ${to_camel(reg_file)}Latency${sm.upper()} {
|
||||
% for bigcat in cats:
|
||||
pub fn ${bigcat.header.latcat}(
|
||||
${bigcat.header.cat0}: ${enum_name},
|
||||
${bigcat.header.cat1}: ${enum_name},
|
||||
has_pred: bool
|
||||
) -> u32 {
|
||||
use ${enum_name}::*;
|
||||
match ${bigcat.header.cat1} {
|
||||
% for cat in bigcat.header.cats:
|
||||
${to_camel(cat)} => match ${bigcat.header.cat0} {
|
||||
<% has_non = False %>
|
||||
% for cat2 in bigcat.header.cats:
|
||||
% if bigcat.fields[loop.parent.index].flds[cat2].pred == True:
|
||||
${to_camel(cat2)} => pred(
|
||||
has_pred,
|
||||
${bigcat.fields[loop.parent.index].flds[cat2].value},
|
||||
${bigcat.fields[loop.parent.index].flds[cat2].pred_val}
|
||||
),
|
||||
% elif bigcat.fields[loop.parent.index].flds[cat2].value != "none":
|
||||
${to_camel(cat2)} => ${bigcat.fields[loop.parent.index].flds[cat2].value},
|
||||
% else:
|
||||
<% has_none = True %>
|
||||
% endif
|
||||
% endfor
|
||||
% if has_none:
|
||||
_ => panic!("Illegal ${bigcat.header.cat0} value in ${bigcat.header.latcat} for ${to_camel(cat)}"),
|
||||
% endif
|
||||
}
|
||||
% endfor
|
||||
}
|
||||
}
|
||||
% endfor
|
||||
}
|
||||
|
||||
% endfor
|
||||
""")
|
||||
|
||||
## A mere convenience to convert snake_case to CamelCase. Numbers are prefixed
|
||||
## with "_".
|
||||
def to_camel(snake_str):
|
||||
result = ''.join(word.title() for word in snake_str.split('_'))
|
||||
return result if not result[0].isdigit() else '_' + result
|
||||
|
||||
def reader(csvfile):
|
||||
"""Wrapper around csv.reader that skips comments and blanks."""
|
||||
# csv.reader actually reads the file one line at a time (it was designed to
|
||||
# open excel generated sheets), so hold the file until all of the lines are
|
||||
# read.
|
||||
with open(csvfile, 'r') as f:
|
||||
for line in csv.reader(f):
|
||||
if line and not line[0].startswith('#'):
|
||||
yield line
|
||||
|
||||
class Fld(object):
|
||||
def __init__(self, line):
|
||||
if "none" in line:
|
||||
self.valid = False
|
||||
else:
|
||||
self.valid = True
|
||||
self.pred = False
|
||||
if "+" in line:
|
||||
self.pred = True
|
||||
part = line.split("+")
|
||||
self.value = part[0]
|
||||
self.pred_val = part[1]
|
||||
elif " & sb" in line:
|
||||
self.scoreboard = True
|
||||
self.value = line.removesuffix(" & sb");
|
||||
else:
|
||||
self.scoreboard = False
|
||||
self.value = line.strip()
|
||||
|
||||
class Header(object):
|
||||
def __init__(self, line):
|
||||
self.latcat = line[0].strip()
|
||||
self.cats = line[1:]
|
||||
|
||||
if self.latcat == "raw":
|
||||
self.cat0 = "writer"
|
||||
self.cat1 = "reader"
|
||||
elif self.latcat == "war":
|
||||
self.cat0 = "reader"
|
||||
self.cat1 = "writer"
|
||||
elif self.latcat == "waw":
|
||||
self.cat0 = "writer1"
|
||||
self.cat1 = "writer2"
|
||||
|
||||
|
||||
class Fields(object):
|
||||
def __init__(self, header, line):
|
||||
self.fldcat = line[0].strip()
|
||||
self.flds = {}
|
||||
for index, cat in enumerate(header.cats):
|
||||
self.flds[cat] = Fld(line[index + 1])
|
||||
|
||||
class Category(object):
|
||||
def __init__(self, header, fields):
|
||||
self.header = header
|
||||
self.fields = fields
|
||||
|
||||
lattypes = ["reg", "ureg", "pred", "upred"]
|
||||
|
||||
def emit_cats(dirname, f, sm, lat):
|
||||
cats = []
|
||||
for index, cat in enumerate(["raw", "war", "waw"]):
|
||||
first_line = False
|
||||
fields = []
|
||||
for l in reader(dirname + "sm" + sm + "/" + lat + "_" + cat + ".csv"):
|
||||
if first_line == False:
|
||||
header = Header(l)
|
||||
first_line = True
|
||||
else:
|
||||
fields.append(Fields(header, l))
|
||||
cats.append(Category(header, fields))
|
||||
|
||||
try:
|
||||
f.write(TEMPLATE_RS.render(lat=lat, sm=sm, cats=cats))
|
||||
except Exception:
|
||||
# In the event there's an error, this imports some helpers from mako
|
||||
# to print a useful stack trace and prints it, then exits with
|
||||
# status 1, if python is run with debug; otherwise it just raises
|
||||
# the exception
|
||||
import sys
|
||||
from mako import exceptions
|
||||
print(exceptions.text_error_template().render(), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def emit_sm(dirname, f, sm):
|
||||
for lat in lattypes:
|
||||
emit_cats(dirname, f, sm, lat)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-p', '--import-path', required=True)
|
||||
parser.add_argument('--out-rs', required=True, help='Output Rust file.')
|
||||
parser.add_argument('--sm', help='SM', required=True)
|
||||
parser.add_argument('csv_files', metavar='FILE', nargs='*',
|
||||
action='append',
|
||||
help='Input CSV filename')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
sys.path.insert(0, args.import_path)
|
||||
import util
|
||||
|
||||
file_cats = {}
|
||||
for csv_file in args.csv_files[0]:
|
||||
split = os.path.basename(csv_file).removesuffix('.csv').split('_')
|
||||
assert len(split) == 2
|
||||
reg_file = split[0]
|
||||
latcat = split[1]
|
||||
|
||||
r = reader(csv_file)
|
||||
header = Header(next(r))
|
||||
fields = [Fields(header, l) for l in r]
|
||||
|
||||
cat = Category(header, fields)
|
||||
file_cats.setdefault(reg_file, []).append(cat)
|
||||
|
||||
environment = dict(
|
||||
sm=args.sm,
|
||||
file_cats=file_cats,
|
||||
to_camel=to_camel,
|
||||
)
|
||||
util.write_template_rs(args.out_rs, TEMPLATE_RS, environment)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
39
src/nouveau/compiler/latencies/lib_rs_gen.py
Normal file
39
src/nouveau/compiler/latencies/lib_rs_gen.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#! /usr/bin/env python3
|
||||
#
|
||||
# Copyright © 2024 Collabora Ltd. and Red Hat Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs
|
||||
# and constructs a lib.rs which puts each of them in ::path::to::mod.
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from mako.template import Template
|
||||
|
||||
TEMPLATE_RS = Template("""\
|
||||
// Copyright © 2024 Collabora Ltd. and Red Hat Inc.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This file is generated by lib_rs_gen.py. DO NOT EDIT!
|
||||
|
||||
% for mod in mods:
|
||||
pub mod ${mod};
|
||||
% endfor
|
||||
""")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-p', '--import-path', required=True)
|
||||
parser.add_argument('--out-rs', required=True, help='Output Rust file.')
|
||||
parser.add_argument('mods', metavar='MOD', nargs='*',
|
||||
action='append', help='Submodule')
|
||||
args = parser.parse_args()
|
||||
|
||||
sys.path.insert(0, args.import_path)
|
||||
import util
|
||||
|
||||
util.write_template_rs(args.out_rs, TEMPLATE_RS, dict(mods=args.mods[0]))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
59
src/nouveau/compiler/latencies/meson.build
Normal file
59
src/nouveau/compiler/latencies/meson.build
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright © 2025 Collabora, Ltd.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
_nak_lat_sm100_files = [
|
||||
'pred_raw.csv',
|
||||
'pred_war.csv',
|
||||
'pred_waw.csv',
|
||||
'reg_raw.csv',
|
||||
'reg_war.csv',
|
||||
'reg_waw.csv',
|
||||
'upred_raw.csv',
|
||||
'upred_war.csv',
|
||||
'upred_waw.csv',
|
||||
'ureg_raw.csv',
|
||||
'ureg_war.csv',
|
||||
'ureg_waw.csv',
|
||||
]
|
||||
|
||||
_nak_lat_sms = {
|
||||
'sm100': _nak_lat_sm100_files,
|
||||
}
|
||||
|
||||
_lat_rs_gen = files('lat_rs_gen.py')
|
||||
|
||||
_lat_rs_generated = []
|
||||
foreach sm, csvs : _nak_lat_sms
|
||||
csv_files = []
|
||||
foreach i : range(csvs.length())
|
||||
csv_files += files(sm + '/' + csvs[i])
|
||||
endforeach
|
||||
|
||||
_lat_rs_generated += custom_target(
|
||||
sm+'.rs',
|
||||
input : [_lat_rs_gen, csv_files],
|
||||
output : [sm+'.rs'],
|
||||
command : [
|
||||
prog_python, '@INPUT@', '-p', nouveau_util_py_path,
|
||||
'--out-rs', '@OUTPUT0@', '--sm', sm,
|
||||
],
|
||||
depend_files : nouveau_util_py,
|
||||
)
|
||||
endforeach
|
||||
|
||||
_nak_latencies_lib_rs = custom_target(
|
||||
'lib.rs',
|
||||
input : ['lib_rs_gen.py', _lat_rs_generated, nouveau_util_py],
|
||||
output : ['lib.rs'],
|
||||
command : [
|
||||
prog_python, '@INPUT0@', '-p', nouveau_util_py_path,
|
||||
'--out-rs', '@OUTPUT0@', _nak_lat_sms.keys()
|
||||
],
|
||||
)
|
||||
|
||||
libnak_latencies_rs = static_library(
|
||||
'nak_latencies',
|
||||
_nak_latencies_lib_rs,
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
rust_abi : 'rust',
|
||||
)
|
||||
13
src/nouveau/compiler/latencies/sm100/pred_raw.csv
Normal file
13
src/nouveau/compiler/latencies/sm100/pred_raw.csv
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
raw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
|
||||
disp_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none
|
||||
disp_dual_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none
|
||||
coupled,4,4,4,4,5,8,5,5,6,6,1 & sb,none
|
||||
dualalu,4,4,4,4,5,8,5,5,6,6,1 & sb,none
|
||||
r2p,4,4,4,4,5,8,5,5,6,6,1 & sb,none
|
||||
r2ur,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
fma,5,5,5,5,5,13,4,5,6,6,1 & sb,none
|
||||
fp16,13,13,13,13,13,13,13,5,14,14,1 & sb,none
|
||||
hfma2_mma,13,13,13,13,13,13,13,13,6,6,1 & sb,none
|
||||
redirected_fp64,13,13,13,13,13,13,13,13,6,6,1 & sb,none
|
||||
decoupled,13,13,13,13,13,13,13,13,14,14,1 & sb,none
|
||||
guard,13,13,13,13,13,13,13,13,14,14,1 & sb,none
|
||||
|
14
src/nouveau/compiler/latencies/sm100/pred_war.csv
Normal file
14
src/nouveau/compiler/latencies/sm100/pred_war.csv
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
war,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
|
||||
disp_alu,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
disp_dual_alu,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
coupled,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
dualalu,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
r2p,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
r2ur,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
fma,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
fp16,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
hfma2_mma,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
redirected_fp64,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
decoupled,1,1,1,1,1,none,1,1,1,1,1,1
|
||||
guard,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
|
||||
|
13
src/nouveau/compiler/latencies/sm100/pred_waw.csv
Normal file
13
src/nouveau/compiler/latencies/sm100/pred_waw.csv
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
waw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard
|
||||
disp_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
|
||||
disp_dual_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
|
||||
coupled,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
|
||||
dualalu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
|
||||
r2p,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none
|
||||
r2ur,1,1,1,1,1,1,1,1,2,2,1 & sb,none
|
||||
fma,1,1,1,1,1,1+8,1,1,2,2,1 & sb,none
|
||||
fp16,2+6,2+6,2+6,2+6,2+6,2+6,2+6,1,2+7,2+7,1 & sb,none
|
||||
hfma2_mma,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none
|
||||
redirected_fp64,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none
|
||||
decoupled,2+10,2+10,2+10,2+10,2+10,2+10,2+10,2+10,1+12,1+12,1 & sb,none
|
||||
guard,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
|
23
src/nouveau/compiler/latencies/sm100/reg_raw.csv
Normal file
23
src/nouveau/compiler/latencies/sm100/reg_raw.csv
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
raw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
|
||||
alu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dualalu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
disp_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
fma,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fma_alu,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_read_ab,5,5,6,4,4,none,none,none,4,6,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_read_cl,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_read_ch,3,3,4,2,2,none,none,none,2,2,3,3,3,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_write_dl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_write_dh,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
fp16,5,5,6,5,5,none,none,none,3,5,4,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_alu,5,5,6,5,5,none,none,none,3,5,5,4,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_f32,5,5,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hfma2_mma,6,6,6,6,6,none,none,none,6,6,6,6,6,8,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
redirected_fp64,6,6,6,6,6,none,none,none,6,6,6,6,6,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
branch,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled_agu,5,5,5,5,5,none,none,none,5,5,5,5,5,7,7,19,19,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
|
||||
|
23
src/nouveau/compiler/latencies/sm100/reg_war.csv
Normal file
23
src/nouveau/compiler/latencies/sm100/reg_war.csv
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
war,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
|
||||
alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dualalu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
disp_64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fma_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_write_dl,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_write_dh,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_f32,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hfma2_mma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
redirected_fp64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
branch,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled_agu,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
|
||||
|
22
src/nouveau/compiler/latencies/sm100/reg_waw.csv
Normal file
22
src/nouveau/compiler/latencies/sm100/reg_waw.csv
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
waw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu
|
||||
alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dualalu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
disp_64,1,1,1,1,1,none,none,none,1,1,1,1,1,3+1,3+1,14+1,14+1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fma,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fma_alu,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
imad_wide_write_dl,1+2,1+2,1+3,1+1,1+1,none,none,none,1,1+1,1+2,1+2,1+2,5+3,5+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imad_wide_write_dh,1,1,1+1,1,1,none,none,none,1,1,1,1,1,5+1,5+1,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
fp16_f32,1,1,1,1,1,none,none,none,1,1,1,1,1,3+2,3+2,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hfma2_mma,1,1,1,1,1,none,none,none,1,1,1,1,1,1,3,14,14,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
redirected_fp64,1,1,1,1,1,none,none,none,1,1,1,1,1,2,1,13,13,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
imma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
hmma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
dmma,2+4,2+4,2+4,2+4,2+4,none,none,none,2+4,2+4,2+4,2+4,2+4,2+8,2+8,10+9,10+9,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
branch,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
decoupled_agu,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb
|
||||
|
8
src/nouveau/compiler/latencies/sm100/upred_raw.csv
Normal file
8
src/nouveau/compiler/latencies/sm100/upred_raw.csv
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
raw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
|
||||
coupled,none,6,1,none,none,none,1 & sb
|
||||
udp,none,4,1,none,none,none,1 & sb
|
||||
voteu,none,none,none,none,none,none,none
|
||||
u_guard,none,11,5,none,none,none,1 & sb
|
||||
bra_jmp,none,9,2,none,none,none,1 & sb
|
||||
uldc_mma,none,11,5,none,none,none,1 & sb
|
||||
usetmaxreg,none,none,none,none,none,none,none
|
||||
|
9
src/nouveau/compiler/latencies/sm100/upred_war.csv
Normal file
9
src/nouveau/compiler/latencies/sm100/upred_war.csv
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
war,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
|
||||
coupled,none,none,none,none,none,none,none
|
||||
udp,1,1,none,1,1,1,none
|
||||
voteu,2,2,none,1,1,1,none
|
||||
u_guard,none,none,none,none,none,none,none
|
||||
bra_jmp,none,none,none,none,none,none,none
|
||||
uldc_mma,none,none,none,none,none,none,none
|
||||
usetmaxreg,1,1,none,1,1,1,none
|
||||
|
||||
|
8
src/nouveau/compiler/latencies/sm100/upred_waw.csv
Normal file
8
src/nouveau/compiler/latencies/sm100/upred_waw.csv
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
waw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg
|
||||
coupled,none,none,none,none,none,none,none
|
||||
udp,none,1,1,none,none,none,1 & sb
|
||||
voteu,none,7,1,none,none,none,1 & sb
|
||||
u_guard,none,none,none,none,none,none,none
|
||||
bra_jmp,none,none,none,none,none,none,none
|
||||
uldc_mma,none,none,none,none,none,none,none
|
||||
usetmaxreg,none,8+2,8,none,none,none,1 & sb
|
||||
|
18
src/nouveau/compiler/latencies/sm100/ureg_raw.csv
Normal file
18
src/nouveau/compiler/latencies/sm100/ureg_raw.csv
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
raw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
|
||||
coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2
|
||||
coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2
|
||||
decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
|
||||
branch,none,none,none,none,none,none,none,1 & sb,none,none,none,10,3,3,3,13,3
|
||||
coupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
|
||||
decoupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
|
||||
hfma2_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
|
||||
to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,13,5
|
||||
tex,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
|
||||
tma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
|
||||
rpcmov_64,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2
|
||||
udp,none,none,none,none,none,none,none,1 & sb,none,none,none,4,2,2,2,13,2
|
||||
uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5
|
||||
umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,2,2,2,13,2
|
||||
elect,none,none,none,none,none,none,none,1 & sb,none,none,none,8,2,2,2,13,2
|
||||
r2ur,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
voteu,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
|
18
src/nouveau/compiler/latencies/sm100/ureg_war.csv
Normal file
18
src/nouveau/compiler/latencies/sm100/ureg_war.csv
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
war,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
|
||||
coupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
coupled_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
decoupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
to_ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
|
||||
tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
udp,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
|
||||
uldc,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
|
||||
umov,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
|
||||
elect,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
|
||||
r2ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none
|
||||
voteu,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none
|
||||
|
19
src/nouveau/compiler/latencies/sm100/ureg_waw.csv
Normal file
19
src/nouveau/compiler/latencies/sm100/ureg_waw.csv
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
waw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu
|
||||
coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
|
||||
coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
|
||||
decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none
|
||||
branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,4+7,4,4,4,4+10,4
|
||||
tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none
|
||||
udp,none,none,none,none,none,none,none,1 & sb,none,none,none,1,1,1,1,4+5,1
|
||||
uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
|
||||
umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
|
||||
elect,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
|
||||
r2ur,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,1,1
|
||||
voteu,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# Copyright © 2022 Collabora, Ltd.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
subdir('latencies')
|
||||
|
||||
dep_paste = dependency('paste',
|
||||
version : '>= 1.0.14',
|
||||
fallback : ['paste', 'dep_paste'],
|
||||
|
|
@ -114,6 +116,7 @@ _libnak_rs = static_library(
|
|||
link_with : [
|
||||
_libnak_bindings_rs,
|
||||
_libnak_ir_proc_rs,
|
||||
libnak_latencies_rs,
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ mod opt_uniform_instrs;
|
|||
mod qmd;
|
||||
mod reg_tracker;
|
||||
mod repair_ssa;
|
||||
mod sm120_instr_latencies;
|
||||
mod sm20;
|
||||
mod sm30_instr_latencies;
|
||||
mod sm32;
|
||||
|
|
|
|||
552
src/nouveau/compiler/nak/sm120_instr_latencies.rs
Normal file
552
src/nouveau/compiler/nak/sm120_instr_latencies.rs
Normal file
|
|
@ -0,0 +1,552 @@
|
|||
// Copyright © 2025 Red Hat.
|
||||
// SPDX-License-Identifier: MIT
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
use crate::ir::*;
|
||||
|
||||
use nak_latencies::sm100::*;
|
||||
|
||||
// This contains the register scheduling information provided by NVIDIA. This
|
||||
// file is for Blackwell only.
|
||||
//
|
||||
// These latencies come from B100 (SM100) and not the consumer RTX chips
|
||||
// (SM120). We have to add some padding to get everything passing on the RTX
|
||||
// chips so that's done in this file while using the sm100 CSVs.
|
||||
|
||||
// Coupled instructions are ones with fixed latencies, they need delays but not
|
||||
// scoreboards. Decoupled instructions are ones with variable latencies, need
|
||||
// scoreboards but not delays. There are also redirected instructions which
|
||||
// depending on the SM, can be coupled or Decoupled so both delays and
|
||||
// scoreboards needs to be provided.
|
||||
|
||||
fn op_reg_latency(op: &Op, reader: bool, op_reg_idx: usize) -> RegLatencySM100 {
|
||||
use RegLatencySM100::*;
|
||||
match op {
|
||||
// this will need updating if imad grows support for input predicates
|
||||
Op::IMad(_) | Op::IMul(_) => Fma,
|
||||
Op::IMad64(_) => {
|
||||
if reader {
|
||||
match op_reg_idx {
|
||||
0 | 1 => ImadWideReadAb,
|
||||
2 => ImadWideReadCl, // vs upper C operand - work it out
|
||||
_ => {
|
||||
panic!("Illegal field in imadwide")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ImadWideWriteDh // as above this needs more work
|
||||
}
|
||||
}
|
||||
|
||||
Op::PopC(_) => Decoupled,
|
||||
Op::IAdd3(_) | Op::IAdd3X(_) => Alu,
|
||||
|
||||
Op::BMsk(_) => Alu,
|
||||
// Sgxt => Alu,
|
||||
Op::Lop3(_) => Alu,
|
||||
Op::Flo(_) => Decoupled,
|
||||
Op::ISetP(_) => Dualalu,
|
||||
Op::IAbs(_) => Alu,
|
||||
Op::Lea(_) => Alu,
|
||||
Op::LeaX(_) => Alu,
|
||||
Op::IMnMx(_) => Dualalu,
|
||||
Op::I2I(_) => Alu,
|
||||
// I2IP => alu
|
||||
Op::Shf(_) => Alu,
|
||||
|
||||
Op::F2FP(_) => Alu,
|
||||
Op::FFma(_) => Fma,
|
||||
Op::FAdd(_) => Fma,
|
||||
Op::FMul(_) => Fma,
|
||||
Op::FMnMx(_) => Dualalu,
|
||||
Op::FSwzAdd(_) => Fma,
|
||||
Op::FSet(_) => Dualalu,
|
||||
// FSel => Alu,
|
||||
Op::FSetP(_) => Dualalu,
|
||||
// FChk => Decoupled,
|
||||
Op::DAdd(_) | Op::DFma(_) | Op::DMul(_) | Op::DSetP(_) => {
|
||||
RedirectedFp64
|
||||
}
|
||||
|
||||
Op::DMnMx(_) => RedirectedFp64, // not in docs
|
||||
|
||||
Op::HAdd2(hadd2) => {
|
||||
if hadd2.f32 {
|
||||
Fp16F32
|
||||
} else {
|
||||
Fp16
|
||||
}
|
||||
}
|
||||
Op::HFma2(_) | Op::HMul2(_) => Fp16,
|
||||
|
||||
Op::HSet2(_) | Op::HSetP2(_) | Op::HMnMx2(_) => Fp16Alu,
|
||||
Op::Hmma(_) => Hmma,
|
||||
Op::Ipa(_) => DecoupledAgu,
|
||||
Op::MuFu(_) => Decoupled,
|
||||
|
||||
// Conversion functions all Decoupled
|
||||
Op::F2F(_) => Decoupled,
|
||||
Op::F2I(_) => Decoupled,
|
||||
Op::I2F(_) => Decoupled,
|
||||
Op::FRnd(_) => Decoupled,
|
||||
Op::AL2P(_) => Decoupled,
|
||||
|
||||
Op::Mov(_) => Dualalu,
|
||||
Op::Sel(_) => Dualalu,
|
||||
Op::BRev(_) => Decoupled,
|
||||
// P2R => Alu,
|
||||
// R2P => Alu,
|
||||
Op::PLop3(_) => Alu,
|
||||
Op::Prmt(_) => Alu,
|
||||
Op::Nop(_) => Disp64,
|
||||
Op::Vote(_) => Dualalu,
|
||||
Op::Match(_) => Decoupled,
|
||||
Op::S2R(_) => DecoupledAgu,
|
||||
Op::R2UR(_) => Alu,
|
||||
Op::Redux(_) => {
|
||||
if reader {
|
||||
Decoupled
|
||||
} else {
|
||||
panic!("Illegal R2UR");
|
||||
}
|
||||
}
|
||||
Op::CS2R(cs2r) => {
|
||||
if cs2r.dst.as_reg().unwrap().comps() == 2 {
|
||||
Disp64
|
||||
} else {
|
||||
Dualalu
|
||||
}
|
||||
}
|
||||
// B2R => DecoupledAgu,
|
||||
// LEPC => Disp64
|
||||
Op::BMov(bmov) => match bmov.dst {
|
||||
Dst::Reg(_) => Branch,
|
||||
_ => Branch,
|
||||
},
|
||||
// RPCMOV.32 => Alu,
|
||||
// RPCMOV.64 => Disp64
|
||||
// PMTRIG => Disp64
|
||||
// CSMTEST => Alu,
|
||||
Op::Bar(_) => DecoupledAgu,
|
||||
Op::Imma(_) => Imma,
|
||||
Op::IDp4(_) => Fma,
|
||||
Op::BClear(_) => Decoupled,
|
||||
Op::Bra(_) => Decoupled,
|
||||
Op::BSSy(_) => Decoupled,
|
||||
Op::Kill(_) => Decoupled,
|
||||
Op::Exit(_) => Decoupled,
|
||||
Op::BSync(_) => Decoupled,
|
||||
Op::Tex(_) => Decoupled,
|
||||
Op::Tld(_) => Decoupled,
|
||||
Op::Tld4(_) => Decoupled,
|
||||
Op::Tmml(_) => Decoupled,
|
||||
Op::Txd(_) => Decoupled,
|
||||
Op::Txq(_) => Decoupled,
|
||||
Op::Ldc(_) => Decoupled,
|
||||
Op::ALd(_) => DecoupledAgu,
|
||||
Op::ASt(_) => DecoupledAgu,
|
||||
Op::Out(_) => DecoupledAgu,
|
||||
Op::OutFinal(_) => DecoupledAgu,
|
||||
Op::Ld(_) => DecoupledAgu,
|
||||
Op::St(_) => DecoupledAgu,
|
||||
Op::Atom(_) => DecoupledAgu,
|
||||
//CCtl.i,c are coupled
|
||||
Op::CCtl(_) => DecoupledAgu,
|
||||
Op::MemBar(_) => Decoupled,
|
||||
Op::SuLd(_) => Decoupled,
|
||||
Op::SuSt(_) => Decoupled,
|
||||
Op::SuAtom(_) => Decoupled,
|
||||
Op::PixLd(_) => DecoupledAgu,
|
||||
Op::Isberd(_) => DecoupledAgu,
|
||||
Op::LdTram(_) => DecoupledAgu,
|
||||
Op::Shfl(_) => DecoupledAgu,
|
||||
//Op::LdSm(_) => DecoupledAgu
|
||||
x => {
|
||||
panic!("Illegal instuction in reg category {}", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn op_pred_latency(op: &Op) -> PredLatencySM100 {
|
||||
use PredLatencySM100::*;
|
||||
match op {
|
||||
Op::Atom(_) => Decoupled,
|
||||
Op::DSetP(_) => RedirectedFp64,
|
||||
Op::FMnMx(_) | Op::FSetP(_) => Dualalu,
|
||||
Op::HFma2(_) => Fp16,
|
||||
Op::HMnMx2(_) => Fp16,
|
||||
Op::HSetP2(_) => Fp16,
|
||||
Op::IAdd3(_) => Coupled,
|
||||
Op::IAdd3X(_) => Coupled,
|
||||
Op::IMad(_) => Fma,
|
||||
Op::IMad64(_) => Fma,
|
||||
Op::IMnMx(_) => Dualalu,
|
||||
Op::IMul(_) => Fma,
|
||||
Op::Ipa(_) => Decoupled,
|
||||
Op::ISetP(_) => Dualalu,
|
||||
|
||||
Op::Ld(_) => Decoupled,
|
||||
|
||||
Op::Lea(_) | Op::LeaX(_) => Coupled,
|
||||
Op::PixLd(_) => Decoupled,
|
||||
Op::PLop3(_) => Coupled,
|
||||
Op::PSetP(_) => Coupled,
|
||||
Op::R2UR(_) => R2Ur,
|
||||
Op::Sel(_) => Dualalu,
|
||||
Op::Shfl(_) => Decoupled,
|
||||
Op::SuLd(_) => Decoupled,
|
||||
Op::SuSt(_) => Decoupled,
|
||||
Op::Tex(_) => Decoupled,
|
||||
Op::Tld(_) => Decoupled,
|
||||
Op::Tld4(_) => Decoupled,
|
||||
Op::Tmml(_) => Decoupled,
|
||||
Op::Txd(_) => Decoupled,
|
||||
Op::Txq(_) => Decoupled,
|
||||
|
||||
Op::Vote(_) => DispDualAlu,
|
||||
Op::Match(_) => Decoupled,
|
||||
_ => {
|
||||
panic!("Illegal op in sm120 pred latency {}", op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn op_ureg_latency(
|
||||
op: &Op,
|
||||
reader: bool,
|
||||
op_reg_idx: usize,
|
||||
) -> UregLatencySM100 {
|
||||
use UregLatencySM100::*;
|
||||
// this decides between the category types for readers.
|
||||
let bindless = reader && op.srcs_as_slice()[op_reg_idx].is_bindless_cbuf();
|
||||
|
||||
let coupled = if bindless { CoupledBindless } else { Coupled };
|
||||
let decoupled = if bindless {
|
||||
DecoupledBindless
|
||||
} else {
|
||||
Decoupled
|
||||
};
|
||||
|
||||
// if this is a reader from a ureg, it could be a U* instruction or a
|
||||
// regular instruction.
|
||||
let uniform_op = op.is_uniform();
|
||||
|
||||
let coupled = if uniform_op { Udp } else { coupled };
|
||||
let decoupled = if uniform_op { Udp } else { decoupled };
|
||||
|
||||
match op {
|
||||
Op::BMsk(_) => coupled,
|
||||
Op::BRev(_) => decoupled,
|
||||
// uclea?
|
||||
Op::Flo(_) => decoupled,
|
||||
Op::IAdd3(_) | Op::IAdd3X(_) => coupled,
|
||||
Op::IAbs(_) => coupled,
|
||||
Op::IDp4(_) => coupled,
|
||||
Op::IMnMx(_) => coupled,
|
||||
Op::IMad(_) => coupled,
|
||||
|
||||
Op::IMad64(_) => coupled,
|
||||
Op::ISetP(_) => coupled,
|
||||
Op::Ldc(_) => {
|
||||
if uniform_op {
|
||||
ToUr
|
||||
} else {
|
||||
decoupled
|
||||
}
|
||||
}
|
||||
Op::Lea(_) => coupled,
|
||||
Op::LeaX(_) => coupled,
|
||||
Op::Lop2(_) | Op::Lop3(_) => coupled,
|
||||
|
||||
Op::MuFu(_) => decoupled,
|
||||
Op::Mov(_) => {
|
||||
if uniform_op {
|
||||
Umov
|
||||
} else {
|
||||
coupled
|
||||
}
|
||||
}
|
||||
|
||||
// mov32i => uldc
|
||||
// p2ur => udp,
|
||||
Op::PLop3(_) => coupled,
|
||||
Op::PopC(_) => {
|
||||
if uniform_op {
|
||||
coupled
|
||||
} else {
|
||||
decoupled
|
||||
}
|
||||
}
|
||||
Op::Prmt(_) => coupled,
|
||||
Op::PSetP(_) => coupled,
|
||||
// UR2UP
|
||||
Op::Sel(_) => coupled,
|
||||
// SGXT
|
||||
Op::Shf(_) => coupled,
|
||||
Op::Shfl(_) => decoupled,
|
||||
|
||||
Op::I2F(_) => decoupled,
|
||||
Op::F2I(_) => decoupled,
|
||||
Op::F2F(_) => decoupled,
|
||||
Op::R2UR(_) => {
|
||||
if !reader {
|
||||
R2Ur
|
||||
} else {
|
||||
panic!("Illegal R2UR in ureg");
|
||||
}
|
||||
}
|
||||
Op::Redux(_) => {
|
||||
if !reader {
|
||||
ToUr
|
||||
} else {
|
||||
panic!("Illegal R2UR in ureg");
|
||||
}
|
||||
}
|
||||
Op::Vote(_) => Voteu,
|
||||
Op::S2R(_) => ToUr,
|
||||
|
||||
Op::Tex(_) | Op::Tld(_) | Op::Tld4(_) | Op::Txq(_) => Tex,
|
||||
Op::FRnd(_) => decoupled,
|
||||
Op::F2FP(_)
|
||||
| Op::FAdd(_)
|
||||
| Op::FMul(_)
|
||||
| Op::FFma(_)
|
||||
| Op::FSet(_)
|
||||
| Op::FSetP(_)
|
||||
| Op::FMnMx(_)
|
||||
| Op::HAdd2(_)
|
||||
| Op::HMul2(_)
|
||||
| Op::HSet2(_)
|
||||
| Op::HFma2(_)
|
||||
| Op::HMnMx2(_)
|
||||
| Op::HSetP2(_) => coupled,
|
||||
Op::DMul(_) | Op::DFma(_) | Op::DAdd(_) | Op::DSetP(_) => decoupled,
|
||||
_ => {
|
||||
panic!("Illegal instuction in ureg category {}", op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn op_upred_latency(op: &Op) -> UpredLatencySM100 {
|
||||
use UpredLatencySM100::*;
|
||||
let uniform_op = op.is_uniform();
|
||||
match op {
|
||||
Op::BMsk(_)
|
||||
| Op::BRev(_)
|
||||
| Op::Flo(_)
|
||||
| Op::IAdd3(_)
|
||||
| Op::IAdd3X(_)
|
||||
| Op::IMad(_)
|
||||
| Op::ISetP(_)
|
||||
| Op::Lea(_)
|
||||
| Op::LeaX(_)
|
||||
| Op::Lop3(_)
|
||||
| Op::Mov(_) => Udp,
|
||||
Op::Ldc(_) => UldcMma,
|
||||
Op::PLop3(_) => {
|
||||
if uniform_op {
|
||||
Udp
|
||||
} else {
|
||||
Coupled
|
||||
}
|
||||
}
|
||||
Op::PSetP(_) => {
|
||||
if uniform_op {
|
||||
Udp
|
||||
} else {
|
||||
Coupled
|
||||
}
|
||||
}
|
||||
Op::Sel(_) => {
|
||||
if uniform_op {
|
||||
Udp
|
||||
} else {
|
||||
Coupled
|
||||
}
|
||||
}
|
||||
Op::Vote(_) => {
|
||||
if uniform_op {
|
||||
Voteu
|
||||
} else {
|
||||
panic!("Illegal Vote in upred");
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
panic!("Illegal instuction in upred category {}", op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SM120Latency {}
|
||||
|
||||
impl SM120Latency {
|
||||
pub fn needs_scoreboards(op: &Op) -> bool {
|
||||
if op.is_uniform() {
|
||||
match op_ureg_latency(op, false, 0) {
|
||||
UregLatencySM100::Uldc
|
||||
| UregLatencySM100::ToUr
|
||||
| UregLatencySM100::Tex => true,
|
||||
_ => false,
|
||||
}
|
||||
} else {
|
||||
match op_reg_latency(op, false, 0) {
|
||||
RegLatencySM100::Dmma
|
||||
| RegLatencySM100::Hmma
|
||||
| RegLatencySM100::RedirectedFp64
|
||||
| RegLatencySM100::Branch
|
||||
| RegLatencySM100::Decoupled
|
||||
| RegLatencySM100::DecoupledAgu => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw(
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
read: Option<&Op>,
|
||||
src_idx: usize,
|
||||
) -> u32 {
|
||||
let dst_file = match &write.dsts_as_slice()[dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
match dst_file {
|
||||
RegFile::GPR => {
|
||||
let write_latency = op_reg_latency(write, false, dst_idx);
|
||||
let read_latency = match read {
|
||||
Some(op) => op_reg_latency(op, true, src_idx),
|
||||
None => RegLatencySM100::RedirectedFp64,
|
||||
};
|
||||
// The latencies are for SM100 docs, but some chips need large
|
||||
// one just override here.
|
||||
if write_latency == RegLatencySM100::Hmma
|
||||
|| read_latency == RegLatencySM100::Hmma
|
||||
{
|
||||
RegLatencySM100::raw(write_latency, read_latency, false) + 9
|
||||
} else {
|
||||
RegLatencySM100::raw(write_latency, read_latency, false) + 1
|
||||
}
|
||||
}
|
||||
RegFile::UGPR => {
|
||||
let write_latency = op_ureg_latency(write, false, dst_idx);
|
||||
let read_latency = match read {
|
||||
Some(op) => op_ureg_latency(op, true, src_idx),
|
||||
None => UregLatencySM100::Uldc,
|
||||
};
|
||||
UregLatencySM100::raw(write_latency, read_latency, false) + 1
|
||||
}
|
||||
RegFile::Pred => {
|
||||
let write_latency = op_pred_latency(write);
|
||||
let read_latency = match read {
|
||||
Some(op) => op_pred_latency(op),
|
||||
None => PredLatencySM100::RedirectedFp64,
|
||||
};
|
||||
PredLatencySM100::raw(write_latency, read_latency, false) + 1
|
||||
}
|
||||
RegFile::UPred => {
|
||||
let write_latency = op_upred_latency(write);
|
||||
let read_latency = match read {
|
||||
Some(op) => op_upred_latency(op),
|
||||
None => UpredLatencySM100::UGuard,
|
||||
};
|
||||
UpredLatencySM100::raw(write_latency, read_latency, false) + 1
|
||||
}
|
||||
RegFile::Bar => 0, // Barriers have a HW scoreboard
|
||||
_ => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn war(read: &Op, src_idx: usize, write: &Op, dst_idx: usize) -> u32 {
|
||||
let dst_file = match &write.dsts_as_slice()[dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
match dst_file {
|
||||
RegFile::GPR => {
|
||||
let write_latency = op_reg_latency(write, false, dst_idx);
|
||||
let read_latency = op_reg_latency(read, true, src_idx);
|
||||
|
||||
if write_latency == RegLatencySM100::Hmma
|
||||
|| read_latency == RegLatencySM100::Hmma
|
||||
{
|
||||
RegLatencySM100::war(read_latency, write_latency, false) + 7
|
||||
} else {
|
||||
RegLatencySM100::war(read_latency, write_latency, false)
|
||||
}
|
||||
}
|
||||
RegFile::UGPR => {
|
||||
let write_latency = op_ureg_latency(write, false, dst_idx);
|
||||
let read_latency = op_ureg_latency(read, true, src_idx);
|
||||
UregLatencySM100::war(read_latency, write_latency, false)
|
||||
}
|
||||
RegFile::Pred => {
|
||||
let write_latency = op_pred_latency(write);
|
||||
let read_latency = op_pred_latency(read);
|
||||
PredLatencySM100::war(read_latency, write_latency, false)
|
||||
}
|
||||
RegFile::UPred => {
|
||||
let write_latency = op_upred_latency(write);
|
||||
let read_latency = op_upred_latency(read);
|
||||
UpredLatencySM100::war(read_latency, write_latency, false)
|
||||
}
|
||||
_ => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn waw(
|
||||
a: &Op,
|
||||
a_dst_idx: usize,
|
||||
b: &Op,
|
||||
b_dst_idx: usize,
|
||||
a_op_pred: bool,
|
||||
) -> u32 {
|
||||
let dst_file = match &a.dsts_as_slice()[a_dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
match dst_file {
|
||||
RegFile::GPR => {
|
||||
let write1_latency = op_reg_latency(a, false, a_dst_idx);
|
||||
let write2_latency = op_reg_latency(b, false, b_dst_idx);
|
||||
if write1_latency == RegLatencySM100::Hmma
|
||||
|| write2_latency == RegLatencySM100::Hmma
|
||||
{
|
||||
RegLatencySM100::waw(
|
||||
write1_latency,
|
||||
write2_latency,
|
||||
a_op_pred,
|
||||
) + 7
|
||||
} else {
|
||||
RegLatencySM100::waw(
|
||||
write1_latency,
|
||||
write2_latency,
|
||||
a_op_pred,
|
||||
)
|
||||
}
|
||||
}
|
||||
RegFile::UGPR => {
|
||||
let write1_latency = op_ureg_latency(a, false, a_dst_idx);
|
||||
let write2_latency = op_ureg_latency(b, false, b_dst_idx);
|
||||
UregLatencySM100::waw(write1_latency, write2_latency, a_op_pred)
|
||||
}
|
||||
RegFile::Pred => {
|
||||
let write1_latency = op_pred_latency(a);
|
||||
let write2_latency = op_pred_latency(b);
|
||||
PredLatencySM100::waw(write1_latency, write2_latency, a_op_pred)
|
||||
}
|
||||
RegFile::UPred => {
|
||||
let write1_latency = op_upred_latency(a);
|
||||
let write2_latency = op_upred_latency(b);
|
||||
UpredLatencySM100::waw(write1_latency, write2_latency, false)
|
||||
}
|
||||
_ => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
use crate::ir::*;
|
||||
use crate::legalize::LegalizeBuilder;
|
||||
use crate::sm120_instr_latencies::SM120Latency;
|
||||
use crate::sm70_encode::*;
|
||||
use crate::sm75_instr_latencies::SM75Latency;
|
||||
use crate::sm80_instr_latencies::SM80Latency;
|
||||
|
||||
pub struct ShaderModel70 {
|
||||
sm: u8,
|
||||
}
|
||||
|
|
@ -154,7 +154,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
return false;
|
||||
}
|
||||
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::needs_scoreboards(op)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::needs_scoreboards(op)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::needs_scoreboards(op)
|
||||
|
|
@ -188,7 +190,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
read: &Op,
|
||||
src_idx: usize,
|
||||
) -> u32 {
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::raw(write, dst_idx, Some(read), src_idx)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::raw(write, dst_idx, Some(read), src_idx)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::raw(write, dst_idx, Some(read), src_idx)
|
||||
|
|
@ -204,7 +208,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
write: &Op,
|
||||
dst_idx: usize,
|
||||
) -> u32 {
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::war(read, src_idx, write, dst_idx)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::war(read, src_idx, write, dst_idx)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::war(read, src_idx, write, dst_idx)
|
||||
|
|
@ -223,7 +229,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
b: &Op,
|
||||
b_dst_idx: usize,
|
||||
) -> u32 {
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred)
|
||||
|
|
@ -235,7 +243,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
}
|
||||
|
||||
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::raw(write, dst_idx, None, 0)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::raw(write, dst_idx, None, 0)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::raw(write, dst_idx, None, 0)
|
||||
|
|
@ -250,7 +260,9 @@ impl ShaderModel for ShaderModel70 {
|
|||
}
|
||||
|
||||
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
if self.is_ampere() || self.is_ada() {
|
||||
if self.is_blackwell() {
|
||||
SM120Latency::raw(write, dst_idx, None, 0)
|
||||
} else if self.is_ampere() || self.is_ada() {
|
||||
SM80Latency::raw(write, dst_idx, None, 0)
|
||||
} else if self.is_turing() {
|
||||
SM75Latency::raw(write, dst_idx, None, 0)
|
||||
|
|
|
|||
|
|
@ -77,6 +77,9 @@ executable(
|
|||
install : with_tools.contains('nouveau'),
|
||||
)
|
||||
|
||||
nouveau_util_py = files('util.py')
|
||||
nouveau_util_py_path = meson.current_source_dir()
|
||||
|
||||
# Only generate Rust bindings for NVK
|
||||
if with_nouveau_vk
|
||||
cl_rs_generated = []
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue