nak: Add real instruction dependencies for Kepler

This commit ports instruction latency information found in codegen emitter.
Previously every instruction was delayed by 16 cycles even if it was not
necessary.
PixMark Piano is highly affected by instruction latencies and gets a 2.5x boost,
other benchmarks still get better performance.
The other two missing pieces to get feature parity with codegen are
functional unit resource tracking and instruction dual-issue.

Performance measures on a GT770 (with 0f pstate)
Pixmark piano: 519 -> 14526 pts (has rendering issues in both!)
Furmark: 3247 -> 5786 pts
The talos principle (high settings): 30-33 -> 55-60 FPS

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35821>
This commit is contained in:
Lorenzo Rossi 2025-07-09 23:49:53 +02:00 committed by Marge Bot
parent ba42b916ac
commit c35990c4bc
4 changed files with 257 additions and 143 deletions

View file

@ -28,6 +28,7 @@ mod qmd;
mod reg_tracker;
mod repair_ssa;
mod sm20;
mod sm30_instr_latencies;
mod sm32;
mod sm50;
mod sm70;

View file

@ -5,6 +5,10 @@ use crate::ir::*;
use crate::legalize::{
src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder,
};
use crate::sm30_instr_latencies::{
encode_kepler_shader, instr_exec_latency, instr_latency,
KeplerInstructionEncoder,
};
use bitview::*;
use rustc_hash::FxHashMap;
@ -57,19 +61,18 @@ impl ShaderModel for ShaderModel20 {
false
}
fn exec_latency(&self, _op: &Op) -> u32 {
1
fn exec_latency(&self, op: &Op) -> u32 {
instr_exec_latency(self.sm, op)
}
fn raw_latency(
&self,
_write: &Op,
_dst_idx: usize,
write: &Op,
dst_idx: usize,
_read: &Op,
_src_idx: usize,
) -> u32 {
// TODO
13
instr_latency(self.sm, write, dst_idx)
}
fn war_latency(
@ -79,7 +82,6 @@ impl ShaderModel for ShaderModel20 {
_write: &Op,
_dst_idx: usize,
) -> u32 {
// TODO
// We assume the source gets read in the first 4 cycles. We don't know
// how quickly the write will happen. This is all a guess.
4
@ -87,27 +89,23 @@ impl ShaderModel for ShaderModel20 {
fn waw_latency(
&self,
_a: &Op,
_a_dst_idx: usize,
a: &Op,
a_dst_idx: usize,
_a_has_pred: bool,
_b: &Op,
_b_dst_idx: usize,
) -> u32 {
// We know our latencies are wrong so assume the wrote could happen
// anywhere between 0 and instr_latency(a) cycles
// TODO
13
instr_latency(self.sm, a, a_dst_idx)
}
fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
// TODO
13
}
fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
// TODO
15
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
instr_latency(self.sm, write, dst_idx)
}
fn max_instr_delay(&self) -> u8 {
@ -119,7 +117,12 @@ impl ShaderModel for ShaderModel20 {
}
fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
encode_sm20_shader(self, s)
if self.sm >= 30 {
// Kepler adds explicit instruction latency encodings
encode_sm30_shader(self, s)
} else {
encode_sm20_shader(self, s)
}
}
}
@ -3081,3 +3084,38 @@ fn encode_sm20_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec<u32> {
encoded
}
impl KeplerInstructionEncoder for ShaderModel20 {
fn encode_instr(
&self,
instr: &Instr,
labels: &FxHashMap<Label, usize>,
encoded: &mut Vec<u32>,
) {
let mut e = SM20Encoder {
sm: self,
ip: encoded.len() * 4,
labels,
inst: [0_u32; 2],
};
as_sm20_op(&instr.op).encode(&mut e);
e.set_pred(&instr.pred);
encoded.extend(&e.inst[..]);
}
fn prepare_sched_instr<'a>(
&self,
sched_instr: &'a mut [u32; 2],
) -> impl BitMutViewable + 'a {
let mut bv = BitMutView::new(sched_instr);
bv.set_field(0..4, 0b0111);
bv.set_field(60..64, 0b0010);
BitMutView::new_subset(sched_instr, 4..60)
}
}
fn encode_sm30_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec<u32> {
assert!(sm.sm >= 30);
encode_kepler_shader(sm, s)
}

View file

@ -0,0 +1,161 @@
use bitview::{BitMutViewable, BitViewable, SetField};
use rustc_hash::FxHashMap;
use crate::ir::{Instr, InstrDeps, Label, Op, OpNop, Shader};
pub fn instr_latency(_sm: u8, op: &Op, _dst_idx: usize) -> u32 {
if op.is_fp64() {
return 20;
}
match op {
Op::Ipa(_) => 15,
Op::Ld(_) => 24,
Op::ALd(_) => 24,
Op::IMul(_) => 15, // This does not apply to imad, right? right???
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => 17,
_ => 9,
}
}
pub fn instr_exec_latency(_sm: u8, op: &Op) -> u32 {
match op {
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => 17,
Op::Exit(_) => 15,
_ => 1,
}
}
fn calc_instr_sched(prev_op: Option<&Op>, op: &Op, deps: &InstrDeps) -> u8 {
// Kepler is the first generation to lift scoreboarding from the
// hardware into the compiler. For each instruction we encode
// the delay but not all the other information necessary for newer
// architectures.
// The hardware still checks for data-hazard and, if present, it
// will delay the instruction by 32 cycles.
match op {
Op::TexDepBar(_) => 0xc2,
Op::Sync(_) => 0x00, // Wait 16 cycles
_ => {
// TODO: when we support dual-issue this should check for
// both previous ops
let base = match prev_op {
Some(Op::ASt(_)) => 0x40,
_ => 0x20,
};
let delay = deps.delay;
debug_assert!(delay >= 1 && delay <= 32);
base | (delay - 1)
}
}
// 0x00: wait for 16 cycles
// 0x04: dual-issue with next instruction
// 0xc2: if TEXBAR
// 0x20 | 0x40: suspend for N+1 cycles (N = bitmask 0x1f)
// 0x40 only if prev_op is attribute store
// Unsure:
// 0x80: global memory bit
//
// TODO:
// - Dual issue (0x04)
// - Functional Unit tracking
}
pub trait KeplerInstructionEncoder {
/// Encode the instruction and push it into the "encoded" vec
fn encode_instr(
&self,
instr: &Instr,
labels: &FxHashMap<Label, usize>,
encoded: &mut Vec<u32>,
);
/// Prepare the scheduling instruction opcode-field and return a
/// subset where the actual scheduling information will be written
fn prepare_sched_instr<'a>(
&self,
sched_instr: &'a mut [u32; 2],
) -> impl BitMutViewable + 'a;
}
/// Helper function that encodes shaders for both KeplerA and KeplerB.
/// Difference in the encoders are handled by KeplerInstructionEncoder.
pub fn encode_kepler_shader<E>(encoder: &E, s: &Shader<'_>) -> Vec<u32>
where
E: KeplerInstructionEncoder,
{
const INSTR_LEN_BYTES: usize = 8;
assert!(s.functions.len() == 1);
let func = &s.functions[0];
// --- Compute label addresses ---
// We need a schedule instruction every 7 instructions, these don't
// define jump boundaries so we can have multible blocks in the same
// 7-instr group.
let mut ip = 0_usize;
let mut labels = FxHashMap::default();
for b in &func.blocks {
let num_sched = (ip / 7) + 1;
labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
ip += b.instrs.len();
}
// --- Real encoding ---
// Create an instruction iterator and iterate it in chunks of 7.
// fill the last chunk with a nop (it should never be executed).
let mut instr_iter = func
.blocks
.iter()
.flat_map(|b| b.instrs.iter().map(|x| &**x))
.peekable();
let mut filling_instr = Instr {
pred: true.into(),
op: Op::Nop(OpNop { label: None }),
deps: InstrDeps::new(),
};
filling_instr.deps.set_delay(1);
let mut sched_chunk_gen = || {
if instr_iter.peek().is_none() {
return None;
}
Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
};
let mut encoded = Vec::new();
let mut prev_op = None;
while let Some(sched_chunk) = sched_chunk_gen() {
let sched_i = encoded.len();
let mut sched_instr = [0u32; 2];
encoded.extend(&sched_instr[..]); // Push now, will edit later
let mut bv = encoder.prepare_sched_instr(&mut sched_instr);
// There should be 8 bits for each instr in a scheduling block
debug_assert!(bv.bits() == 8 * 7);
for (i, instr) in sched_chunk.iter().enumerate() {
encoder.encode_instr(&instr, &labels, &mut encoded);
let sched = calc_instr_sched(prev_op, &instr.op, &instr.deps);
bv.set_field(i * 8..(i + 1) * 8, sched);
prev_op = Some(&instr.op);
}
drop(bv);
encoded[sched_i] = sched_instr[0];
encoded[sched_i + 1] = sched_instr[1];
}
encoded
}

View file

@ -6,6 +6,10 @@ use crate::legalize::{
src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder,
PadValue,
};
use crate::sm30_instr_latencies::{
encode_kepler_shader, instr_exec_latency, instr_latency,
KeplerInstructionEncoder,
};
use bitview::{
BitMutView, BitMutViewable, BitView, BitViewable, SetBit, SetField,
};
@ -60,34 +64,17 @@ impl ShaderModel for ShaderModel32 {
}
fn exec_latency(&self, op: &Op) -> u32 {
// TODO
match op {
Op::CCtl(_)
| Op::MemBar(_)
| Op::Bra(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Exit(_)
| Op::Bar(_)
| Op::Kill(_)
| Op::OutFinal(_) => 13,
_ => 1,
}
instr_exec_latency(self.sm, op)
}
fn raw_latency(
&self,
_write: &Op,
_dst_idx: usize,
write: &Op,
dst_idx: usize,
_read: &Op,
_src_idx: usize,
) -> u32 {
// TODO
13
instr_latency(self.sm, write, dst_idx)
}
fn war_latency(
@ -97,7 +84,6 @@ impl ShaderModel for ShaderModel32 {
_write: &Op,
_dst_idx: usize,
) -> u32 {
// TODO
// We assume the source gets read in the first 4 cycles. We don't know
// how quickly the write will happen. This is all a guess.
4
@ -105,27 +91,23 @@ impl ShaderModel for ShaderModel32 {
fn waw_latency(
&self,
_a: &Op,
_a_dst_idx: usize,
a: &Op,
a_dst_idx: usize,
_a_has_pred: bool,
_b: &Op,
_b_dst_idx: usize,
) -> u32 {
// We know our latencies are wrong so assume the wrote could happen
// anywhere between 0 and instr_latency(a) cycles
// TODO
13
instr_latency(self.sm, a, a_dst_idx)
}
fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
// TODO
13
}
fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
// TODO
15
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
instr_latency(self.sm, write, dst_idx)
}
fn max_instr_delay(&self) -> u8 {
@ -161,7 +143,6 @@ struct SM32Encoder<'a> {
ip: usize,
labels: &'a FxHashMap<Label, usize>,
inst: [u32; 2],
sched: u8,
}
impl BitViewable for SM32Encoder<'_> {
@ -323,23 +304,6 @@ impl SM32Encoder<'_> {
},
);
}
fn set_instr_dependency(&mut self, _deps: &InstrDeps) {
// TODO: schedulng
//let mut sched = BitMutView::new(&mut self.sched);
//sched.set_field(0..5, deps.delay);
self.sched = 0x00;
// 0x00: wait for 32 cycles
// 0x04: dual-issue with next instruction
// 0xc2 if TEXBAR
// 0x40 if EXPORT
// 0x20 otherwise(?)
// 0x80: global memory bit
// 0x40: EXPORT(?)
// 0x20: suspend for N cycles (N = bitmask 0x1f)
// 0x10: shared memory?
}
}
//
@ -3409,86 +3373,36 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op {
as_sm50_op_match!(op)
}
fn encode_instr(
instr: &Instr,
sm: &ShaderModel32,
labels: &FxHashMap<Label, usize>,
encoded: &mut Vec<u32>,
) -> u8 {
let mut e = SM32Encoder {
sm: sm,
ip: encoded.len() * 4,
labels,
inst: [0_u32; 2],
sched: 0,
};
impl KeplerInstructionEncoder for ShaderModel32 {
fn encode_instr(
&self,
instr: &Instr,
labels: &FxHashMap<Label, usize>,
encoded: &mut Vec<u32>,
) {
let mut e = SM32Encoder {
sm: self,
ip: encoded.len() * 4,
labels,
inst: [0_u32; 2],
};
as_sm32_op(&instr.op).encode(&mut e);
e.set_pred(&instr.pred);
encoded.extend(&e.inst[..]);
}
as_sm32_op(&instr.op).encode(&mut e);
e.set_pred(&instr.pred);
e.set_instr_dependency(&instr.deps);
fn prepare_sched_instr<'a>(
&self,
sched_instr: &'a mut [u32; 2],
) -> impl BitMutViewable + 'a {
let mut bv = BitMutView::new(sched_instr);
bv.set_field(0..2, 0b00);
bv.set_field(58..64, 0b000010); // 0x08
encoded.extend(&e.inst[..]);
e.sched
BitMutView::new_subset(sched_instr, 2..58)
}
}
fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec<u32> {
const INSTR_LEN_BYTES: usize = 8;
assert!(s.functions.len() == 1);
let func = &s.functions[0];
// --- Compute label addresses ---
// We need a schedule instruction every 7 instructions, these don't
// define jump boundaries so we can have multible blocks in the same
// 7-instr group.
let mut ip = 0_usize;
let mut labels = FxHashMap::default();
for b in &func.blocks {
let num_sched = (ip / 7) + 1;
labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
ip += b.instrs.len();
}
// --- Real encoding ---
// Create an instruction iterator and iterate it in chunks of 7.
// fill the last chunk with a nop (it should never be executed).
let mut instr_iter = func
.blocks
.iter()
.flat_map(|b| b.instrs.iter().map(|x| &**x))
.peekable();
let mut filling_instr = Instr {
pred: true.into(),
op: Op::Nop(OpNop { label: None }),
deps: InstrDeps::new(),
};
filling_instr.deps.set_delay(1);
let mut sched_chunk_gen = || {
if instr_iter.peek().is_none() {
return None;
}
Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
};
let mut encoded = Vec::new();
while let Some(sched_chunk) = sched_chunk_gen() {
let sched_i = encoded.len();
let mut sched_instr = [0u32; 2];
encoded.extend(&sched_instr[..]); // Push now, will edit later
let mut bv = BitMutView::new(&mut sched_instr);
bv.set_field(0..2, 0b00);
bv.set_field(58..64, 0b000010); // 0x80
let mut bv = bv.subset_mut(2..58);
for (i, instr) in sched_chunk.iter().enumerate() {
let sched = encode_instr(instr, sm, &labels, &mut encoded);
bv.set_field(i * 8..(i + 1) * 8, sched);
}
encoded[sched_i] = sched_instr[0];
encoded[sched_i + 1] = sched_instr[1];
}
encoded
encode_kepler_shader(sm, s)
}