intel/executor: Update bfloat example
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Elaborate on the packed/unpack restrictions, use ADD(x, 0.0f)
as a workaround for F->BF conversion.

Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34506>
This commit is contained in:
Caio Oliveira 2025-03-27 18:18:28 -07:00 committed by Marge Bot
parent fbe5d559bd
commit fafdd24285

View file

@ -3,30 +3,36 @@ local r = execute {
@id g3
mov(8) g4<1>F g3<1>UD {A@1};
mov(8) g5<1>F g4.2<0,1,0>F {A@1};
mov(8) g5<1>F g4.1<0,1,0>F {A@1};
// Moving from F to BF (packed) doesn't work, but that's not much
// of a problem because BFloat16 is a cropped version of Float32.
//
// So instead of
//
// mov(8) g10<1>BF g4<1>F
//
// use MOV with UW and appropriate offset.
mov(8) g10<1>UW g4.1<2>UW {A@1};
// Converting F to unpacked BF works, but as will be
// illustrated, is not very useful.
mad(8) g11<1>BF g4<1>F g10<1>BF g5<1>F {A@1};
mov(8) g10<2>BF g4<1>F {A@1};
// With exception of DPAS, instructions need to have at
// least one non-BF operand and the operands must be packed.
mov(8) g11<1>UW g10<2>UW {A@1}; // Pack it!
add(8) g12<1>BF g11<1>BF g4<1>F {A@1};
// For similar reason as above, instead of
//
// mov(8) g20<1>F g12<1>BF
//
// use a SHL unpacking into UD.
shl(8) g20<1>UD g12<1>UW 16UW {A@1};
// Converting F to packed BF doesn't work, so add the value
// to 0.0f instead. This will preserve the NaN.
mov(8) g21<1>UD g20<1>F {A@1};
@write g3 g21
add(8) g20<1>BF g4<1>F 0F {A@1}; // F -> BF.
// Converting BF to F doesn't work, so for a packed source,
// shift-left the bits to expand it into an UD instead.
shl(8) g30<1>UD g20<1>UW 16UW {A@1}; // BF -> F.
mad(8) g40<1>BF g12<1>BF g20<1>BF g5<1>F {A@1};
add(8) g41<1>BF g40<1>BF g30<1>F {A@1};
shl(8) g42<1>UD g41<1>UW 16UW {A@1}; // BF -> F.
mov(8) g43<1>UD g42<1>F {A@1};
@write g3 g43
@eot
]]