diff --git a/src/intel/executor/examples/bfloat.lua b/src/intel/executor/examples/bfloat.lua index a624b2e850f..b0cc7365711 100644 --- a/src/intel/executor/examples/bfloat.lua +++ b/src/intel/executor/examples/bfloat.lua @@ -3,30 +3,36 @@ local r = execute { @id g3 mov(8) g4<1>F g3<1>UD {A@1}; - mov(8) g5<1>F g4.2<0,1,0>F {A@1}; + mov(8) g5<1>F g4.1<0,1,0>F {A@1}; - // Moving from F to BF (packed) doesn't work, but that's not much - // of a problem because BFloat16 is a cropped version of Float32. - // - // So instead of - // - // mov(8) g10<1>BF g4<1>F - // - // use MOV with UW and appropriate offset. - mov(8) g10<1>UW g4.1<2>UW {A@1}; + // Converting F to unpacked BF works, but as will be + // illustrated, is not very useful. - mad(8) g11<1>BF g4<1>F g10<1>BF g5<1>F {A@1}; + mov(8) g10<2>BF g4<1>F {A@1}; + + // With exception of DPAS, instructions need to have at + // least one non-BF operand and the operands must be packed. + + mov(8) g11<1>UW g10<2>UW {A@1}; // Pack it! add(8) g12<1>BF g11<1>BF g4<1>F {A@1}; - // For similar reason as above, instead of - // - // mov(8) g20<1>F g12<1>BF - // - // use a SHL unpacking into UD. - shl(8) g20<1>UD g12<1>UW 16UW {A@1}; + // Converting F to packed BF doesn't work, so add the value + // to 0.0f instead. This will preserve the NaN. - mov(8) g21<1>UD g20<1>F {A@1}; - @write g3 g21 + add(8) g20<1>BF g4<1>F 0F {A@1}; // F -> BF. + + // Converting BF to F doesn't work, so for a packed source, + // shift-left the bits to expand it into an UD instead. + + shl(8) g30<1>UD g20<1>UW 16UW {A@1}; // BF -> F. + + mad(8) g40<1>BF g12<1>BF g20<1>BF g5<1>F {A@1}; + add(8) g41<1>BF g40<1>BF g30<1>F {A@1}; + + shl(8) g42<1>UD g41<1>UW 16UW {A@1}; // BF -> F. + + mov(8) g43<1>UD g42<1>F {A@1}; + @write g3 g43 @eot ]]