intel/executor: Update bfloat example

Elaborate on the packed/unpack restrictions, use ADD(x, 0.0f) as a workaround for F->BF conversion. Reviewed-by: Rohan Garg <rohan.garg@intel.com> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34506>
2026-05-19 20:08:06 +02:00 · 2025-03-27 18:18:28 -07:00 · 2025-03-27 18:18:28 -07:00 · fafdd24285
commit fafdd24285
parent fbe5d559bd
1 changed files with 25 additions and 19 deletions
--- a/src/intel/executor/examples/bfloat.lua
+++ b/src/intel/executor/examples/bfloat.lua
@ -3,30 +3,36 @@ local r = execute {
    @id      g3

    mov(8)   g4<1>F    g3<1>UD                       {A@1};
-    mov(8)   g5<1>F    g4.2<0,1,0>F                  {A@1};
+    mov(8)   g5<1>F    g4.1<0,1,0>F                  {A@1};

-    // Moving from F to BF (packed) doesn't work, but that's not much
-    // of a problem because BFloat16 is a cropped version of Float32.
-    //
-    // So instead of
-    //
-    //     mov(8)   g10<1>BF  g4<1>F
-    //
-    // use MOV with UW and appropriate offset.
-    mov(8)   g10<1>UW  g4.1<2>UW                     {A@1};
+    // Converting F to unpacked BF works, but as will be
+    // illustrated, is not very useful.

-    mad(8)   g11<1>BF  g4<1>F    g10<1>BF  g5<1>F    {A@1};
+    mov(8)   g10<2>BF  g4<1>F                        {A@1};
+
+    // With exception of DPAS, instructions need to have at
+    // least one non-BF operand and the operands must be packed.
+
+    mov(8)   g11<1>UW  g10<2>UW                      {A@1};  // Pack it!
    add(8)   g12<1>BF  g11<1>BF  g4<1>F              {A@1};

-    // For similar reason as above, instead of
-    //
-    //     mov(8)   g20<1>F   g12<1>BF
-    //
-    // use a SHL unpacking into UD.
-    shl(8)   g20<1>UD  g12<1>UW  16UW                {A@1};
+    // Converting F to packed BF doesn't work, so add the value
+    // to 0.0f instead.  This will preserve the NaN.

-    mov(8)   g21<1>UD  g20<1>F                       {A@1};
-    @write   g3        g21
+    add(8)   g20<1>BF  g4<1>F    0F                  {A@1}; // F -> BF.
+
+    // Converting BF to F doesn't work, so for a packed source,
+    // shift-left the bits to expand it into an UD instead.
+
+    shl(8)   g30<1>UD  g20<1>UW  16UW                {A@1}; // BF -> F.
+
+    mad(8)   g40<1>BF  g12<1>BF  g20<1>BF  g5<1>F    {A@1};
+    add(8)   g41<1>BF  g40<1>BF  g30<1>F             {A@1};
+
+    shl(8)   g42<1>UD  g41<1>UW  16UW                {A@1}; // BF -> F.
+
+    mov(8)   g43<1>UD  g42<1>F                       {A@1};
+    @write   g3        g43

    @eot
  ]]