intel/brw: Temporarily disable result=float16 matrix configs

Even though the hardware does not naively support these configurations, there are many potential benefits to advertising them. These configurations can theoretically use half the memory bandwidth for loads and stores. For large matrices, that can be the limiting in performance. The current implementation, however, has a number of significant problems. The conversion from float16 to float32 is performed in the driver during conversion from NIR. As a result, many common usage patterns end up doing back-to-back conversions to and from float16 between matrix multiplications (when the result of one multiplication is used as the accumulator for the next). The float16 version of the matrix waste half the possible register space. Each float16 value sits alone in a dword. This is done so that the per-invocation slice of an 8x8 float16 result matrix and an 8x8 float32 result matrix will have the same number of elements. This makes it possible to do straightforward implementations of all the unary_op type conversions in NIR. It would be possible to perform N:M element type conversions in the backend using specialized NIR intrinsics. However, per #10961, this would be very, very painful. My hope is that, once a suitable resolution for that issue can be found, support for these configs can be restored. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28834>
2026-05-06 07:18:17 +02:00 · 2024-04-04 15:12:19 -07:00 · 2024-04-04 15:12:19 -07:00 · ea6e10c0b2
commit ea6e10c0b2
parent 33dd38f9d5
3 changed files with 23 additions and 91 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -4348,35 +4348,10 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,

      dest = retype(dest, dest_type);
      fs_reg src0 = retype(get_nir_src(ntb, instr->src[0]), dest_type);
-      const fs_reg dest_hf = dest;

      fs_builder bld16 = bld.exec_all().group(16, 0);
      fs_builder bldn = devinfo->ver >= 20 ? bld16 : bld.exec_all().group(8, 0);

-      /* DG2 cannot have the destination or source 0 of DPAS be float16. It is
-       * still advantageous to support these formats for memory and bandwidth
-       * savings.
-       *
-       * The float16 source must be expanded to float32.
-       */
-      if (devinfo->verx10 == 125 && dest_type == BRW_TYPE_HF &&
-          !s.compiler->lower_dpas) {
-         dest = bldn.vgrf(BRW_TYPE_F, rcount);
-
-         if (src0.file != ARF) {
-            const fs_reg src0_hf = src0;
-
-            src0 = bldn.vgrf(BRW_TYPE_F, rcount);
-
-            for (unsigned i = 0; i < 4; i++) {
-               bld16.MOV(byte_offset(src0, REG_SIZE * i * 2),
-                         byte_offset(src0_hf, REG_SIZE * i));
-            }
-         } else {
-            src0 = retype(src0, BRW_TYPE_F);
-         }
-      }
-
      bldn.DPAS(dest,
                src0,
                retype(get_nir_src(ntb, instr->src[2]), src_type),
@ -4385,14 +4360,6 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
                rcount)
         ->saturate = nir_intrinsic_saturate(instr);

-      /* Compact the destination to float16 (from float32). */
-      if (!dest.equals(dest_hf)) {
-         for (unsigned i = 0; i < 4; i++) {
-            bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
-                      byte_offset(dest, REG_SIZE * i * 2));
-         }
-      }
-
      cs_prog_data->uses_systolic = true;
      break;
   }
--- a/src/intel/compiler/brw_nir_lower_cooperative_matrix.c
+++ b/src/intel/compiler/brw_nir_lower_cooperative_matrix.c
@ -148,21 +148,14 @@ get_slice_type_from_desc(const struct lower_cmat_state *state,

   const unsigned element_bits = 32;
   const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
-   unsigned packing_factor = MIN2(elements_per_invocation,
-                                  element_bits / bits);

-   /* Adjust the packing factor so that each row of the matrix fills and
-    * entire GRF.
-    *
-    * The in-register layout of B matrices is different, so those are handled
-    * more like column major (for row major matrices). See the file comment
-    * for more details.
+   /* Each invocation must have at least one dword of data, and that dword
+    * must be tightly packed with values. No matter the matrix dimensions, a
+    * matrix of uint8_t data must pack 4 values in each entry.
    */
-   const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
-   while ((actual_cols / packing_factor) < 8) {
-      assert(packing_factor > 1);
-      packing_factor /= 2;
-   }
+   const unsigned packing_factor = element_bits / bits;
+
+   assert(elements_per_invocation >= packing_factor);

   switch (desc.element_type) {
   case GLSL_TYPE_FLOAT:
@ -172,12 +165,12 @@ get_slice_type_from_desc(const struct lower_cmat_state *state,
   case GLSL_TYPE_FLOAT16:
   case GLSL_TYPE_UINT8:
   case GLSL_TYPE_UINT16:
-      base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
+      base_type = GLSL_TYPE_UINT;
      break;
   case GLSL_TYPE_INT:
   case GLSL_TYPE_INT8:
   case GLSL_TYPE_INT16:
-      base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
+      base_type = GLSL_TYPE_INT;
      break;
   default:
      unreachable("Invalid cooperative matrix element type.");
@ -422,54 +415,30 @@ lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,

   const nir_op op = nir_intrinsic_alu_op(intrin);

-   /* There are three possible cases:
-    *
-    * 1. dst_packing_factor == src_packing_factor. This is the common case,
-    *    and handling it is straightforward.
-    *
-    * 2. dst_packing_factor > src_packing_factor. This occurs when converting a
-    *    float32_t matrix slice to a packed float16_t slice. Loop over the size
-    *    of the destination slice, but read multiple entries from the source
-    *    slice on each iteration.
-    *
-    * 3. dst_packing_factor < src_packing_factor. This occurs when converting a
-    *    packed int8_t matrix slice to an int32_t slice. Loop over the size of
-    *    the source slice, but write multiple entries to the destination slice
-    *    on each iteration.
-    *
-    * Handle all cases by iterating over the total (non-packed) number of
-    * elements in the slice. When dst_packing_factor values have been
-    * calculated, store them.
+   /* With the combinations of formats exposed on all platforms, matrices with
+    * the same dimensions will always have the same data size. The only real
+    * type conversion possible is int32 <-> float32. As a result
+    * dst_packing_factor == src_packing_factor.
    */
-   assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
-          (src_packing_factor * glsl_get_vector_elements(src_slice->type)));
+   assert(dst_packing_factor == src_packing_factor);

   /* Stores at most dst_packing_factor partial results. */
   nir_def *v[4];
   assert(dst_packing_factor <= 4);

-   for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
-      const unsigned dst_chan_index = i % dst_packing_factor;
-      const unsigned src_chan_index = i % src_packing_factor;
-      const unsigned dst_index = i / dst_packing_factor;
-      const unsigned src_index = i / src_packing_factor;
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *chan = nir_channel(b, nir_load_deref(b, src_slice), i);

-      nir_def *src =
-         nir_channel(b,
-                     nir_unpack_bits(b,
-                                     nir_channel(b,
-                                                 nir_load_deref(b, src_slice),
-                                                 src_index),
-                                     src_bits),
-                     src_chan_index);
+      for (unsigned j = 0; j < dst_packing_factor; j++) {
+         nir_def *src =
+            nir_channel(b, nir_unpack_bits(b, chan, src_bits), j);

-      v[dst_chan_index] = nir_build_alu1(b, op, src);
-
-      if (dst_chan_index == (dst_packing_factor - 1)) {
-         results[dst_index] =
-            nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
-                          dst_packing_factor * dst_bits);
+         v[j] = nir_build_alu1(b, op, src);
      }
+
+      results[i] =
+         nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
+                       dst_packing_factor * dst_bits);
   }

   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@ -620,7 +620,6 @@ static const struct intel_device_info intel_device_info_chv = {
   .has_sample_with_hiz = true,                     \
   .has_illegal_ccs_values = true,                                    \
   .cooperative_matrix_configurations = {                             \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@ -852,7 +851,6 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = {
   .num_subslices = _subslices,                       \
   .max_eus_per_subslice = 8,                                         \
   .cooperative_matrix_configurations = {                             \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@ -985,7 +983,6 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = {
         .writecombining = PAT_ENTRY(1, WC, NONE),              \
   },                                                           \
   .cooperative_matrix_configurations = {                       \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@ -1117,7 +1114,6 @@ static const struct intel_device_info intel_device_info_sg1 = {
   .has_aux_map = false,                                        \
   .simulator_id = 29,                                          \
   .cooperative_matrix_configurations = {                       \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \