diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 5f0753c75de..ecd4b50cfd7 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4348,35 +4348,10 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
 
       dest = retype(dest, dest_type);
       fs_reg src0 = retype(get_nir_src(ntb, instr->src[0]), dest_type);
-      const fs_reg dest_hf = dest;
 
       fs_builder bld16 = bld.exec_all().group(16, 0);
       fs_builder bldn = devinfo->ver >= 20 ? bld16 : bld.exec_all().group(8, 0);
 
-      /* DG2 cannot have the destination or source 0 of DPAS be float16. It is
-       * still advantageous to support these formats for memory and bandwidth
-       * savings.
-       *
-       * The float16 source must be expanded to float32.
-       */
-      if (devinfo->verx10 == 125 && dest_type == BRW_TYPE_HF &&
-          !s.compiler->lower_dpas) {
-         dest = bldn.vgrf(BRW_TYPE_F, rcount);
-
-         if (src0.file != ARF) {
-            const fs_reg src0_hf = src0;
-
-            src0 = bldn.vgrf(BRW_TYPE_F, rcount);
-
-            for (unsigned i = 0; i < 4; i++) {
-               bld16.MOV(byte_offset(src0, REG_SIZE * i * 2),
-                         byte_offset(src0_hf, REG_SIZE * i));
-            }
-         } else {
-            src0 = retype(src0, BRW_TYPE_F);
-         }
-      }
-
       bldn.DPAS(dest,
                 src0,
                 retype(get_nir_src(ntb, instr->src[2]), src_type),
@@ -4385,14 +4360,6 @@ fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
                 rcount)
          ->saturate = nir_intrinsic_saturate(instr);
 
-      /* Compact the destination to float16 (from float32). */
-      if (!dest.equals(dest_hf)) {
-         for (unsigned i = 0; i < 4; i++) {
-            bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
-                      byte_offset(dest, REG_SIZE * i * 2));
-         }
-      }
-
       cs_prog_data->uses_systolic = true;
       break;
    }
diff --git a/src/intel/compiler/brw_nir_lower_cooperative_matrix.c b/src/intel/compiler/brw_nir_lower_cooperative_matrix.c
index 895a1ae2216..e71c76f87ad 100644
--- a/src/intel/compiler/brw_nir_lower_cooperative_matrix.c
+++ b/src/intel/compiler/brw_nir_lower_cooperative_matrix.c
@@ -148,21 +148,14 @@ get_slice_type_from_desc(const struct lower_cmat_state *state,
 
    const unsigned element_bits = 32;
    const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
-   unsigned packing_factor = MIN2(elements_per_invocation,
-                                  element_bits / bits);
 
-   /* Adjust the packing factor so that each row of the matrix fills and
-    * entire GRF.
-    *
-    * The in-register layout of B matrices is different, so those are handled
-    * more like column major (for row major matrices). See the file comment
-    * for more details.
+   /* Each invocation must have at least one dword of data, and that dword
+    * must be tightly packed with values. No matter the matrix dimensions, a
+    * matrix of uint8_t data must pack 4 values in each entry.
     */
-   const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
-   while ((actual_cols / packing_factor) < 8) {
-      assert(packing_factor > 1);
-      packing_factor /= 2;
-   }
+   const unsigned packing_factor = element_bits / bits;
+
+   assert(elements_per_invocation >= packing_factor);
 
    switch (desc.element_type) {
    case GLSL_TYPE_FLOAT:
@@ -172,12 +165,12 @@ get_slice_type_from_desc(const struct lower_cmat_state *state,
    case GLSL_TYPE_FLOAT16:
    case GLSL_TYPE_UINT8:
    case GLSL_TYPE_UINT16:
-      base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
+      base_type = GLSL_TYPE_UINT;
       break;
    case GLSL_TYPE_INT:
    case GLSL_TYPE_INT8:
    case GLSL_TYPE_INT16:
-      base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
+      base_type = GLSL_TYPE_INT;
       break;
    default:
       unreachable("Invalid cooperative matrix element type.");
@@ -422,54 +415,30 @@ lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,
 
    const nir_op op = nir_intrinsic_alu_op(intrin);
 
-   /* There are three possible cases:
-    *
-    * 1. dst_packing_factor == src_packing_factor. This is the common case,
-    *    and handling it is straightforward.
-    *
-    * 2. dst_packing_factor > src_packing_factor. This occurs when converting a
-    *    float32_t matrix slice to a packed float16_t slice. Loop over the size
-    *    of the destination slice, but read multiple entries from the source
-    *    slice on each iteration.
-    *
-    * 3. dst_packing_factor < src_packing_factor. This occurs when converting a
-    *    packed int8_t matrix slice to an int32_t slice. Loop over the size of
-    *    the source slice, but write multiple entries to the destination slice
-    *    on each iteration.
-    *
-    * Handle all cases by iterating over the total (non-packed) number of
-    * elements in the slice. When dst_packing_factor values have been
-    * calculated, store them.
+   /* With the combinations of formats exposed on all platforms, matrices with
+    * the same dimensions will always have the same data size. The only real
+    * type conversion possible is int32 <-> float32. As a result
+    * dst_packing_factor == src_packing_factor.
     */
-   assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
-          (src_packing_factor * glsl_get_vector_elements(src_slice->type)));
+   assert(dst_packing_factor == src_packing_factor);
 
    /* Stores at most dst_packing_factor partial results. */
    nir_def *v[4];
    assert(dst_packing_factor <= 4);
 
-   for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
-      const unsigned dst_chan_index = i % dst_packing_factor;
-      const unsigned src_chan_index = i % src_packing_factor;
-      const unsigned dst_index = i / dst_packing_factor;
-      const unsigned src_index = i / src_packing_factor;
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *chan = nir_channel(b, nir_load_deref(b, src_slice), i);
 
-      nir_def *src =
-         nir_channel(b,
-                     nir_unpack_bits(b,
-                                     nir_channel(b,
-                                                 nir_load_deref(b, src_slice),
-                                                 src_index),
-                                     src_bits),
-                     src_chan_index);
+      for (unsigned j = 0; j < dst_packing_factor; j++) {
+         nir_def *src =
+            nir_channel(b, nir_unpack_bits(b, chan, src_bits), j);
 
-      v[dst_chan_index] = nir_build_alu1(b, op, src);
-
-      if (dst_chan_index == (dst_packing_factor - 1)) {
-         results[dst_index] =
-            nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
-                          dst_packing_factor * dst_bits);
+         v[j] = nir_build_alu1(b, op, src);
       }
+
+      results[i] =
+         nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
+                       dst_packing_factor * dst_bits);
    }
 
    nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c
index 5b7cd9a5f08..4695bcc4143 100644
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -620,7 +620,6 @@ static const struct intel_device_info intel_device_info_chv = {
    .has_sample_with_hiz = true,                     \
    .has_illegal_ccs_values = true,                                    \
    .cooperative_matrix_configurations = {                             \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@@ -852,7 +851,6 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = {
    .num_subslices = _subslices,                       \
    .max_eus_per_subslice = 8,                                         \
    .cooperative_matrix_configurations = {                             \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@@ -985,7 +983,6 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = {
          .writecombining = PAT_ENTRY(1, WC, NONE),              \
    },                                                           \
    .cooperative_matrix_configurations = {                       \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
@@ -1117,7 +1114,6 @@ static const struct intel_device_info intel_device_info_sg1 = {
    .has_aux_map = false,                                        \
    .simulator_id = 29,                                          \
    .cooperative_matrix_configurations = {                       \
-    { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
     { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \