From 475ec2ade327c83d43b360bb5d36a41ef42f788f Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sat, 1 Nov 2025 19:41:34 +1000
Subject: [PATCH] lavapipe: add support for VK_KHR_cooperative_matrix.

This adds support for cooperative matrix to lavapipe.

It uses 8x8 matricies as the size, and loads the first row
of 8 values into a the subgroup.

It stores the B matrix transposed to make the matmul operation
a lot more subgroup friendly.

Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38935>
---
 src/gallium/frontends/lavapipe/lvp_device.c   |  76 ++-
 src/gallium/frontends/lavapipe/lvp_pipeline.c |   2 +
 src/gallium/frontends/lavapipe/meson.build    |   1 +
 src/gallium/frontends/lavapipe/nir/lvp_nir.h  |   2 +
 .../nir/lvp_nir_lower_cooperative_matrix.c    | 504 ++++++++++++++++++
 5 files changed, 583 insertions(+), 2 deletions(-)
 create mode 100644 src/gallium/frontends/lavapipe/nir/lvp_nir_lower_cooperative_matrix.c

diff --git a/src/gallium/frontends/lavapipe/lvp_device.c b/src/gallium/frontends/lavapipe/lvp_device.c
index 02dc2289c6b..80bfc7ea387 100644
--- a/src/gallium/frontends/lavapipe/lvp_device.c
+++ b/src/gallium/frontends/lavapipe/lvp_device.c
@@ -74,6 +74,13 @@
 #define LVP_SAMPLE_COUNTS (VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT | \
                            VK_SAMPLE_COUNT_8_BIT)
 
+extern unsigned lp_native_vector_width;
+
+static bool has_cooperative_matrix(void) {
+   /* only support coopmat if we have 8 wide */
+   return (lp_native_vector_width / 32) >= 8;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL lvp_EnumerateInstanceVersion(uint32_t* pApiVersion)
 {
    *pApiVersion = LVP_API_VERSION;
@@ -124,6 +131,7 @@ static const struct vk_device_extension_table lvp_device_extensions_supported =
    .KHR_buffer_device_address             = true,
    .KHR_create_renderpass2                = true,
    .KHR_compute_shader_derivatives        = true,
+   .KHR_cooperative_matrix                = true,
    .KHR_copy_commands2                    = true,
    .KHR_copy_memory_indirect              = true,
    .KHR_dedicated_allocation              = true,
@@ -857,11 +865,13 @@ lvp_get_features(const struct lvp_physical_device *pdevice,
       /* VK_KHR_unified_image_layouts */
       .unifiedImageLayouts = true,
       .unifiedImageLayoutsVideo = true,
+
+      /* VK_KHR_cooperative_matrix */
+      .cooperativeMatrix = has_cooperative_matrix(),
+      .cooperativeMatrixRobustBufferAccess = has_cooperative_matrix(),
    };
 }
 
-extern unsigned lp_native_vector_width;
-
 static VkImageLayout lvp_host_copy_image_layouts[] = {
    VK_IMAGE_LAYOUT_GENERAL,
    VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
@@ -1369,6 +1379,10 @@ lvp_get_properties(const struct lvp_physical_device *device, struct vk_propertie
    /* VK_EXT_mesh_shader */
    p->maxMeshPayloadAndSharedMemorySize = p->maxTaskPayloadSize + p->maxMeshSharedMemorySize; /* 28K min required */
    p->maxMeshPayloadAndOutputMemorySize = p->maxTaskPayloadSize + p->maxMeshOutputMemorySize; /* 47K min required */
+
+   /* VK_KHR_cooperative_matrix */
+   p->cooperativeMatrixSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
+
 }
 
 static VkResult VKAPI_CALL
@@ -2863,3 +2877,61 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetRenderingAreaGranularityKHR(
    VkExtent2D tile_size = {64, 64};
    *pGranularity = tile_size;
 }
+
+VKAPI_ATTR VkResult VKAPI_CALL lvp_GetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+   VkPhysicalDevice physicalDevice,
+   uint32_t *pPropertyCount,
+   VkCooperativeMatrixPropertiesKHR *pProperties)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkCooperativeMatrixPropertiesKHR, out, pProperties, pPropertyCount);
+
+   for (unsigned fp32 = 0; fp32 < 2; fp32++) {
+      vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, p)
+      {
+         *p = (struct VkCooperativeMatrixPropertiesKHR){
+            .sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR,
+            .MSize = 8,
+            .NSize = 8,
+            .KSize = 8,
+            .AType = VK_COMPONENT_TYPE_FLOAT16_KHR,
+            .BType = VK_COMPONENT_TYPE_FLOAT16_KHR,
+            .CType = fp32 == 1 ? VK_COMPONENT_TYPE_FLOAT32_KHR : VK_COMPONENT_TYPE_FLOAT16_KHR,
+            .ResultType = fp32 == 1 ? VK_COMPONENT_TYPE_FLOAT32_KHR : VK_COMPONENT_TYPE_FLOAT16_KHR,
+            .saturatingAccumulation = false,
+            .scope = VK_SCOPE_SUBGROUP_KHR
+         };
+      }
+   }
+
+   vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, p)
+   {
+      *p = (struct VkCooperativeMatrixPropertiesKHR){
+         .sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR,
+         .MSize = 8,
+         .NSize = 8,
+         .KSize = 8,
+         .AType = VK_COMPONENT_TYPE_UINT8_KHR,
+         .BType = VK_COMPONENT_TYPE_UINT8_KHR,
+         .CType = VK_COMPONENT_TYPE_UINT32_KHR,
+         .ResultType = VK_COMPONENT_TYPE_UINT32_KHR,
+         .saturatingAccumulation = false,
+         .scope = VK_SCOPE_SUBGROUP_KHR
+      };
+   }
+   vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, p)
+   {
+      *p = (struct VkCooperativeMatrixPropertiesKHR){
+         .sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR,
+         .MSize = 8,
+         .NSize = 8,
+         .KSize = 8,
+         .AType = VK_COMPONENT_TYPE_SINT8_KHR,
+         .BType = VK_COMPONENT_TYPE_SINT8_KHR,
+         .CType = VK_COMPONENT_TYPE_SINT32_KHR,
+         .ResultType = VK_COMPONENT_TYPE_SINT32_KHR,
+         .saturatingAccumulation = false,
+         .scope = VK_SCOPE_SUBGROUP_KHR
+      };
+   }
+   return vk_outarray_status(&out);
+}
diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c
index 28e4d4cd4e9..ee559ea71d4 100644
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@@ -347,6 +347,8 @@ lvp_shader_lower(struct lvp_device *pdevice, nir_shader *nir, struct lvp_pipelin
    NIR_PASS(_, nir, nir_lower_system_values);
    NIR_PASS(_, nir, nir_lower_is_helper_invocation);
 
+   NIR_PASS(_, nir, lvp_nir_lower_cooperative_matrix);
+
    const struct nir_lower_compute_system_values_options compute_system_values = {0};
    NIR_PASS(_, nir, nir_lower_compute_system_values, &compute_system_values);
 
diff --git a/src/gallium/frontends/lavapipe/meson.build b/src/gallium/frontends/lavapipe/meson.build
index 2a0c1bcc563..b87926e8022 100644
--- a/src/gallium/frontends/lavapipe/meson.build
+++ b/src/gallium/frontends/lavapipe/meson.build
@@ -12,6 +12,7 @@ lvp_entrypoints = custom_target(
 )
 
 liblvp_files = files(
+    'nir/lvp_nir_lower_cooperative_matrix.c',
     'nir/lvp_nir_lower_exec_graph.c',
     'nir/lvp_nir_lower_input_attachments.c',
     'nir/lvp_nir_lower_pipeline_layout.c',
diff --git a/src/gallium/frontends/lavapipe/nir/lvp_nir.h b/src/gallium/frontends/lavapipe/nir/lvp_nir.h
index a4b6535b696..fbb76d9b9a1 100644
--- a/src/gallium/frontends/lavapipe/nir/lvp_nir.h
+++ b/src/gallium/frontends/lavapipe/nir/lvp_nir.h
@@ -121,4 +121,6 @@ bool lvp_nir_lower_sparse_residency(struct nir_shader *shader);
 bool lvp_nir_opt_robustness(struct nir_shader *shader, struct lvp_device *device,
                             struct vk_pipeline_robustness_state *robustness);
 
+bool lvp_nir_lower_cooperative_matrix(nir_shader *shader);
+
 #endif
diff --git a/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_cooperative_matrix.c b/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_cooperative_matrix.c
new file mode 100644
index 00000000000..6f08a3813c6
--- /dev/null
+++ b/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_cooperative_matrix.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright © 2025 Red Hat
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#include "lvp_nir.h"
+
+extern unsigned lp_native_vector_width;
+
+#define MAX_CMAT_LEN 16
+#define CMAT_LEN (lp_native_vector_width / 32)
+
+/* This pass lowers cooperative matrix.
+ *
+ * for lavapipe we advertise 8x8 matrix.
+ * This means we can store vec8[8] and get the backend to do the right thing.
+ */
+static unsigned
+get_cmat_size(struct glsl_cmat_description matrix_desc)
+{
+   return matrix_desc.cols * matrix_desc.rows;
+}
+
+static unsigned
+get_cmat_length(struct glsl_cmat_description matrix_desc)
+{
+   return get_cmat_size(matrix_desc) / CMAT_LEN;
+}
+
+static const struct glsl_type *
+remap_matrix_type(struct hash_table *mapping, const struct glsl_type *orig)
+{
+  struct hash_entry *entry = _mesa_hash_table_search(mapping, orig);
+
+   if (entry)
+      return entry->data;
+
+   const struct glsl_type *new_type = orig;
+
+   if (glsl_type_is_cmat(orig)) {
+      struct glsl_cmat_description matrix_desc =
+         *glsl_get_cmat_description(orig);
+
+      new_type = glsl_vector_type(matrix_desc.element_type, get_cmat_length(matrix_desc));
+   } else if (glsl_type_is_array(orig)) {
+      const struct glsl_type *elem_type = glsl_get_array_element(orig);
+      const struct glsl_type *new_elem_type =
+         remap_matrix_type(mapping, elem_type);
+
+      if (elem_type != new_elem_type) {
+         new_type = glsl_array_type(new_elem_type, glsl_get_length(orig),
+                                    glsl_get_explicit_stride(orig));
+      }
+   }
+   _mesa_hash_table_insert(mapping, orig, (void *)new_type);
+   return new_type;
+}
+
+static nir_def *
+load_cmat_deref(nir_builder *b, nir_deref_instr *src)
+{
+   struct glsl_cmat_description matrix_desc =
+      *glsl_get_cmat_description(src->type);
+
+   return nir_build_load_deref(
+      b, get_cmat_length(matrix_desc),
+      glsl_base_type_bit_size(matrix_desc.element_type), &src->def, 0);
+}
+
+static ALWAYS_INLINE nir_def *
+load_cmat_src(nir_builder *b, nir_src src)
+{
+   return load_cmat_deref(b, nir_src_as_deref(src));
+}
+
+static ALWAYS_INLINE struct glsl_cmat_description
+cmat_src_desc(nir_src src)
+{
+   nir_deref_instr *deref = nir_src_as_deref(src);
+   return *glsl_get_cmat_description(deref->type);
+}
+
+static void
+store_cmat_deref(nir_builder *b, nir_deref_instr *dst, nir_def *val)
+{
+   ASSERTED struct glsl_cmat_description matrix_desc =
+      *glsl_get_cmat_description(dst->type);
+
+   assert(val->bit_size == glsl_base_type_bit_size(matrix_desc.element_type));
+   assert(val->num_components == get_cmat_length(matrix_desc));
+
+   nir_store_deref(b, dst, val, ~0);
+}
+
+static ALWAYS_INLINE void
+store_cmat_src(nir_builder *b, nir_src dst_src, nir_def *val)
+{
+   store_cmat_deref(b, nir_src_as_deref(dst_src), val);
+}
+
+static bool
+lower_cmat_copy(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   nir_build_copy_deref(b, intr->src[0].ssa, intr->src[1].ssa);
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static nir_def *
+convert_base_type(nir_builder *b, nir_def *src, enum glsl_base_type src_type, enum glsl_base_type dst_type)
+{
+   if (dst_type == src_type)
+      return src;
+
+   nir_op op = nir_type_conversion_op(nir_get_nir_type_for_glsl_base_type(src_type),
+                                      nir_get_nir_type_for_glsl_base_type(dst_type), nir_rounding_mode_undef);
+
+   return nir_build_alu1(b, op, src);
+}
+
+static bool
+lower_cmat_convert(nir_builder *b,
+                   nir_intrinsic_instr *intr)
+{
+   struct glsl_cmat_description dst_desc = cmat_src_desc(intr->src[0]);
+   struct glsl_cmat_description src_desc = cmat_src_desc(intr->src[1]);
+
+   enum glsl_base_type dst_element_type = dst_desc.element_type;
+   enum glsl_base_type src_element_type = src_desc.element_type;
+   nir_def *cmat = load_cmat_src(b, intr->src[1]);
+
+   nir_def *ret = convert_base_type(b, cmat, src_element_type, dst_element_type);
+   store_cmat_src(b, intr->src[0], ret);
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_load_store(nir_builder *b,
+                      struct hash_table *type_mapping,
+                      nir_intrinsic_instr *intr)
+{
+   const bool is_load = intr->intrinsic == nir_intrinsic_cmat_load;
+   const struct glsl_cmat_description desc = cmat_src_desc(intr->src[!is_load]);
+   enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
+   nir_deref_instr *cmat_deref = nir_src_as_deref(intr->src[!is_load]);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[is_load]);
+   nir_def *stride = intr->src[2].ssa;
+
+   nir_def *lane_id = nir_load_subgroup_invocation(b);
+   unsigned type_size_B = glsl_base_type_bit_size(desc.element_type) / 8;
+   const uint32_t ptr_stride = glsl_get_bit_size(deref->type) / 8 * glsl_get_vector_elements(deref->type);
+   deref = nir_build_deref_cast(b, &deref->def, deref->modes, deref->type, ptr_stride);
+   const struct glsl_type *cmat_type = remap_matrix_type(type_mapping, cmat_deref->type);
+   cmat_deref = nir_build_deref_cast(b, &cmat_deref->def, cmat_deref->modes,
+                                     cmat_type, 0);
+
+   /* store B matrix transposed */
+   if (desc.use == GLSL_CMAT_USE_B)
+      layout =
+         layout == GLSL_MATRIX_LAYOUT_COLUMN_MAJOR ? GLSL_MATRIX_LAYOUT_ROW_MAJOR : GLSL_MATRIX_LAYOUT_COLUMN_MAJOR;
+
+   unsigned idx_bits = deref->def.bit_size;
+   nir_def *vars[MAX_CMAT_LEN];
+
+   if (!is_load) {
+      nir_def *src = load_cmat_src(b, intr->src[!is_load]);
+      for (unsigned i = 0; i < CMAT_LEN; i++) {
+         vars[i] = nir_channel(b, src, i);
+      }
+   }
+
+   for (unsigned i = 0; i < CMAT_LEN; i++) {
+      nir_def *col_offset = lane_id;
+      nir_def *row_offset = nir_imm_int(b, i);
+
+      if (layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR) {
+         SWAP(col_offset, row_offset);
+      }
+
+      col_offset = nir_imul(b, col_offset, stride);
+      col_offset = nir_u2uN(b, col_offset, idx_bits);
+      row_offset = nir_u2uN(b, row_offset, idx_bits);
+
+      nir_deref_instr *iter_deref = nir_build_deref_ptr_as_array(b, deref, col_offset);
+
+      iter_deref = nir_build_deref_cast(b, &iter_deref->def,
+                                        deref->modes,
+                                        glsl_scalar_type(desc.element_type),
+                                        type_size_B);
+      iter_deref = nir_build_deref_ptr_as_array(b, iter_deref, row_offset);
+
+      if (is_load) {
+         vars[i] = nir_load_deref(b, iter_deref);
+      } else {
+         nir_store_deref(b, iter_deref, vars[i], ~0);
+      }
+   }
+
+   if (is_load) {
+      nir_def *mat = nir_vec(b, vars, CMAT_LEN);
+      nir_store_deref(b, cmat_deref, mat, nir_component_mask(mat->num_components));
+   }
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_construct(nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+   nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);
+   struct glsl_cmat_description desc = *glsl_get_cmat_description(dst_deref->type);
+   nir_def *elem = intr->src[1].ssa;
+
+   nir_def *r = nir_replicate(b, elem, get_cmat_length(desc));
+
+   nir_store_deref(b, dst_deref, r, nir_component_mask(r->num_components));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_extract(nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+   nir_def *mat = load_cmat_src(b, intr->src[0]);
+   nir_def *index = intr->src[1].ssa;
+   nir_def *elem = nir_vector_extract(b, mat, index);
+   nir_def_replace(&intr->def, elem);
+   return true;
+}
+
+static bool
+lower_cmat_insert(nir_builder *b,
+                  nir_intrinsic_instr *intr)
+{
+   nir_def *elem = intr->src[1].ssa;
+   nir_def *mat = load_cmat_src(b, intr->src[2]);
+   nir_def *index = intr->src[3].ssa;
+
+   nir_def *r = nir_vector_insert(b, mat, elem, index);
+   store_cmat_src(b, intr->src[0], r);
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_binary_op(nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+   nir_def *src_a = load_cmat_src(b, intr->src[1]);
+   nir_def *src_b = load_cmat_src(b, intr->src[2]);
+   nir_op op = nir_intrinsic_alu_op(intr);
+
+   nir_def *ret = nir_build_alu2(b, op, src_a, src_b);
+   store_cmat_src(b, intr->src[0], ret);
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_unary_op(nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+   nir_def *src = load_cmat_src(b, intr->src[1]);
+   nir_op op = nir_intrinsic_alu_op(intr);
+
+   nir_def *ret = nir_build_alu1(b, op, src);
+   store_cmat_src(b, intr->src[0], ret);
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_scalar_op(nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+   nir_def *src_a = load_cmat_src(b, intr->src[1]);
+   nir_op op = nir_intrinsic_alu_op(intr);
+
+   nir_def *ret = nir_build_alu2(b, op, src_a, intr->src[2].ssa);
+   store_cmat_src(b, intr->src[0], ret);
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_length(nir_builder *b,
+                  nir_intrinsic_instr *intr)
+{
+   nir_def_replace(&intr->def, nir_imm_int(b, CMAT_LEN));
+   return true;
+}
+
+static bool
+lower_cmat_muladd(nir_builder *b,
+                  nir_intrinsic_instr *intr)
+{
+   const struct glsl_cmat_description a_desc = cmat_src_desc(intr->src[1]);
+   const struct glsl_cmat_description b_desc = cmat_src_desc(intr->src[2]);
+   const struct glsl_cmat_description c_desc = cmat_src_desc(intr->src[3]);
+   nir_def *cmat_a = load_cmat_src(b, intr->src[1]);
+   nir_def *cmat_b = load_cmat_src(b, intr->src[2]);
+   nir_def *cmat_c = load_cmat_src(b, intr->src[3]);
+
+   unsigned a_length = get_cmat_length(a_desc);
+   unsigned b_length = get_cmat_length(b_desc);
+   unsigned c_length = get_cmat_length(c_desc);
+   nir_def *a_comps[NIR_MAX_VEC_COMPONENTS];
+   nir_def *b_comps[NIR_MAX_VEC_COMPONENTS];
+   nir_def *c_comps[NIR_MAX_VEC_COMPONENTS];
+   nir_def *d_comps[NIR_MAX_VEC_COMPONENTS];
+   const nir_cmat_signed cmat_signed_mask = nir_intrinsic_cmat_signed_mask(intr);
+
+   enum glsl_base_type c_element_type =
+      glsl_apply_signedness_to_base_type(c_desc.element_type, cmat_signed_mask & NIR_CMAT_C_SIGNED);
+
+   for (unsigned i = 0; i < a_length; i++)
+      a_comps[i] = nir_channel(b, cmat_a, i);
+
+   for (unsigned i = 0; i < b_length; i++)
+      b_comps[i] = nir_channel(b, cmat_b, i);
+
+   for (unsigned i = 0; i < c_length; i++)
+      c_comps[i] = nir_channel(b, cmat_c, i);
+
+   nir_def *lane_id = nir_load_subgroup_invocation(b);
+   int accum_bit_size = glsl_base_type_bit_size(c_desc.element_type);
+   for (unsigned i = 0; i < CMAT_LEN; i++) {
+      nir_def *ref = nir_imm_zero(b, 1, glsl_base_type_bit_size(c_desc.element_type));
+      for (unsigned j = 0; j < CMAT_LEN; j++) {
+         nir_def *outer_else_val = ref;
+         ref = nir_imm_zero(b, 1, glsl_base_type_bit_size(c_desc.element_type));
+
+         nir_def *a_i = a_comps[i];
+         nir_def *b_j = b_comps[j]; /* B is stored transposed */
+         nir_def *val;
+         if (glsl_base_type_is_integer(c_desc.element_type)) {
+            if (c_element_type == GLSL_TYPE_INT)
+               a_i = nir_i2iN(b, a_i, accum_bit_size);
+            else
+               a_i = nir_u2uN(b, a_i, accum_bit_size);
+            if (c_element_type == GLSL_TYPE_INT)
+               b_j = nir_i2iN(b, b_j, accum_bit_size);
+            else
+               b_j = nir_u2uN(b, b_j, accum_bit_size);
+
+            val = nir_imul(b, a_i, b_j);
+            ref = nir_iadd(b, ref, val);
+         } else {
+            a_i = nir_f2fN(b, a_i, accum_bit_size);
+            b_j = nir_f2fN(b, b_j, accum_bit_size);
+            val = nir_fmul(b, a_i, b_j);
+            ref = nir_fadd(b, ref, val);
+         }
+
+         if (glsl_base_type_is_integer(c_desc.element_type)) {
+            ref = nir_reduce(b, ref, .reduction_op = nir_op_iadd);
+         } else {
+            ref = nir_reduce(b, ref, .reduction_op = nir_op_fadd);
+         }
+
+         nir_def *lane = nir_ieq_imm(b, lane_id, j);
+         ref = nir_bcsel(b, lane, ref, outer_else_val);
+      }
+
+      if (glsl_base_type_is_integer(c_desc.element_type)) {
+         ref = nir_iadd(b, ref, c_comps[i]);
+      } else {
+         ref = nir_fadd(b, ref, c_comps[i]);
+      }
+      d_comps[i] = ref;
+   }
+   nir_def *ret = nir_vec(b, d_comps, CMAT_LEN);
+   store_cmat_src(b, intr->src[0], ret);
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_cmat_bitcast(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   nir_def *src1 = load_cmat_src(b, intr->src[1]);
+   nir_store_deref(b, nir_src_as_deref(intr->src[0]), src1, nir_component_mask(src1->num_components));
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+lower_impl(nir_function_impl *impl,
+           struct hash_table *type_mapping)
+{
+   bool progress = false;
+   /* Remap all cmat temp var to array of scalars */
+   nir_foreach_function_temp_variable(var, impl) {
+      const struct glsl_type *new_type =
+         remap_matrix_type(type_mapping, var->type);
+      if (new_type != var->type) {
+         var->type = new_type;
+         progress = true;
+      }
+   }
+
+   /* Iterate in reverse order so that lowering can still use the matrix types from the derefs before we change it. */
+   nir_builder b = nir_builder_create(impl);
+   nir_foreach_block_reverse_safe (block, impl) {
+      nir_foreach_instr_reverse_safe (instr, block) {
+         b.cursor = nir_before_instr(instr);
+
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            switch (intr->intrinsic) {
+            case nir_intrinsic_cmat_length:
+               progress |= lower_cmat_length(&b, intr);
+               break;
+            case nir_intrinsic_cmat_construct:
+               progress |= lower_cmat_construct(&b, intr);
+               break;
+            case nir_intrinsic_cmat_extract:
+               progress |= lower_cmat_extract(&b, intr);
+               break;
+            case nir_intrinsic_cmat_insert:
+               progress |= lower_cmat_insert(&b, intr);
+               break;
+            case nir_intrinsic_cmat_load:
+            case nir_intrinsic_cmat_store:
+               progress |= lower_cmat_load_store(&b, type_mapping, intr);
+               break;
+            case nir_intrinsic_cmat_binary_op:
+               progress |= lower_cmat_binary_op(&b, intr);
+               break;
+            case nir_intrinsic_cmat_unary_op:
+               progress |= lower_cmat_unary_op(&b, intr);
+               break;
+            case nir_intrinsic_cmat_scalar_op:
+               progress |= lower_cmat_scalar_op(&b, intr);
+               break;
+            case nir_intrinsic_cmat_muladd:
+               progress |= lower_cmat_muladd(&b, intr);
+               break;
+            case nir_intrinsic_cmat_copy:
+               progress |= lower_cmat_copy(&b, intr);
+               break;
+            case nir_intrinsic_cmat_convert:
+               progress |= lower_cmat_convert(&b, intr);
+               break;
+            case nir_intrinsic_cmat_bitcast:
+               progress |= lower_cmat_bitcast(&b, intr);
+               break;
+            default:
+               break;
+            }
+            break;
+         }
+         case nir_instr_type_deref: {
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+            const struct glsl_type *new_type =
+               remap_matrix_type(type_mapping, deref->type);
+
+            if (new_type != deref->type) {
+               deref->type = new_type;
+               progress = true;
+            }
+            break;
+         }
+         default:
+            break;
+         }
+      }
+   }
+   return nir_progress(progress, impl, nir_metadata_none);
+}
+
+bool
+lvp_nir_lower_cooperative_matrix(nir_shader *shader)
+{
+   bool progress = false;
+
+   if (!shader->info.cs.has_cooperative_matrix)
+      return false;
+
+   struct hash_table *type_mapping = _mesa_pointer_hash_table_create(NULL);
+   /* Remap all cmat shader temp var to array of vectors */
+   nir_foreach_variable_with_modes(var, shader, nir_var_shader_temp) {
+      const struct glsl_type *new_type =
+         remap_matrix_type(type_mapping, var->type);
+
+      if (new_type != var->type) {
+         var->type = new_type;
+         progress = true;
+      }
+   }
+
+   progress |= lower_impl(nir_shader_get_entrypoint(shader), type_mapping);
+
+   _mesa_hash_table_destroy(type_mapping, NULL);
+   return progress;
+}