diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 7c20d5e9b4a..f3441316849 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -121,6 +121,9 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
    /* Default to the sampler since that's what we've done since forever */
    compiler->indirect_ubos_use_sampler = true;
 
+   compiler->lower_dpas = devinfo->verx10 < 125 ||
+      debug_get_bool_option("INTEL_LOWER_DPAS", false);
+
    /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
    for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
       compiler->scalar_stage[i] = devinfo->ver >= 8 ||
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index b6cd0308c9d..bdc403c0d5a 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -128,6 +128,14 @@ struct brw_compiler {
     */
    bool use_bindless_sampler_offset;
 
+   /**
+    * Should DPAS instructions be lowered?
+    *
+    * This will be set for all platforms before Gfx12.5. It may also be set
+    * platforms that support DPAS for testing purposes.
+    */
+   bool lower_dpas;
+
    /**
     * Calling the ra_allocate function after each register spill can take
     * several minutes. This option speeds up shader compilation by spilling
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index ca2d18639ae..c0937e0357b 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6134,6 +6134,9 @@ fs_visitor::optimize()
 
    validate();
 
+   if (compiler->lower_dpas)
+      OPT(brw_lower_dpas, *this);
+
    OPT(split_virtual_grfs);
 
    /* Before anything else, eliminate dead code.  The results of some NIR
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index c7af424c8fc..a57274250b9 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -626,6 +626,8 @@ void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst
 int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
                                     const brw_stage_prog_data *prog_data);
 
+bool brw_lower_dpas(fs_visitor &v);
+
 void nir_to_brw(fs_visitor *s);
 
 #endif /* BRW_FS_H */
diff --git a/src/intel/compiler/brw_fs_lower_dpas.cpp b/src/intel/compiler/brw_fs_lower_dpas.cpp
new file mode 100644
index 00000000000..306731722af
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_dpas.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static void
+f16_using_mac(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
+
+   const brw_reg_type src0_type = inst->dst.type;
+   const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
+   const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
+
+   const fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride =
+      dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
+
+      for (unsigned subword = 0; subword < 2; subword++) {
+         for (unsigned s = 0; s < inst->sdepth; s++) {
+            /* The first multiply of the dot-product operation has to
+             * explicitly write the accumulator register. The successive MAC
+             * instructions will implicitly read *and* write the
+             * accumulator. Those MAC instructions can also optionally
+             * explicitly write some other register.
+             *
+             * FINISHME: The accumulator can actually hold 16 HF values. On
+             * Gfx12 there are two accumulators. It should be possible to do
+             * this in SIMD16 or even SIMD32. I was unable to get this to work
+             * properly.
+             */
+            if (s == 0 && subword == 0) {
+               const unsigned acc_width = 8;
+               fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
+                                      inst->group % acc_width);
+
+               if (bld.shader->devinfo->verx10 >= 125) {
+                  acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
+               } else {
+                  acc = retype(acc, BRW_REGISTER_TYPE_HF);
+               }
+
+               bld.MUL(acc,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+
+            } else {
+               fs_reg result;
+
+               /* As mentioned above, the MAC had an optional, explicit
+                * destination register. Various optimization passes are not
+                * clever enough to understand the intricacies of this
+                * instruction, so only write the result register on the final
+                * MAC in the sequence.
+                */
+               if ((s + 1) == inst->sdepth && subword == 1)
+                  result = temp;
+               else
+                  result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
+
+               bld.MAC(result,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+            }
+         }
+      }
+
+      if (!src0.is_null()) {
+         if (src0_type != BRW_REGISTER_TYPE_HF) {
+            fs_reg temp2 = bld.vgrf(src0_type, 1);
+
+            bld.MOV(temp2, temp);
+
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp2,
+                    byte_offset(src0, r * dest_stride));
+         } else {
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp,
+                    byte_offset(src0, r * dest_stride));
+         }
+      } else {
+         bld.MOV(byte_offset(dest, r * dest_stride), temp);
+      }
+   }
+}
+
+static void
+int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         bld.DP4A(dest,
+                  dest,
+                  byte_offset(src1, s * REG_SIZE),
+                  component(byte_offset(src2, r * REG_SIZE), s))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+static void
+int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src0_type = inst->dst.type;
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const brw_reg_type temp_type =
+            (inst->src[1].type == BRW_REGISTER_TYPE_B ||
+             inst->src[2].type == BRW_REGISTER_TYPE_B)
+            ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
+
+         /* Expand 8 dwords of packed bytes into 16 dwords of packed
+          * words.
+          *
+          * FINISHME: Gfx9 should not need this work around. Gfx11
+          * may be able to use integer MAD. Both platforms may be
+          * able to use MAC.
+          */
+         bld.group(32, 0).MOV(retype(temp3, temp_type),
+                              retype(byte_offset(src2, r * REG_SIZE),
+                                     inst->src[2].type));
+
+         bld.MUL(subscript(temp1, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 0),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp1, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 1),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 1));
+
+         bld.MUL(subscript(temp2, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 2),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp2, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 3),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 1));
+
+         bld.ADD(subscript(temp1, src0_type, 0),
+                 subscript(temp1, temp_type, 0),
+                 subscript(temp1, temp_type, 1));
+
+         bld.ADD(subscript(temp2, src0_type, 0),
+                 subscript(temp2, temp_type, 0),
+                 subscript(temp2, temp_type, 1));
+
+         bld.ADD(retype(temp1, src0_type),
+                 retype(temp1, src0_type),
+                 retype(temp2, src0_type));
+
+         bld.ADD(dest, dest, retype(temp1, src0_type))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+bool
+brw_lower_dpas(fs_visitor &v)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
+      if (inst->opcode != BRW_OPCODE_DPAS)
+         continue;
+
+      const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
+
+      if (brw_reg_type_is_floating_point(inst->dst.type)) {
+         f16_using_mac(bld, inst);
+      } else {
+         if (v.devinfo->ver >= 12) {
+            int8_using_dp4a(bld, inst);
+         } else {
+            int8_using_mul_add(bld, inst);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 5fd08abd4fb..0372c5aff9f 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -57,6 +57,7 @@ libintel_compiler_files = files(
   'brw_fs.h',
   'brw_fs_live_variables.cpp',
   'brw_fs_live_variables.h',
+  'brw_fs_lower_dpas.cpp',
   'brw_fs_lower_pack.cpp',
   'brw_fs_lower_regioning.cpp',
   'brw_fs_nir.cpp',