diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index b3159ed57b4..ca81c491de1 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -403,134 +403,6 @@ namespace brw {
          return brw_reg(dst);
       }
 
-      void
-      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
-                     const brw_reg &tmp,
-                     unsigned left_offset, unsigned left_stride,
-                     unsigned right_offset, unsigned right_stride) const
-      {
-         brw_reg left, right;
-         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
-         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
-         if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
-             (!shader->devinfo->has_64bit_int || shader->devinfo->ver >= 20)) {
-            switch (opcode) {
-            case BRW_OPCODE_MUL:
-               /* This will get lowered by integer MUL lowering */
-               set_condmod(mod, emit(opcode, right, left, right));
-               break;
-
-            case BRW_OPCODE_SEL: {
-               /* In order for the comparisons to work out right, we need our
-                * comparisons to be strict.
-                */
-               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
-               if (mod == BRW_CONDITIONAL_GE)
-                  mod = BRW_CONDITIONAL_G;
-
-               /* We treat the bottom 32 bits as unsigned regardless of
-                * whether or not the integer as a whole is signed.
-                */
-               brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
-               brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
-
-               /* The upper bits get the same sign as the 64-bit type */
-               brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
-               brw_reg right_high = subscript(right, type32, 1);
-               brw_reg left_high = subscript(left, type32, 1);
-
-               /* Build up our comparison:
-                *
-                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
-                */
-               CMP(null_reg_ud(), retype(left_low, BRW_TYPE_UD),
-                                  retype(right_low, BRW_TYPE_UD), mod);
-               set_predicate(BRW_PREDICATE_NORMAL,
-                             CMP(null_reg_ud(), left_high, right_high,
-                                 BRW_CONDITIONAL_EQ));
-               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                                 CMP(null_reg_ud(), left_high, right_high, mod));
-
-               /* We could use selects here or we could use predicated MOVs
-                * because the destination and second source (if it were a SEL)
-                * are the same.
-                */
-               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
-               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
-               break;
-            }
-
-            default:
-               unreachable("Unsupported 64-bit scan op");
-            }
-         } else {
-            set_condmod(mod, emit(opcode, right, left, right));
-         }
-      }
-
-      void
-      emit_scan(enum opcode opcode, const brw_reg &tmp,
-                unsigned cluster_size, brw_conditional_mod mod) const
-      {
-         assert(dispatch_width() >= 8);
-
-         /* The instruction splitting code isn't advanced enough to split
-          * these so we need to handle that ourselves.
-          */
-         if (dispatch_width() * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
-            const unsigned half_width = dispatch_width() / 2;
-            const fs_builder ubld = exec_all().group(half_width, 0);
-            brw_reg left = tmp;
-            brw_reg right = horiz_offset(tmp, half_width);
-            ubld.emit_scan(opcode, left, cluster_size, mod);
-            ubld.emit_scan(opcode, right, cluster_size, mod);
-            if (cluster_size > half_width) {
-               ubld.emit_scan_step(opcode, mod, tmp,
-                                   half_width - 1, 0, half_width, 1);
-            }
-            return;
-         }
-
-         if (cluster_size > 1) {
-            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
-            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
-         }
-
-         if (cluster_size > 2) {
-            if (brw_type_size_bytes(tmp.type) <= 4) {
-               const fs_builder ubld =
-                  exec_all().group(dispatch_width() / 4, 0);
-               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
-               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
-            } else {
-               /* For 64-bit types, we have to do things differently because
-                * the code above would land us with destination strides that
-                * the hardware can't handle.  Fortunately, we'll only be
-                * 8-wide in that case and it's the same number of
-                * instructions.
-                */
-               const fs_builder ubld = exec_all().group(2, 0);
-               for (unsigned i = 0; i < dispatch_width(); i += 4)
-                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
-            }
-         }
-
-         for (unsigned i = 4;
-              i < MIN2(cluster_size, dispatch_width());
-              i *= 2) {
-            const fs_builder ubld = exec_all().group(i, 0);
-            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
-
-            if (dispatch_width() > i * 2)
-               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
-
-            if (dispatch_width() > i * 4) {
-               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
-               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
-            }
-         }
-      }
-
       fs_inst *
       emit_undef_for_dst(const fs_inst *old_inst) const
       {
diff --git a/src/intel/compiler/brw_lower_subgroup_ops.cpp b/src/intel/compiler/brw_lower_subgroup_ops.cpp
index cedb2e17759..d3d99ef312c 100644
--- a/src/intel/compiler/brw_lower_subgroup_ops.cpp
+++ b/src/intel/compiler/brw_lower_subgroup_ops.cpp
@@ -121,6 +121,135 @@ brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type)
    return info;
 }
 
+static void
+brw_emit_scan_step(const fs_builder &bld, enum opcode opcode, brw_conditional_mod mod,
+                   const brw_reg &tmp,
+                   unsigned left_offset, unsigned left_stride,
+                   unsigned right_offset, unsigned right_stride)
+{
+   brw_reg left, right;
+   left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
+   right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
+   if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
+       (!bld.shader->devinfo->has_64bit_int || bld.shader->devinfo->ver >= 20)) {
+      switch (opcode) {
+      case BRW_OPCODE_MUL:
+         /* This will get lowered by integer MUL lowering */
+         set_condmod(mod, bld.emit(opcode, right, left, right));
+         break;
+
+      case BRW_OPCODE_SEL: {
+         /* In order for the comparisons to work out right, we need our
+          * comparisons to be strict.
+          */
+         assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
+         if (mod == BRW_CONDITIONAL_GE)
+            mod = BRW_CONDITIONAL_G;
+
+         /* We treat the bottom 32 bits as unsigned regardless of
+          * whether or not the integer as a whole is signed.
+          */
+         brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
+         brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
+
+         /* The upper bits get the same sign as the 64-bit type */
+         brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
+         brw_reg right_high = subscript(right, type32, 1);
+         brw_reg left_high = subscript(left, type32, 1);
+
+         /* Build up our comparison:
+          *
+          *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
+          */
+         bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD),
+                            retype(right_low, BRW_TYPE_UD), mod);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.CMP(bld.null_reg_ud(), left_high, right_high,
+                           BRW_CONDITIONAL_EQ));
+         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                           bld.CMP(bld.null_reg_ud(), left_high, right_high, mod));
+
+         /* We could use selects here or we could use predicated MOVs
+          * because the destination and second source (if it were a SEL)
+          * are the same.
+          */
+         set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low));
+         set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high));
+         break;
+      }
+
+      default:
+         unreachable("Unsupported 64-bit scan op");
+      }
+   } else {
+      set_condmod(mod, bld.emit(opcode, right, left, right));
+   }
+}
+
+static void
+brw_emit_scan(const fs_builder &bld, enum opcode opcode, const brw_reg &tmp,
+              unsigned cluster_size, brw_conditional_mod mod)
+{
+   unsigned dispatch_width = bld.dispatch_width();
+   assert(dispatch_width >= 8);
+
+   /* The instruction splitting code isn't advanced enough to split
+    * these so we need to handle that ourselves.
+    */
+   if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
+      const unsigned half_width = dispatch_width / 2;
+      const fs_builder ubld = bld.exec_all().group(half_width, 0);
+      brw_reg left = tmp;
+      brw_reg right = horiz_offset(tmp, half_width);
+      brw_emit_scan(ubld, opcode, left, cluster_size, mod);
+      brw_emit_scan(ubld, opcode, right, cluster_size, mod);
+      if (cluster_size > half_width) {
+         brw_emit_scan_step(ubld, opcode, mod, tmp,
+                            half_width - 1, 0, half_width, 1);
+      }
+      return;
+   }
+
+   if (cluster_size > 1) {
+      const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
+      brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2);
+   }
+
+   if (cluster_size > 2) {
+      if (brw_type_size_bytes(tmp.type) <= 4) {
+         const fs_builder ubld =
+            bld.exec_all().group(dispatch_width / 4, 0);
+         brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4);
+         brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4);
+      } else {
+         /* For 64-bit types, we have to do things differently because
+          * the code above would land us with destination strides that
+          * the hardware can't handle.  Fortunately, we'll only be
+          * 8-wide in that case and it's the same number of
+          * instructions.
+          */
+         const fs_builder ubld = bld.exec_all().group(2, 0);
+         for (unsigned i = 0; i < dispatch_width; i += 4)
+            brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1);
+      }
+   }
+
+   for (unsigned i = 4;
+        i < MIN2(cluster_size, dispatch_width);
+        i *= 2) {
+      const fs_builder ubld = bld.exec_all().group(i, 0);
+      brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1);
+
+      if (dispatch_width > i * 2)
+         brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
+
+      if (dispatch_width > i * 4) {
+         brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
+         brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
+      }
+   }
+}
+
 static bool
 brw_lower_reduce(fs_visitor &s, bblock_t *block, fs_inst *inst)
 {
@@ -147,7 +276,7 @@ brw_lower_reduce(fs_visitor &s, bblock_t *block, fs_inst *inst)
    brw_reg scan = bld.vgrf(src.type);
    bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);
 
-   bld.emit_scan(info.op, scan, cluster_size, info.cond_mod);
+   brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod);
 
    if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) {
       /* In this case, CLUSTER_BROADCAST instruction isn't needed because
@@ -208,7 +337,7 @@ brw_lower_scan(fs_visitor &s, bblock_t *block, fs_inst *inst)
       scan = shifted;
    }
 
-   bld.emit_scan(info.op, scan, s.dispatch_width, info.cond_mod);
+   brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod);
 
    bld.MOV(dst, scan);