diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index d3aec055a77..7892a69d981 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2132,77 +2132,6 @@ fs_visitor::get_pull_locs(const fs_reg &src,
    return true;
 }
 
-/**
- * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
- * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
- */
-bool
-brw_fs_lower_constant_loads(fs_visitor &s)
-{
-   unsigned index, pull_index;
-   bool progress = false;
-
-   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
-      /* Set up the annotation tracking for new generated instructions. */
-      const fs_builder ibld(&s, block, inst);
-
-      for (int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file != UNIFORM)
-	    continue;
-
-         /* We'll handle this case later */
-         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
-            continue;
-
-         if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
-	    continue;
-
-         assert(inst->src[i].stride == 0);
-
-         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
-         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
-         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-         const unsigned base = pull_index * 4;
-
-         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
-         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
-         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
-         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
-
-
-         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
-                   srcs, PULL_UNIFORM_CONSTANT_SRCS);
-
-         /* Rewrite the instruction to use the temporary VGRF. */
-         inst->src[i].file = VGRF;
-         inst->src[i].nr = dst.nr;
-         inst->src[i].offset = (base & (block_sz - 1)) +
-                               inst->src[i].offset % 4;
-
-         progress = true;
-      }
-
-      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
-          inst->src[0].file == UNIFORM) {
-
-         if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
-            continue;
-
-         s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
-                                      brw_imm_ud(index),
-                                      fs_reg() /* surface_handle */,
-                                      inst->src[1],
-                                      pull_index * 4, 4, 1);
-         inst->remove(block);
-
-         progress = true;
-      }
-   }
-   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
-   return progress;
-}
-
 /**
  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
  * instructions to FS_OPCODE_REP_FB_WRITE.
@@ -2276,114 +2205,6 @@ fs_visitor::emit_repclear_shader()
    brw_fs_lower_scoreboard(*this);
 }
 
-bool
-brw_fs_lower_load_payload(fs_visitor &s)
-{
-   bool progress = false;
-
-   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
-      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
-         continue;
-
-      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
-      assert(inst->saturate == false);
-      fs_reg dst = inst->dst;
-
-      /* Get rid of COMPR4.  We'll add it back in if we need it */
-      if (dst.file == MRF)
-         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
-
-      const fs_builder ibld(&s, block, inst);
-      const fs_builder ubld = ibld.exec_all();
-
-      for (uint8_t i = 0; i < inst->header_size;) {
-         /* Number of header GRFs to initialize at once with a single MOV
-          * instruction.
-          */
-         const unsigned n =
-            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
-             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
-            2 : 1;
-
-         if (inst->src[i].file != BAD_FILE)
-            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
-                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));
-
-         dst = byte_offset(dst, n * REG_SIZE);
-         i += n;
-      }
-
-      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
-          inst->exec_size > 8) {
-         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
-          * a straightforward copy.  Instead, the result of the
-          * LOAD_PAYLOAD is treated as interleaved and the first four
-          * non-header sources are unpacked as:
-          *
-          * m + 0: r0
-          * m + 1: g0
-          * m + 2: b0
-          * m + 3: a0
-          * m + 4: r1
-          * m + 5: g1
-          * m + 6: b1
-          * m + 7: a1
-          *
-          * This is used for gen <= 5 fb writes.
-          */
-         assert(inst->exec_size == 16);
-         assert(inst->header_size + 4 <= inst->sources);
-         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
-            if (inst->src[i].file != BAD_FILE) {
-               if (s.devinfo->has_compr4) {
-                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
-                  compr4_dst.nr |= BRW_MRF_COMPR4;
-                  ibld.MOV(compr4_dst, inst->src[i]);
-               } else {
-                  /* Platform doesn't have COMPR4.  We have to fake it */
-                  fs_reg mov_dst = retype(dst, inst->src[i].type);
-                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
-                  mov_dst.nr += 4;
-                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
-               }
-            }
-
-            dst.nr++;
-         }
-
-         /* The loop above only ever incremented us through the first set
-          * of 4 registers.  However, thanks to the magic of COMPR4, we
-          * actually wrote to the first 8 registers, so we need to take
-          * that into account now.
-          */
-         dst.nr += 4;
-
-         /* The COMPR4 code took care of the first 4 sources.  We'll let
-          * the regular path handle any remaining sources.  Yes, we are
-          * modifying the instruction but we're about to delete it so
-          * this really doesn't hurt anything.
-          */
-         inst->header_size += 4;
-      }
-
-      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
-         dst.type = inst->src[i].type;
-         if (inst->src[i].file != BAD_FILE) {
-            ibld.MOV(dst, inst->src[i]);
-         }
-         dst = offset(dst, ibld, 1);
-      }
-
-      inst->remove(block);
-      progress = true;
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
-   return progress;
-}
-
 /**
  * Factor an unsigned 32-bit integer.
  *
@@ -2879,95 +2700,6 @@ brw_fs_lower_integer_multiplication(fs_visitor &s)
    return progress;
 }
 
-bool
-brw_fs_lower_sub_sat(fs_visitor &s)
-{
-   bool progress = false;
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
-      const fs_builder ibld(&s, block, inst);
-
-      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
-          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
-         /* The fundamental problem is the hardware performs source negation
-          * at the bit width of the source.  If the source is 0x80000000D, the
-          * negation is 0x80000000D.  As a result, subtractSaturate(0,
-          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
-          * are at least three ways to resolve this:
-          *
-          * 1. Use the accumulator for the negated source.  The accumulator is
-          *    33 bits, so our source 0x80000000 is sign-extended to
-          *    0x1800000000.  The negation of which is 0x080000000.  This
-          *    doesn't help for 64-bit integers (which are already bigger than
-          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
-          *    SIMD32 instructions would have to be split into multiple SIMD8
-          *    instructions.
-          *
-          * 2. Use slightly different math.  For any n-bit value x, we know (x
-          *    >> 1) != -(x >> 1).  We can use this fact to only do
-          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
-          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
-          *
-          * 3. For unsigned sources, it is sufficient to replace the
-          *    subtractSaturate with (a > b) ? a - b : 0.
-          *
-          * It may also be possible to use the SUBB instruction.  This
-          * implicitly writes the accumulator, so it could only be used in the
-          * same situations as #1 above.  It is further limited by only
-          * allowing UD sources.
-          */
-         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
-             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
-            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
-
-            ibld.MOV(acc, inst->src[1]);
-            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
-            add->saturate = true;
-            add->src[0].negate = true;
-         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
-            /* tmp = src1 >> 1;
-             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
-             */
-            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
-            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
-            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
-            fs_inst *add;
-
-            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
-
-            add = ibld.ADD(tmp2, inst->src[1], tmp1);
-            add->src[1].negate = true;
-
-            add = ibld.ADD(tmp3, inst->src[0], tmp1);
-            add->src[1].negate = true;
-            add->saturate = true;
-
-            add = ibld.ADD(inst->dst, tmp3, tmp2);
-            add->src[1].negate = true;
-            add->saturate = true;
-         } else {
-            /* a > b ? a - b : 0 */
-            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
-                     BRW_CONDITIONAL_G);
-
-            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
-            add->src[1].negate = !add->src[1].negate;
-
-            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
-               ->predicate = BRW_PREDICATE_NORMAL;
-         }
-
-         inst->remove(block);
-         progress = true;
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
 /**
  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
  * by discard.  Due to the layout of the sample mask in the fragment shader
@@ -4029,226 +3761,6 @@ brw_fs_lower_simd_width(fs_visitor &s)
    return progress;
 }
 
-/**
- * Transform barycentric vectors into the interleaved form expected by the PLN
- * instruction and returned by the Gfx7+ PI shared function.
- *
- * For channels 0-15 in SIMD16 mode they are expected to be laid out as
- * follows in the register file:
- *
- *    rN+0: X[0-7]
- *    rN+1: Y[0-7]
- *    rN+2: X[8-15]
- *    rN+3: Y[8-15]
- *
- * There is no need to handle SIMD32 here -- This is expected to be run after
- * SIMD lowering, since SIMD lowering relies on vectors having the standard
- * component layout.
- */
-bool
-brw_fs_lower_barycentrics(fs_visitor &s)
-{
-   const intel_device_info *devinfo = s.devinfo;
-   const bool has_interleaved_layout = devinfo->has_pln ||
-      (devinfo->ver >= 7 && devinfo->ver < 20);
-   bool progress = false;
-
-   if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
-      return false;
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
-      if (inst->exec_size < 16)
-         continue;
-
-      const fs_builder ibld(&s, block, inst);
-      const fs_builder ubld = ibld.exec_all().group(8, 0);
-
-      switch (inst->opcode) {
-      case FS_OPCODE_LINTERP : {
-         assert(inst->exec_size == 16);
-         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
-         fs_reg srcs[4];
-
-         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
-            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
-                                   8 * (i / 2));
-
-         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
-
-         inst->src[0] = tmp;
-         progress = true;
-         break;
-      }
-      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
-      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
-      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
-         assert(inst->exec_size == 16);
-         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
-
-         for (unsigned i = 0; i < 2; i++) {
-            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
-               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
-                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
-                                                    8 * g),
-                                       offset(tmp, ubld, 2 * g + i));
-               mov->predicate = inst->predicate;
-               mov->predicate_inverse = inst->predicate_inverse;
-               mov->flag_subreg = inst->flag_subreg;
-            }
-         }
-
-         inst->dst = tmp;
-         progress = true;
-         break;
-      }
-      default:
-         break;
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
-/**
- * Lower a derivative instruction as the floating-point difference of two
- * swizzles of the source, specified as \p swz0 and \p swz1.
- */
-static bool
-lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
-                 unsigned swz0, unsigned swz1)
-{
-   const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
-   const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
-   const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
-
-   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
-   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
-
-   inst->resize_sources(2);
-   inst->src[0] = negate(tmp0);
-   inst->src[1] = tmp1;
-   inst->opcode = BRW_OPCODE_ADD;
-
-   return true;
-}
-
-/**
- * Lower derivative instructions on platforms where codegen cannot implement
- * them efficiently (i.e. XeHP).
- */
-bool
-brw_fs_lower_derivatives(fs_visitor &s)
-{
-   bool progress = false;
-
-   if (s.devinfo->verx10 < 125)
-      return false;
-
-   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
-      if (inst->opcode == FS_OPCODE_DDX_COARSE)
-         progress |= lower_derivative(s, block, inst,
-                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
-
-      else if (inst->opcode == FS_OPCODE_DDX_FINE)
-         progress |= lower_derivative(s, block, inst,
-                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
-
-      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
-         progress |= lower_derivative(s, block, inst,
-                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
-
-      else if (inst->opcode == FS_OPCODE_DDY_FINE)
-         progress |= lower_derivative(s, block, inst,
-                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
-bool
-brw_fs_lower_find_live_channel(fs_visitor &s)
-{
-   bool progress = false;
-
-   if (s.devinfo->ver < 8)
-      return false;
-
-   bool packed_dispatch =
-      brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
-                                    s.stage_prog_data);
-   bool vmask =
-      s.stage == MESA_SHADER_FRAGMENT &&
-      brw_wm_prog_data(s.stage_prog_data)->uses_vmask;
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
-      if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
-          inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
-         continue;
-
-      bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
-
-      /* Getting the first active channel index is easy on Gfx8: Just find
-       * the first bit set in the execution mask.  The register exists on
-       * HSW already but it reads back as all ones when the current
-       * instruction has execution masking disabled, so it's kind of
-       * useless there.
-       */
-      fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
-
-      const fs_builder ibld(&s, block, inst);
-      if (!inst->is_partial_write())
-         ibld.emit_undef_for_dst(inst);
-
-      const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
-
-      /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
-       * so combine the execution and dispatch masks to obtain the true mask.
-       *
-       * If we're looking for the first live channel, and we have packed
-       * dispatch, we can skip this step, as we know all dispatched channels
-       * will appear at the front of the mask.
-       */
-      if (!(first && packed_dispatch)) {
-         fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-         ubld.UNDEF(mask);
-         ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
-
-         /* Quarter control has the effect of magically shifting the value of
-          * ce0 so you'll get the first/last active channel relative to the
-          * specified quarter control as result.
-          */
-         if (inst->group > 0)
-            ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
-
-         ubld.AND(mask, exec_mask, mask);
-         exec_mask = mask;
-      }
-
-      if (first) {
-         ubld.FBL(inst->dst, exec_mask);
-      } else {
-         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         ubld.UNDEF(tmp);
-         ubld.LZD(tmp, exec_mask);
-         ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
-      }
-
-      inst->remove(block);
-      progress = true;
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
 void
 fs_visitor::dump_instructions_to_file(FILE *file) const
 {
@@ -4559,78 +4071,6 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
    free(filename);
 }
 
-/**
- * From the Skylake PRM Vol. 2a docs for sends:
- *
- *    "It is required that the second block of GRFs does not overlap with the
- *    first block."
- *
- * There are plenty of cases where we may accidentally violate this due to
- * having, for instance, both sources be the constant 0.  This little pass
- * just adds a new vgrf for the second payload and copies it over.
- */
-bool
-brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
-{
-   bool progress = false;
-
-   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
-      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
-          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
-                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
-         fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
-                             BRW_REGISTER_TYPE_UD);
-         /* Sadly, we've lost all notion of channels and bit sizes at this
-          * point.  Just WE_all it.
-          */
-         const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
-         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
-         fs_reg copy_dst = tmp;
-         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
-            if (inst->ex_mlen == i + 1) {
-               /* Only one register left; do SIMD8 */
-               ibld.group(8, 0).MOV(copy_dst, copy_src);
-            } else {
-               ibld.MOV(copy_dst, copy_src);
-            }
-            copy_src = offset(copy_src, ibld, 1);
-            copy_dst = offset(copy_dst, ibld, 1);
-         }
-         inst->src[3] = tmp;
-         progress = true;
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
-/**
- * Three source instruction must have a GRF/MRF destination register.
- * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
- */
-bool
-brw_fs_lower_3src_null_dest(fs_visitor &s)
-{
-   bool progress = false;
-
-   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
-      if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
-         inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
-                            inst->dst.type);
-         progress = true;
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
-                            DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
 static bool
 needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
 {
diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp
new file mode 100644
index 00000000000..e573fb21b56
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower.cpp
@@ -0,0 +1,608 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+/**
+ * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
+ * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
+ */
+bool
+brw_fs_lower_constant_loads(fs_visitor &s)
+{
+   unsigned index, pull_index;
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(&s, block, inst);
+
+      for (int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file != UNIFORM)
+	    continue;
+
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
+         if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
+	    continue;
+
+         assert(inst->src[i].stride == 0);
+
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
+         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
+         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
+
+
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
+                   srcs, PULL_UNIFORM_CONSTANT_SRCS);
+
+         /* Rewrite the instruction to use the temporary VGRF. */
+         inst->src[i].file = VGRF;
+         inst->src[i].nr = dst.nr;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
+
+         progress = true;
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
+            continue;
+
+         s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                      brw_imm_ud(index),
+                                      fs_reg() /* surface_handle */,
+                                      inst->src[1],
+                                      pull_index * 4, 4, 1);
+         inst->remove(block);
+
+         progress = true;
+      }
+   }
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+bool
+brw_fs_lower_load_payload(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
+      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      /* Get rid of COMPR4.  We'll add it back in if we need it */
+      if (dst.file == MRF)
+         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
+
+      const fs_builder ibld(&s, block, inst);
+      const fs_builder ubld = ibld.exec_all();
+
+      for (uint8_t i = 0; i < inst->header_size;) {
+         /* Number of header GRFs to initialize at once with a single MOV
+          * instruction.
+          */
+         const unsigned n =
+            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
+             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
+            2 : 1;
+
+         if (inst->src[i].file != BAD_FILE)
+            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
+                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));
+
+         dst = byte_offset(dst, n * REG_SIZE);
+         i += n;
+      }
+
+      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
+          inst->exec_size > 8) {
+         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+          * a straightforward copy.  Instead, the result of the
+          * LOAD_PAYLOAD is treated as interleaved and the first four
+          * non-header sources are unpacked as:
+          *
+          * m + 0: r0
+          * m + 1: g0
+          * m + 2: b0
+          * m + 3: a0
+          * m + 4: r1
+          * m + 5: g1
+          * m + 6: b1
+          * m + 7: a1
+          *
+          * This is used for gen <= 5 fb writes.
+          */
+         assert(inst->exec_size == 16);
+         assert(inst->header_size + 4 <= inst->sources);
+         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+            if (inst->src[i].file != BAD_FILE) {
+               if (s.devinfo->has_compr4) {
+                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
+                  compr4_dst.nr |= BRW_MRF_COMPR4;
+                  ibld.MOV(compr4_dst, inst->src[i]);
+               } else {
+                  /* Platform doesn't have COMPR4.  We have to fake it */
+                  fs_reg mov_dst = retype(dst, inst->src[i].type);
+                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
+                  mov_dst.nr += 4;
+                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
+               }
+            }
+
+            dst.nr++;
+         }
+
+         /* The loop above only ever incremented us through the first set
+          * of 4 registers.  However, thanks to the magic of COMPR4, we
+          * actually wrote to the first 8 registers, so we need to take
+          * that into account now.
+          */
+         dst.nr += 4;
+
+         /* The COMPR4 code took care of the first 4 sources.  We'll let
+          * the regular path handle any remaining sources.  Yes, we are
+          * modifying the instruction but we're about to delete it so
+          * this really doesn't hurt anything.
+          */
+         inst->header_size += 4;
+      }
+
+      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+         dst.type = inst->src[i].type;
+         if (inst->src[i].file != BAD_FILE) {
+            ibld.MOV(dst, inst->src[i]);
+         }
+         dst = offset(dst, ibld, 1);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+bool
+brw_fs_lower_minmax(fs_visitor &s)
+{
+   assert(s.devinfo->ver < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      const fs_builder ibld(&s, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* If src1 is an immediate value that is not NaN, then it can't be
+          * NaN.  In that case, emit CMP because it is much better for cmod
+          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
+          * support HF or DF, so it is not necessary to check for those.
+          */
+         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     inst->conditional_mod);
+         } else {
+            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                      inst->conditional_mod);
+         }
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+bool
+brw_fs_lower_sub_sat(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      const fs_builder ibld(&s, block, inst);
+
+      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
+          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
+         /* The fundamental problem is the hardware performs source negation
+          * at the bit width of the source.  If the source is 0x80000000D, the
+          * negation is 0x80000000D.  As a result, subtractSaturate(0,
+          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
+          * are at least three ways to resolve this:
+          *
+          * 1. Use the accumulator for the negated source.  The accumulator is
+          *    33 bits, so our source 0x80000000 is sign-extended to
+          *    0x1800000000.  The negation of which is 0x080000000.  This
+          *    doesn't help for 64-bit integers (which are already bigger than
+          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
+          *    SIMD32 instructions would have to be split into multiple SIMD8
+          *    instructions.
+          *
+          * 2. Use slightly different math.  For any n-bit value x, we know (x
+          *    >> 1) != -(x >> 1).  We can use this fact to only do
+          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
+          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
+          *
+          * 3. For unsigned sources, it is sufficient to replace the
+          *    subtractSaturate with (a > b) ? a - b : 0.
+          *
+          * It may also be possible to use the SUBB instruction.  This
+          * implicitly writes the accumulator, so it could only be used in the
+          * same situations as #1 above.  It is further limited by only
+          * allowing UD sources.
+          */
+         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
+             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
+            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
+
+            ibld.MOV(acc, inst->src[1]);
+            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
+            add->saturate = true;
+            add->src[0].negate = true;
+         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
+            /* tmp = src1 >> 1;
+             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
+             */
+            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
+            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
+            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
+            fs_inst *add;
+
+            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
+
+            add = ibld.ADD(tmp2, inst->src[1], tmp1);
+            add->src[1].negate = true;
+
+            add = ibld.ADD(tmp3, inst->src[0], tmp1);
+            add->src[1].negate = true;
+            add->saturate = true;
+
+            add = ibld.ADD(inst->dst, tmp3, tmp2);
+            add->src[1].negate = true;
+            add->saturate = true;
+         } else {
+            /* a > b ? a - b : 0 */
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     BRW_CONDITIONAL_G);
+
+            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
+            add->src[1].negate = !add->src[1].negate;
+
+            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
+               ->predicate = BRW_PREDICATE_NORMAL;
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Transform barycentric vectors into the interleaved form expected by the PLN
+ * instruction and returned by the Gfx7+ PI shared function.
+ *
+ * For channels 0-15 in SIMD16 mode they are expected to be laid out as
+ * follows in the register file:
+ *
+ *    rN+0: X[0-7]
+ *    rN+1: Y[0-7]
+ *    rN+2: X[8-15]
+ *    rN+3: Y[8-15]
+ *
+ * There is no need to handle SIMD32 here -- This is expected to be run after
+ * SIMD lowering, since SIMD lowering relies on vectors having the standard
+ * component layout.
+ */
+bool
+brw_fs_lower_barycentrics(fs_visitor &s)
+{
+   const intel_device_info *devinfo = s.devinfo;
+   const bool has_interleaved_layout = devinfo->has_pln ||
+      (devinfo->ver >= 7 && devinfo->ver < 20);
+   bool progress = false;
+
+   if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      if (inst->exec_size < 16)
+         continue;
+
+      const fs_builder ibld(&s, block, inst);
+      const fs_builder ubld = ibld.exec_all().group(8, 0);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_LINTERP : {
+         assert(inst->exec_size == 16);
+         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
+         fs_reg srcs[4];
+
+         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
+            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
+                                   8 * (i / 2));
+
+         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
+
+         inst->src[0] = tmp;
+         progress = true;
+         break;
+      }
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
+         assert(inst->exec_size == 16);
+         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
+
+         for (unsigned i = 0; i < 2; i++) {
+            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
+                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
+                                                    8 * g),
+                                       offset(tmp, ubld, 2 * g + i));
+               mov->predicate = inst->predicate;
+               mov->predicate_inverse = inst->predicate_inverse;
+               mov->flag_subreg = inst->flag_subreg;
+            }
+         }
+
+         inst->dst = tmp;
+         progress = true;
+         break;
+      }
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Lower a derivative instruction as the floating-point difference of two
+ * swizzles of the source, specified as \p swz0 and \p swz1.
+ */
+static bool
+lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
+                 unsigned swz0, unsigned swz1)
+{
+   const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
+   const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
+   const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
+
+   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
+   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
+
+   inst->resize_sources(2);
+   inst->src[0] = negate(tmp0);
+   inst->src[1] = tmp1;
+   inst->opcode = BRW_OPCODE_ADD;
+
+   return true;
+}
+
+/**
+ * Lower derivative instructions on platforms where codegen cannot implement
+ * them efficiently (i.e. XeHP).
+ */
+bool
+brw_fs_lower_derivatives(fs_visitor &s)
+{
+   bool progress = false;
+
+   if (s.devinfo->verx10 < 125)
+      return false;
+
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
+      if (inst->opcode == FS_OPCODE_DDX_COARSE)
+         progress |= lower_derivative(s, block, inst,
+                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
+
+      else if (inst->opcode == FS_OPCODE_DDX_FINE)
+         progress |= lower_derivative(s, block, inst,
+                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
+
+      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
+         progress |= lower_derivative(s, block, inst,
+                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
+
+      else if (inst->opcode == FS_OPCODE_DDY_FINE)
+         progress |= lower_derivative(s, block, inst,
+                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+bool
+brw_fs_lower_find_live_channel(fs_visitor &s)
+{
+   bool progress = false;
+
+   if (s.devinfo->ver < 8)
+      return false;
+
+   bool packed_dispatch =
+      brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
+                                    s.stage_prog_data);
+   bool vmask =
+      s.stage == MESA_SHADER_FRAGMENT &&
+      brw_wm_prog_data(s.stage_prog_data)->uses_vmask;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
+          inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
+         continue;
+
+      bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
+
+      /* Getting the first active channel index is easy on Gfx8: Just find
+       * the first bit set in the execution mask.  The register exists on
+       * HSW already but it reads back as all ones when the current
+       * instruction has execution masking disabled, so it's kind of
+       * useless there.
+       */
+      fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
+
+      const fs_builder ibld(&s, block, inst);
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+
+      const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
+
+      /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
+       * so combine the execution and dispatch masks to obtain the true mask.
+       *
+       * If we're looking for the first live channel, and we have packed
+       * dispatch, we can skip this step, as we know all dispatched channels
+       * will appear at the front of the mask.
+       */
+      if (!(first && packed_dispatch)) {
+         fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.UNDEF(mask);
+         ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
+
+         /* Quarter control has the effect of magically shifting the value of
+          * ce0 so you'll get the first/last active channel relative to the
+          * specified quarter control as result.
+          */
+         if (inst->group > 0)
+            ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
+
+         ubld.AND(mask, exec_mask, mask);
+         exec_mask = mask;
+      }
+
+      if (first) {
+         ubld.FBL(inst->dst, exec_mask);
+      } else {
+         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         ubld.UNDEF(tmp);
+         ubld.LZD(tmp, exec_mask);
+         ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * From the Skylake PRM Vol. 2a docs for sends:
+ *
+ *    "It is required that the second block of GRFs does not overlap with the
+ *    first block."
+ *
+ * There are plenty of cases where we may accidentally violate this due to
+ * having, for instance, both sources be the constant 0.  This little pass
+ * just adds a new vgrf for the second payload and copies it over.
+ */
+bool
+brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
+      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
+          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
+                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
+         fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
+                             BRW_REGISTER_TYPE_UD);
+         /* Sadly, we've lost all notion of channels and bit sizes at this
+          * point.  Just WE_all it.
+          */
+         const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
+         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
+         fs_reg copy_dst = tmp;
+         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
+            if (inst->ex_mlen == i + 1) {
+               /* Only one register left; do SIMD8 */
+               ibld.group(8, 0).MOV(copy_dst, copy_src);
+            } else {
+               ibld.MOV(copy_dst, copy_src);
+            }
+            copy_src = offset(copy_src, ibld, 1);
+            copy_dst = offset(copy_dst, ibld, 1);
+         }
+         inst->src[3] = tmp;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
+ */
+bool
+brw_fs_lower_3src_null_dest(fs_visitor &s)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
+      if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
+         inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
+                            inst->dst.type);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
+                            DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index ea6382465c1..16cde743976 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -75,6 +75,7 @@ libintel_compiler_brw_files = files(
   'brw_fs.h',
   'brw_fs_live_variables.cpp',
   'brw_fs_live_variables.h',
+  'brw_fs_lower.cpp',
   'brw_fs_lower_dpas.cpp',
   'brw_fs_lower_pack.cpp',
   'brw_fs_lower_regioning.cpp',