mesa/src/intel/compiler/brw_fs_lower.cpp

/*
 * Copyright © 2010 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "brw_fs.h"
#include "brw_fs_builder.h"

using namespace brw;

/**
 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
 */
bool
brw_fs_lower_constant_loads(fs_visitor &s)
{
   unsigned index, pull_index;
   bool progress = false;

   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
      /* Set up the annotation tracking for new generated instructions. */
      const fs_builder ibld(&s, block, inst);

      for (int i = 0; i < inst->sources; i++) {
	 if (inst->src[i].file != UNIFORM)
	    continue;

         /* We'll handle this case later */
         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
            continue;

         if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
	    continue;

         assert(inst->src[i].stride == 0);

         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
         const unsigned base = pull_index * 4;

         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);


         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
                   srcs, PULL_UNIFORM_CONSTANT_SRCS);

         /* Rewrite the instruction to use the temporary VGRF. */
         inst->src[i].file = VGRF;
         inst->src[i].nr = dst.nr;
         inst->src[i].offset = (base & (block_sz - 1)) +
                               inst->src[i].offset % 4;

         progress = true;
      }

      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
          inst->src[0].file == UNIFORM) {

         if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
            continue;

         s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
                                      brw_imm_ud(index),
                                      fs_reg() /* surface_handle */,
                                      inst->src[1],
                                      pull_index * 4, 4, 1);
         inst->remove(block);

         progress = true;
      }
   }
   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

   return progress;
}

bool
brw_fs_lower_load_payload(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
         continue;

      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
      assert(inst->saturate == false);
      fs_reg dst = inst->dst;

      /* Get rid of COMPR4.  We'll add it back in if we need it */
      if (dst.file == MRF)
         dst.nr = dst.nr & ~BRW_MRF_COMPR4;

      const fs_builder ibld(&s, block, inst);
      const fs_builder ubld = ibld.exec_all();

      for (uint8_t i = 0; i < inst->header_size;) {
         /* Number of header GRFs to initialize at once with a single MOV
          * instruction.
          */
         const unsigned n =
            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
            2 : 1;

         if (inst->src[i].file != BAD_FILE)
            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));

         dst = byte_offset(dst, n * REG_SIZE);
         i += n;
      }

      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
          inst->exec_size > 8) {
         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
          * a straightforward copy.  Instead, the result of the
          * LOAD_PAYLOAD is treated as interleaved and the first four
          * non-header sources are unpacked as:
          *
          * m + 0: r0
          * m + 1: g0
          * m + 2: b0
          * m + 3: a0
          * m + 4: r1
          * m + 5: g1
          * m + 6: b1
          * m + 7: a1
          *
          * This is used for gen <= 5 fb writes.
          */
         assert(inst->exec_size == 16);
         assert(inst->header_size + 4 <= inst->sources);
         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
            if (inst->src[i].file != BAD_FILE) {
               if (s.devinfo->has_compr4) {
                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
                  compr4_dst.nr |= BRW_MRF_COMPR4;
                  ibld.MOV(compr4_dst, inst->src[i]);
               } else {
                  /* Platform doesn't have COMPR4.  We have to fake it */
                  fs_reg mov_dst = retype(dst, inst->src[i].type);
                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
                  mov_dst.nr += 4;
                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
               }
            }

            dst.nr++;
         }

         /* The loop above only ever incremented us through the first set
          * of 4 registers.  However, thanks to the magic of COMPR4, we
          * actually wrote to the first 8 registers, so we need to take
          * that into account now.
          */
         dst.nr += 4;

         /* The COMPR4 code took care of the first 4 sources.  We'll let
          * the regular path handle any remaining sources.  Yes, we are
          * modifying the instruction but we're about to delete it so
          * this really doesn't hurt anything.
          */
         inst->header_size += 4;
      }

      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
         dst.type = inst->src[i].type;
         if (inst->src[i].file != BAD_FILE) {
            ibld.MOV(dst, inst->src[i]);
         }
         dst = offset(dst, ibld, 1);
      }

      inst->remove(block);
      progress = true;
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

   return progress;
}

bool
brw_fs_lower_minmax(fs_visitor &s)
{
   assert(s.devinfo->ver < 6);

   bool progress = false;

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      const fs_builder ibld(&s, block, inst);

      if (inst->opcode == BRW_OPCODE_SEL &&
          inst->predicate == BRW_PREDICATE_NONE) {
         /* If src1 is an immediate value that is not NaN, then it can't be
          * NaN.  In that case, emit CMP because it is much better for cmod
          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
          * support HF or DF, so it is not necessary to check for those.
          */
         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
                     inst->conditional_mod);
         } else {
            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
                      inst->conditional_mod);
         }
         inst->predicate = BRW_PREDICATE_NORMAL;
         inst->conditional_mod = BRW_CONDITIONAL_NONE;

         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

   return progress;
}

bool
brw_fs_lower_sub_sat(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      const fs_builder ibld(&s, block, inst);

      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
         /* The fundamental problem is the hardware performs source negation
          * at the bit width of the source.  If the source is 0x80000000D, the
          * negation is 0x80000000D.  As a result, subtractSaturate(0,
          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
          * are at least three ways to resolve this:
          *
          * 1. Use the accumulator for the negated source.  The accumulator is
          *    33 bits, so our source 0x80000000 is sign-extended to
          *    0x1800000000.  The negation of which is 0x080000000.  This
          *    doesn't help for 64-bit integers (which are already bigger than
          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
          *    SIMD32 instructions would have to be split into multiple SIMD8
          *    instructions.
          *
          * 2. Use slightly different math.  For any n-bit value x, we know (x
          *    >> 1) != -(x >> 1).  We can use this fact to only do
          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
          *
          * 3. For unsigned sources, it is sufficient to replace the
          *    subtractSaturate with (a > b) ? a - b : 0.
          *
          * It may also be possible to use the SUBB instruction.  This
          * implicitly writes the accumulator, so it could only be used in the
          * same situations as #1 above.  It is further limited by only
          * allowing UD sources.
          */
         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);

            ibld.MOV(acc, inst->src[1]);
            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
            add->saturate = true;
            add->src[0].negate = true;
         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
            /* tmp = src1 >> 1;
             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
             */
            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
            fs_inst *add;

            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));

            add = ibld.ADD(tmp2, inst->src[1], tmp1);
            add->src[1].negate = true;

            add = ibld.ADD(tmp3, inst->src[0], tmp1);
            add->src[1].negate = true;
            add->saturate = true;

            add = ibld.ADD(inst->dst, tmp3, tmp2);
            add->src[1].negate = true;
            add->saturate = true;
         } else {
            /* a > b ? a - b : 0 */
            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
                     BRW_CONDITIONAL_G);

            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
            add->src[1].negate = !add->src[1].negate;

            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
               ->predicate = BRW_PREDICATE_NORMAL;
         }

         inst->remove(block);
         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * Transform barycentric vectors into the interleaved form expected by the PLN
 * instruction and returned by the Gfx7+ PI shared function.
 *
 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
 * follows in the register file:
 *
 *    rN+0: X[0-7]
 *    rN+1: Y[0-7]
 *    rN+2: X[8-15]
 *    rN+3: Y[8-15]
 *
 * There is no need to handle SIMD32 here -- This is expected to be run after
 * SIMD lowering, since SIMD lowering relies on vectors having the standard
 * component layout.
 */
bool
brw_fs_lower_barycentrics(fs_visitor &s)
{
   const intel_device_info *devinfo = s.devinfo;
   const bool has_interleaved_layout = devinfo->has_pln ||
      (devinfo->ver >= 7 && devinfo->ver < 20);
   bool progress = false;

   if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
      return false;

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      if (inst->exec_size < 16)
         continue;

      const fs_builder ibld(&s, block, inst);
      const fs_builder ubld = ibld.exec_all().group(8, 0);

      switch (inst->opcode) {
      case FS_OPCODE_LINTERP : {
         assert(inst->exec_size == 16);
         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
         fs_reg srcs[4];

         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
                                   8 * (i / 2));

         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));

         inst->src[0] = tmp;
         progress = true;
         break;
      }
      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
         assert(inst->exec_size == 16);
         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);

         for (unsigned i = 0; i < 2; i++) {
            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
                                                    8 * g),
                                       offset(tmp, ubld, 2 * g + i));
               mov->predicate = inst->predicate;
               mov->predicate_inverse = inst->predicate_inverse;
               mov->flag_subreg = inst->flag_subreg;
            }
         }

         inst->dst = tmp;
         progress = true;
         break;
      }
      default:
         break;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * Lower a derivative instruction as the floating-point difference of two
 * swizzles of the source, specified as \p swz0 and \p swz1.
 */
static bool
lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
                 unsigned swz0, unsigned swz1)
{
   const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
   const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
   const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);

   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));

   inst->resize_sources(2);
   inst->src[0] = negate(tmp0);
   inst->src[1] = tmp1;
   inst->opcode = BRW_OPCODE_ADD;

   return true;
}

/**
 * Lower derivative instructions on platforms where codegen cannot implement
 * them efficiently (i.e. XeHP).
 */
bool
brw_fs_lower_derivatives(fs_visitor &s)
{
   bool progress = false;

   if (s.devinfo->verx10 < 125)
      return false;

   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
      if (inst->opcode == FS_OPCODE_DDX_COARSE)
         progress |= lower_derivative(s, block, inst,
                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);

      else if (inst->opcode == FS_OPCODE_DDX_FINE)
         progress |= lower_derivative(s, block, inst,
                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);

      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
         progress |= lower_derivative(s, block, inst,
                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);

      else if (inst->opcode == FS_OPCODE_DDY_FINE)
         progress |= lower_derivative(s, block, inst,
                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

bool
brw_fs_lower_find_live_channel(fs_visitor &s)
{
   bool progress = false;

   if (s.devinfo->ver < 8)
      return false;

   bool packed_dispatch =
      brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                    s.stage_prog_data);
   bool vmask =
      s.stage == MESA_SHADER_FRAGMENT &&
      brw_wm_prog_data(s.stage_prog_data)->uses_vmask;

   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
          inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
         continue;

      bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;

      /* Getting the first active channel index is easy on Gfx8: Just find
       * the first bit set in the execution mask.  The register exists on
       * HSW already but it reads back as all ones when the current
       * instruction has execution masking disabled, so it's kind of
       * useless there.
       */
      fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));

      const fs_builder ibld(&s, block, inst);
      if (!inst->is_partial_write())
         ibld.emit_undef_for_dst(inst);

      const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);

      /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
       * so combine the execution and dispatch masks to obtain the true mask.
       *
       * If we're looking for the first live channel, and we have packed
       * dispatch, we can skip this step, as we know all dispatched channels
       * will appear at the front of the mask.
       */
      if (!(first && packed_dispatch)) {
         fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
         ubld.UNDEF(mask);
         ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));

         /* Quarter control has the effect of magically shifting the value of
          * ce0 so you'll get the first/last active channel relative to the
          * specified quarter control as result.
          */
         if (inst->group > 0)
            ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));

         ubld.AND(mask, exec_mask, mask);
         exec_mask = mask;
      }

      if (first) {
         ubld.FBL(inst->dst, exec_mask);
      } else {
         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
         ubld.UNDEF(tmp);
         ubld.LZD(tmp, exec_mask);
         ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
      }

      inst->remove(block);
      progress = true;
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * From the Skylake PRM Vol. 2a docs for sends:
 *
 *    "It is required that the second block of GRFs does not overlap with the
 *    first block."
 *
 * There are plenty of cases where we may accidentally violate this due to
 * having, for instance, both sources be the constant 0.  This little pass
 * just adds a new vgrf for the second payload and copies it over.
 */
bool
brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
         fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
                             BRW_REGISTER_TYPE_UD);
         /* Sadly, we've lost all notion of channels and bit sizes at this
          * point.  Just WE_all it.
          */
         const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
         fs_reg copy_dst = tmp;
         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
            if (inst->ex_mlen == i + 1) {
               /* Only one register left; do SIMD8 */
               ibld.group(8, 0).MOV(copy_dst, copy_src);
            } else {
               ibld.MOV(copy_dst, copy_src);
            }
            copy_src = offset(copy_src, ibld, 1);
            copy_dst = offset(copy_dst, ibld, 1);
         }
         inst->src[3] = tmp;
         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * Three source instruction must have a GRF/MRF destination register.
 * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
 */
bool
brw_fs_lower_3src_null_dest(fs_visitor &s)
{
   bool progress = false;

   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
      if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
         inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
                            inst->dst.type);
         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
                            DEPENDENCY_VARIABLES);

   return progress;
}
intel/brw: Move small lowering passes into brw_fs_lower.cpp Larger lowering passes will go to their own files. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26887> 2024-01-04 23:27:04 -08:00			`/*`
			`* Copyright © 2010 Intel Corporation`
			`* SPDX-License-Identifier: MIT`
			`*/`

			`#include "brw_fs.h"`
			`#include "brw_fs_builder.h"`

			`using namespace brw;`

			`/**`
			`* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD`
			`* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.`
			`*/`
			`bool`
			`brw_fs_lower_constant_loads(fs_visitor &s)`
			`{`
			`unsigned index, pull_index;`
			`bool progress = false;`

			`foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {`
			`/* Set up the annotation tracking for new generated instructions. */`
			`const fs_builder ibld(&s, block, inst);`

			`for (int i = 0; i < inst->sources; i++) {`
			`if (inst->src[i].file != UNIFORM)`
			`continue;`

			`/* We'll handle this case later */`
			`if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)`
			`continue;`

			`if (!s.get_pull_locs(inst->src[i], &index, &pull_index))`
			`continue;`

			`assert(inst->src[i].stride == 0);`

			`const unsigned block_sz = 64; /* Fetch one cacheline at a time. */`
			`const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);`
			`const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);`
			`const unsigned base = pull_index * 4;`

			`fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];`
			`srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);`
			`srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));`
			`srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);`


			`ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,`
			`srcs, PULL_UNIFORM_CONSTANT_SRCS);`

			`/* Rewrite the instruction to use the temporary VGRF. */`
			`inst->src[i].file = VGRF;`
			`inst->src[i].nr = dst.nr;`
			`inst->src[i].offset = (base & (block_sz - 1)) +`
			`inst->src[i].offset % 4;`

			`progress = true;`
			`}`

			`if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&`
			`inst->src[0].file == UNIFORM) {`

			`if (!s.get_pull_locs(inst->src[0], &index, &pull_index))`
			`continue;`

			`s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,`
			`brw_imm_ud(index),`
			`fs_reg() /* surface_handle */,`
			`inst->src[1],`
			`pull_index * 4, 4, 1);`
			`inst->remove(block);`

			`progress = true;`
			`}`
			`}`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);`

			`return progress;`
			`}`

			`bool`
			`brw_fs_lower_load_payload(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {`
			`if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)`
			`continue;`

			`assert(inst->dst.file == MRF \|\| inst->dst.file == VGRF);`
			`assert(inst->saturate == false);`
			`fs_reg dst = inst->dst;`

			`/* Get rid of COMPR4. We'll add it back in if we need it */`
			`if (dst.file == MRF)`
			`dst.nr = dst.nr & ~BRW_MRF_COMPR4;`

			`const fs_builder ibld(&s, block, inst);`
			`const fs_builder ubld = ibld.exec_all();`

			`for (uint8_t i = 0; i < inst->header_size;) {`
			`/* Number of header GRFs to initialize at once with a single MOV`
			`* instruction.`
			`*/`
			`const unsigned n =`
			`(i + 1 < inst->header_size && inst->src[i].stride == 1 &&`
			`inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?`
			`2 : 1;`

			`if (inst->src[i].file != BAD_FILE)`
			`ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),`
			`retype(inst->src[i], BRW_REGISTER_TYPE_UD));`

			`dst = byte_offset(dst, n * REG_SIZE);`
			`i += n;`
			`}`

			`if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&`
			`inst->exec_size > 8) {`
			`/* In this case, the payload portion of the LOAD_PAYLOAD isn't`
			`* a straightforward copy. Instead, the result of the`
			`* LOAD_PAYLOAD is treated as interleaved and the first four`
			`* non-header sources are unpacked as:`
			`*`
			`* m + 0: r0`
			`* m + 1: g0`
			`* m + 2: b0`
			`* m + 3: a0`
			`* m + 4: r1`
			`* m + 5: g1`
			`* m + 6: b1`
			`* m + 7: a1`
			`*`
			`* This is used for gen <= 5 fb writes.`
			`*/`
			`assert(inst->exec_size == 16);`
			`assert(inst->header_size + 4 <= inst->sources);`
			`for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {`
			`if (inst->src[i].file != BAD_FILE) {`
			`if (s.devinfo->has_compr4) {`
			`fs_reg compr4_dst = retype(dst, inst->src[i].type);`
			`compr4_dst.nr \|= BRW_MRF_COMPR4;`
			`ibld.MOV(compr4_dst, inst->src[i]);`
			`} else {`
			`/* Platform doesn't have COMPR4. We have to fake it */`
			`fs_reg mov_dst = retype(dst, inst->src[i].type);`
			`ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));`
			`mov_dst.nr += 4;`
			`ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));`
			`}`
			`}`

			`dst.nr++;`
			`}`

			`/* The loop above only ever incremented us through the first set`
			`* of 4 registers. However, thanks to the magic of COMPR4, we`
			`* actually wrote to the first 8 registers, so we need to take`
			`* that into account now.`
			`*/`
			`dst.nr += 4;`

			`/* The COMPR4 code took care of the first 4 sources. We'll let`
			`* the regular path handle any remaining sources. Yes, we are`
			`* modifying the instruction but we're about to delete it so`
			`* this really doesn't hurt anything.`
			`*/`
			`inst->header_size += 4;`
			`}`

			`for (uint8_t i = inst->header_size; i < inst->sources; i++) {`
			`dst.type = inst->src[i].type;`
			`if (inst->src[i].file != BAD_FILE) {`
			`ibld.MOV(dst, inst->src[i]);`
			`}`
			`dst = offset(dst, ibld, 1);`
			`}`

			`inst->remove(block);`
			`progress = true;`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);`

			`return progress;`
			`}`

			`bool`
			`brw_fs_lower_minmax(fs_visitor &s)`
			`{`
			`assert(s.devinfo->ver < 6);`

			`bool progress = false;`

			`foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {`
			`const fs_builder ibld(&s, block, inst);`

			`if (inst->opcode == BRW_OPCODE_SEL &&`
			`inst->predicate == BRW_PREDICATE_NONE) {`
			`/* If src1 is an immediate value that is not NaN, then it can't be`
			`* NaN. In that case, emit CMP because it is much better for cmod`
			`* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't`
			`* support HF or DF, so it is not necessary to check for those.`
			`*/`
			`if (inst->src[1].type != BRW_REGISTER_TYPE_F \|\|`
			`(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {`
			`ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],`
			`inst->conditional_mod);`
			`} else {`
			`ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],`
			`inst->conditional_mod);`
			`}`
			`inst->predicate = BRW_PREDICATE_NORMAL;`
			`inst->conditional_mod = BRW_CONDITIONAL_NONE;`

			`progress = true;`
			`}`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);`

			`return progress;`
			`}`

			`bool`
			`brw_fs_lower_sub_sat(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {`
			`const fs_builder ibld(&s, block, inst);`

			`if (inst->opcode == SHADER_OPCODE_USUB_SAT \|\|`
			`inst->opcode == SHADER_OPCODE_ISUB_SAT) {`
			`/* The fundamental problem is the hardware performs source negation`
			`* at the bit width of the source. If the source is 0x80000000D, the`
			`* negation is 0x80000000D. As a result, subtractSaturate(0,`
			`* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There`
			`* are at least three ways to resolve this:`
			`*`
			`* 1. Use the accumulator for the negated source. The accumulator is`
			`* 33 bits, so our source 0x80000000 is sign-extended to`
			`* 0x1800000000. The negation of which is 0x080000000. This`
			`* doesn't help for 64-bit integers (which are already bigger than`
			`* 33 bits). There are also only 8 accumulators, so SIMD16 or`
			`* SIMD32 instructions would have to be split into multiple SIMD8`
			`* instructions.`
			`*`
			`* 2. Use slightly different math. For any n-bit value x, we know (x`
			`* >> 1) != -(x >> 1). We can use this fact to only do`
			`* subtractions involving (x >> 1). subtractSaturate(a, b) ==`
			`* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).`
			`*`
			`* 3. For unsigned sources, it is sufficient to replace the`
			`* subtractSaturate with (a > b) ? a - b : 0.`
			`*`
			`* It may also be possible to use the SUBB instruction. This`
			`* implicitly writes the accumulator, so it could only be used in the`
			`* same situations as #1 above. It is further limited by only`
			`* allowing UD sources.`
			`*/`
			`if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&`
			`inst->src[0].type != BRW_REGISTER_TYPE_UQ) {`
			`fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);`

			`ibld.MOV(acc, inst->src[1]);`
			`fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);`
			`add->saturate = true;`
			`add->src[0].negate = true;`
			`} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {`
			`/* tmp = src1 >> 1;`
			`* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));`
			`*/`
			`fs_reg tmp1 = ibld.vgrf(inst->src[0].type);`
			`fs_reg tmp2 = ibld.vgrf(inst->src[0].type);`
			`fs_reg tmp3 = ibld.vgrf(inst->src[0].type);`
			`fs_inst *add;`

			`ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));`

			`add = ibld.ADD(tmp2, inst->src[1], tmp1);`
			`add->src[1].negate = true;`

			`add = ibld.ADD(tmp3, inst->src[0], tmp1);`
			`add->src[1].negate = true;`
			`add->saturate = true;`

			`add = ibld.ADD(inst->dst, tmp3, tmp2);`
			`add->src[1].negate = true;`
			`add->saturate = true;`
			`} else {`
			`/* a > b ? a - b : 0 */`
			`ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],`
			`BRW_CONDITIONAL_G);`

			`fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);`
			`add->src[1].negate = !add->src[1].negate;`

			`ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))`
			`->predicate = BRW_PREDICATE_NORMAL;`
			`}`

			`inst->remove(block);`
			`progress = true;`
			`}`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS \| DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`

			`/**`
			`* Transform barycentric vectors into the interleaved form expected by the PLN`
			`* instruction and returned by the Gfx7+ PI shared function.`
			`*`
			`* For channels 0-15 in SIMD16 mode they are expected to be laid out as`
			`* follows in the register file:`
			`*`
			`* rN+0: X[0-7]`
			`* rN+1: Y[0-7]`
			`* rN+2: X[8-15]`
			`* rN+3: Y[8-15]`
			`*`
			`* There is no need to handle SIMD32 here -- This is expected to be run after`
			`* SIMD lowering, since SIMD lowering relies on vectors having the standard`
			`* component layout.`
			`*/`
			`bool`
			`brw_fs_lower_barycentrics(fs_visitor &s)`
			`{`
			`const intel_device_info *devinfo = s.devinfo;`
			`const bool has_interleaved_layout = devinfo->has_pln \|\|`
			`(devinfo->ver >= 7 && devinfo->ver < 20);`
			`bool progress = false;`

			`if (s.stage != MESA_SHADER_FRAGMENT \|\| !has_interleaved_layout)`
			`return false;`

			`foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {`
			`if (inst->exec_size < 16)`
			`continue;`

			`const fs_builder ibld(&s, block, inst);`
			`const fs_builder ubld = ibld.exec_all().group(8, 0);`

			`switch (inst->opcode) {`
			`case FS_OPCODE_LINTERP : {`
			`assert(inst->exec_size == 16);`
			`const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);`
			`fs_reg srcs[4];`

			`for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)`
			`srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),`
			`8 * (i / 2));`

			`ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));`

			`inst->src[0] = tmp;`
			`progress = true;`
			`break;`
			`}`
			`case FS_OPCODE_INTERPOLATE_AT_SAMPLE:`
			`case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:`
			`case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {`
			`assert(inst->exec_size == 16);`
			`const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);`

			`for (unsigned i = 0; i < 2; i++) {`
			`for (unsigned g = 0; g < inst->exec_size / 8; g++) {`
			`fs_inst *mov = ibld.at(block, inst->next).group(8, g)`
			`.MOV(horiz_offset(offset(inst->dst, ibld, i),`
			`8 * g),`
			`offset(tmp, ubld, 2 * g + i));`
			`mov->predicate = inst->predicate;`
			`mov->predicate_inverse = inst->predicate_inverse;`
			`mov->flag_subreg = inst->flag_subreg;`
			`}`
			`}`

			`inst->dst = tmp;`
			`progress = true;`
			`break;`
			`}`
			`default:`
			`break;`
			`}`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS \| DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`

			`/**`
			`* Lower a derivative instruction as the floating-point difference of two`
			`* swizzles of the source, specified as \p swz0 and \p swz1.`
			`*/`
			`static bool`
			`lower_derivative(fs_visitor &s, bblock_t block, fs_inst inst,`
			`unsigned swz0, unsigned swz1)`
			`{`
			`const fs_builder ubld = fs_builder(&s, block, inst).exec_all();`
			`const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);`
			`const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);`

			`ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));`
			`ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));`

			`inst->resize_sources(2);`
			`inst->src[0] = negate(tmp0);`
			`inst->src[1] = tmp1;`
			`inst->opcode = BRW_OPCODE_ADD;`

			`return true;`
			`}`

			`/**`
			`* Lower derivative instructions on platforms where codegen cannot implement`
			`* them efficiently (i.e. XeHP).`
			`*/`
			`bool`
			`brw_fs_lower_derivatives(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`if (s.devinfo->verx10 < 125)`
			`return false;`

			`foreach_block_and_inst(block, fs_inst, inst, s.cfg) {`
			`if (inst->opcode == FS_OPCODE_DDX_COARSE)`
			`progress \|= lower_derivative(s, block, inst,`
			`BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);`

			`else if (inst->opcode == FS_OPCODE_DDX_FINE)`
			`progress \|= lower_derivative(s, block, inst,`
			`BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);`

			`else if (inst->opcode == FS_OPCODE_DDY_COARSE)`
			`progress \|= lower_derivative(s, block, inst,`
			`BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);`

			`else if (inst->opcode == FS_OPCODE_DDY_FINE)`
			`progress \|= lower_derivative(s, block, inst,`
			`BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS \| DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`

			`bool`
			`brw_fs_lower_find_live_channel(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`if (s.devinfo->ver < 8)`
			`return false;`

			`bool packed_dispatch =`
			`brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,`
			`s.stage_prog_data);`
			`bool vmask =`
			`s.stage == MESA_SHADER_FRAGMENT &&`
			`brw_wm_prog_data(s.stage_prog_data)->uses_vmask;`

			`foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {`
			`if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&`
			`inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)`
			`continue;`

			`bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;`

			`/* Getting the first active channel index is easy on Gfx8: Just find`
			`* the first bit set in the execution mask. The register exists on`
			`* HSW already but it reads back as all ones when the current`
			`* instruction has execution masking disabled, so it's kind of`
			`* useless there.`
			`*/`
			`fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));`

			`const fs_builder ibld(&s, block, inst);`
			`if (!inst->is_partial_write())`
			`ibld.emit_undef_for_dst(inst);`

			`const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);`

			`/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),`
			`* so combine the execution and dispatch masks to obtain the true mask.`
			`*`
			`* If we're looking for the first live channel, and we have packed`
			`* dispatch, we can skip this step, as we know all dispatched channels`
			`* will appear at the front of the mask.`
			`*/`
			`if (!(first && packed_dispatch)) {`
			`fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);`
			`ubld.UNDEF(mask);`
			`ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));`

			`/* Quarter control has the effect of magically shifting the value of`
			`* ce0 so you'll get the first/last active channel relative to the`
			`* specified quarter control as result.`
			`*/`
			`if (inst->group > 0)`
			`ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));`

			`ubld.AND(mask, exec_mask, mask);`
			`exec_mask = mask;`
			`}`

			`if (first) {`
			`ubld.FBL(inst->dst, exec_mask);`
			`} else {`
			`fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);`
			`ubld.UNDEF(tmp);`
			`ubld.LZD(tmp, exec_mask);`
			`ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));`
			`}`

			`inst->remove(block);`
			`progress = true;`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS \| DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`

			`/**`
			`* From the Skylake PRM Vol. 2a docs for sends:`
			`*`
			`* "It is required that the second block of GRFs does not overlap with the`
			`* first block."`
			`*`
			`* There are plenty of cases where we may accidentally violate this due to`
			`* having, for instance, both sources be the constant 0. This little pass`
			`* just adds a new vgrf for the second payload and copies it over.`
			`*/`
			`bool`
			`brw_fs_lower_sends_overlapping_payload(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {`
			`if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&`
			`regions_overlap(inst->src[2], inst->mlen * REG_SIZE,`
			`inst->src[3], inst->ex_mlen * REG_SIZE)) {`
			`fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),`
			`BRW_REGISTER_TYPE_UD);`
			`/* Sadly, we've lost all notion of channels and bit sizes at this`
			`* point. Just WE_all it.`
			`*/`
			`const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);`
			`fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);`
			`fs_reg copy_dst = tmp;`
			`for (unsigned i = 0; i < inst->ex_mlen; i += 2) {`
			`if (inst->ex_mlen == i + 1) {`
			`/* Only one register left; do SIMD8 */`
			`ibld.group(8, 0).MOV(copy_dst, copy_src);`
			`} else {`
			`ibld.MOV(copy_dst, copy_src);`
			`}`
			`copy_src = offset(copy_src, ibld, 1);`
			`copy_dst = offset(copy_dst, ibld, 1);`
			`}`
			`inst->src[3] = tmp;`
			`progress = true;`
			`}`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS \| DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`

			`/**`
			`* Three source instruction must have a GRF/MRF destination register.`
			`* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.`
			`*/`
			`bool`
			`brw_fs_lower_3src_null_dest(fs_visitor &s)`
			`{`
			`bool progress = false;`

			`foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {`
			`if (inst->is_3src(s.compiler) && inst->dst.is_null()) {`
			`inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),`
			`inst->dst.type);`
			`progress = true;`
			`}`
			`}`

			`if (progress)`
			`s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL \|`
			`DEPENDENCY_VARIABLES);`

			`return progress;`
			`}`