intel/brw: Move optimize and small optimizations to brw_fs_opt.cpp

Remaining optimizations in brw_fs.cpp will get their own files. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26887>
2026-05-04 20:38:06 +02:00 · 2024-01-04 17:31:42 -08:00 · 2024-01-04 17:31:42 -08:00 · f3b7f4726a
commit f3b7f4726a
parent 7451c0f5d6
3 changed files with 505 additions and 491 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -2938,296 +2938,6 @@ brw_fs_opt_algebraic(fs_visitor &s)
   return progress;
 }

-static unsigned
-load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
-{
-   assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
-   assert(size_read >= lp->header_size * REG_SIZE);
-
-   unsigned i;
-   unsigned size = lp->header_size * REG_SIZE;
-   for (i = lp->header_size; size < size_read && i < lp->sources; i++)
-      size += lp->exec_size * type_sz(lp->src[i].type);
-
-   /* Size read must cover exactly a subset of sources. */
-   assert(size == size_read);
-   return i;
-}
-
-/**
- * Optimize sample messages that have constant zero values for the trailing
- * parameters. We can just reduce the message length for these
- * instructions instead of reserving a register for it. Trailing parameters
- * that aren't sent default to zero anyway. This will cause the dead code
- * eliminator to remove the MOV instruction that would otherwise be emitted to
- * set up the zero value.
- */
-bool
-brw_fs_opt_zero_samples(fs_visitor &s)
-{
-   /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
-   assert(s.devinfo->ver >= 7);
-
-   bool progress = false;
-
-   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
-      if (send->opcode != SHADER_OPCODE_SEND ||
-          send->sfid != BRW_SFID_SAMPLER)
-         continue;
-
-      /* Wa_14012688258:
-       *
-       * Don't trim zeros at the end of payload for sample operations
-       * in cube and cube arrays.
-       */
-      if (send->keep_payload_trailing_zeros)
-         continue;
-
-      /* This pass works on SENDs before splitting. */
-      if (send->ex_mlen > 0)
-         continue;
-
-      fs_inst *lp = (fs_inst *) send->prev;
-
-      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
-         continue;
-
-      /* How much of the payload are actually read by this SEND. */
-      const unsigned params =
-         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
-
-      /* We don't want to remove the message header or the first parameter.
-       * Removing the first parameter is not allowed, see the Haswell PRM
-       * volume 7, page 149:
-       *
-       *     "Parameter 0 is required except for the sampleinfo message, which
-       *      has no parameter 0"
-       */
-      const unsigned first_param_idx = lp->header_size;
-      unsigned zero_size = 0;
-      for (unsigned i = params - 1; i > first_param_idx; i--) {
-         if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
-            break;
-         zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
-      }
-
-      const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
-      if (zero_len > 0) {
-         send->mlen -= zero_len;
-         progress = true;
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
-
-   return progress;
-}
-
-/**
- * Opportunistically split SEND message payloads.
- *
- * Gfx9+ supports "split" SEND messages, which take two payloads that are
- * implicitly concatenated.  If we find a SEND message with a single payload,
- * we can split that payload in two.  This results in smaller contiguous
- * register blocks for us to allocate.  But it can help beyond that, too.
- *
- * We try and split a LOAD_PAYLOAD between sources which change registers.
- * For example, a sampler message often contains a x/y/z coordinate that may
- * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
- * or array index, which comes from elsewhere.  In this case, the first few
- * sources will be different offsets of the same VGRF, then a later source
- * will be a different VGRF.  So we split there, possibly eliminating the
- * payload concatenation altogether.
- */
-bool
-brw_fs_opt_split_sends(fs_visitor &s)
-{
-   if (s.devinfo->ver < 9)
-      return false;
-
-   bool progress = false;
-
-   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
-      if (send->opcode != SHADER_OPCODE_SEND ||
-          send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
-         continue;
-
-      assert(send->src[2].file == VGRF);
-
-      /* Currently don't split sends that reuse a previously used payload. */
-      fs_inst *lp = (fs_inst *) send->prev;
-
-      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
-         continue;
-
-      if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
-         continue;
-
-      /* Split either after the header (if present), or when consecutive
-       * sources switch from one VGRF to a different one.
-       */
-      unsigned mid = lp->header_size;
-      if (mid == 0) {
-         for (mid = 1; mid < lp->sources; mid++) {
-            if (lp->src[mid].file == BAD_FILE)
-               continue;
-
-            if (lp->src[0].file != lp->src[mid].file ||
-                lp->src[0].nr != lp->src[mid].nr)
-               break;
-         }
-      }
-
-      /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
-       * find out how many sources from the payload does it really need.
-       */
-      const unsigned end =
-         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
-
-      /* Nothing to split. */
-      if (end <= mid)
-         continue;
-
-      const fs_builder ibld(&s, block, lp);
-      fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
-      fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
-
-      assert(lp1->size_written % REG_SIZE == 0);
-      assert(lp2->size_written % REG_SIZE == 0);
-      assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
-
-      lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
-      lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
-
-      send->resize_sources(4);
-      send->src[2] = lp1->dst;
-      send->src[3] = lp2->dst;
-      send->ex_mlen = lp2->size_written / REG_SIZE;
-      send->mlen -= send->ex_mlen;
-
-      progress = true;
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-
-   return progress;
-}
-
-/**
- * Remove redundant or useless halts.
- *
- * For example, we can eliminate halts in the following sequence:
- *
- * halt        (redundant with the next halt)
- * halt        (useless; jumps to the next instruction)
- * halt-target
- */
-bool
-brw_fs_opt_remove_redundant_halts(fs_visitor &s)
-{
-   bool progress = false;
-
-   unsigned halt_count = 0;
-   fs_inst *halt_target = NULL;
-   bblock_t *halt_target_block = NULL;
-   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
-      if (inst->opcode == BRW_OPCODE_HALT)
-         halt_count++;
-
-      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
-         halt_target = inst;
-         halt_target_block = block;
-         break;
-      }
-   }
-
-   if (!halt_target) {
-      assert(halt_count == 0);
-      return false;
-   }
-
-   /* Delete any HALTs immediately before the halt target. */
-   for (fs_inst *prev = (fs_inst *) halt_target->prev;
-        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
-        prev = (fs_inst *) halt_target->prev) {
-      prev->remove(halt_target_block);
-      halt_count--;
-      progress = true;
-   }
-
-   if (halt_count == 0) {
-      halt_target->remove(halt_target_block);
-      progress = true;
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
-   return progress;
-}
-
-/**
- * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
- * flow.  We could probably do better here with some form of divergence
- * analysis.
- */
-bool
-brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
-{
-   bool progress = false;
-   unsigned depth = 0;
-
-   if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
-                                      s.stage_prog_data)) {
-      /* The optimization below assumes that channel zero is live on thread
-       * dispatch, which may not be the case if the fixed function dispatches
-       * threads sparsely.
-       */
-      return false;
-   }
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
-      switch (inst->opcode) {
-      case BRW_OPCODE_IF:
-      case BRW_OPCODE_DO:
-         depth++;
-         break;
-
-      case BRW_OPCODE_ENDIF:
-      case BRW_OPCODE_WHILE:
-         depth--;
-         break;
-
-      case BRW_OPCODE_HALT:
-         /* This can potentially make control flow non-uniform until the end
-          * of the program.
-          */
-         goto out;
-
-      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
-         if (depth == 0) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = brw_imm_ud(0u);
-            inst->sources = 1;
-            inst->force_writemask_all = true;
-            progress = true;
-         }
-         break;
-
-      default:
-         break;
-      }
-   }
-
-out:
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
-
-   return progress;
-}
-
 /**
 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
 * instructions to FS_OPCODE_REP_FB_WRITE.
@ -3301,55 +3011,6 @@ fs_visitor::emit_repclear_shader()
   brw_fs_lower_scoreboard(*this);
 }

-/**
- * Rounding modes for conversion instructions are included for each
- * conversion, but right now it is a state. So once it is set,
- * we don't need to call it again for subsequent calls.
- *
- * This is useful for vector/matrices conversions, as setting the
- * mode once is enough for the full vector/matrix
- */
-bool
-brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
-{
-   bool progress = false;
-   unsigned execution_mode = s.nir->info.float_controls_execution_mode;
-
-   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
-   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
-        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
-        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
-       execution_mode)
-      base_mode = BRW_RND_MODE_RTNE;
-   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
-        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
-        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
-       execution_mode)
-      base_mode = BRW_RND_MODE_RTZ;
-
-   foreach_block (block, s.cfg) {
-      brw_rnd_mode prev_mode = base_mode;
-
-      foreach_inst_in_block_safe (fs_inst, inst, block) {
-         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
-            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
-            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
-            if (mode == prev_mode) {
-               inst->remove(block);
-               progress = true;
-            } else {
-               prev_mode = mode;
-            }
-         }
-      }
-   }
-
-   if (progress)
-      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
-   return progress;
-}
-
 bool
 brw_fs_lower_load_payload(fs_visitor &s)
 {
@ -5633,158 +5294,6 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
   free(filename);
 }

-void
-brw_fs_optimize(fs_visitor &s)
-{
-   const intel_device_info *devinfo = s.devinfo;
-   const nir_shader *nir = s.nir;
-
-   s.debug_optimizer(nir, "start", 0, 0);
-
-   /* Start by validating the shader we currently have. */
-   s.validate();
-
-   bool progress = false;
-   int iteration = 0;
-   int pass_num = 0;
-
-#define OPT(pass, ...) ({                                               \
-      pass_num++;                                                       \
-      bool this_progress = pass(s, ##__VA_ARGS__);                      \
-                                                                        \
-      if (this_progress)                                                \
-         s.debug_optimizer(nir, #pass, iteration, pass_num);            \
-                                                                        \
-      s.validate();                                                     \
-                                                                        \
-      progress = progress || this_progress;                             \
-      this_progress;                                                    \
-   })
-
-   s.assign_constant_locations();
-   OPT(brw_fs_lower_constant_loads);
-
-   s.validate();
-
-   if (s.compiler->lower_dpas)
-      OPT(brw_lower_dpas);
-
-   OPT(brw_fs_opt_split_virtual_grfs);
-
-   /* Before anything else, eliminate dead code.  The results of some NIR
-    * instructions may effectively be calculated twice.  Once when the
-    * instruction is encountered, and again when the user of that result is
-    * encountered.  Wipe those away before algebraic optimizations and
-    * especially copy propagation can mix things up.
-    */
-   OPT(brw_fs_opt_dead_code_eliminate);
-
-   OPT(brw_fs_opt_remove_extra_rounding_modes);
-
-   do {
-      progress = false;
-      pass_num = 0;
-      iteration++;
-
-      OPT(brw_fs_opt_algebraic);
-      OPT(brw_fs_opt_cse);
-      OPT(brw_fs_opt_copy_propagation);
-      OPT(opt_predicated_break);
-      OPT(brw_fs_opt_cmod_propagation);
-      OPT(brw_fs_opt_dead_code_eliminate);
-      OPT(brw_fs_opt_peephole_sel);
-      OPT(dead_control_flow_eliminate);
-      OPT(brw_fs_opt_saturate_propagation);
-      OPT(brw_fs_opt_register_coalesce);
-      OPT(brw_fs_opt_eliminate_find_live_channel);
-
-      OPT(brw_fs_opt_compact_virtual_grfs);
-   } while (progress);
-
-   progress = false;
-   pass_num = 0;
-
-   if (OPT(brw_fs_lower_pack)) {
-      OPT(brw_fs_opt_register_coalesce);
-      OPT(brw_fs_opt_dead_code_eliminate);
-   }
-
-   OPT(brw_fs_lower_simd_width);
-   OPT(brw_fs_lower_barycentrics);
-   OPT(brw_fs_lower_logical_sends);
-
-   /* After logical SEND lowering. */
-
-   if (OPT(brw_fs_opt_copy_propagation))
-      OPT(brw_fs_opt_algebraic);
-
-   /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
-    * Do this before splitting SENDs.
-    */
-   if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
-      OPT(brw_fs_opt_algebraic);
-
-   OPT(brw_fs_opt_split_sends);
-   OPT(brw_fs_workaround_nomask_control_flow);
-
-   if (progress) {
-      if (OPT(brw_fs_opt_copy_propagation))
-         OPT(brw_fs_opt_algebraic);
-
-      /* Run after logical send lowering to give it a chance to CSE the
-       * LOAD_PAYLOAD instructions created to construct the payloads of
-       * e.g. texturing messages in cases where it wasn't possible to CSE the
-       * whole logical instruction.
-       */
-      OPT(brw_fs_opt_cse);
-      OPT(brw_fs_opt_register_coalesce);
-      OPT(brw_fs_opt_dead_code_eliminate);
-      OPT(brw_fs_opt_peephole_sel);
-   }
-
-   OPT(brw_fs_opt_remove_redundant_halts);
-
-   if (OPT(brw_fs_lower_load_payload)) {
-      OPT(brw_fs_opt_split_virtual_grfs);
-
-      /* Lower 64 bit MOVs generated by payload lowering. */
-      if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
-         OPT(brw_fs_opt_algebraic);
-
-      OPT(brw_fs_opt_register_coalesce);
-      OPT(brw_fs_lower_simd_width);
-      OPT(brw_fs_opt_dead_code_eliminate);
-   }
-
-   OPT(brw_fs_opt_combine_constants);
-   if (OPT(brw_fs_lower_integer_multiplication)) {
-      /* If lower_integer_multiplication made progress, it may have produced
-       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
-       * one more time to clean those up if they exist.
-       */
-      OPT(brw_fs_lower_integer_multiplication);
-   }
-   OPT(brw_fs_lower_sub_sat);
-
-   progress = false;
-   OPT(brw_fs_lower_derivatives);
-   OPT(brw_fs_lower_regioning);
-   if (progress) {
-      if (OPT(brw_fs_opt_copy_propagation))
-         OPT(brw_fs_opt_algebraic);
-      OPT(brw_fs_opt_dead_code_eliminate);
-      OPT(brw_fs_lower_simd_width);
-   }
-
-   OPT(brw_fs_lower_sends_overlapping_payload);
-
-   OPT(brw_fs_lower_uniform_pull_constant_loads);
-
-   OPT(brw_fs_lower_find_live_channel);
-
-   s.validate();
-}
-
 /**
 * From the Skylake PRM Vol. 2a docs for sends:
 *
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@ -0,0 +1,504 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_dead_control_flow.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+void
+brw_fs_optimize(fs_visitor &s)
+{
+   const intel_device_info *devinfo = s.devinfo;
+   const nir_shader *nir = s.nir;
+
+   s.debug_optimizer(nir, "start", 0, 0);
+
+   /* Start by validating the shader we currently have. */
+   s.validate();
+
+   bool progress = false;
+   int iteration = 0;
+   int pass_num = 0;
+
+#define OPT(pass, ...) ({                                               \
+      pass_num++;                                                       \
+      bool this_progress = pass(s, ##__VA_ARGS__);                      \
+                                                                        \
+      if (this_progress)                                                \
+         s.debug_optimizer(nir, #pass, iteration, pass_num);            \
+                                                                        \
+      s.validate();                                                     \
+                                                                        \
+      progress = progress || this_progress;                             \
+      this_progress;                                                    \
+   })
+
+   s.assign_constant_locations();
+   OPT(brw_fs_lower_constant_loads);
+
+   s.validate();
+
+   if (s.compiler->lower_dpas)
+      OPT(brw_lower_dpas);
+
+   OPT(brw_fs_opt_split_virtual_grfs);
+
+   /* Before anything else, eliminate dead code.  The results of some NIR
+    * instructions may effectively be calculated twice.  Once when the
+    * instruction is encountered, and again when the user of that result is
+    * encountered.  Wipe those away before algebraic optimizations and
+    * especially copy propagation can mix things up.
+    */
+   OPT(brw_fs_opt_dead_code_eliminate);
+
+   OPT(brw_fs_opt_remove_extra_rounding_modes);
+
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(brw_fs_opt_algebraic);
+      OPT(brw_fs_opt_cse);
+      OPT(brw_fs_opt_copy_propagation);
+      OPT(opt_predicated_break);
+      OPT(brw_fs_opt_cmod_propagation);
+      OPT(brw_fs_opt_dead_code_eliminate);
+      OPT(brw_fs_opt_peephole_sel);
+      OPT(dead_control_flow_eliminate);
+      OPT(brw_fs_opt_saturate_propagation);
+      OPT(brw_fs_opt_register_coalesce);
+      OPT(brw_fs_opt_eliminate_find_live_channel);
+
+      OPT(brw_fs_opt_compact_virtual_grfs);
+   } while (progress);
+
+   progress = false;
+   pass_num = 0;
+
+   if (OPT(brw_fs_lower_pack)) {
+      OPT(brw_fs_opt_register_coalesce);
+      OPT(brw_fs_opt_dead_code_eliminate);
+   }
+
+   OPT(brw_fs_lower_simd_width);
+   OPT(brw_fs_lower_barycentrics);
+   OPT(brw_fs_lower_logical_sends);
+
+   /* After logical SEND lowering. */
+
+   if (OPT(brw_fs_opt_copy_propagation))
+      OPT(brw_fs_opt_algebraic);
+
+   /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
+    * Do this before splitting SENDs.
+    */
+   if (OPT(brw_fs_opt_zero_samples) && OPT(brw_fs_opt_copy_propagation))
+      OPT(brw_fs_opt_algebraic);
+
+   OPT(brw_fs_opt_split_sends);
+   OPT(brw_fs_workaround_nomask_control_flow);
+
+   if (progress) {
+      if (OPT(brw_fs_opt_copy_propagation))
+         OPT(brw_fs_opt_algebraic);
+
+      /* Run after logical send lowering to give it a chance to CSE the
+       * LOAD_PAYLOAD instructions created to construct the payloads of
+       * e.g. texturing messages in cases where it wasn't possible to CSE the
+       * whole logical instruction.
+       */
+      OPT(brw_fs_opt_cse);
+      OPT(brw_fs_opt_register_coalesce);
+      OPT(brw_fs_opt_dead_code_eliminate);
+      OPT(brw_fs_opt_peephole_sel);
+   }
+
+   OPT(brw_fs_opt_remove_redundant_halts);
+
+   if (OPT(brw_fs_lower_load_payload)) {
+      OPT(brw_fs_opt_split_virtual_grfs);
+
+      /* Lower 64 bit MOVs generated by payload lowering. */
+      if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
+         OPT(brw_fs_opt_algebraic);
+
+      OPT(brw_fs_opt_register_coalesce);
+      OPT(brw_fs_lower_simd_width);
+      OPT(brw_fs_opt_dead_code_eliminate);
+   }
+
+   OPT(brw_fs_opt_combine_constants);
+   if (OPT(brw_fs_lower_integer_multiplication)) {
+      /* If lower_integer_multiplication made progress, it may have produced
+       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
+       * one more time to clean those up if they exist.
+       */
+      OPT(brw_fs_lower_integer_multiplication);
+   }
+   OPT(brw_fs_lower_sub_sat);
+
+   progress = false;
+   OPT(brw_fs_lower_derivatives);
+   OPT(brw_fs_lower_regioning);
+   if (progress) {
+      if (OPT(brw_fs_opt_copy_propagation))
+         OPT(brw_fs_opt_algebraic);
+      OPT(brw_fs_opt_dead_code_eliminate);
+      OPT(brw_fs_lower_simd_width);
+   }
+
+   OPT(brw_fs_lower_sends_overlapping_payload);
+
+   OPT(brw_fs_lower_uniform_pull_constant_loads);
+
+   OPT(brw_fs_lower_find_live_channel);
+
+   s.validate();
+}
+
+static unsigned
+load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
+{
+   assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
+   assert(size_read >= lp->header_size * REG_SIZE);
+
+   unsigned i;
+   unsigned size = lp->header_size * REG_SIZE;
+   for (i = lp->header_size; size < size_read && i < lp->sources; i++)
+      size += lp->exec_size * type_sz(lp->src[i].type);
+
+   /* Size read must cover exactly a subset of sources. */
+   assert(size == size_read);
+   return i;
+}
+
+/**
+ * Optimize sample messages that have constant zero values for the trailing
+ * parameters. We can just reduce the message length for these
+ * instructions instead of reserving a register for it. Trailing parameters
+ * that aren't sent default to zero anyway. This will cause the dead code
+ * eliminator to remove the MOV instruction that would otherwise be emitted to
+ * set up the zero value.
+ */
+
+bool
+brw_fs_opt_zero_samples(fs_visitor &s)
+{
+   /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
+   assert(s.devinfo->ver >= 7);
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
+      if (send->opcode != SHADER_OPCODE_SEND ||
+          send->sfid != BRW_SFID_SAMPLER)
+         continue;
+
+      /* Wa_14012688258:
+       *
+       * Don't trim zeros at the end of payload for sample operations
+       * in cube and cube arrays.
+       */
+      if (send->keep_payload_trailing_zeros)
+         continue;
+
+      /* This pass works on SENDs before splitting. */
+      if (send->ex_mlen > 0)
+         continue;
+
+      fs_inst *lp = (fs_inst *) send->prev;
+
+      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      /* How much of the payload are actually read by this SEND. */
+      const unsigned params =
+         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
+
+      /* We don't want to remove the message header or the first parameter.
+       * Removing the first parameter is not allowed, see the Haswell PRM
+       * volume 7, page 149:
+       *
+       *     "Parameter 0 is required except for the sampleinfo message, which
+       *      has no parameter 0"
+       */
+      const unsigned first_param_idx = lp->header_size;
+      unsigned zero_size = 0;
+      for (unsigned i = params - 1; i > first_param_idx; i--) {
+         if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
+            break;
+         zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
+      }
+
+      const unsigned zero_len = zero_size / (reg_unit(s.devinfo) * REG_SIZE);
+      if (zero_len > 0) {
+         send->mlen -= zero_len;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/**
+ * Opportunistically split SEND message payloads.
+ *
+ * Gfx9+ supports "split" SEND messages, which take two payloads that are
+ * implicitly concatenated.  If we find a SEND message with a single payload,
+ * we can split that payload in two.  This results in smaller contiguous
+ * register blocks for us to allocate.  But it can help beyond that, too.
+ *
+ * We try and split a LOAD_PAYLOAD between sources which change registers.
+ * For example, a sampler message often contains a x/y/z coordinate that may
+ * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
+ * or array index, which comes from elsewhere.  In this case, the first few
+ * sources will be different offsets of the same VGRF, then a later source
+ * will be a different VGRF.  So we split there, possibly eliminating the
+ * payload concatenation altogether.
+ */
+bool
+brw_fs_opt_split_sends(fs_visitor &s)
+{
+   if (s.devinfo->ver < 9)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
+      if (send->opcode != SHADER_OPCODE_SEND ||
+          send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0)
+         continue;
+
+      assert(send->src[2].file == VGRF);
+
+      /* Currently don't split sends that reuse a previously used payload. */
+      fs_inst *lp = (fs_inst *) send->prev;
+
+      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
+         continue;
+
+      /* Split either after the header (if present), or when consecutive
+       * sources switch from one VGRF to a different one.
+       */
+      unsigned mid = lp->header_size;
+      if (mid == 0) {
+         for (mid = 1; mid < lp->sources; mid++) {
+            if (lp->src[mid].file == BAD_FILE)
+               continue;
+
+            if (lp->src[0].file != lp->src[mid].file ||
+                lp->src[0].nr != lp->src[mid].nr)
+               break;
+         }
+      }
+
+      /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
+       * find out how many sources from the payload does it really need.
+       */
+      const unsigned end =
+         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
+
+      /* Nothing to split. */
+      if (end <= mid)
+         continue;
+
+      const fs_builder ibld(&s, block, lp);
+      fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
+      fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
+
+      assert(lp1->size_written % REG_SIZE == 0);
+      assert(lp2->size_written % REG_SIZE == 0);
+      assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
+
+      lp1->dst = fs_reg(VGRF, s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
+      lp2->dst = fs_reg(VGRF, s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
+
+      send->resize_sources(4);
+      send->src[2] = lp1->dst;
+      send->src[3] = lp2->dst;
+      send->ex_mlen = lp2->size_written / REG_SIZE;
+      send->mlen -= send->ex_mlen;
+
+      progress = true;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Remove redundant or useless halts.
+ *
+ * For example, we can eliminate halts in the following sequence:
+ *
+ * halt        (redundant with the next halt)
+ * halt        (useless; jumps to the next instruction)
+ * halt-target
+ */
+bool
+brw_fs_opt_remove_redundant_halts(fs_visitor &s)
+{
+   bool progress = false;
+
+   unsigned halt_count = 0;
+   fs_inst *halt_target = NULL;
+   bblock_t *halt_target_block = NULL;
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
+      if (inst->opcode == BRW_OPCODE_HALT)
+         halt_count++;
+
+      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
+         halt_target = inst;
+         halt_target_block = block;
+         break;
+      }
+   }
+
+   if (!halt_target) {
+      assert(halt_count == 0);
+      return false;
+   }
+
+   /* Delete any HALTs immediately before the halt target. */
+   for (fs_inst *prev = (fs_inst *) halt_target->prev;
+        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
+        prev = (fs_inst *) halt_target->prev) {
+      prev->remove(halt_target_block);
+      halt_count--;
+      progress = true;
+   }
+
+   if (halt_count == 0) {
+      halt_target->remove(halt_target_block);
+      progress = true;
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow.  We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+brw_fs_opt_eliminate_find_live_channel(fs_visitor &s)
+{
+   bool progress = false;
+   unsigned depth = 0;
+
+   if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
+                                      s.stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return false;
+   }
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_DO:
+         depth++;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         depth--;
+         break;
+
+      case BRW_OPCODE_HALT:
+         /* This can potentially make control flow non-uniform until the end
+          * of the program.
+          */
+         goto out;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         if (depth == 0) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = brw_imm_ud(0u);
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+out:
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/**
+ * Rounding modes for conversion instructions are included for each
+ * conversion, but right now it is a state. So once it is set,
+ * we don't need to call it again for subsequent calls.
+ *
+ * This is useful for vector/matrices conversions, as setting the
+ * mode once is enough for the full vector/matrix
+ */
+bool
+brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s)
+{
+   bool progress = false;
+   unsigned execution_mode = s.nir->info.float_controls_execution_mode;
+
+   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
+       execution_mode)
+      base_mode = BRW_RND_MODE_RTNE;
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
+       execution_mode)
+      base_mode = BRW_RND_MODE_RTZ;
+
+   foreach_block (block, s.cfg) {
+      brw_rnd_mode prev_mode = base_mode;
+
+      foreach_inst_in_block_safe (fs_inst, inst, block) {
+         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
+            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
+            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
+            if (mode == prev_mode) {
+               inst->remove(block);
+               progress = true;
+            } else {
+               prev_mode = mode;
+            }
+         }
+      }
+   }
+
+   if (progress)
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@ -79,6 +79,7 @@ libintel_compiler_brw_files = files(
  'brw_fs_lower_pack.cpp',
  'brw_fs_lower_regioning.cpp',
  'brw_fs_nir.cpp',
+  'brw_fs_opt.cpp',
  'brw_fs_reg_allocate.cpp',
  'brw_fs_register_coalesce.cpp',
  'brw_fs_saturate_propagation.cpp',