mesa/src/intel/compiler/brw_opt_virtual_grfs.cpp

/*
 * Copyright © 2010 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "brw_shader.h"
#include "brw_builder.h"

/**
 * Split large virtual GRFs into separate components if we can.
 *
 * This pass aggressively splits VGRFs into as small a chunks as possible,
 * down to single registers if it can.  If no VGRFs can be split, we return
 * false so this pass can safely be used inside an optimization loop.  We
 * want to split, because virtual GRFs are what we register allocate and
 * spill (due to contiguousness requirements for some instructions), and
 * they're what we naturally generate in the codegen process, but most
 * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
 * split, we'll end up with reduced live intervals and better dead code
 * elimination and coalescing.
 */
bool
brw_opt_split_virtual_grfs(brw_shader &s)
{
   /* Compact the register file so we eliminate dead vgrfs.  This
    * only defines split points for live registers, so if we have
    * too large dead registers they will hit assertions later.
    */
   brw_opt_compact_virtual_grfs(s);

   unsigned num_vars = s.alloc.count;

   /* Count the total number of registers */
   unsigned reg_count = 0;
   unsigned *vgrf_to_reg = new unsigned[num_vars];
   for (unsigned i = 0; i < num_vars; i++) {
      vgrf_to_reg[i] = reg_count;
      reg_count += s.alloc.sizes[i];
   }

   /* An array of "split points".  For each register slot, this indicates
    * if this slot can be separated from the previous slot.  Every time an
    * instruction uses multiple elements of a register (as a source or
    * destination), we mark the used slots as inseparable.  Then we go
    * through and split the registers into the smallest pieces we can.
    */
   bool *split_points = new bool[reg_count];
   memset(split_points, 0, reg_count * sizeof(*split_points));

   /* Mark all used registers as fully splittable following the physical
    * register size.
    */
   const unsigned reg_inc = reg_unit(s.devinfo);
   foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
      if (inst->dst.file == VGRF) {
         unsigned reg = vgrf_to_reg[inst->dst.nr];
         for (unsigned j = reg_inc; j < s.alloc.sizes[inst->dst.nr]; j += reg_inc)
            split_points[reg + j] = true;
      }

      for (unsigned i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == VGRF) {
            unsigned reg = vgrf_to_reg[inst->src[i].nr];
            for (unsigned j = reg_inc; j < s.alloc.sizes[inst->src[i].nr]; j += reg_inc)
               split_points[reg + j] = true;
         }
      }
   }

   foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
      /* We fix up undef instructions later */
      if (inst->opcode == SHADER_OPCODE_UNDEF) {
         assert(inst->dst.file == VGRF);
         continue;
      }

      if (inst->dst.file == VGRF) {
         unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
         for (unsigned j = 1; j < regs_written(inst); j++)
            split_points[reg + j] = false;
      }
      for (unsigned i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == VGRF) {
            unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
            for (unsigned j = 1; j < regs_read(s.devinfo, inst, i); j++)
               split_points[reg + j] = false;
         }
      }
   }

   /* Bitset of which registers have been split */
   bool *vgrf_has_split = new bool[num_vars];
   memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));

   unsigned *new_virtual_grf = new unsigned[reg_count];
   unsigned *new_reg_offset = new unsigned[reg_count];

   unsigned reg = 0;
   bool has_splits = false;
   for (unsigned i = 0; i < num_vars; i++) {
      /* The first one should always be 0 as a quick sanity check. */
      assert(split_points[reg] == false);

      /* j = 0 case */
      new_reg_offset[reg] = 0;
      reg++;
      unsigned offset = 1;

      /* j > 0 case */
      for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
         /* If this is a split point, reset the offset to 0 and allocate a
          * new virtual GRF for the previous offset many registers
          */
         if (split_points[reg]) {
            has_splits = true;
            vgrf_has_split[i] = true;
            unsigned grf = brw_allocate_vgrf_units(s, offset).nr;
            for (unsigned k = reg - offset; k < reg; k++)
               new_virtual_grf[k] = grf;
            offset = 0;
         }
         new_reg_offset[reg] = offset;
         offset++;
         reg++;
      }

      /* The last one gets the original register number */
      s.alloc.sizes[i] = offset;
      for (unsigned k = reg - offset; k < reg; k++)
         new_virtual_grf[k] = i;
   }
   assert(reg == reg_count);

   bool progress;
   if (!has_splits) {
      progress = false;
      goto cleanup;
   }

   foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
      if (inst->opcode == SHADER_OPCODE_UNDEF) {
         assert(inst->dst.file == VGRF);
         if (vgrf_has_split[inst->dst.nr]) {
            const brw_builder ibld(inst);
            assert(inst->size_written % REG_SIZE == 0);
            unsigned reg_offset = inst->dst.offset / REG_SIZE;
            unsigned size_written = 0;
            while (size_written < inst->size_written) {
               reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
               brw_inst *undef =
                  ibld.UNDEF(
                     byte_offset(brw_vgrf(new_virtual_grf[reg], inst->dst.type),
                                 new_reg_offset[reg] * REG_SIZE));
               undef->size_written =
                  MIN2(inst->size_written - size_written, undef->size_written);
               assert(undef->size_written % REG_SIZE == 0);
               size_written += undef->size_written;
            }
            inst->remove();
         } else {
            reg = vgrf_to_reg[inst->dst.nr];
            assert(new_reg_offset[reg] == 0);
            assert(new_virtual_grf[reg] == inst->dst.nr);
         }
         continue;
      }

      if (inst->dst.file == VGRF) {
         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
         if (vgrf_has_split[inst->dst.nr]) {
            inst->dst.nr = new_virtual_grf[reg];
            inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
                               inst->dst.offset % REG_SIZE;
            assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
         } else {
            assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
            assert(new_virtual_grf[reg] == inst->dst.nr);
         }
      }
      for (unsigned i = 0; i < inst->sources; i++) {
	 if (inst->src[i].file != VGRF)
            continue;

         reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
         if (vgrf_has_split[inst->src[i].nr]) {
            inst->src[i].nr = new_virtual_grf[reg];
            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
                                  inst->src[i].offset % REG_SIZE;
            assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
         } else {
            assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
            assert(new_virtual_grf[reg] == inst->src[i].nr);
         }
      }
   }
   s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
                         BRW_DEPENDENCY_VARIABLES);

   progress = true;

cleanup:
   delete[] split_points;
   delete[] vgrf_has_split;
   delete[] new_virtual_grf;
   delete[] new_reg_offset;
   delete[] vgrf_to_reg;

   return progress;
}

/**
 * Remove unused virtual GRFs and compact the vgrf_* arrays.
 *
 * During code generation, we create tons of temporary variables, many of
 * which get immediately killed and are never used again.  Yet, in later
 * optimization and analysis passes, such as compute_live_intervals, we need
 * to loop over all the virtual GRFs.  Compacting them can save a lot of
 * overhead.
 */
bool
brw_opt_compact_virtual_grfs(brw_shader &s)
{
   bool progress = false;
   int *remap_table = new int[s.alloc.count];
   memset(remap_table, -1, s.alloc.count * sizeof(int));

   /* Mark which virtual GRFs are used. */
   foreach_block_and_inst(block, const brw_inst, inst, s.cfg) {
      if (inst->dst.file == VGRF)
         remap_table[inst->dst.nr] = 0;

      for (int i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == VGRF)
            remap_table[inst->src[i].nr] = 0;
      }
   }

   /* Compact the GRF arrays. */
   int new_index = 0;
   for (unsigned i = 0; i < s.alloc.count; i++) {
      if (remap_table[i] == -1) {
         /* We just found an unused register.  This means that we are
          * actually going to compact something.
          */
         progress = true;
      } else {
         remap_table[i] = new_index;
         s.alloc.sizes[new_index] = s.alloc.sizes[i];
         s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
                               BRW_DEPENDENCY_VARIABLES);
         ++new_index;
      }
   }

   s.alloc.count = new_index;

   /* Patch all the instructions to use the newly renumbered registers */
   foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
      if (inst->dst.file == VGRF)
         inst->dst.nr = remap_table[inst->dst.nr];

      for (int i = 0; i < inst->sources; i++) {
         if (inst->src[i].file == VGRF)
            inst->src[i].nr = remap_table[inst->src[i].nr];
      }
   }

   /* Patch all the references to delta_xy, since they're used in register
    * allocation.  If they're unused, switch them to BAD_FILE so we don't
    * think some random VGRF is delta_xy.
    */
   for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
      if (s.delta_xy[i].file == VGRF) {
         if (remap_table[s.delta_xy[i].nr] != -1) {
            s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
         } else {
            s.delta_xy[i].file = BAD_FILE;
         }
      }
   }

   delete[] remap_table;

   return progress;
}