/*
 * Copyright © 2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/* This is a new block-level load instruction scheduler where loads are grouped
 * according to their indirection level within a basic block. An indirection
 * is when a result of one load is used as a source of another load. The result
 * is that disjoint ALU opcode groups and load (texture) opcode groups are
 * created where each next load group is the next level of indirection.
 * It's done by finding the first and last load with the same indirection
 * level, and moving all unrelated instructions between them after the last
 * load except for load sources, which are moved before the first load.
 * It naturally suits hardware that has limits on texture indirections, but
 * other hardware can benefit too. Only texture, image, and SSBO load and
 * atomic instructions are grouped.
 *
 * There is an option to group only those loads that use the same resource
 * variable. This increases the chance to get more cache hits than if the loads
 * were spread out.
 *
 * The increased register usage is offset by the increase in observed memory
 * bandwidth due to more cache hits (dependent on hw behavior) and thus
 * decrease the subgroup lifetime, which allows registers to be deallocated
 * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
 * benefit at all. Doubling the register usage and using those registers to
 * amplify observed bandwidth can improve performance a lot.
 *
 * It's recommended to run a hw-specific instruction scheduler after this to
 * prevent spilling.
 */

#include "nir.h"
#include "util/u_dynarray.h"

typedef struct {
   bool visited;
   uint32_t instr_index;
   uint32_t indirection_level;
} instr_info;

typedef struct opaque_resource opaque_resource;

static opaque_resource *
get_load_resource(nir_instr *instr)
{
   if (instr->type == nir_instr_type_tex) {
      nir_tex_instr *tex = nir_instr_as_tex(instr);

      for (unsigned i = 0; i < tex->num_srcs; i++) {
         switch (tex->src[i].src_type) {
         case nir_tex_src_texture_deref:
         case nir_tex_src_texture_handle:
            return (opaque_resource*)nir_def_instr(tex->src[i].src.ssa);
         default:
            break;
         }
      }

      /* Some drivers that don't support indirect resource indexing lower
       * derefs to the constant texture_index.
       */
      return (opaque_resource*)(uintptr_t)tex->texture_index;
   }

   if (instr->type == nir_instr_type_intrinsic) {
      /* This is also the list of intrinsics that are grouped. */
      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
      /* Image loads. */
      case nir_intrinsic_image_load:
      case nir_intrinsic_image_deref_load:
      case nir_intrinsic_bindless_image_load:
      case nir_intrinsic_image_heap_load:
      case nir_intrinsic_image_sparse_load:
      case nir_intrinsic_image_deref_sparse_load:
      case nir_intrinsic_bindless_image_sparse_load:
      case nir_intrinsic_image_heap_sparse_load:
      /* Fragment mask loads. (samples_identical also loads it) */
      case nir_intrinsic_image_fragment_mask_load_amd:
      case nir_intrinsic_image_deref_fragment_mask_load_amd:
      case nir_intrinsic_bindless_image_fragment_mask_load_amd:
      case nir_intrinsic_image_heap_fragment_mask_load_amd:
      case nir_intrinsic_image_samples_identical:
      case nir_intrinsic_image_deref_samples_identical:
      case nir_intrinsic_bindless_image_samples_identical:
      case nir_intrinsic_image_heap_samples_identical:
      /* Queries */
      case nir_intrinsic_image_size:
      case nir_intrinsic_image_deref_size:
      case nir_intrinsic_bindless_image_size:
      case nir_intrinsic_image_heap_size:
      case nir_intrinsic_image_samples:
      case nir_intrinsic_image_deref_samples:
      case nir_intrinsic_bindless_image_samples:
      case nir_intrinsic_image_heap_samples:
      case nir_intrinsic_image_levels:
      case nir_intrinsic_image_deref_levels:
      case nir_intrinsic_bindless_image_levels:
      case nir_intrinsic_image_heap_levels:
      /* Other loads. */
      /* load_ubo is ignored because it's usually cheap. */
      case nir_intrinsic_load_ssbo:
      case nir_intrinsic_load_global:
         return (opaque_resource*)nir_def_instr(nir_instr_as_intrinsic(instr)->src[0].ssa);
      default:
         return NULL;
      }
   }

   return NULL;
}

/* Track only those that we want to group. */
static bool
is_grouped_load(nir_instr *instr)
{
   if (instr->type == nir_instr_type_intrinsic &&
       !nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
      return false;

   return get_load_resource(instr) != NULL;
}

static bool
is_part_of_group(nir_instr *instr, nir_instr *first,
                 uint32_t indirection_level, instr_info *infos)
{
   /* Grouping is done by moving everything else out of the first..last
    * instruction range of the load group corresponding to the given
    * indirection level.
    *
    * We can move anything that's not a grouped load because we are not really
    * moving it. What we are doing is that we are moving grouped loads to
    * the same place by moving everything else between the first and last load
    * out of the way. This doesn't change the order of non-reorderable
    * instructions.
    *
    * If "first" is set, compare against its indirection level, else compared
    * against "indirection_level".
    */
   return is_grouped_load(instr) &&
          infos[instr->index].indirection_level ==
          (first ? infos[first->index].indirection_level : indirection_level);
}

struct check_sources_state {
   instr_info *infos;
   nir_block *block;
   uint32_t first_instr_index;
};

static bool
has_only_sources_less_than(nir_src *src, void *data)
{
   struct check_sources_state *state = (struct check_sources_state *)data;

   /* true if nir_foreach_src should keep going */
   return state->block != nir_def_block(src->ssa) ||
             state->infos[nir_def_instr(src->ssa)->index].instr_index <
             state->first_instr_index;
}

static void
group_loads(nir_instr *first, nir_instr *last, instr_info *infos)
{
   assert(is_grouped_load(first));
   assert(is_grouped_load(last));

   /* Walk the instruction range between the first and last backward, and
    * move those that have no uses within the range after the last one.
    */
   for (nir_instr *instr = nir_instr_prev(last); instr != first;
        instr = nir_instr_prev(instr)) {
      if (is_part_of_group(instr, first, 0, infos))
         continue;

      bool all_uses_after_last = true;
      nir_def *def = nir_instr_def(instr);

      if (def) {
         nir_foreach_use(use, def) {
            if (nir_src_parent_instr(use)->block == instr->block &&
                infos[nir_src_parent_instr(use)->index].instr_index <=
                infos[last->index].instr_index) {
               all_uses_after_last = false;
               break;
            }
         }
      }

      if (all_uses_after_last) {
         nir_instr *move_instr = instr;
         /* Set the iterator to the next instruction because we'll move
          * the current one.
          */
         instr = nir_instr_next(instr);

         /* Move the instruction after the last and update its index to
          * indicate that it's after it.
          */
         nir_instr_move(nir_after_instr(last), move_instr);
         infos[move_instr->index].instr_index =
            infos[last->index].instr_index + 1;
      }
   }

   struct check_sources_state state;
   state.infos = infos;
   state.block = first->block;
   state.first_instr_index = infos[first->index].instr_index;

   /* Walk the instruction range between the first and last forward, and move
    * those that have no sources within the range before the first one.
    */
   for (nir_instr *instr = nir_instr_next(first); instr != last;
        instr = nir_instr_next(instr)) {
      /* Only move instructions without side effects. */
      if (is_part_of_group(instr, first, 0, infos))
         continue;

      if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
         nir_instr *move_instr = instr;
         /* Set the last instruction because we'll delete the current one. */
         instr = nir_instr_prev(instr);

         /* Move the instruction before the first and update its index
          * to indicate that it's before it.
          */
         nir_instr_move(nir_before_instr(first), move_instr);
         infos[move_instr->index].instr_index =
            infos[first->index].instr_index - 1;
      }
   }
}

static bool
is_pseudo_inst(nir_instr *instr)
{
   /* Other instructions do not usually contribute to the shader binary size. */
   return instr->type != nir_instr_type_alu &&
          instr->type != nir_instr_type_call &&
          instr->type != nir_instr_type_cmat_call &&
          instr->type != nir_instr_type_tex &&
          instr->type != nir_instr_type_intrinsic;
}

static void
set_instr_indices(nir_block *block, instr_info *infos)
{
   /* Start with 1 because we'll move instructions before the first one
    * and will want to label it 0.
    */
   unsigned counter = 1;
   nir_instr *last = NULL;

   nir_foreach_instr(instr, block) {
      /* Make sure grouped instructions don't have the same index as pseudo
       * instructions.
       */
      if (last && is_pseudo_inst(last) && is_grouped_load(instr))
         counter++;

      /* Set each instruction's index within the block. */
      infos[instr->index].instr_index = counter;

      /* Only count non-pseudo instructions. */
      if (!is_pseudo_inst(instr))
         counter++;

      last = instr;
   }
}

static void
handle_load_range(nir_instr **first, nir_instr **last, nir_instr *current,
                  unsigned max_distance, instr_info *infos)
{
   assert(!current || !*first ||
          infos[current->index].instr_index >=
          infos[(*first)->index].instr_index);
   if (*first && *last &&
       (!current ||
        infos[current->index].instr_index -
        infos[(*first)->index].instr_index > max_distance)) {
      assert(*first != *last);
      group_loads(*first, *last, infos);
      set_instr_indices((*first)->block, infos);
      *first = NULL;
      *last = NULL;
   }
}

static bool
is_demote(nir_instr *instr)
{
   if (instr->type == nir_instr_type_intrinsic) {
      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

      if (intr->intrinsic == nir_intrinsic_terminate ||
          intr->intrinsic == nir_intrinsic_terminate_if ||
          intr->intrinsic == nir_intrinsic_demote ||
          intr->intrinsic == nir_intrinsic_demote_if)
         return true;
   }

   return false;
}

struct indirection_state {
   instr_info *infos;
   nir_block *block;
   unsigned indirections;
};

static unsigned
get_num_indirections(nir_instr *instr, instr_info *infos);

static bool
gather_indirections(nir_src *src, void *data)
{
   struct indirection_state *state = (struct indirection_state *)data;
   nir_instr *instr = nir_def_instr(src->ssa);

   /* We only count indirections within the same block. */
   if (instr->block == state->block) {
      unsigned indirections = get_num_indirections(nir_def_instr(src->ssa),
                                                   state->infos);

      if (instr->type == nir_instr_type_tex || is_grouped_load(instr))
         indirections++;

      state->indirections = MAX2(state->indirections, indirections);
   }

   return true; /* whether nir_foreach_src should keep going */
}

/* Return the number of load indirections within the block. */
static unsigned
get_num_indirections(nir_instr *instr, instr_info *infos)
{
   /* Don't traverse phis because we could end up in an infinite recursion
    * if the phi points to the current block (such as a loop body).
    */
   if (instr->type == nir_instr_type_phi)
      return 0;

   if (infos[instr->index].visited)
      return infos[instr->index].instr_index;

   struct indirection_state state;
   state.infos = infos;
   state.block = instr->block;
   state.indirections = 0;

   nir_foreach_src(instr, gather_indirections, &state);

   infos[instr->index].visited = true;
   infos[instr->index].instr_index = state.indirections;
   return state.indirections;
}

static void
process_block(nir_block *block, nir_load_grouping grouping,
              unsigned max_distance, instr_info *infos)
{
   int max_indirection = -1;
   unsigned num_inst_per_level[256] = { 0 };

   for (unsigned i = 0; i < block->end_ip + 1 - block->start_ip; i++) {
      infos[block->start_ip + i].visited = false;
   }

   /* Count the number of load indirections for each load instruction
    * within this block.
    */
   nir_foreach_instr(instr, block) {
      if (is_grouped_load(instr)) {
         unsigned indirections = get_num_indirections(instr, infos);

         num_inst_per_level[indirections]++;
         infos[instr->index].indirection_level = indirections;

         max_indirection = MAX2(max_indirection, (int)indirections);
      }
   }

   /* Each indirection level is grouped. */
   for (int level = 0; level <= max_indirection; level++) {
      if (num_inst_per_level[level] <= 1)
         continue;

      set_instr_indices(block, infos);

      opaque_resource *resource = NULL;
      nir_instr *first_load = NULL, *last_load = NULL;

      /* Find the first and last instruction that use the same
       * resource and are within a certain distance of each other.
       * If found, group them by moving all movable instructions
       * between them out.
       */
      nir_foreach_instr(current, block) {
         /* Don't group across terminate. */
         if (is_demote(current)) {
            /* Group unconditionally.  */
            handle_load_range(&first_load, &last_load, NULL, 0, infos);
            first_load = NULL;
            last_load = NULL;
            continue;
         }

         /* Only group load instructions with the same indirection level. */
         if (is_part_of_group(current, NULL, level, infos)) {
            opaque_resource *current_resource;

            switch (grouping) {
            case nir_group_all:
               if (!first_load)
                  first_load = current;
               else
                  last_load = current;
               break;

            case nir_group_same_resource_only:
               current_resource = get_load_resource(current);

               if (current_resource) {
                  if (!first_load) {
                     first_load = current;
                     resource = current_resource;
                  } else if (current_resource == resource) {
                     last_load = current;
                  }
               }
            }
         }

         /* Group only if we exceeded the maximum distance. */
         handle_load_range(&first_load, &last_load, current, max_distance,
                           infos);
      }

      /* Group unconditionally.  */
      handle_load_range(&first_load, &last_load, NULL, 0, infos);
   }
}

/* max_distance is the maximum distance between the first and last instruction
 * in a group.
 */
bool
nir_opt_group_loads(nir_shader *shader, nir_load_grouping grouping,
                    unsigned max_distance)
{
   /* Temporary space for instruction info. */
   struct util_dynarray infos_scratch = UTIL_DYNARRAY_INIT;

   nir_foreach_function_impl(impl, shader) {
      nir_metadata_require(impl, nir_metadata_instr_index);

      unsigned num_instr =
         nir_impl_last_block(impl)->end_ip + 1; /* we might need 1 more */
      instr_info *infos =
         (instr_info*)util_dynarray_resize(&infos_scratch, instr_info,
                                           num_instr);

      nir_foreach_block(block, impl) {
         process_block(block, grouping, max_distance, infos);
      }

      nir_progress(true, impl,
                   nir_metadata_control_flow | nir_metadata_loop_analysis);
   }

   util_dynarray_fini(&infos_scratch);
   return true;
}