/*
 * Copyright © 2024 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "brw_eu.h"
#include "brw_shader.h"
#include "brw_builder.h"

static unsigned
dest_comps_for_txf(const brw_shader &s, const brw_inst *txf)
{
   if (!txf)
      return 0;

   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
   const unsigned per_component_regs =
      DIV_ROUND_UP(brw_type_size_bytes(txf->dst.type) *
                   txf->exec_size, grf_size);
   const unsigned dest_regs = txf->size_written / grf_size;
   const unsigned dest_comps = dest_regs / per_component_regs;
   return dest_comps;
}

static bool
is_def(const brw_def_analysis &defs, const brw_reg &r)
{
   return r.file == IMM || r.file == BAD_FILE || defs.get(r) != NULL;
}

static bool
is_uniform_def(const brw_def_analysis &defs, const brw_reg &r)
{
   return is_def(defs, r) && is_uniform(r);
}

/**
 * Check if two texture instructions have a matching source (either the same
 * immediate value, or both references to the same immutable SSA def and
 * with matching source modifiers and regions).
 */
static bool
sources_match(ASSERTED const brw_def_analysis &defs,
              const brw_inst *a, const brw_inst *b, enum tex_logical_srcs src)
{
   assert(is_def(defs, a->src[src]));
   assert(is_def(defs, b->src[src]));
   return brw_regs_equal(&a->src[src], &b->src[src]);
}

static void
merge_instructions(brw_shader &s, brw_tex_inst **txfs, unsigned count)
{
   const unsigned min_simd = 8 * reg_unit(s.devinfo);
   const unsigned max_simd = 16 * reg_unit(s.devinfo);
   const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);

   for (unsigned curr = 0; curr < count; curr += max_simd) {
      const unsigned lanes = CLAMP(count - curr, min_simd, max_simd);
      const unsigned width = util_next_power_of_two(lanes);
      const brw_builder ubld =
         brw_builder(&s).before(txfs[curr]).exec_all().group(width, 0);
      const brw_builder ubld1 = ubld.group(1, 0);

      enum brw_reg_type coord_type =
         txfs[curr]->src[TEX_LOGICAL_SRC_PAYLOAD0].type;
      brw_reg coord = ubld.vgrf(coord_type);
      brw_reg coord_comps[32];

      for (unsigned i = 0; i < width; i++) {
            /* Our block size might be larger than the number of convergent
             * loads we're combining.  If so, repeat the last component.
             */
         if (txfs[curr+i])
            coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_PAYLOAD0];
         else
            coord_comps[i] = coord_comps[i-1];
         }
      ubld1.VEC(coord, coord_comps, width);

      brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
      srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
      srcs[TEX_LOGICAL_SRC_PAYLOAD0] = coord;
      for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD1; i < txfs[0]->sources; i++)
         srcs[i] = txfs[0]->src[i];

      /* Each of our txf may have a reduced response length if some
       * components are never read.  Use the maximum of the sizes.
       */
      unsigned new_dest_comps = 0;
      for (unsigned i = 0; i < width; i++) {
         const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
         new_dest_comps = MAX2(new_dest_comps, this_comps);
      }

      /* Emit the new divergent TXF */
      brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
      brw_tex_inst *div_txf =
         ubld.emit(SHADER_OPCODE_SAMPLER, div, srcs, txfs[0]->sources)->as_tex();
      div_txf->surface_bindless = txfs[0]->surface_bindless;
      div_txf->sampler_opcode = txfs[0]->sampler_opcode;
      div_txf->residency = false;

      /* Update it to also use response length reduction */
      const unsigned per_component_regs =
         DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
                      grf_size);
      div_txf->size_written = new_dest_comps * per_component_regs * grf_size;

      for (unsigned i = 0; i < width; i++) {
         brw_inst *txf = txfs[curr+i];
         if (!txf)
            break;

         const brw_builder ibld = brw_builder(txf);

         /* Replace each of the original TXFs with MOVs from our new one */
         const unsigned dest_comps = dest_comps_for_txf(s, txf);
         assert(dest_comps <= 4);

         brw_reg v[4];
         for (unsigned c = 0; c < dest_comps; c++)
            v[c] = component(offset(div, ubld, c), i);
         ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);

         txf->remove();
      }
   }
}

/**
 * Look for a series of convergent texture buffer fetches within a basic
 * block and combine them into a single divergent load with one lane for
 * each original fetch.  For example, this series of convergent fetches:
 *
 *   txf(16) %12:UD, coord = 12d, lod = 0u, handle = %1<0>:D
 *   txf(16) %13:UD, coord = 13d, lod = 0u, handle = %1<0>:D
 *   txf(16) %14:UD, coord = 14d, lod = 0u, handle = %1<0>:D
 *   txf(16) %15:UD, coord = 15d, lod = 0u, handle = %1<0>:D
 *   txf(16) %16:UD, coord = 16d, lod = 0u, handle = %1<0>:D
 *   txf(16) %17:UD, coord = 17d, lod = 0u, handle = %1<0>:D
 *   txf(16) %18:UD, coord = 18d, lod = 0u, handle = %1<0>:D
 *   txf(16) %19:UD, coord = 19d, lod = 0u, handle = %1<0>:D
 *
 * can be combined into a single divergent load and scalar-expansion moves
 * (which can easily be copy propagated away):
 *
 *   load_payload(1) %2:D 12d, 13d, 14d, 15d, 16d, 17d, 18d, 19d
 *   txf(8) %3:UD, coord = %2, lod = 0u, handle = %1<0>:D
 *   mov(16) %12:UD, %3+0.0<0>:UD
 *   ...
 *   mov(16) %19:UD, %3+0.28<0>:UD
 *
 * Our sampler hardware doesn't have any special support for convergent
 * loads (like LSC transpose/block loads), and always performs SIMD8/16/32
 * per-channel loads.  But with this trick, we can still combine multiple
 * convergent loads into a single message with fewer round-trips, and much
 * lower register pressure.
 */
bool
brw_opt_combine_convergent_txf(brw_shader &s)
{
   const brw_def_analysis &defs = s.def_analysis.require();

   bool progress = false;

   foreach_block(block, s.cfg) {
      /* Gather a list of convergent TXFs to the same surface in this block */
      brw_tex_inst *txfs_ld[32] = {};
      brw_tex_inst *txfs_ld_lz[32] = {};
      unsigned ld_count = 0;
      unsigned ld_lz_count = 0;

      foreach_inst_in_block(brw_inst, inst, block) {
         brw_tex_inst *tex = inst->as_tex();
         if (tex == NULL)
            continue;

         if (tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD &&
             tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD_LZ)
            continue;

         /* Only handle buffers or single miplevel 1D images for now */
         if (tex->coord_components > 1)
            continue;

         if (tex->residency)
            continue;

         if (tex->predicate || tex->force_writemask_all)
            continue;

         if (!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_SURFACE]))
            continue;

         /* Only handle immediates for now: we could check is_uniform(),
          * but we'd need to ensure the coordinate's definition reaches
          * txfs[0] which is where we'll insert the combined coordinate.
          */
         if (tex->src[TEX_LOGICAL_SRC_PAYLOAD0].file != IMM)
            continue;

         brw_tex_inst *tex0 = tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD ?
            txfs_ld[0] : txfs_ld_lz[0];

         if (tex0 != NULL) {
            if (!sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_SURFACE) ||
                tex->surface_bindless != tex0->surface_bindless)
               continue;

            if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD) {
               if (ld_count > 0 &&
                   !sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_PAYLOAD2))
                  continue;
            }
         }

         if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD)
            txfs_ld[ld_count++] = tex;
         if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD_LZ)
            txfs_ld_lz[ld_lz_count++] = tex;

         if (ld_count == ARRAY_SIZE(txfs_ld) ||
             ld_lz_count == ARRAY_SIZE(txfs_ld_lz))
            break;
      }

      /* Emit divergent TXFs and replace the original ones with MOVs */
      if (ld_count >= 2) {
         merge_instructions(s, txfs_ld, ld_count);
         progress = true;
      }
      if (ld_lz_count >= 2) {
         merge_instructions(s, txfs_ld_lz, ld_lz_count);
         progress = true;
      }
   }

   if (progress)
      s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);

   return progress;
}