mesa/src/intel/compiler/brw_fs_workaround.cpp

/*
 * Copyright © 2010 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "brw_fs.h"
#include "brw_fs_builder.h"

using namespace brw;

/* Wa_14015360517
 *
 * The first instruction of any kernel should have non-zero emask.
 * Make sure this happens by introducing a dummy mov instruction.
 */
bool
brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s)
{
   if (!intel_needs_workaround(s.devinfo, 14015360517))
      return false;

   fs_inst *first_inst =
      s.cfg->first_block()->start();

   /* We can skip the WA if first instruction is marked with
    * force_writemask_all or exec_size equals dispatch_width.
    */
   if (first_inst->force_writemask_all ||
       first_inst->exec_size == s.dispatch_width)
      return false;

   /* Insert dummy mov as first instruction. */
   const fs_builder ubld =
      fs_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
   ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));

   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
   return true;
}

static bool
needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
{
   /* This workaround is about making sure that any instruction writing
    * through UGM has completed before we hit EOT.
    */
   if (inst->sfid != GFX12_SFID_UGM)
      return false;

   /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
    * where the L1-cache override is NOT among {WB, WS, WT}
    */
   enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
   if (lsc_opcode_is_store(opcode)) {
      switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
      case LSC_CACHE_STORE_L1STATE_L3MOCS:
      case LSC_CACHE_STORE_L1WB_L3WB:
      case LSC_CACHE_STORE_L1S_L3UC:
      case LSC_CACHE_STORE_L1S_L3WB:
      case LSC_CACHE_STORE_L1WT_L3UC:
      case LSC_CACHE_STORE_L1WT_L3WB:
         return false;

      default:
         return true;
      }
   }

   /* Any UGM Atomic message WITHOUT return value */
   if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
      return true;

   return false;
}

/* Wa_22013689345
 *
 * We need to emit UGM fence message before EOT, if shader has any UGM write
 * or atomic message.
 *
 * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
 *                We probably need a better criteria in needs_dummy_fence().
 */
bool
brw_fs_workaround_memory_fence_before_eot(fs_visitor &s)
{
   bool progress = false;
   bool has_ugm_write_or_atomic = false;

   if (!intel_needs_workaround(s.devinfo, 22013689345))
      return false;

   foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
      if (!inst->eot) {
         if (needs_dummy_fence(s.devinfo, inst))
            has_ugm_write_or_atomic = true;
         continue;
      }

      if (!has_ugm_write_or_atomic)
         break;

      const fs_builder ibld(&s, block, inst);
      const fs_builder ubld = ibld.exec_all().group(1, 0);

      brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
      fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
                                       dst, brw_vec8_grf(0, 0),
                                       /* commit enable */ brw_imm_ud(1),
                                       /* bti */ brw_imm_ud(0));
      dummy_fence->sfid = GFX12_SFID_UGM;
      dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
                                             LSC_FLUSH_TYPE_NONE_6, false);
      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
      progress = true;
      /* TODO: remove this break if we ever have shader with multiple EOT. */
      break;
   }

   if (progress) {
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
                            DEPENDENCY_VARIABLES);
   }

   return progress;
}

/**
 * Find the first instruction in the program that might start a region of
 * divergent control flow due to a HALT jump.  There is no
 * find_halt_control_flow_region_end(), the region of divergence extends until
 * the only SHADER_OPCODE_HALT_TARGET in the program.
 */
static const fs_inst *
find_halt_control_flow_region_start(const fs_visitor *v)
{
   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
      if (inst->opcode == BRW_OPCODE_HALT ||
          inst->opcode == SHADER_OPCODE_HALT_TARGET)
         return inst;
   }

   return NULL;
}

/**
 * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
 * can cause a BB to be executed with all channels disabled, which will lead
 * to the execution of any NoMask instructions in it, even though any
 * execution-masked instructions will be correctly shot down.  This may break
 * assumptions of some NoMask SEND messages whose descriptor depends on data
 * generated by live invocations of the shader.
 *
 * This avoids the problem by predicating certain instructions on an ANY
 * horizontal predicate that makes sure that their execution is omitted when
 * all channels of the program are disabled.
 */
bool
brw_fs_workaround_nomask_control_flow(fs_visitor &s)
{
   if (s.devinfo->ver != 12)
      return false;

   const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
                              s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
                              BRW_PREDICATE_ALIGN1_ANY8H;
   const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
   unsigned depth = 0;
   bool progress = false;

   const fs_live_variables &live_vars = s.live_analysis.require();

   /* Scan the program backwards in order to be able to easily determine
    * whether the flag register is live at any point.
    */
   foreach_block_reverse_safe(block, s.cfg) {
      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
                                               .flag_liveout[0];
      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);

      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
         if (!inst->predicate && inst->exec_size >= 8)
            flag_liveout &= ~inst->flags_written(s.devinfo);

         switch (inst->opcode) {
         case BRW_OPCODE_DO:
         case BRW_OPCODE_IF:
            /* Note that this doesn't handle BRW_OPCODE_HALT since only
             * the first one in the program closes the region of divergent
             * control flow due to any HALT instructions -- Instead this is
             * handled with the halt_start check below.
             */
            depth--;
            break;

         case BRW_OPCODE_WHILE:
         case BRW_OPCODE_ENDIF:
         case SHADER_OPCODE_HALT_TARGET:
            depth++;
            break;

         default:
            /* Note that the vast majority of NoMask SEND instructions in the
             * program are harmless while executed in a block with all
             * channels disabled, since any instructions with side effects we
             * could hit here should be execution-masked.
             *
             * The main concern is NoMask SEND instructions where the message
             * descriptor or header depends on data generated by live
             * invocations of the shader (RESINFO and
             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
             * computed surface index seem to be the only examples right now
             * where this could easily lead to GPU hangs).  Unfortunately we
             * have no straightforward way to detect that currently, so just
             * predicate any NoMask SEND instructions we find under control
             * flow.
             *
             * If this proves to have a measurable performance impact it can
             * be easily extended with a whitelist of messages we know we can
             * safely omit the predication for.
             */
            if (depth && inst->force_writemask_all &&
                is_send(inst) && !inst->predicate &&
                !inst->has_no_mask_send_params) {
               /* We need to load the execution mask into the flag register by
                * using a builder with channel group matching the whole shader
                * (rather than the default which is derived from the original
                * instruction), in order to avoid getting a right-shifted
                * value.
                */
               const fs_builder ubld = fs_builder(&s, block, inst)
                                       .exec_all().group(s.dispatch_width, 0);
               const brw_reg flag = retype(brw_flag_reg(0, 0),
                                          BRW_TYPE_UD);

               /* Due to the lack of flag register allocation we need to save
                * and restore the flag register if it's live.
                */
               const bool save_flag = flag_liveout &
                                      brw_fs_flag_mask(flag, s.dispatch_width / 8);
               const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);

               if (save_flag) {
                  ubld.group(8, 0).UNDEF(tmp);
                  ubld.group(1, 0).MOV(tmp, flag);
               }

               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);

               set_predicate(pred, inst);
               inst->flag_subreg = 0;
               inst->predicate_trivial = true;

               if (save_flag)
                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);

               progress = true;
            }
            break;
         }

         if (inst == halt_start)
            depth--;

         flag_liveout |= inst->flags_read(s.devinfo);
      }
   }

   if (progress)
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

   return progress;
}

/**
 * flags_read() and flags_written() return flag access with byte granularity,
 * but for Flag Register PRM lists "Access Granularity: Word", so we can assume
 * accessing any part of a word will clear its register dependency.
 */
static unsigned
bytes_bitmask_to_words(unsigned b)
{
   unsigned first_byte_mask = b & 0x55555555;
   unsigned second_byte_mask = b & 0xaaaaaaaa;
   return first_byte_mask |
          (first_byte_mask << 1) |
          second_byte_mask |
          (second_byte_mask >> 1);
}

/**
 * WaClearArfDependenciesBeforeEot
 *
 * Flag register dependency not cleared after EOT, so we have to source them
 * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
 *
 * To avoid emitting MOVs when it's not needed, check if each block  reads all
 * the flags it sets. We might falsely determine register as unread if it'll be
 * accessed inside the next blocks, but this still should be good enough.
 */
bool
brw_fs_workaround_source_arf_before_eot(fs_visitor &s)
{
   bool progress = false;

   if (s.devinfo->ver != 9)
      return false;

   unsigned flags_unread = 0;

   foreach_block(block, s.cfg) {
      unsigned flags_unread_in_block = 0;

      foreach_inst_in_block(fs_inst, inst, block) {
         /* Instruction can read and write to the same flag, so the order is important */
         flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(s.devinfo));
         flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(s.devinfo));

         /* HALT does not start its block even though it can leave a dependency */
         if (inst->opcode == BRW_OPCODE_HALT ||
             inst->opcode == SHADER_OPCODE_HALT_TARGET) {
            flags_unread |= flags_unread_in_block;
            flags_unread_in_block = 0;
         }
      }

      flags_unread |= flags_unread_in_block;

      if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
         break;
   }

   if (flags_unread) {
      int eot_count = 0;

      foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg)
      {
         if (!inst->eot)
            continue;

         /* Currently, we always emit only one EOT per program,
          * this WA should be updated if it ever changes.
          */
         assert(++eot_count == 1);

         const fs_builder ibld(&s, block, inst);
         const fs_builder ubld = ibld.exec_all().group(1, 0);

         if (flags_unread & 0x0f)
            ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(0, 0), BRW_TYPE_UD));

         if (flags_unread & 0xf0)
            ubld.MOV(ubld.null_reg_ud(), retype(brw_flag_reg(1, 0), BRW_TYPE_UD));
      }

      progress = true;
      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
   }

   return progress;
}