mesa/src/amd/llvm/ac_llvm_helper.cpp

/*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * SPDX-License-Identifier: MIT
 */

#include <llvm-c/Core.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Verifier.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/MC/MCSubtargetInfo.h>
#include <llvm/Support/CommandLine.h>
#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Utils.h>
#include <llvm/CodeGen/Passes.h>
#include <llvm/Passes/PassBuilder.h>
#include <llvm/Transforms/InstCombine/InstCombine.h>
#include <llvm/Transforms/IPO/AlwaysInliner.h>
#include <llvm/Transforms/IPO/SCCP.h>
#include <llvm/Transforms/Scalar/EarlyCSE.h>
#include <llvm/Transforms/Scalar/LICM.h>
#include <llvm/Transforms/Scalar/SROA.h>
#include <llvm/Transforms/Scalar/SimplifyCFG.h>
#include "llvm/CodeGen/SelectionDAGNodes.h"

#include <cstring>

/* DO NOT REORDER THE HEADERS
 * The LLVM headers need to all be included before any Mesa header,
 * as they use the `restrict` keyword in ways that are incompatible
 * with our #define in include/c99_compat.h
 */

#include "ac_binary.h"
#include "ac_llvm_util.h"
#include "ac_llvm_build.h"
#include "util/macros.h"

using namespace llvm;

class RunAtExitForStaticDestructors : public SDNode
{
public:
   /* getSDVTList (protected) calls getValueTypeList (private), which contains static variables. */
   RunAtExitForStaticDestructors(): SDNode(0, 0, DebugLoc(), getSDVTList(MVT::Other))
   {
   }
};

void ac_llvm_run_atexit_for_destructors(void)
{
   /* LLVM >= 16 registers static variable destructors on the first compile, which gcc
    * implements by calling atexit there. Before that, u_queue registers its atexit
    * handler to kill all threads. Since exit() runs atexit handlers in the reverse order,
    * the LLVM destructors are called first while shader compiler threads may still be
    * running, which crashes in LLVM in SelectionDAG.cpp.
    *
    * The solution is to run the code that declares the LLVM static variables first,
    * so that atexit for LLVM is registered first and u_queue is registered after that,
    * which ensures that all u_queue threads are terminated before LLVM destructors are
    * called.
    *
    * This just executes the code that declares static variables.
    */
   RunAtExitForStaticDestructors();
}

bool ac_is_llvm_processor_supported(LLVMTargetMachineRef tm, const char *processor)
{
   TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
   return TM->getMCSubtargetInfo()->isCPUStringValid(processor);
}

void ac_reset_llvm_all_options_occurrences()
{
   cl::ResetAllOptionOccurrences();
}

void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
{
   Argument *A = unwrap<Argument>(val);
   A->addAttr(Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
}

void ac_add_attr_alignment(LLVMValueRef val, uint64_t bytes)
{
   Argument *A = unwrap<Argument>(val);
   A->addAttr(Attribute::getWithAlignment(A->getContext(), Align(bytes)));
}

LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx)
{
   TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
   LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx);

#if LLVM_VERSION_MAJOR >= 21
   unwrap(module)->setTargetTriple(TM->getTargetTriple());
#else
   unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple());
#endif
   unwrap(module)->setDataLayout(TM->createDataLayout());
   return module;
}

LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode)
{
   LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);

   FastMathFlags flags;

   switch (float_mode) {
   case AC_FLOAT_MODE_DEFAULT:
   case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO:
      break;

   case AC_FLOAT_MODE_DEFAULT_OPENGL:
      /* Allow optimizations to treat the sign of a zero argument or
       * result as insignificant.
       */
      flags.setNoSignedZeros(); /* nsz */

      /* Allow optimizations to use the reciprocal of an argument
       * rather than perform division.
       */
      flags.setAllowReciprocal(); /* arcp */

      unwrap(builder)->setFastMathFlags(flags);
      break;
   }

   return builder;
}

void ac_enable_signed_zeros(struct ac_llvm_context *ctx)
{
   if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
      auto *b = unwrap(ctx->builder);
      FastMathFlags flags = b->getFastMathFlags();

      /* This disables the optimization of (x + 0), which is used
       * to convert negative zero to positive zero.
       */
      flags.setNoSignedZeros(false);
      b->setFastMathFlags(flags);
   }
}

void ac_disable_signed_zeros(struct ac_llvm_context *ctx)
{
   if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
      auto *b = unwrap(ctx->builder);
      FastMathFlags flags = b->getFastMathFlags();

      flags.setNoSignedZeros();
      b->setFastMathFlags(flags);
   }
}

/* Implementation of raw_pwrite_stream that works on malloc()ed memory for
 * better compatibility with C code. */
struct raw_memory_ostream : public raw_pwrite_stream {
   char *buffer;
   size_t written;
   size_t bufsize;

   raw_memory_ostream()
   {
      buffer = NULL;
      written = 0;
      bufsize = 0;
      SetUnbuffered();
   }

   ~raw_memory_ostream()
   {
      free(buffer);
   }

   void take(char *&out_buffer, size_t &out_size)
   {
      out_buffer = buffer;
      out_size = written;
      buffer = NULL;
      written = 0;
      bufsize = 0;
   }

   void flush() = delete;

   void write_impl(const char *ptr, size_t size) override
   {
      if (unlikely(written + size < written))
         abort();
      if (written + size > bufsize) {
         bufsize = MAX3(1024, written + size, bufsize / 3 * 4);
         buffer = (char *)realloc(buffer, bufsize);
         if (!buffer) {
            fprintf(stderr, "amd: out of memory allocating ELF buffer\n");
            abort();
         }
      }
      memcpy(buffer + written, ptr, size);
      written += size;
   }

   void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override
   {
      assert(offset == (size_t)offset && offset + size >= offset && offset + size <= written);
      memcpy(buffer + offset, ptr, size);
   }

   uint64_t current_pos() const override
   {
      return written;
   }
};

/* The middle-end optimization passes are run using
 * the LLVM's new pass manager infrastructure.
 */
struct ac_midend_optimizer
{
   TargetMachine *target_machine;
   PassBuilder pass_builder;
   TargetLibraryInfoImpl target_library_info;

   /* Should be declared in this order only,
    * so that they are destroyed in the correct order
    * due to inter-analysis-manager references.
    */
   LoopAnalysisManager loop_am;
   FunctionAnalysisManager function_am;
   CGSCCAnalysisManager cgscc_am;
   ModuleAnalysisManager module_am;

   /* Pass Managers */
   LoopPassManager loop_pm;
   FunctionPassManager function_pm;
   ModulePassManager module_pm;

   ac_midend_optimizer(TargetMachine *arg_target_machine, bool arg_check_ir)
      : target_machine(arg_target_machine),
        pass_builder(target_machine, PipelineTuningOptions(), {}),
        target_library_info(Triple(target_machine->getTargetTriple()))
   {
      /* Build the pipeline and optimize.
       * Any custom analyses should be registered
       * before LLVM's default analysis sets.
       */
      function_am.registerPass(
         [&] { return TargetLibraryAnalysis(target_library_info); }
      );

      pass_builder.registerModuleAnalyses(module_am);
      pass_builder.registerCGSCCAnalyses(cgscc_am);
      pass_builder.registerFunctionAnalyses(function_am);
      pass_builder.registerLoopAnalyses(loop_am);
      pass_builder.crossRegisterProxies(loop_am, function_am, cgscc_am, module_am);

      if (arg_check_ir)
         module_pm.addPass(VerifierPass());

      /* Adding inliner pass to the module pass manager directly
       * ensures that the pass is run on all functions first, which makes sure
       * that the following passes are only run on the remaining non-inline
       * function, so it removes useless work done on dead inline functions.
       */
      module_pm.addPass(AlwaysInlinerPass());

      /* The following set of passes run on an individual function/loop first
       * before proceeding to the next.
       */
      function_pm.addPass(SROAPass(SROAOptions::ModifyCFG));

      loop_pm.addPass(LICMPass(LICMOptions()));
      function_pm.addPass(createFunctionToLoopPassAdaptor(std::move(loop_pm), true));
      function_pm.addPass(SimplifyCFGPass());
      function_pm.addPass(EarlyCSEPass(true));

      module_pm.addPass(createModuleToFunctionPassAdaptor(std::move(function_pm)));
   }

   void run(Module &module)
   {
      module_pm.run(module, module_am);

      /* After a run(), the results in the analyses managers
       * aren't useful to optimize a subsequent LLVM module.
       * If used, it can lead to unexpected crashes.
       * Hence, the results in the analyses managers
       * need to be invalidated and cleared before
       * running optimizations on a new LLVM module.
       */
      module_am.invalidate(module, PreservedAnalyses::none());
      module_am.clear();
      cgscc_am.clear();
      function_am.clear();
      loop_am.clear();
   }
};

/* The backend passes for optimizations, instruction selection,
 * and code generation in the LLVM compiler still requires the
 * legacy::PassManager. The use of the legacy PM will be
 * deprecated when the new PM can handle backend passes.
 */
struct ac_backend_optimizer
{
   raw_memory_ostream ostream; /* ELF shader binary stream */
   legacy::PassManager backend_pass_manager; /* for codegen only */

   ac_backend_optimizer(TargetMachine *arg_target_machine)
   {
      /* add backend passes */
      if (arg_target_machine->addPassesToEmitFile(backend_pass_manager, ostream, nullptr,
                                             CodeGenFileType::ObjectFile)) {
         fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n");
      }
   }

   void run(Module &module, char *&out_buffer, size_t &out_size)
   {
      backend_pass_manager.run(module);
      ostream.take(out_buffer, out_size);
   }
};

ac_midend_optimizer *ac_create_midend_optimizer(LLVMTargetMachineRef tm,
                                                bool check_ir)
{
   TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
   return new ac_midend_optimizer(TM, check_ir);
}

void ac_destroy_midend_optimiser(ac_midend_optimizer *meo)
{
   delete meo;
}

bool ac_llvm_optimize_module(ac_midend_optimizer *meo, LLVMModuleRef module)
{
   if (!meo)
      return false;

   /* Runs all the middle-end optimizations, no code generation */
   meo->run(*unwrap(module));
   return true;
}

ac_backend_optimizer *ac_create_backend_optimizer(LLVMTargetMachineRef tm)
{
   TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
   return new ac_backend_optimizer(TM);
}

void ac_destroy_backend_optimizer(ac_backend_optimizer *beo)
{
   delete beo;
}

bool ac_compile_module_to_elf(ac_backend_optimizer *beo, LLVMModuleRef module,
                              char **pelf_buffer, size_t *pelf_size)
{
   if (!beo)
      return false;

   /* Runs all backend optimizations and code generation */
   beo->run(*unwrap(module), *pelf_buffer, *pelf_size);
   return true;
}

LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,
                                 LLVMValueRef ptr, LLVMValueRef val, const char *sync_scope)
{
   AtomicRMWInst::BinOp binop;
   switch (op) {
   case LLVMAtomicRMWBinOpXchg:
      binop = AtomicRMWInst::Xchg;
      break;
   case LLVMAtomicRMWBinOpAdd:
      binop = AtomicRMWInst::Add;
      break;
   case LLVMAtomicRMWBinOpSub:
      binop = AtomicRMWInst::Sub;
      break;
   case LLVMAtomicRMWBinOpAnd:
      binop = AtomicRMWInst::And;
      break;
   case LLVMAtomicRMWBinOpNand:
      binop = AtomicRMWInst::Nand;
      break;
   case LLVMAtomicRMWBinOpOr:
      binop = AtomicRMWInst::Or;
      break;
   case LLVMAtomicRMWBinOpXor:
      binop = AtomicRMWInst::Xor;
      break;
   case LLVMAtomicRMWBinOpMax:
      binop = AtomicRMWInst::Max;
      break;
   case LLVMAtomicRMWBinOpMin:
      binop = AtomicRMWInst::Min;
      break;
   case LLVMAtomicRMWBinOpUMax:
      binop = AtomicRMWInst::UMax;
      break;
   case LLVMAtomicRMWBinOpUMin:
      binop = AtomicRMWInst::UMin;
      break;
   case LLVMAtomicRMWBinOpFAdd:
      binop = AtomicRMWInst::FAdd;
      break;
   default:
      UNREACHABLE("invalid LLVMAtomicRMWBinOp");
      break;
   }
   unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
   return wrap(unwrap(ctx->builder)
                        ->CreateAtomicRMW(binop, unwrap(ptr), unwrap(val),
                                          MaybeAlign(0),
                                          AtomicOrdering::SequentiallyConsistent, SSID));
}

LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr,
                                      LLVMValueRef cmp, LLVMValueRef val, const char *sync_scope)
{
   unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
   return wrap(unwrap(ctx->builder)
                        ->CreateAtomicCmpXchg(unwrap(ptr), unwrap(cmp),
                                              unwrap(val),
                                              MaybeAlign(0),
                                              AtomicOrdering::SequentiallyConsistent,
                                              AtomicOrdering::SequentiallyConsistent, SSID));
}