Target/NVPTX/NVPTXLowerArgs.cpp

311116Sdim//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
311116Sdim//
353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
353358Sdim// See https://llvm.org/LICENSE.txt for license information.
353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
311116Sdim//
311116Sdim//===----------------------------------------------------------------------===//
311116Sdim//
311116Sdim//
311116Sdim// Arguments to kernel and device functions are passed via param space,
311116Sdim// which imposes certain restrictions:
311116Sdim// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
311116Sdim//
311116Sdim// Kernel parameters are read-only and accessible only via ld.param
311116Sdim// instruction, directly or via a pointer. Pointers to kernel
311116Sdim// arguments can't be converted to generic address space.
311116Sdim//
311116Sdim// Device function parameters are directly accessible via
311116Sdim// ld.param/st.param, but taking the address of one returns a pointer
311116Sdim// to a copy created in local space which *can't* be used with
311116Sdim// ld.param/st.param.
311116Sdim//
311116Sdim// Copying a byval struct into local memory in IR allows us to enforce
311116Sdim// the param space restrictions, gives the rest of IR a pointer w/o
311116Sdim// param space restrictions, and gives us an opportunity to eliminate
311116Sdim// the copy.
311116Sdim//
311116Sdim// Pointer arguments to kernel functions need more work to be lowered:
311116Sdim//
311116Sdim// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
311116Sdim//    global address space. This allows later optimizations to emit
311116Sdim//    ld.global.*/st.global.* for accessing these pointer arguments. For
311116Sdim//    example,
311116Sdim//
311116Sdim//    define void @foo(float* %input) {
311116Sdim//      %v = load float, float* %input, align 4
311116Sdim//      ...
311116Sdim//    }
311116Sdim//
311116Sdim//    becomes
311116Sdim//
311116Sdim//    define void @foo(float* %input) {
311116Sdim//      %input2 = addrspacecast float* %input to float addrspace(1)*
311116Sdim//      %input3 = addrspacecast float addrspace(1)* %input2 to float*
311116Sdim//      %v = load float, float* %input3, align 4
311116Sdim//      ...
311116Sdim//    }
311116Sdim//
311116Sdim//    Later, NVPTXInferAddressSpaces will optimize it to
311116Sdim//
311116Sdim//    define void @foo(float* %input) {
311116Sdim//      %input2 = addrspacecast float* %input to float addrspace(1)*
311116Sdim//      %v = load float, float addrspace(1)* %input2, align 4
311116Sdim//      ...
311116Sdim//    }
311116Sdim//
311116Sdim// 2. Convert pointers in a byval kernel parameter to pointers in the global
311116Sdim//    address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
311116Sdim//
311116Sdim//    struct S {
311116Sdim//      int *x;
311116Sdim//      int *y;
311116Sdim//    };
311116Sdim//    __global__ void foo(S s) {
311116Sdim//      int *b = s.y;
311116Sdim//      // use b
311116Sdim//    }
311116Sdim//
311116Sdim//    "b" points to the global address space. In the IR level,
311116Sdim//
311116Sdim//    define void @foo({i32*, i32*}* byval %input) {
311116Sdim//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
311116Sdim//      %b = load i32*, i32** %b_ptr
311116Sdim//      ; use %b
311116Sdim//    }
311116Sdim//
311116Sdim//    becomes
311116Sdim//
311116Sdim//    define void @foo({i32*, i32*}* byval %input) {
311116Sdim//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
311116Sdim//      %b = load i32*, i32** %b_ptr
311116Sdim//      %b_global = addrspacecast i32* %b to i32 addrspace(1)*
311116Sdim//      %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
311116Sdim//      ; use %b_generic
311116Sdim//    }
311116Sdim//
311116Sdim// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
311116Sdim// cancel the addrspacecast pair this pass emits.
311116Sdim//===----------------------------------------------------------------------===//
311116Sdim
311116Sdim#include "NVPTX.h"
321369Sdim#include "NVPTXTargetMachine.h"
311116Sdim#include "NVPTXUtilities.h"
353358Sdim#include "MCTargetDesc/NVPTXBaseInfo.h"
311116Sdim#include "llvm/Analysis/ValueTracking.h"
311116Sdim#include "llvm/IR/Function.h"
311116Sdim#include "llvm/IR/Instructions.h"
311116Sdim#include "llvm/IR/Module.h"
311116Sdim#include "llvm/IR/Type.h"
311116Sdim#include "llvm/Pass.h"
311116Sdim
311116Sdimusing namespace llvm;
311116Sdim
311116Sdimnamespace llvm {
311116Sdimvoid initializeNVPTXLowerArgsPass(PassRegistry &);
311116Sdim}
311116Sdim
311116Sdimnamespace {
311116Sdimclass NVPTXLowerArgs : public FunctionPass {
311116Sdim  bool runOnFunction(Function &F) override;
311116Sdim
311116Sdim  bool runOnKernelFunction(Function &F);
311116Sdim  bool runOnDeviceFunction(Function &F);
311116Sdim
311116Sdim  // handle byval parameters
311116Sdim  void handleByValParam(Argument *Arg);
311116Sdim  // Knowing Ptr must point to the global address space, this function
311116Sdim  // addrspacecasts Ptr to global and then back to generic. This allows
311116Sdim  // NVPTXInferAddressSpaces to fold the global-to-generic cast into
311116Sdim  // loads/stores that appear later.
311116Sdim  void markPointerAsGlobal(Value *Ptr);
311116Sdim
311116Sdimpublic:
311116Sdim  static char ID; // Pass identification, replacement for typeid
311116Sdim  NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr)
311116Sdim      : FunctionPass(ID), TM(TM) {}
311116Sdim  StringRef getPassName() const override {
311116Sdim    return "Lower pointer arguments of CUDA kernels";
311116Sdim  }
311116Sdim
311116Sdimprivate:
311116Sdim  const NVPTXTargetMachine *TM;
311116Sdim};
311116Sdim} // namespace
311116Sdim
311116Sdimchar NVPTXLowerArgs::ID = 1;
311116Sdim
311116SdimINITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
311116Sdim                "Lower arguments (NVPTX)", false, false)
311116Sdim
311116Sdim// =============================================================================
311116Sdim// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
311116Sdim// then add the following instructions to the first basic block:
311116Sdim//
311116Sdim// %temp = alloca %struct.x, align 8
311116Sdim// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
311116Sdim// %tv = load %struct.x addrspace(101)* %tempd
311116Sdim// store %struct.x %tv, %struct.x* %temp, align 8
311116Sdim//
311116Sdim// The above code allocates some space in the stack and copies the incoming
311116Sdim// struct from param space to local space.
311116Sdim// Then replace all occurrences of %d by %temp.
311116Sdim// =============================================================================
311116Sdimvoid NVPTXLowerArgs::handleByValParam(Argument *Arg) {
311116Sdim  Function *Func = Arg->getParent();
311116Sdim  Instruction *FirstInst = &(Func->getEntryBlock().front());
311116Sdim  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
311116Sdim
311116Sdim  assert(PType && "Expecting pointer type in handleByValParam");
311116Sdim
311116Sdim  Type *StructType = PType->getElementType();
321369Sdim  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
321369Sdim  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
311116Sdim  // Set the alignment to alignment of the byval parameter. This is because,
311116Sdim  // later load/stores assume that alignment, and we are going to replace
311116Sdim  // the use of the byval parameter with this alloca instruction.
360784Sdim  AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
311116Sdim  Arg->replaceAllUsesWith(AllocA);
311116Sdim
311116Sdim  Value *ArgInParam = new AddrSpaceCastInst(
311116Sdim      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
311116Sdim      FirstInst);
353358Sdim  LoadInst *LI =
353358Sdim      new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst);
311116Sdim  new StoreInst(LI, AllocA, FirstInst);
311116Sdim}
311116Sdim
311116Sdimvoid NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
311116Sdim  if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
311116Sdim    return;
311116Sdim
311116Sdim  // Deciding where to emit the addrspacecast pair.
311116Sdim  BasicBlock::iterator InsertPt;
311116Sdim  if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
311116Sdim    // Insert at the functon entry if Ptr is an argument.
311116Sdim    InsertPt = Arg->getParent()->getEntryBlock().begin();
311116Sdim  } else {
311116Sdim    // Insert right after Ptr if Ptr is an instruction.
311116Sdim    InsertPt = ++cast<Instruction>(Ptr)->getIterator();
311116Sdim    assert(InsertPt != InsertPt->getParent()->end() &&
311116Sdim           "We don't call this function with Ptr being a terminator.");
311116Sdim  }
311116Sdim
311116Sdim  Instruction *PtrInGlobal = new AddrSpaceCastInst(
311116Sdim      Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
311116Sdim                            ADDRESS_SPACE_GLOBAL),
311116Sdim      Ptr->getName(), &*InsertPt);
311116Sdim  Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
311116Sdim                                              Ptr->getName(), &*InsertPt);
311116Sdim  // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
311116Sdim  Ptr->replaceAllUsesWith(PtrInGeneric);
311116Sdim  PtrInGlobal->setOperand(0, Ptr);
311116Sdim}
311116Sdim
311116Sdim// =============================================================================
311116Sdim// Main function for this pass.
311116Sdim// =============================================================================
311116Sdimbool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
311116Sdim  if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
311116Sdim    // Mark pointers in byval structs as global.
311116Sdim    for (auto &B : F) {
311116Sdim      for (auto &I : B) {
311116Sdim        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
311116Sdim          if (LI->getType()->isPointerTy()) {
311116Sdim            Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
311116Sdim                                            F.getParent()->getDataLayout());
311116Sdim            if (Argument *Arg = dyn_cast<Argument>(UO)) {
311116Sdim              if (Arg->hasByValAttr()) {
311116Sdim                // LI is a load from a pointer within a byval kernel parameter.
311116Sdim                markPointerAsGlobal(LI);
311116Sdim              }
311116Sdim            }
311116Sdim          }
311116Sdim        }
311116Sdim      }
311116Sdim    }
311116Sdim  }
311116Sdim
311116Sdim  for (Argument &Arg : F.args()) {
311116Sdim    if (Arg.getType()->isPointerTy()) {
311116Sdim      if (Arg.hasByValAttr())
311116Sdim        handleByValParam(&Arg);
311116Sdim      else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
311116Sdim        markPointerAsGlobal(&Arg);
311116Sdim    }
311116Sdim  }
311116Sdim  return true;
311116Sdim}
311116Sdim
311116Sdim// Device functions only need to copy byval args into local memory.
311116Sdimbool NVPTXLowerArgs::runOnDeviceFunction(Function &F) {
311116Sdim  for (Argument &Arg : F.args())
311116Sdim    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
311116Sdim      handleByValParam(&Arg);
311116Sdim  return true;
311116Sdim}
311116Sdim
311116Sdimbool NVPTXLowerArgs::runOnFunction(Function &F) {
311116Sdim  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
311116Sdim}
311116Sdim
311116SdimFunctionPass *
311116Sdimllvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) {
311116Sdim  return new NVPTXLowerArgs(TM);
311116Sdim}