1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass eliminates allocas by either converting them into vectors or
10// by migrating them to local address space.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUSubtarget.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/ADT/APInt.h"
18#include "llvm/ADT/None.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/Triple.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/CaptureTracking.h"
24#include "llvm/Analysis/ValueTracking.h"
25#include "llvm/CodeGen/TargetPassConfig.h"
26#include "llvm/IR/Attributes.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Constant.h"
29#include "llvm/IR/Constants.h"
30#include "llvm/IR/DataLayout.h"
31#include "llvm/IR/DerivedTypes.h"
32#include "llvm/IR/Function.h"
33#include "llvm/IR/GlobalValue.h"
34#include "llvm/IR/GlobalVariable.h"
35#include "llvm/IR/IRBuilder.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/IntrinsicInst.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/Metadata.h"
44#include "llvm/IR/Module.h"
45#include "llvm/IR/Type.h"
46#include "llvm/IR/User.h"
47#include "llvm/IR/Value.h"
48#include "llvm/Pass.h"
49#include "llvm/Support/Casting.h"
50#include "llvm/Support/Debug.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/MathExtras.h"
53#include "llvm/Support/raw_ostream.h"
54#include "llvm/Target/TargetMachine.h"
55#include <algorithm>
56#include <cassert>
57#include <cstdint>
58#include <map>
59#include <tuple>
60#include <utility>
61#include <vector>
62
63#define DEBUG_TYPE "amdgpu-promote-alloca"
64
65using namespace llvm;
66
67namespace {
68
69static cl::opt<bool> DisablePromoteAllocaToVector(
70  "disable-promote-alloca-to-vector",
71  cl::desc("Disable promote alloca to vector"),
72  cl::init(false));
73
74static cl::opt<bool> DisablePromoteAllocaToLDS(
75  "disable-promote-alloca-to-lds",
76  cl::desc("Disable promote alloca to LDS"),
77  cl::init(false));
78
79static cl::opt<unsigned> PromoteAllocaToVectorLimit(
80  "amdgpu-promote-alloca-to-vector-limit",
81  cl::desc("Maximum byte size to consider promote alloca to vector"),
82  cl::init(0));
83
84// FIXME: This can create globals so should be a module pass.
85class AMDGPUPromoteAlloca : public FunctionPass {
86private:
87  const TargetMachine *TM;
88  Module *Mod = nullptr;
89  const DataLayout *DL = nullptr;
90
91  // FIXME: This should be per-kernel.
92  uint32_t LocalMemLimit = 0;
93  uint32_t CurrentLocalMemUsage = 0;
94  unsigned MaxVGPRs;
95
96  bool IsAMDGCN = false;
97  bool IsAMDHSA = false;
98
99  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
100  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
101
102  /// BaseAlloca is the alloca root the search started from.
103  /// Val may be that alloca or a recursive user of it.
104  bool collectUsesWithPtrTypes(Value *BaseAlloca,
105                               Value *Val,
106                               std::vector<Value*> &WorkList) const;
107
108  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
109  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
110  /// Returns true if both operands are derived from the same alloca. Val should
111  /// be the same value as one of the input operands of UseInst.
112  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
113                                       Instruction *UseInst,
114                                       int OpIdx0, int OpIdx1) const;
115
116  /// Check whether we have enough local memory for promotion.
117  bool hasSufficientLocalMem(const Function &F);
118
119public:
120  static char ID;
121
122  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
123
124  bool doInitialization(Module &M) override;
125  bool runOnFunction(Function &F) override;
126
127  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
128
129  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
130
131  void getAnalysisUsage(AnalysisUsage &AU) const override {
132    AU.setPreservesCFG();
133    FunctionPass::getAnalysisUsage(AU);
134  }
135};
136
137class AMDGPUPromoteAllocaToVector : public FunctionPass {
138private:
139  unsigned MaxVGPRs;
140
141public:
142  static char ID;
143
144  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
145
146  bool runOnFunction(Function &F) override;
147
148  StringRef getPassName() const override {
149    return "AMDGPU Promote Alloca to vector";
150  }
151
152  bool handleAlloca(AllocaInst &I);
153
154  void getAnalysisUsage(AnalysisUsage &AU) const override {
155    AU.setPreservesCFG();
156    FunctionPass::getAnalysisUsage(AU);
157  }
158};
159
160} // end anonymous namespace
161
162char AMDGPUPromoteAlloca::ID = 0;
163char AMDGPUPromoteAllocaToVector::ID = 0;
164
165INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
166                "AMDGPU promote alloca to vector or LDS", false, false)
167
168INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
169                "AMDGPU promote alloca to vector", false, false)
170
171char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
172char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
173
174bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
175  Mod = &M;
176  DL = &Mod->getDataLayout();
177
178  return false;
179}
180
181bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
182  if (skipFunction(F))
183    return false;
184
185  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
186    TM = &TPC->getTM<TargetMachine>();
187  else
188    return false;
189
190  const Triple &TT = TM->getTargetTriple();
191  IsAMDGCN = TT.getArch() == Triple::amdgcn;
192  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
193
194  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
195  if (!ST.isPromoteAllocaEnabled())
196    return false;
197
198  if (IsAMDGCN) {
199    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
200    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
201  } else {
202    MaxVGPRs = 128;
203  }
204
205  bool SufficientLDS = hasSufficientLocalMem(F);
206  bool Changed = false;
207  BasicBlock &EntryBB = *F.begin();
208
209  SmallVector<AllocaInst *, 16> Allocas;
210  for (Instruction &I : EntryBB) {
211    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
212      Allocas.push_back(AI);
213  }
214
215  for (AllocaInst *AI : Allocas) {
216    if (handleAlloca(*AI, SufficientLDS))
217      Changed = true;
218  }
219
220  return Changed;
221}
222
223std::pair<Value *, Value *>
224AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
225  const Function &F = *Builder.GetInsertBlock()->getParent();
226  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
227
228  if (!IsAMDHSA) {
229    Function *LocalSizeYFn
230      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
231    Function *LocalSizeZFn
232      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
233
234    CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
235    CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
236
237    ST.makeLIDRangeMetadata(LocalSizeY);
238    ST.makeLIDRangeMetadata(LocalSizeZ);
239
240    return std::make_pair(LocalSizeY, LocalSizeZ);
241  }
242
243  // We must read the size out of the dispatch pointer.
244  assert(IsAMDGCN);
245
246  // We are indexing into this struct, and want to extract the workgroup_size_*
247  // fields.
248  //
249  //   typedef struct hsa_kernel_dispatch_packet_s {
250  //     uint16_t header;
251  //     uint16_t setup;
252  //     uint16_t workgroup_size_x ;
253  //     uint16_t workgroup_size_y;
254  //     uint16_t workgroup_size_z;
255  //     uint16_t reserved0;
256  //     uint32_t grid_size_x ;
257  //     uint32_t grid_size_y ;
258  //     uint32_t grid_size_z;
259  //
260  //     uint32_t private_segment_size;
261  //     uint32_t group_segment_size;
262  //     uint64_t kernel_object;
263  //
264  // #ifdef HSA_LARGE_MODEL
265  //     void *kernarg_address;
266  // #elif defined HSA_LITTLE_ENDIAN
267  //     void *kernarg_address;
268  //     uint32_t reserved1;
269  // #else
270  //     uint32_t reserved1;
271  //     void *kernarg_address;
272  // #endif
273  //     uint64_t reserved2;
274  //     hsa_signal_t completion_signal; // uint64_t wrapper
275  //   } hsa_kernel_dispatch_packet_t
276  //
277  Function *DispatchPtrFn
278    = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
279
280  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
281  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
282  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
283
284  // Size of the dispatch packet struct.
285  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
286
287  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
288  Value *CastDispatchPtr = Builder.CreateBitCast(
289    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
290
291  // We could do a single 64-bit load here, but it's likely that the basic
292  // 32-bit and extract sequence is already present, and it is probably easier
293  // to CSE this. The loads should be mergable later anyway.
294  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
295  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
296
297  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
298  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
299
300  MDNode *MD = MDNode::get(Mod->getContext(), None);
301  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
302  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
303  ST.makeLIDRangeMetadata(LoadZU);
304
305  // Extract y component. Upper half of LoadZU should be zero already.
306  Value *Y = Builder.CreateLShr(LoadXY, 16);
307
308  return std::make_pair(Y, LoadZU);
309}
310
311Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
312  const AMDGPUSubtarget &ST =
313      AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
314  Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
315
316  switch (N) {
317  case 0:
318    IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
319                      : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
320    break;
321  case 1:
322    IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
323                      : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
324    break;
325
326  case 2:
327    IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
328                      : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
329    break;
330  default:
331    llvm_unreachable("invalid dimension");
332  }
333
334  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
335  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
336  ST.makeLIDRangeMetadata(CI);
337
338  return CI;
339}
340
341static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
342  return FixedVectorType::get(ArrayTy->getElementType(),
343                              ArrayTy->getNumElements());
344}
345
346static Value *stripBitcasts(Value *V) {
347  while (Instruction *I = dyn_cast<Instruction>(V)) {
348    if (I->getOpcode() != Instruction::BitCast)
349      break;
350    V = I->getOperand(0);
351  }
352  return V;
353}
354
355static Value *
356calculateVectorIndex(Value *Ptr,
357                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
358  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
359  if (!GEP)
360    return nullptr;
361
362  auto I = GEPIdx.find(GEP);
363  return I == GEPIdx.end() ? nullptr : I->second;
364}
365
366static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
367  // FIXME we only support simple cases
368  if (GEP->getNumOperands() != 3)
369    return nullptr;
370
371  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
372  if (!I0 || !I0->isZero())
373    return nullptr;
374
375  return GEP->getOperand(2);
376}
377
378// Not an instruction handled below to turn into a vector.
379//
380// TODO: Check isTriviallyVectorizable for calls and handle other
381// instructions.
382static bool canVectorizeInst(Instruction *Inst, User *User,
383                             const DataLayout &DL) {
384  switch (Inst->getOpcode()) {
385  case Instruction::Load: {
386    // Currently only handle the case where the Pointer Operand is a GEP.
387    // Also we could not vectorize volatile or atomic loads.
388    LoadInst *LI = cast<LoadInst>(Inst);
389    if (isa<AllocaInst>(User) &&
390        LI->getPointerOperandType() == User->getType() &&
391        isa<VectorType>(LI->getType()))
392      return true;
393
394    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
395    if (!PtrInst)
396      return false;
397
398    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
399            PtrInst->getOpcode() == Instruction::BitCast) &&
400           LI->isSimple();
401  }
402  case Instruction::BitCast:
403    return true;
404  case Instruction::Store: {
405    // Must be the stored pointer operand, not a stored value, plus
406    // since it should be canonical form, the User should be a GEP.
407    // Also we could not vectorize volatile or atomic stores.
408    StoreInst *SI = cast<StoreInst>(Inst);
409    if (isa<AllocaInst>(User) &&
410        SI->getPointerOperandType() == User->getType() &&
411        isa<VectorType>(SI->getValueOperand()->getType()))
412      return true;
413
414    Instruction *UserInst = dyn_cast<Instruction>(User);
415    if (!UserInst)
416      return false;
417
418    return (SI->getPointerOperand() == User) &&
419           (UserInst->getOpcode() == Instruction::GetElementPtr ||
420            UserInst->getOpcode() == Instruction::BitCast) &&
421           SI->isSimple();
422  }
423  default:
424    return false;
425  }
426}
427
428static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
429                                     unsigned MaxVGPRs) {
430
431  if (DisablePromoteAllocaToVector) {
432    LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
433    return false;
434  }
435
436  Type *AllocaTy = Alloca->getAllocatedType();
437  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
438  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
439    if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
440        ArrayTy->getNumElements() > 0)
441      VectorTy = arrayTypeToVecType(ArrayTy);
442  }
443
444  // Use up to 1/4 of available register budget for vectorization.
445  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
446                                              : (MaxVGPRs * 32);
447
448  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
449    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
450                      << MaxVGPRs << " registers available\n");
451    return false;
452  }
453
454  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
455
456  // FIXME: There is no reason why we can't support larger arrays, we
457  // are just being conservative for now.
458  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
459  // could also be promoted but we don't currently handle this case
460  if (!VectorTy || VectorTy->getNumElements() > 16 ||
461      VectorTy->getNumElements() < 2) {
462    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
463    return false;
464  }
465
466  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
467  std::vector<Value *> WorkList;
468  SmallVector<User *, 8> Users(Alloca->users());
469  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
470  Type *VecEltTy = VectorTy->getElementType();
471  while (!Users.empty()) {
472    User *AllocaUser = Users.pop_back_val();
473    User *UseUser = UseUsers.pop_back_val();
474    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
475
476    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
477    if (!GEP) {
478      if (!canVectorizeInst(Inst, UseUser, DL))
479        return false;
480
481      if (Inst->getOpcode() == Instruction::BitCast) {
482        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
483        Type *ToTy = Inst->getType()->getPointerElementType();
484        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
485            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
486          continue;
487
488        for (User *CastUser : Inst->users()) {
489          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
490            continue;
491          Users.push_back(CastUser);
492          UseUsers.push_back(Inst);
493        }
494
495        continue;
496      }
497
498      WorkList.push_back(AllocaUser);
499      continue;
500    }
501
502    Value *Index = GEPToVectorIndex(GEP);
503
504    // If we can't compute a vector index from this GEP, then we can't
505    // promote this alloca to vector.
506    if (!Index) {
507      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
508                        << '\n');
509      return false;
510    }
511
512    GEPVectorIdx[GEP] = Index;
513    Users.append(GEP->user_begin(), GEP->user_end());
514    UseUsers.append(GEP->getNumUses(), GEP);
515  }
516
517  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
518                    << *VectorTy << '\n');
519
520  for (Value *V : WorkList) {
521    Instruction *Inst = cast<Instruction>(V);
522    IRBuilder<> Builder(Inst);
523    switch (Inst->getOpcode()) {
524    case Instruction::Load: {
525      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
526        break;
527
528      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
529      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
530      if (!Index)
531        break;
532
533      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
534      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
535      Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
536      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
537      if (Inst->getType() != VecEltTy)
538        ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
539      Inst->replaceAllUsesWith(ExtractElement);
540      Inst->eraseFromParent();
541      break;
542    }
543    case Instruction::Store: {
544      StoreInst *SI = cast<StoreInst>(Inst);
545      if (SI->getValueOperand()->getType() == AllocaTy ||
546          SI->getValueOperand()->getType()->isVectorTy())
547        break;
548
549      Value *Ptr = SI->getPointerOperand();
550      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
551      if (!Index)
552        break;
553
554      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
555      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
556      Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
557      Value *Elt = SI->getValueOperand();
558      if (Elt->getType() != VecEltTy)
559        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
560      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
561      Builder.CreateStore(NewVecValue, BitCast);
562      Inst->eraseFromParent();
563      break;
564    }
565
566    default:
567      llvm_unreachable("Inconsistency in instructions promotable to vector");
568    }
569  }
570  return true;
571}
572
573static bool isCallPromotable(CallInst *CI) {
574  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
575  if (!II)
576    return false;
577
578  switch (II->getIntrinsicID()) {
579  case Intrinsic::memcpy:
580  case Intrinsic::memmove:
581  case Intrinsic::memset:
582  case Intrinsic::lifetime_start:
583  case Intrinsic::lifetime_end:
584  case Intrinsic::invariant_start:
585  case Intrinsic::invariant_end:
586  case Intrinsic::launder_invariant_group:
587  case Intrinsic::strip_invariant_group:
588  case Intrinsic::objectsize:
589    return true;
590  default:
591    return false;
592  }
593}
594
595bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
596                                                          Value *Val,
597                                                          Instruction *Inst,
598                                                          int OpIdx0,
599                                                          int OpIdx1) const {
600  // Figure out which operand is the one we might not be promoting.
601  Value *OtherOp = Inst->getOperand(OpIdx0);
602  if (Val == OtherOp)
603    OtherOp = Inst->getOperand(OpIdx1);
604
605  if (isa<ConstantPointerNull>(OtherOp))
606    return true;
607
608  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
609  if (!isa<AllocaInst>(OtherObj))
610    return false;
611
612  // TODO: We should be able to replace undefs with the right pointer type.
613
614  // TODO: If we know the other base object is another promotable
615  // alloca, not necessarily this alloca, we can do this. The
616  // important part is both must have the same address space at
617  // the end.
618  if (OtherObj != BaseAlloca) {
619    LLVM_DEBUG(
620        dbgs() << "Found a binary instruction with another alloca object\n");
621    return false;
622  }
623
624  return true;
625}
626
627bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
628  Value *BaseAlloca,
629  Value *Val,
630  std::vector<Value*> &WorkList) const {
631
632  for (User *User : Val->users()) {
633    if (is_contained(WorkList, User))
634      continue;
635
636    if (CallInst *CI = dyn_cast<CallInst>(User)) {
637      if (!isCallPromotable(CI))
638        return false;
639
640      WorkList.push_back(User);
641      continue;
642    }
643
644    Instruction *UseInst = cast<Instruction>(User);
645    if (UseInst->getOpcode() == Instruction::PtrToInt)
646      return false;
647
648    if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
649      if (LI->isVolatile())
650        return false;
651
652      continue;
653    }
654
655    if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
656      if (SI->isVolatile())
657        return false;
658
659      // Reject if the stored value is not the pointer operand.
660      if (SI->getPointerOperand() != Val)
661        return false;
662    } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
663      if (RMW->isVolatile())
664        return false;
665    } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
666      if (CAS->isVolatile())
667        return false;
668    }
669
670    // Only promote a select if we know that the other select operand
671    // is from another pointer that will also be promoted.
672    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
673      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
674        return false;
675
676      // May need to rewrite constant operands.
677      WorkList.push_back(ICmp);
678    }
679
680    if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
681      // Give up if the pointer may be captured.
682      if (PointerMayBeCaptured(UseInst, true, true))
683        return false;
684      // Don't collect the users of this.
685      WorkList.push_back(User);
686      continue;
687    }
688
689    if (!User->getType()->isPointerTy())
690      continue;
691
692    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
693      // Be conservative if an address could be computed outside the bounds of
694      // the alloca.
695      if (!GEP->isInBounds())
696        return false;
697    }
698
699    // Only promote a select if we know that the other select operand is from
700    // another pointer that will also be promoted.
701    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
702      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
703        return false;
704    }
705
706    // Repeat for phis.
707    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
708      // TODO: Handle more complex cases. We should be able to replace loops
709      // over arrays.
710      switch (Phi->getNumIncomingValues()) {
711      case 1:
712        break;
713      case 2:
714        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
715          return false;
716        break;
717      default:
718        return false;
719      }
720    }
721
722    WorkList.push_back(User);
723    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
724      return false;
725  }
726
727  return true;
728}
729
730bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
731
732  FunctionType *FTy = F.getFunctionType();
733  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
734
735  // If the function has any arguments in the local address space, then it's
736  // possible these arguments require the entire local memory space, so
737  // we cannot use local memory in the pass.
738  for (Type *ParamTy : FTy->params()) {
739    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
740    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
741      LocalMemLimit = 0;
742      LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
743                           "local memory disabled.\n");
744      return false;
745    }
746  }
747
748  LocalMemLimit = ST.getLocalMemorySize();
749  if (LocalMemLimit == 0)
750    return false;
751
752  const DataLayout &DL = Mod->getDataLayout();
753
754  // Check how much local memory is being used by global objects
755  CurrentLocalMemUsage = 0;
756  for (GlobalVariable &GV : Mod->globals()) {
757    if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
758      continue;
759
760    for (const User *U : GV.users()) {
761      const Instruction *Use = dyn_cast<Instruction>(U);
762      if (!Use)
763        continue;
764
765      if (Use->getParent()->getParent() == &F) {
766        Align Alignment =
767            DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
768
769        // FIXME: Try to account for padding here. The padding is currently
770        // determined from the inverse order of uses in the function. I'm not
771        // sure if the use list order is in any way connected to this, so the
772        // total reported size is likely incorrect.
773        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
774        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
775        CurrentLocalMemUsage += AllocSize;
776        break;
777      }
778    }
779  }
780
781  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
782                                                          F);
783
784  // Restrict local memory usage so that we don't drastically reduce occupancy,
785  // unless it is already significantly reduced.
786
787  // TODO: Have some sort of hint or other heuristics to guess occupancy based
788  // on other factors..
789  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
790  if (OccupancyHint == 0)
791    OccupancyHint = 7;
792
793  // Clamp to max value.
794  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
795
796  // Check the hint but ignore it if it's obviously wrong from the existing LDS
797  // usage.
798  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
799
800
801  // Round up to the next tier of usage.
802  unsigned MaxSizeWithWaveCount
803    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
804
805  // Program is possibly broken by using more local mem than available.
806  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
807    return false;
808
809  LocalMemLimit = MaxSizeWithWaveCount;
810
811  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
812                    << " bytes of LDS\n"
813                    << "  Rounding size to " << MaxSizeWithWaveCount
814                    << " with a maximum occupancy of " << MaxOccupancy << '\n'
815                    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
816                    << " available for promotion\n");
817
818  return true;
819}
820
821// FIXME: Should try to pick the most likely to be profitable allocas first.
822bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
823  // Array allocations are probably not worth handling, since an allocation of
824  // the array type is the canonical form.
825  if (!I.isStaticAlloca() || I.isArrayAllocation())
826    return false;
827
828  const DataLayout &DL = Mod->getDataLayout();
829  IRBuilder<> Builder(&I);
830
831  // First try to replace the alloca with a vector
832  Type *AllocaTy = I.getAllocatedType();
833
834  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
835
836  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
837    return true; // Promoted to vector.
838
839  if (DisablePromoteAllocaToLDS)
840    return false;
841
842  const Function &ContainingFunction = *I.getParent()->getParent();
843  CallingConv::ID CC = ContainingFunction.getCallingConv();
844
845  // Don't promote the alloca to LDS for shader calling conventions as the work
846  // item ID intrinsics are not supported for these calling conventions.
847  // Furthermore not all LDS is available for some of the stages.
848  switch (CC) {
849  case CallingConv::AMDGPU_KERNEL:
850  case CallingConv::SPIR_KERNEL:
851    break;
852  default:
853    LLVM_DEBUG(
854        dbgs()
855        << " promote alloca to LDS not supported with calling convention.\n");
856    return false;
857  }
858
859  // Not likely to have sufficient local memory for promotion.
860  if (!SufficientLDS)
861    return false;
862
863  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
864  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
865
866  Align Alignment =
867      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
868
869  // FIXME: This computed padding is likely wrong since it depends on inverse
870  // usage order.
871  //
872  // FIXME: It is also possible that if we're allowed to use all of the memory
873  // could could end up using more than the maximum due to alignment padding.
874
875  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
876  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
877  NewSize += AllocSize;
878
879  if (NewSize > LocalMemLimit) {
880    LLVM_DEBUG(dbgs() << "  " << AllocSize
881                      << " bytes of local memory not available to promote\n");
882    return false;
883  }
884
885  CurrentLocalMemUsage = NewSize;
886
887  std::vector<Value*> WorkList;
888
889  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
890    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
891    return false;
892  }
893
894  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
895
896  Function *F = I.getParent()->getParent();
897
898  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
899  GlobalVariable *GV = new GlobalVariable(
900      *Mod, GVTy, false, GlobalValue::InternalLinkage,
901      UndefValue::get(GVTy),
902      Twine(F->getName()) + Twine('.') + I.getName(),
903      nullptr,
904      GlobalVariable::NotThreadLocal,
905      AMDGPUAS::LOCAL_ADDRESS);
906  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
907  GV->setAlignment(MaybeAlign(I.getAlignment()));
908
909  Value *TCntY, *TCntZ;
910
911  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
912  Value *TIdX = getWorkitemID(Builder, 0);
913  Value *TIdY = getWorkitemID(Builder, 1);
914  Value *TIdZ = getWorkitemID(Builder, 2);
915
916  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
917  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
918  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
919  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
920  TID = Builder.CreateAdd(TID, TIdZ);
921
922  Value *Indices[] = {
923    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
924    TID
925  };
926
927  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
928  I.mutateType(Offset->getType());
929  I.replaceAllUsesWith(Offset);
930  I.eraseFromParent();
931
932  for (Value *V : WorkList) {
933    CallInst *Call = dyn_cast<CallInst>(V);
934    if (!Call) {
935      if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
936        Value *Src0 = CI->getOperand(0);
937        Type *EltTy = Src0->getType()->getPointerElementType();
938        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
939
940        if (isa<ConstantPointerNull>(CI->getOperand(0)))
941          CI->setOperand(0, ConstantPointerNull::get(NewTy));
942
943        if (isa<ConstantPointerNull>(CI->getOperand(1)))
944          CI->setOperand(1, ConstantPointerNull::get(NewTy));
945
946        continue;
947      }
948
949      // The operand's value should be corrected on its own and we don't want to
950      // touch the users.
951      if (isa<AddrSpaceCastInst>(V))
952        continue;
953
954      Type *EltTy = V->getType()->getPointerElementType();
955      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
956
957      // FIXME: It doesn't really make sense to try to do this for all
958      // instructions.
959      V->mutateType(NewTy);
960
961      // Adjust the types of any constant operands.
962      if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
963        if (isa<ConstantPointerNull>(SI->getOperand(1)))
964          SI->setOperand(1, ConstantPointerNull::get(NewTy));
965
966        if (isa<ConstantPointerNull>(SI->getOperand(2)))
967          SI->setOperand(2, ConstantPointerNull::get(NewTy));
968      } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
969        for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
970          if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
971            Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
972        }
973      }
974
975      continue;
976    }
977
978    IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
979    Builder.SetInsertPoint(Intr);
980    switch (Intr->getIntrinsicID()) {
981    case Intrinsic::lifetime_start:
982    case Intrinsic::lifetime_end:
983      // These intrinsics are for address space 0 only
984      Intr->eraseFromParent();
985      continue;
986    case Intrinsic::memcpy: {
987      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
988      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
989                           MemCpy->getRawSource(), MemCpy->getSourceAlign(),
990                           MemCpy->getLength(), MemCpy->isVolatile());
991      Intr->eraseFromParent();
992      continue;
993    }
994    case Intrinsic::memmove: {
995      MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
996      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
997                            MemMove->getRawSource(), MemMove->getSourceAlign(),
998                            MemMove->getLength(), MemMove->isVolatile());
999      Intr->eraseFromParent();
1000      continue;
1001    }
1002    case Intrinsic::memset: {
1003      MemSetInst *MemSet = cast<MemSetInst>(Intr);
1004      Builder.CreateMemSet(
1005          MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
1006          MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
1007      Intr->eraseFromParent();
1008      continue;
1009    }
1010    case Intrinsic::invariant_start:
1011    case Intrinsic::invariant_end:
1012    case Intrinsic::launder_invariant_group:
1013    case Intrinsic::strip_invariant_group:
1014      Intr->eraseFromParent();
1015      // FIXME: I think the invariant marker should still theoretically apply,
1016      // but the intrinsics need to be changed to accept pointers with any
1017      // address space.
1018      continue;
1019    case Intrinsic::objectsize: {
1020      Value *Src = Intr->getOperand(0);
1021      Type *SrcTy = Src->getType()->getPointerElementType();
1022      Function *ObjectSize = Intrinsic::getDeclaration(Mod,
1023        Intrinsic::objectsize,
1024        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
1025      );
1026
1027      CallInst *NewCall = Builder.CreateCall(
1028          ObjectSize,
1029          {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1030      Intr->replaceAllUsesWith(NewCall);
1031      Intr->eraseFromParent();
1032      continue;
1033    }
1034    default:
1035      Intr->print(errs());
1036      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1037    }
1038  }
1039  return true;
1040}
1041
1042bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
1043  if (skipFunction(F) || DisablePromoteAllocaToVector)
1044    return false;
1045
1046  const TargetMachine *TM;
1047  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
1048    TM = &TPC->getTM<TargetMachine>();
1049  else
1050    return false;
1051
1052  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
1053  if (!ST.isPromoteAllocaEnabled())
1054    return false;
1055
1056  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1057    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
1058    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1059  } else {
1060    MaxVGPRs = 128;
1061  }
1062
1063  bool Changed = false;
1064  BasicBlock &EntryBB = *F.begin();
1065
1066  SmallVector<AllocaInst *, 16> Allocas;
1067  for (Instruction &I : EntryBB) {
1068    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1069      Allocas.push_back(AI);
1070  }
1071
1072  for (AllocaInst *AI : Allocas) {
1073    if (handleAlloca(*AI))
1074      Changed = true;
1075  }
1076
1077  return Changed;
1078}
1079
1080bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
1081  // Array allocations are probably not worth handling, since an allocation of
1082  // the array type is the canonical form.
1083  if (!I.isStaticAlloca() || I.isArrayAllocation())
1084    return false;
1085
1086  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
1087
1088  Module *Mod = I.getParent()->getParent()->getParent();
1089  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
1090}
1091
1092FunctionPass *llvm::createAMDGPUPromoteAlloca() {
1093  return new AMDGPUPromoteAlloca();
1094}
1095
1096FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
1097  return new AMDGPUPromoteAllocaToVector();
1098}
1099