1//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass eliminates local data store, LDS, uses from non-kernel functions.
10// LDS is contiguous memory allocated per kernel execution.
11//
12// Background.
13//
14// The programming model is global variables, or equivalently function local
15// static variables, accessible from kernels or other functions. For uses from
16// kernels this is straightforward - assign an integer to the kernel for the
17// memory required by all the variables combined, allocate them within that.
18// For uses from functions there are performance tradeoffs to choose between.
19//
20// This model means the GPU runtime can specify the amount of memory allocated.
21// If this is more than the kernel assumed, the excess can be made available
22// using a language specific feature, which IR represents as a variable with
23// no initializer. This feature is not yet implemented for non-kernel functions.
24// This lowering could be extended to handle that use case, but would probably
25// require closer integration with promoteAllocaToLDS.
26//
27// Consequences of this GPU feature:
28// - memory is limited and exceeding it halts compilation
29// - a global accessed by one kernel exists independent of other kernels
30// - a global exists independent of simultaneous execution of the same kernel
31// - the address of the global may be different from different kernels as they
32//   do not alias, which permits only allocating variables they use
33// - if the address is allowed to differ, functions need help to find it
34//
35// Uses from kernels are implemented here by grouping them in a per-kernel
36// struct instance. This duplicates the variables, accurately modelling their
37// aliasing properties relative to a single global representation. It also
38// permits control over alignment via padding.
39//
40// Uses from functions are more complicated and the primary purpose of this
41// IR pass. Several different lowering are chosen between to meet requirements
42// to avoid allocating any LDS where it is not necessary, as that impacts
43// occupancy and may fail the compilation, while not imposing overhead on a
44// feature whose primary advantage over global memory is performance. The basic
45// design goal is to avoid one kernel imposing overhead on another.
46//
47// Implementation.
48//
49// LDS variables with constant annotation or non-undef initializer are passed
50// through unchanged for simplification or error diagnostics in later passes.
51// Non-undef initializers are not yet implemented for LDS.
52//
53// LDS variables that are always allocated at the same address can be found
54// by lookup at that address. Otherwise runtime information/cost is required.
55//
56// The simplest strategy possible is to group all LDS variables in a single
57// struct and allocate that struct in every kernel such that the original
58// variables are always at the same address. LDS is however a limited resource
59// so this strategy is unusable in practice. It is not implemented here.
60//
61// Strategy | Precise allocation | Zero runtime cost | General purpose |
62//  --------+--------------------+-------------------+-----------------+
63//   Module |                 No |               Yes |             Yes |
64//    Table |                Yes |                No |             Yes |
65//   Kernel |                Yes |               Yes |              No |
66//   Hybrid |                Yes |           Partial |             Yes |
67//
68// Module spends LDS memory to save cycles. Table spends cycles and global
69// memory to save LDS. Kernel is as fast as kernel allocation but only works
70// for variables that are known reachable from a single kernel. Hybrid picks
71// between all three. When forced to choose between LDS and cycles it minimises
72// LDS use.
73
74// The "module" lowering implemented here finds LDS variables which are used by
75// non-kernel functions and creates a new struct with a field for each of those
76// LDS variables. Variables that are only used from kernels are excluded.
77// Kernels that do not use this struct are annoteated with the attribute
78// amdgpu-elide-module-lds which allows the back end to elide the allocation.
79//
80// The "table" lowering implemented here has three components.
81// First kernels are assigned a unique integer identifier which is available in
82// functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
83// is passed through a specific SGPR, thus works with indirect calls.
84// Second, each kernel allocates LDS variables independent of other kernels and
85// writes the addresses it chose for each variable into an array in consistent
86// order. If the kernel does not allocate a given variable, it writes undef to
87// the corresponding array location. These arrays are written to a constant
88// table in the order matching the kernel unique integer identifier.
89// Third, uses from non-kernel functions are replaced with a table lookup using
90// the intrinsic function to find the address of the variable.
91//
92// "Kernel" lowering is only applicable for variables that are unambiguously
93// reachable from exactly one kernel. For those cases, accesses to the variable
94// can be lowered to ConstantExpr address of a struct instance specific to that
95// one kernel. This is zero cost in space and in compute. It will raise a fatal
96// error on any variable that might be reachable from multiple kernels and is
97// thus most easily used as part of the hybrid lowering strategy.
98//
99// Hybrid lowering is a mixture of the above. It uses the zero cost kernel
100// lowering where it can. It lowers the variable accessed by the greatest
101// number of kernels using the module strategy as that is free for the first
102// variable. Any futher variables that can be lowered with the module strategy
103// without incurring LDS memory overhead are. The remaining ones are lowered
104// via table.
105//
106// Consequences
107// - No heuristics or user controlled magic numbers, hybrid is the right choice
108// - Kernels that don't use functions (or have had them all inlined) are not
109//   affected by any lowering for kernels that do.
110// - Kernels that don't make indirect function calls are not affected by those
111//   that do.
112// - Variables which are used by lots of kernels, e.g. those injected by a
113//   language runtime in most kernels, are expected to have no overhead
114// - Implementations that instantiate templates per-kernel where those templates
115//   use LDS are expected to hit the "Kernel" lowering strategy
116// - The runtime properties impose a cost in compiler implementation complexity
117//
118//===----------------------------------------------------------------------===//
119
120#include "AMDGPU.h"
121#include "Utils/AMDGPUBaseInfo.h"
122#include "Utils/AMDGPUMemoryUtils.h"
123#include "llvm/ADT/BitVector.h"
124#include "llvm/ADT/DenseMap.h"
125#include "llvm/ADT/DenseSet.h"
126#include "llvm/ADT/STLExtras.h"
127#include "llvm/ADT/SetOperations.h"
128#include "llvm/ADT/SetVector.h"
129#include "llvm/Analysis/CallGraph.h"
130#include "llvm/IR/Constants.h"
131#include "llvm/IR/DerivedTypes.h"
132#include "llvm/IR/IRBuilder.h"
133#include "llvm/IR/InlineAsm.h"
134#include "llvm/IR/Instructions.h"
135#include "llvm/IR/IntrinsicsAMDGPU.h"
136#include "llvm/IR/MDBuilder.h"
137#include "llvm/InitializePasses.h"
138#include "llvm/Pass.h"
139#include "llvm/Support/CommandLine.h"
140#include "llvm/Support/Debug.h"
141#include "llvm/Support/OptimizedStructLayout.h"
142#include "llvm/Transforms/Utils/BasicBlockUtils.h"
143#include "llvm/Transforms/Utils/ModuleUtils.h"
144
145#include <tuple>
146#include <vector>
147
148#include <cstdio>
149
150#define DEBUG_TYPE "amdgpu-lower-module-lds"
151
152using namespace llvm;
153
154namespace {
155
156cl::opt<bool> SuperAlignLDSGlobals(
157    "amdgpu-super-align-lds-globals",
158    cl::desc("Increase alignment of LDS if it is not on align boundary"),
159    cl::init(true), cl::Hidden);
160
161enum class LoweringKind { module, table, kernel, hybrid };
162cl::opt<LoweringKind> LoweringKindLoc(
163    "amdgpu-lower-module-lds-strategy",
164    cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
165    cl::init(LoweringKind::module),
166    cl::values(
167        clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
168        clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
169        clEnumValN(
170            LoweringKind::kernel, "kernel",
171            "Lower variables reachable from one kernel, otherwise abort"),
172        clEnumValN(LoweringKind::hybrid, "hybrid",
173                   "Lower via mixture of above strategies")));
174
175bool isKernelLDS(const Function *F) {
176  // Some weirdness here. AMDGPU::isKernelCC does not call into
177  // AMDGPU::isKernel with the calling conv, it instead calls into
178  // isModuleEntryFunction which returns true for more calling conventions
179  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
180  // There's also a test that checks that the LDS lowering does not hit on
181  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
182  // Putting LDS in the name of the function to draw attention to this.
183  return AMDGPU::isKernel(F->getCallingConv());
184}
185
186class AMDGPULowerModuleLDS : public ModulePass {
187
188  static void
189  removeLocalVarsFromUsedLists(Module &M,
190                               const DenseSet<GlobalVariable *> &LocalVars) {
191    // The verifier rejects used lists containing an inttoptr of a constant
192    // so remove the variables from these lists before replaceAllUsesWith
193    SmallPtrSet<Constant *, 8> LocalVarsSet;
194    for (GlobalVariable *LocalVar : LocalVars)
195      LocalVarsSet.insert(cast<Constant>(LocalVar->stripPointerCasts()));
196
197    removeFromUsedLists(
198        M, [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(C); });
199
200    for (GlobalVariable *LocalVar : LocalVars)
201      LocalVar->removeDeadConstantUsers();
202  }
203
204  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
205                               GlobalVariable *SGV) {
206    // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
207    // that might call a function which accesses a field within it. This is
208    // presently approximated to 'all kernels' if there are any such functions
209    // in the module. This implicit use is redefined as an explicit use here so
210    // that later passes, specifically PromoteAlloca, account for the required
211    // memory without any knowledge of this transform.
212
213    // An operand bundle on llvm.donothing works because the call instruction
214    // survives until after the last pass that needs to account for LDS. It is
215    // better than inline asm as the latter survives until the end of codegen. A
216    // totally robust solution would be a function with the same semantics as
217    // llvm.donothing that takes a pointer to the instance and is lowered to a
218    // no-op after LDS is allocated, but that is not presently necessary.
219
220    LLVMContext &Ctx = Func->getContext();
221
222    Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
223
224    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
225
226    Function *Decl =
227        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
228
229    Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
230        SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
231
232    Builder.CreateCall(FTy, Decl, {},
233                       {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
234                       "");
235  }
236
237  static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
238    // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
239    // global may have uses from multiple different functions as a result.
240    // This pass specialises LDS variables with respect to the kernel that
241    // allocates them.
242
243    // This is semantically equivalent to:
244    // for (auto &F : M.functions())
245    //   for (auto &BB : F)
246    //     for (auto &I : BB)
247    //       for (Use &Op : I.operands())
248    //         if (constantExprUsesLDS(Op))
249    //           replaceConstantExprInFunction(I, Op);
250
251    bool Changed = false;
252
253    // Find all ConstantExpr that are direct users of an LDS global
254    SmallVector<ConstantExpr *> Stack;
255    for (auto &GV : M.globals())
256      if (AMDGPU::isLDSVariableToLower(GV))
257        for (User *U : GV.users())
258          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
259            Stack.push_back(C);
260
261    // Expand to include constexpr users of direct users
262    SetVector<ConstantExpr *> ConstExprUsersOfLDS;
263    while (!Stack.empty()) {
264      ConstantExpr *V = Stack.pop_back_val();
265      if (ConstExprUsersOfLDS.contains(V))
266        continue;
267
268      ConstExprUsersOfLDS.insert(V);
269
270      for (auto *Nested : V->users())
271        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested))
272          Stack.push_back(CE);
273    }
274
275    // Find all instructions that use any of the ConstExpr users of LDS
276    SetVector<Instruction *> InstructionWorklist;
277    for (ConstantExpr *CE : ConstExprUsersOfLDS)
278      for (User *U : CE->users())
279        if (auto *I = dyn_cast<Instruction>(U))
280          InstructionWorklist.insert(I);
281
282    // Replace those ConstExpr operands with instructions
283    while (!InstructionWorklist.empty()) {
284      Instruction *I = InstructionWorklist.pop_back_val();
285      for (Use &U : I->operands()) {
286
287        auto *BI = I;
288        if (auto *Phi = dyn_cast<PHINode>(I)) {
289          BasicBlock *BB = Phi->getIncomingBlock(U);
290          BasicBlock::iterator It = BB->getFirstInsertionPt();
291          assert(It != BB->end() && "Unexpected empty basic block");
292          BI = &(*(It));
293        }
294
295        if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) {
296          if (ConstExprUsersOfLDS.contains(C)) {
297            Changed = true;
298            Instruction *NI = C->getAsInstruction(BI);
299            InstructionWorklist.insert(NI);
300            U.set(NI);
301            C->removeDeadConstantUsers();
302          }
303        }
304      }
305    }
306
307    return Changed;
308  }
309
310public:
311  static char ID;
312
313  AMDGPULowerModuleLDS() : ModulePass(ID) {
314    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
315  }
316
317  using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
318
319  using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
320
321  static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
322                                     FunctionVariableMap &kernels,
323                                     FunctionVariableMap &functions) {
324
325    // Get uses from the current function, excluding uses by called functions
326    // Two output variables to avoid walking the globals list twice
327    for (auto &GV : M.globals()) {
328      if (!AMDGPU::isLDSVariableToLower(GV)) {
329        continue;
330      }
331
332      SmallVector<User *, 16> Stack(GV.users());
333      for (User *V : GV.users()) {
334        if (auto *I = dyn_cast<Instruction>(V)) {
335          Function *F = I->getFunction();
336          if (isKernelLDS(F)) {
337            kernels[F].insert(&GV);
338          } else {
339            functions[F].insert(&GV);
340          }
341        }
342      }
343    }
344  }
345
346  struct LDSUsesInfoTy {
347    FunctionVariableMap direct_access;
348    FunctionVariableMap indirect_access;
349  };
350
351  static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
352
353    FunctionVariableMap direct_map_kernel;
354    FunctionVariableMap direct_map_function;
355    getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
356
357    // Collect variables that are used by functions whose address has escaped
358    DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
359    for (Function &F : M.functions()) {
360      if (!isKernelLDS(&F))
361          if (F.hasAddressTaken(nullptr,
362                                /* IgnoreCallbackUses */ false,
363                                /* IgnoreAssumeLikeCalls */ false,
364                                /* IgnoreLLVMUsed */ true,
365                                /* IgnoreArcAttachedCall */ false)) {
366          set_union(VariablesReachableThroughFunctionPointer,
367                    direct_map_function[&F]);
368        }
369    }
370
371    auto functionMakesUnknownCall = [&](const Function *F) -> bool {
372      assert(!F->isDeclaration());
373      for (CallGraphNode::CallRecord R : *CG[F]) {
374        if (!R.second->getFunction()) {
375          return true;
376        }
377      }
378      return false;
379    };
380
381    // Work out which variables are reachable through function calls
382    FunctionVariableMap transitive_map_function = direct_map_function;
383
384    // If the function makes any unknown call, assume the worst case that it can
385    // access all variables accessed by functions whose address escaped
386    for (Function &F : M.functions()) {
387      if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
388        if (!isKernelLDS(&F)) {
389          set_union(transitive_map_function[&F],
390                    VariablesReachableThroughFunctionPointer);
391        }
392      }
393    }
394
395    // Direct implementation of collecting all variables reachable from each
396    // function
397    for (Function &Func : M.functions()) {
398      if (Func.isDeclaration() || isKernelLDS(&Func))
399        continue;
400
401      DenseSet<Function *> seen; // catches cycles
402      SmallVector<Function *, 4> wip{&Func};
403
404      while (!wip.empty()) {
405        Function *F = wip.pop_back_val();
406
407        // Can accelerate this by referring to transitive map for functions that
408        // have already been computed, with more care than this
409        set_union(transitive_map_function[&Func], direct_map_function[F]);
410
411        for (CallGraphNode::CallRecord R : *CG[F]) {
412          Function *ith = R.second->getFunction();
413          if (ith) {
414            if (!seen.contains(ith)) {
415              seen.insert(ith);
416              wip.push_back(ith);
417            }
418          }
419        }
420      }
421    }
422
423    // direct_map_kernel lists which variables are used by the kernel
424    // find the variables which are used through a function call
425    FunctionVariableMap indirect_map_kernel;
426
427    for (Function &Func : M.functions()) {
428      if (Func.isDeclaration() || !isKernelLDS(&Func))
429        continue;
430
431      for (CallGraphNode::CallRecord R : *CG[&Func]) {
432        Function *ith = R.second->getFunction();
433        if (ith) {
434          set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
435        } else {
436          set_union(indirect_map_kernel[&Func],
437                    VariablesReachableThroughFunctionPointer);
438        }
439      }
440    }
441
442    return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
443  }
444
445  struct LDSVariableReplacement {
446    GlobalVariable *SGV = nullptr;
447    DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
448  };
449
450  // remap from lds global to a constantexpr gep to where it has been moved to
451  // for each kernel
452  // an array with an element for each kernel containing where the corresponding
453  // variable was remapped to
454
455  static Constant *getAddressesOfVariablesInKernel(
456      LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
457      DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
458    // Create a ConstantArray containing the address of each Variable within the
459    // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
460    // does not allocate it
461    // TODO: Drop the ptrtoint conversion
462
463    Type *I32 = Type::getInt32Ty(Ctx);
464
465    ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size());
466
467    SmallVector<Constant *> Elements;
468    for (size_t i = 0; i < Variables.size(); i++) {
469      GlobalVariable *GV = Variables[i];
470      if (LDSVarsToConstantGEP.count(GV) != 0) {
471        auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32);
472        Elements.push_back(elt);
473      } else {
474        Elements.push_back(PoisonValue::get(I32));
475      }
476    }
477    return ConstantArray::get(KernelOffsetsType, Elements);
478  }
479
480  static GlobalVariable *buildLookupTable(
481      Module &M, ArrayRef<GlobalVariable *> Variables,
482      ArrayRef<Function *> kernels,
483      DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
484    if (Variables.empty()) {
485      return nullptr;
486    }
487    LLVMContext &Ctx = M.getContext();
488
489    const size_t NumberVariables = Variables.size();
490    const size_t NumberKernels = kernels.size();
491
492    ArrayType *KernelOffsetsType =
493        ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
494
495    ArrayType *AllKernelsOffsetsType =
496        ArrayType::get(KernelOffsetsType, NumberKernels);
497
498    std::vector<Constant *> overallConstantExprElts(NumberKernels);
499    for (size_t i = 0; i < NumberKernels; i++) {
500      LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]];
501      overallConstantExprElts[i] = getAddressesOfVariablesInKernel(
502          Ctx, Variables, Replacement.LDSVarsToConstantGEP);
503    }
504
505    Constant *init =
506        ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
507
508    return new GlobalVariable(
509        M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
510        "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
511        AMDGPUAS::CONSTANT_ADDRESS);
512  }
513
514  void replaceUsesInInstructionsWithTableLookup(
515      Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
516      GlobalVariable *LookupTable) {
517
518    LLVMContext &Ctx = M.getContext();
519    IRBuilder<> Builder(Ctx);
520    Type *I32 = Type::getInt32Ty(Ctx);
521
522    // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
523    // lowers to a read from a live in register. Emit it once in the entry
524    // block to spare deduplicating it later.
525
526    DenseMap<Function *, Value *> tableKernelIndexCache;
527    auto getTableKernelIndex = [&](Function *F) -> Value * {
528      if (tableKernelIndexCache.count(F) == 0) {
529        LLVMContext &Ctx = M.getContext();
530        FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
531        Function *Decl =
532            Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
533
534        BasicBlock::iterator it =
535            F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
536        Instruction &i = *it;
537        Builder.SetInsertPoint(&i);
538
539        tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
540      }
541
542      return tableKernelIndexCache[F];
543    };
544
545    for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
546      auto *GV = ModuleScopeVariables[Index];
547
548      for (Use &U : make_early_inc_range(GV->uses())) {
549        auto *I = dyn_cast<Instruction>(U.getUser());
550        if (!I)
551          continue;
552
553        Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
554
555        // So if the phi uses this value multiple times, what does this look
556        // like?
557        if (auto *Phi = dyn_cast<PHINode>(I)) {
558          BasicBlock *BB = Phi->getIncomingBlock(U);
559          Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
560        } else {
561          Builder.SetInsertPoint(I);
562        }
563
564        Value *GEPIdx[3] = {
565            ConstantInt::get(I32, 0),
566            tableKernelIndex,
567            ConstantInt::get(I32, Index),
568        };
569
570        Value *Address = Builder.CreateInBoundsGEP(
571            LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
572
573        Value *loaded = Builder.CreateLoad(I32, Address);
574
575        Value *replacement =
576            Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
577
578        U.set(replacement);
579      }
580    }
581  }
582
583  static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
584      Module &M, LDSUsesInfoTy &LDSUsesInfo,
585      DenseSet<GlobalVariable *> const &VariableSet) {
586
587    DenseSet<Function *> KernelSet;
588
589    if (VariableSet.empty()) return KernelSet;
590
591    for (Function &Func : M.functions()) {
592      if (Func.isDeclaration() || !isKernelLDS(&Func))
593        continue;
594      for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
595        if (VariableSet.contains(GV)) {
596          KernelSet.insert(&Func);
597          break;
598        }
599      }
600    }
601
602    return KernelSet;
603  }
604
605  static GlobalVariable *
606  chooseBestVariableForModuleStrategy(const DataLayout &DL,
607                                      VariableFunctionMap &LDSVars) {
608    // Find the global variable with the most indirect uses from kernels
609
610    struct CandidateTy {
611      GlobalVariable *GV = nullptr;
612      size_t UserCount = 0;
613      size_t Size = 0;
614
615      CandidateTy() = default;
616
617      CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
618          : GV(GV), UserCount(UserCount), Size(AllocSize) {}
619
620      bool operator<(const CandidateTy &Other) const {
621        // Fewer users makes module scope variable less attractive
622        if (UserCount < Other.UserCount) {
623          return true;
624        }
625        if (UserCount > Other.UserCount) {
626          return false;
627        }
628
629        // Bigger makes module scope variable less attractive
630        if (Size < Other.Size) {
631          return false;
632        }
633
634        if (Size > Other.Size) {
635          return true;
636        }
637
638        // Arbitrary but consistent
639        return GV->getName() < Other.GV->getName();
640      }
641    };
642
643    CandidateTy MostUsed;
644
645    for (auto &K : LDSVars) {
646      GlobalVariable *GV = K.first;
647      if (K.second.size() <= 1) {
648        // A variable reachable by only one kernel is best lowered with kernel
649        // strategy
650        continue;
651      }
652      CandidateTy Candidate(GV, K.second.size(),
653                      DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
654      if (MostUsed < Candidate)
655        MostUsed = Candidate;
656    }
657
658    return MostUsed.GV;
659  }
660
661  bool runOnModule(Module &M) override {
662    LLVMContext &Ctx = M.getContext();
663    CallGraph CG = CallGraph(M);
664    bool Changed = superAlignLDSGlobals(M);
665
666    Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
667
668    Changed = true; // todo: narrow this down
669
670    // For each kernel, what variables does it access directly or through
671    // callees
672    LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
673
674    // For each variable accessed through callees, which kernels access it
675    VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
676    for (auto &K : LDSUsesInfo.indirect_access) {
677      Function *F = K.first;
678      assert(isKernelLDS(F));
679      for (GlobalVariable *GV : K.second) {
680        LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
681      }
682    }
683
684    // Partition variables into the different strategies
685    DenseSet<GlobalVariable *> ModuleScopeVariables;
686    DenseSet<GlobalVariable *> TableLookupVariables;
687    DenseSet<GlobalVariable *> KernelAccessVariables;
688
689    {
690      GlobalVariable *HybridModuleRoot =
691          LoweringKindLoc != LoweringKind::hybrid
692              ? nullptr
693              : chooseBestVariableForModuleStrategy(
694                    M.getDataLayout(),
695                    LDSToKernelsThatNeedToAccessItIndirectly);
696
697      DenseSet<Function *> const EmptySet;
698      DenseSet<Function *> const &HybridModuleRootKernels =
699          HybridModuleRoot
700              ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
701              : EmptySet;
702
703      for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
704        // Each iteration of this loop assigns exactly one global variable to
705        // exactly one of the implementation strategies.
706
707        GlobalVariable *GV = K.first;
708        assert(AMDGPU::isLDSVariableToLower(*GV));
709        assert(K.second.size() != 0);
710
711        switch (LoweringKindLoc) {
712        case LoweringKind::module:
713          ModuleScopeVariables.insert(GV);
714          break;
715
716        case LoweringKind::table:
717          TableLookupVariables.insert(GV);
718          break;
719
720        case LoweringKind::kernel:
721          if (K.second.size() == 1) {
722            KernelAccessVariables.insert(GV);
723          } else {
724            report_fatal_error(
725                "cannot lower LDS '" + GV->getName() +
726                "' to kernel access as it is reachable from multiple kernels");
727          }
728          break;
729
730        case LoweringKind::hybrid: {
731          if (GV == HybridModuleRoot) {
732            assert(K.second.size() != 1);
733            ModuleScopeVariables.insert(GV);
734          } else if (K.second.size() == 1) {
735            KernelAccessVariables.insert(GV);
736          } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
737            ModuleScopeVariables.insert(GV);
738          } else {
739            TableLookupVariables.insert(GV);
740          }
741          break;
742        }
743        }
744      }
745
746      assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
747                 KernelAccessVariables.size() ==
748             LDSToKernelsThatNeedToAccessItIndirectly.size());
749    } // Variables have now been partitioned into the three lowering strategies.
750
751    // If the kernel accesses a variable that is going to be stored in the
752    // module instance through a call then that kernel needs to allocate the
753    // module instance
754    DenseSet<Function *> KernelsThatAllocateModuleLDS =
755        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
756                                                        ModuleScopeVariables);
757    DenseSet<Function *> KernelsThatAllocateTableLDS =
758        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
759                                                        TableLookupVariables);
760
761    if (!ModuleScopeVariables.empty()) {
762      LDSVariableReplacement ModuleScopeReplacement =
763          createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
764                                       ModuleScopeVariables);
765
766      appendToCompilerUsed(M,
767                           {static_cast<GlobalValue *>(
768                               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
769                                   cast<Constant>(ModuleScopeReplacement.SGV),
770                                   Type::getInt8PtrTy(Ctx)))});
771
772      // historic
773      removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
774
775      // Replace all uses of module scope variable from non-kernel functions
776      replaceLDSVariablesWithStruct(
777          M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
778            Instruction *I = dyn_cast<Instruction>(U.getUser());
779            if (!I) {
780              return false;
781            }
782            Function *F = I->getFunction();
783            return !isKernelLDS(F);
784          });
785
786      // Replace uses of module scope variable from kernel functions that
787      // allocate the module scope variable, otherwise leave them unchanged
788      // Record on each kernel whether the module scope global is used by it
789
790      LLVMContext &Ctx = M.getContext();
791      IRBuilder<> Builder(Ctx);
792
793      for (Function &Func : M.functions()) {
794        if (Func.isDeclaration() || !isKernelLDS(&Func))
795          continue;
796
797        if (KernelsThatAllocateModuleLDS.contains(&Func)) {
798          replaceLDSVariablesWithStruct(
799              M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
800                Instruction *I = dyn_cast<Instruction>(U.getUser());
801                if (!I) {
802                  return false;
803                }
804                Function *F = I->getFunction();
805                return F == &Func;
806              });
807
808          markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
809
810        } else {
811          Func.addFnAttr("amdgpu-elide-module-lds");
812        }
813      }
814    }
815
816    // Create a struct for each kernel for the non-module-scope variables
817    DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
818    for (Function &Func : M.functions()) {
819      if (Func.isDeclaration() || !isKernelLDS(&Func))
820        continue;
821
822      DenseSet<GlobalVariable *> KernelUsedVariables;
823      for (auto &v : LDSUsesInfo.direct_access[&Func]) {
824        KernelUsedVariables.insert(v);
825      }
826      for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
827        KernelUsedVariables.insert(v);
828      }
829
830      // Variables allocated in module lds must all resolve to that struct,
831      // not to the per-kernel instance.
832      if (KernelsThatAllocateModuleLDS.contains(&Func)) {
833        for (GlobalVariable *v : ModuleScopeVariables) {
834          KernelUsedVariables.erase(v);
835        }
836      }
837
838      if (KernelUsedVariables.empty()) {
839        // Either used no LDS, or all the LDS it used was also in module
840        continue;
841      }
842
843      // The association between kernel function and LDS struct is done by
844      // symbol name, which only works if the function in question has a
845      // name This is not expected to be a problem in practice as kernels
846      // are called by name making anonymous ones (which are named by the
847      // backend) difficult to use. This does mean that llvm test cases need
848      // to name the kernels.
849      if (!Func.hasName()) {
850        report_fatal_error("Anonymous kernels cannot use LDS variables");
851      }
852
853      std::string VarName =
854          (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();
855
856      auto Replacement =
857          createLDSVariableReplacement(M, VarName, KernelUsedVariables);
858
859      // remove preserves existing codegen
860      removeLocalVarsFromUsedLists(M, KernelUsedVariables);
861      KernelToReplacement[&Func] = Replacement;
862
863      // Rewrite uses within kernel to the new struct
864      replaceLDSVariablesWithStruct(
865          M, KernelUsedVariables, Replacement, [&Func](Use &U) {
866            Instruction *I = dyn_cast<Instruction>(U.getUser());
867            return I && I->getFunction() == &Func;
868          });
869    }
870
871    // Lower zero cost accesses to the kernel instances just created
872    for (auto &GV : KernelAccessVariables) {
873      auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
874      assert(funcs.size() == 1); // Only one kernel can access it
875      LDSVariableReplacement Replacement =
876          KernelToReplacement[*(funcs.begin())];
877
878      DenseSet<GlobalVariable *> Vec;
879      Vec.insert(GV);
880
881      replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
882                                                           return isa<Instruction>(U.getUser());
883      });
884    }
885
886    if (!KernelsThatAllocateTableLDS.empty()) {
887      // Collect the kernels that allocate table lookup LDS
888      std::vector<Function *> OrderedKernels;
889      {
890        for (Function &Func : M.functions()) {
891          if (Func.isDeclaration())
892            continue;
893          if (!isKernelLDS(&Func))
894            continue;
895
896          if (KernelsThatAllocateTableLDS.contains(&Func)) {
897            assert(Func.hasName()); // else fatal error earlier
898            OrderedKernels.push_back(&Func);
899          }
900        }
901
902        // Put them in an arbitrary but reproducible order
903        llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
904                   [](const Function *lhs, const Function *rhs) -> bool {
905                     return lhs->getName() < rhs->getName();
906                   });
907
908        // Annotate the kernels with their order in this vector
909        LLVMContext &Ctx = M.getContext();
910        IRBuilder<> Builder(Ctx);
911
912        if (OrderedKernels.size() > UINT32_MAX) {
913          // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
914          report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
915        }
916
917        for (size_t i = 0; i < OrderedKernels.size(); i++) {
918          Metadata *AttrMDArgs[1] = {
919              ConstantAsMetadata::get(Builder.getInt32(i)),
920          };
921          OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
922                                         MDNode::get(Ctx, AttrMDArgs));
923
924          markUsedByKernel(Builder, OrderedKernels[i],
925                           KernelToReplacement[OrderedKernels[i]].SGV);
926        }
927      }
928
929      // The order must be consistent between lookup table and accesses to
930      // lookup table
931      std::vector<GlobalVariable *> TableLookupVariablesOrdered(
932          TableLookupVariables.begin(), TableLookupVariables.end());
933      llvm::sort(TableLookupVariablesOrdered.begin(),
934                 TableLookupVariablesOrdered.end(),
935                 [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
936                   return lhs->getName() < rhs->getName();
937                 });
938
939      GlobalVariable *LookupTable = buildLookupTable(
940          M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
941      replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
942                                               LookupTable);
943    }
944
945    for (auto &GV : make_early_inc_range(M.globals()))
946      if (AMDGPU::isLDSVariableToLower(GV)) {
947
948        // probably want to remove from used lists
949        GV.removeDeadConstantUsers();
950        if (GV.use_empty())
951          GV.eraseFromParent();
952      }
953
954    return Changed;
955  }
956
957private:
958  // Increase the alignment of LDS globals if necessary to maximise the chance
959  // that we can use aligned LDS instructions to access them.
960  static bool superAlignLDSGlobals(Module &M) {
961    const DataLayout &DL = M.getDataLayout();
962    bool Changed = false;
963    if (!SuperAlignLDSGlobals) {
964      return Changed;
965    }
966
967    for (auto &GV : M.globals()) {
968      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
969        // Only changing alignment of LDS variables
970        continue;
971      }
972      if (!GV.hasInitializer()) {
973        // cuda/hip extern __shared__ variable, leave alignment alone
974        continue;
975      }
976
977      Align Alignment = AMDGPU::getAlign(DL, &GV);
978      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
979
980      if (GVSize > 8) {
981        // We might want to use a b96 or b128 load/store
982        Alignment = std::max(Alignment, Align(16));
983      } else if (GVSize > 4) {
984        // We might want to use a b64 load/store
985        Alignment = std::max(Alignment, Align(8));
986      } else if (GVSize > 2) {
987        // We might want to use a b32 load/store
988        Alignment = std::max(Alignment, Align(4));
989      } else if (GVSize > 1) {
990        // We might want to use a b16 load/store
991        Alignment = std::max(Alignment, Align(2));
992      }
993
994      if (Alignment != AMDGPU::getAlign(DL, &GV)) {
995        Changed = true;
996        GV.setAlignment(Alignment);
997      }
998    }
999    return Changed;
1000  }
1001
1002  static LDSVariableReplacement createLDSVariableReplacement(
1003      Module &M, std::string VarName,
1004      DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
1005    // Create a struct instance containing LDSVarsToTransform and map from those
1006    // variables to ConstantExprGEP
1007    // Variables may be introduced to meet alignment requirements. No aliasing
1008    // metadata is useful for these as they have no uses. Erased before return.
1009
1010    LLVMContext &Ctx = M.getContext();
1011    const DataLayout &DL = M.getDataLayout();
1012    assert(!LDSVarsToTransform.empty());
1013
1014    SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
1015    LayoutFields.reserve(LDSVarsToTransform.size());
1016    {
1017      // The order of fields in this struct depends on the order of
1018      // varables in the argument which varies when changing how they
1019      // are identified, leading to spurious test breakage.
1020      std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(),
1021                                           LDSVarsToTransform.end());
1022      llvm::sort(Sorted.begin(), Sorted.end(),
1023                 [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
1024                   return lhs->getName() < rhs->getName();
1025                 });
1026      for (GlobalVariable *GV : Sorted) {
1027        OptimizedStructLayoutField F(GV,
1028                                     DL.getTypeAllocSize(GV->getValueType()),
1029                                     AMDGPU::getAlign(DL, GV));
1030        LayoutFields.emplace_back(F);
1031      }
1032    }
1033
1034    performOptimizedStructLayout(LayoutFields);
1035
1036    std::vector<GlobalVariable *> LocalVars;
1037    BitVector IsPaddingField;
1038    LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
1039    IsPaddingField.reserve(LDSVarsToTransform.size());
1040    {
1041      uint64_t CurrentOffset = 0;
1042      for (size_t I = 0; I < LayoutFields.size(); I++) {
1043        GlobalVariable *FGV = static_cast<GlobalVariable *>(
1044            const_cast<void *>(LayoutFields[I].Id));
1045        Align DataAlign = LayoutFields[I].Alignment;
1046
1047        uint64_t DataAlignV = DataAlign.value();
1048        if (uint64_t Rem = CurrentOffset % DataAlignV) {
1049          uint64_t Padding = DataAlignV - Rem;
1050
1051          // Append an array of padding bytes to meet alignment requested
1052          // Note (o +      (a - (o % a)) ) % a == 0
1053          //      (offset + Padding       ) % align == 0
1054
1055          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
1056          LocalVars.push_back(new GlobalVariable(
1057              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
1058              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1059              false));
1060          IsPaddingField.push_back(true);
1061          CurrentOffset += Padding;
1062        }
1063
1064        LocalVars.push_back(FGV);
1065        IsPaddingField.push_back(false);
1066        CurrentOffset += LayoutFields[I].Size;
1067      }
1068    }
1069
1070    std::vector<Type *> LocalVarTypes;
1071    LocalVarTypes.reserve(LocalVars.size());
1072    std::transform(
1073        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
1074        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
1075
1076    StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
1077
1078    Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
1079
1080    GlobalVariable *SGV = new GlobalVariable(
1081        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
1082        VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1083        false);
1084    SGV->setAlignment(StructAlign);
1085
1086    DenseMap<GlobalVariable *, Constant *> Map;
1087    Type *I32 = Type::getInt32Ty(Ctx);
1088    for (size_t I = 0; I < LocalVars.size(); I++) {
1089      GlobalVariable *GV = LocalVars[I];
1090      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
1091      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
1092      if (IsPaddingField[I]) {
1093        assert(GV->use_empty());
1094        GV->eraseFromParent();
1095      } else {
1096        Map[GV] = GEP;
1097      }
1098    }
1099    assert(Map.size() == LDSVarsToTransform.size());
1100    return {SGV, std::move(Map)};
1101  }
1102
1103  template <typename PredicateTy>
1104  void replaceLDSVariablesWithStruct(
1105      Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
1106      LDSVariableReplacement Replacement, PredicateTy Predicate) {
1107    LLVMContext &Ctx = M.getContext();
1108    const DataLayout &DL = M.getDataLayout();
1109
1110    // A hack... we need to insert the aliasing info in a predictable order for
1111    // lit tests. Would like to have them in a stable order already, ideally the
1112    // same order they get allocated, which might mean an ordered set container
1113    std::vector<GlobalVariable *> LDSVarsToTransform(
1114        LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end());
1115    llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(),
1116               [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
1117                 return lhs->getName() < rhs->getName();
1118               });
1119
1120    // Create alias.scope and their lists. Each field in the new structure
1121    // does not alias with all other fields.
1122    SmallVector<MDNode *> AliasScopes;
1123    SmallVector<Metadata *> NoAliasList;
1124    const size_t NumberVars = LDSVarsToTransform.size();
1125    if (NumberVars > 1) {
1126      MDBuilder MDB(Ctx);
1127      AliasScopes.reserve(NumberVars);
1128      MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
1129      for (size_t I = 0; I < NumberVars; I++) {
1130        MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
1131        AliasScopes.push_back(Scope);
1132      }
1133      NoAliasList.append(&AliasScopes[1], AliasScopes.end());
1134    }
1135
1136    // Replace uses of ith variable with a constantexpr to the corresponding
1137    // field of the instance that will be allocated by AMDGPUMachineFunction
1138    for (size_t I = 0; I < NumberVars; I++) {
1139      GlobalVariable *GV = LDSVarsToTransform[I];
1140      Constant *GEP = Replacement.LDSVarsToConstantGEP[GV];
1141
1142      GV->replaceUsesWithIf(GEP, Predicate);
1143
1144      APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
1145      GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
1146      uint64_t Offset = APOff.getZExtValue();
1147
1148      Align A =
1149          commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset);
1150
1151      if (I)
1152        NoAliasList[I - 1] = AliasScopes[I - 1];
1153      MDNode *NoAlias =
1154          NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
1155      MDNode *AliasScope =
1156          AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
1157
1158      refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
1159    }
1160  }
1161
1162  void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
1163                                MDNode *AliasScope, MDNode *NoAlias,
1164                                unsigned MaxDepth = 5) {
1165    if (!MaxDepth || (A == 1 && !AliasScope))
1166      return;
1167
1168    for (User *U : Ptr->users()) {
1169      if (auto *I = dyn_cast<Instruction>(U)) {
1170        if (AliasScope && I->mayReadOrWriteMemory()) {
1171          MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
1172          AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
1173                   : AliasScope);
1174          I->setMetadata(LLVMContext::MD_alias_scope, AS);
1175
1176          MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
1177          NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
1178          I->setMetadata(LLVMContext::MD_noalias, NA);
1179        }
1180      }
1181
1182      if (auto *LI = dyn_cast<LoadInst>(U)) {
1183        LI->setAlignment(std::max(A, LI->getAlign()));
1184        continue;
1185      }
1186      if (auto *SI = dyn_cast<StoreInst>(U)) {
1187        if (SI->getPointerOperand() == Ptr)
1188          SI->setAlignment(std::max(A, SI->getAlign()));
1189        continue;
1190      }
1191      if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
1192        // None of atomicrmw operations can work on pointers, but let's
1193        // check it anyway in case it will or we will process ConstantExpr.
1194        if (AI->getPointerOperand() == Ptr)
1195          AI->setAlignment(std::max(A, AI->getAlign()));
1196        continue;
1197      }
1198      if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
1199        if (AI->getPointerOperand() == Ptr)
1200          AI->setAlignment(std::max(A, AI->getAlign()));
1201        continue;
1202      }
1203      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
1204        unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
1205        APInt Off(BitWidth, 0);
1206        if (GEP->getPointerOperand() == Ptr) {
1207          Align GA;
1208          if (GEP->accumulateConstantOffset(DL, Off))
1209            GA = commonAlignment(A, Off.getLimitedValue());
1210          refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
1211                                   MaxDepth - 1);
1212        }
1213        continue;
1214      }
1215      if (auto *I = dyn_cast<Instruction>(U)) {
1216        if (I->getOpcode() == Instruction::BitCast ||
1217            I->getOpcode() == Instruction::AddrSpaceCast)
1218          refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
1219      }
1220    }
1221  }
1222};
1223
1224} // namespace
1225char AMDGPULowerModuleLDS::ID = 0;
1226
1227char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
1228
1229INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
1230                "Lower uses of LDS variables from non-kernel functions", false,
1231                false)
1232
1233ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
1234  return new AMDGPULowerModuleLDS();
1235}
1236
1237PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
1238                                                ModuleAnalysisManager &) {
1239  return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
1240                                               : PreservedAnalyses::all();
1241}
1242