1292915Sdim//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
2292915Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6292915Sdim//
7292915Sdim//===----------------------------------------------------------------------===//
8292915Sdim//
9292915Sdim/// \file
10292915Sdim/// This pass adds amdgpu.uniform metadata to IR values so this information
11292915Sdim/// can be used during instruction selection.
12292915Sdim//
13292915Sdim//===----------------------------------------------------------------------===//
14292915Sdim
15292915Sdim#include "AMDGPU.h"
16314564Sdim#include "llvm/ADT/SetVector.h"
17344779Sdim#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18314564Sdim#include "llvm/Analysis/LoopInfo.h"
19314564Sdim#include "llvm/Analysis/MemoryDependenceAnalysis.h"
20321369Sdim#include "llvm/IR/IRBuilder.h"
21292915Sdim#include "llvm/IR/InstVisitor.h"
22360784Sdim#include "llvm/InitializePasses.h"
23292915Sdim#include "llvm/Support/Debug.h"
24292915Sdim#include "llvm/Support/raw_ostream.h"
25292915Sdim
26292915Sdim#define DEBUG_TYPE "amdgpu-annotate-uniform"
27292915Sdim
28292915Sdimusing namespace llvm;
29292915Sdim
30292915Sdimnamespace {
31292915Sdim
32292915Sdimclass AMDGPUAnnotateUniformValues : public FunctionPass,
33292915Sdim                       public InstVisitor<AMDGPUAnnotateUniformValues> {
34344779Sdim  LegacyDivergenceAnalysis *DA;
35314564Sdim  MemoryDependenceResults *MDR;
36314564Sdim  LoopInfo *LI;
37314564Sdim  DenseMap<Value*, GetElementPtrInst*> noClobberClones;
38314564Sdim  bool isKernelFunc;
39292915Sdim
40292915Sdimpublic:
41292915Sdim  static char ID;
42292915Sdim  AMDGPUAnnotateUniformValues() :
43292915Sdim    FunctionPass(ID) { }
44292915Sdim  bool doInitialization(Module &M) override;
45292915Sdim  bool runOnFunction(Function &F) override;
46314564Sdim  StringRef getPassName() const override {
47314564Sdim    return "AMDGPU Annotate Uniform Values";
48314564Sdim  }
49292915Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
50344779Sdim    AU.addRequired<LegacyDivergenceAnalysis>();
51314564Sdim    AU.addRequired<MemoryDependenceWrapperPass>();
52314564Sdim    AU.addRequired<LoopInfoWrapperPass>();
53292915Sdim    AU.setPreservesAll();
54292915Sdim }
55292915Sdim
56309124Sdim  void visitBranchInst(BranchInst &I);
57292915Sdim  void visitLoadInst(LoadInst &I);
58314564Sdim  bool isClobberedInFunction(LoadInst * Load);
59292915Sdim};
60292915Sdim
61292915Sdim} // End anonymous namespace
62292915Sdim
63292915SdimINITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
64292915Sdim                      "Add AMDGPU uniform metadata", false, false)
65344779SdimINITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
66314564SdimINITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
67314564SdimINITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
68292915SdimINITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
69292915Sdim                    "Add AMDGPU uniform metadata", false, false)
70292915Sdim
71292915Sdimchar AMDGPUAnnotateUniformValues::ID = 0;
72292915Sdim
73309124Sdimstatic void setUniformMetadata(Instruction *I) {
74309124Sdim  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
75309124Sdim}
76314564Sdimstatic void setNoClobberMetadata(Instruction *I) {
77314564Sdim  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
78314564Sdim}
79309124Sdim
80314564Sdimstatic void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
81314564Sdim  for (auto I : predecessors(Root))
82314564Sdim    if (Set.insert(I))
83314564Sdim      DFS(I, Set);
84314564Sdim}
85314564Sdim
86314564Sdimbool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
87314564Sdim  // 1. get Loop for the Load->getparent();
88314564Sdim  // 2. if it exists, collect all the BBs from the most outer
89314564Sdim  // loop and check for the writes. If NOT - start DFS over all preds.
90314564Sdim  // 3. Start DFS over all preds from the most outer loop header.
91314564Sdim  SetVector<BasicBlock *> Checklist;
92314564Sdim  BasicBlock *Start = Load->getParent();
93314564Sdim  Checklist.insert(Start);
94314564Sdim  const Value *Ptr = Load->getPointerOperand();
95314564Sdim  const Loop *L = LI->getLoopFor(Start);
96314564Sdim  if (L) {
97314564Sdim    const Loop *P = L;
98314564Sdim    do {
99314564Sdim      L = P;
100314564Sdim      P = P->getParentLoop();
101314564Sdim    } while (P);
102314564Sdim    Checklist.insert(L->block_begin(), L->block_end());
103314564Sdim    Start = L->getHeader();
104314564Sdim  }
105314564Sdim
106314564Sdim  DFS(Start, Checklist);
107314564Sdim  for (auto &BB : Checklist) {
108321369Sdim    BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
109321369Sdim      BasicBlock::iterator(Load) : BB->end();
110321369Sdim    auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true,
111321369Sdim                                           StartIt, BB, Load);
112321369Sdim    if (Q.isClobber() || Q.isUnknown())
113321369Sdim      return true;
114314564Sdim  }
115314564Sdim  return false;
116314564Sdim}
117314564Sdim
118309124Sdimvoid AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
119344779Sdim  if (DA->isUniform(&I))
120344779Sdim    setUniformMetadata(I.getParent()->getTerminator());
121309124Sdim}
122309124Sdim
123292915Sdimvoid AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
124292915Sdim  Value *Ptr = I.getPointerOperand();
125292915Sdim  if (!DA->isUniform(Ptr))
126292915Sdim    return;
127321369Sdim  auto isGlobalLoad = [&](LoadInst &Load)->bool {
128344779Sdim    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
129314564Sdim  };
130314564Sdim  // We're tracking up to the Function boundaries
131314564Sdim  // We cannot go beyond because of FunctionPass restrictions
132314564Sdim  // Thus we can ensure that memory not clobbered for memory
133314564Sdim  // operations that live in kernel only.
134314564Sdim  bool NotClobbered = isKernelFunc &&   !isClobberedInFunction(&I);
135314564Sdim  Instruction *PtrI = dyn_cast<Instruction>(Ptr);
136314564Sdim  if (!PtrI && NotClobbered && isGlobalLoad(I)) {
137314564Sdim    if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
138314564Sdim      // Lookup for the existing GEP
139314564Sdim      if (noClobberClones.count(Ptr)) {
140314564Sdim        PtrI = noClobberClones[Ptr];
141314564Sdim      } else {
142314564Sdim        // Create GEP of the Value
143314564Sdim        Function *F = I.getParent()->getParent();
144314564Sdim        Value *Idx = Constant::getIntegerValue(
145314564Sdim          Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
146314564Sdim        // Insert GEP at the entry to make it dominate all uses
147314564Sdim        PtrI = GetElementPtrInst::Create(
148314564Sdim          Ptr->getType()->getPointerElementType(), Ptr,
149314564Sdim          ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
150314564Sdim      }
151314564Sdim      I.replaceUsesOfWith(Ptr, PtrI);
152314564Sdim    }
153314564Sdim  }
154292915Sdim
155314564Sdim  if (PtrI) {
156309124Sdim    setUniformMetadata(PtrI);
157314564Sdim    if (NotClobbered)
158314564Sdim      setNoClobberMetadata(PtrI);
159314564Sdim  }
160292915Sdim}
161292915Sdim
162292915Sdimbool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
163292915Sdim  return false;
164292915Sdim}
165292915Sdim
166292915Sdimbool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
167309124Sdim  if (skipFunction(F))
168309124Sdim    return false;
169309124Sdim
170344779Sdim  DA  = &getAnalysis<LegacyDivergenceAnalysis>();
171314564Sdim  MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
172314564Sdim  LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
173314564Sdim  isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
174314564Sdim
175292915Sdim  visit(F);
176314564Sdim  noClobberClones.clear();
177292915Sdim  return true;
178292915Sdim}
179292915Sdim
180292915SdimFunctionPass *
181292915Sdimllvm::createAMDGPUAnnotateUniformValues() {
182292915Sdim  return new AMDGPUAnnotateUniformValues();
183292915Sdim}
184