1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "llvm/Analysis/AssumptionCache.h"
18#include "llvm/Analysis/UniformityAnalysis.h"
19#include "llvm/Analysis/ValueTracking.h"
20#include "llvm/CodeGen/TargetPassConfig.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/InstVisitor.h"
23#include "llvm/InitializePasses.h"
24#include "llvm/Support/CommandLine.h"
25#include "llvm/Support/KnownBits.h"
26#include "llvm/Transforms/Utils/Local.h"
27
28#define DEBUG_TYPE "amdgpu-late-codegenprepare"
29
30using namespace llvm;
31
32// Scalar load widening needs running after load-store-vectorizer as that pass
33// doesn't handle overlapping cases. In addition, this pass enhances the
34// widening to handle cases where scalar sub-dword loads are naturally aligned
35// only but not dword aligned.
36static cl::opt<bool>
37    WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38               cl::desc("Widen sub-dword constant address space loads in "
39                        "AMDGPULateCodeGenPrepare"),
40               cl::ReallyHidden, cl::init(true));
41
42namespace {
43
44class AMDGPULateCodeGenPrepare
45    : public FunctionPass,
46      public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47  Module *Mod = nullptr;
48  const DataLayout *DL = nullptr;
49
50  AssumptionCache *AC = nullptr;
51  UniformityInfo *UA = nullptr;
52
53public:
54  static char ID;
55
56  AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
57
58  StringRef getPassName() const override {
59    return "AMDGPU IR late optimizations";
60  }
61
62  void getAnalysisUsage(AnalysisUsage &AU) const override {
63    AU.addRequired<TargetPassConfig>();
64    AU.addRequired<AssumptionCacheTracker>();
65    AU.addRequired<UniformityInfoWrapperPass>();
66    AU.setPreservesAll();
67  }
68
69  bool doInitialization(Module &M) override;
70  bool runOnFunction(Function &F) override;
71
72  bool visitInstruction(Instruction &) { return false; }
73
74  // Check if the specified value is at least DWORD aligned.
75  bool isDWORDAligned(const Value *V) const {
76    KnownBits Known = computeKnownBits(V, *DL, 0, AC);
77    return Known.countMinTrailingZeros() >= 2;
78  }
79
80  bool canWidenScalarExtLoad(LoadInst &LI) const;
81  bool visitLoadInst(LoadInst &LI);
82};
83
84} // end anonymous namespace
85
86bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87  Mod = &M;
88  DL = &Mod->getDataLayout();
89  return false;
90}
91
92bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93  if (skipFunction(F))
94    return false;
95
96  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97  const TargetMachine &TM = TPC.getTM<TargetMachine>();
98  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99  if (ST.hasScalarSubwordLoads())
100    return false;
101
102  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
103  UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
104
105  bool Changed = false;
106  for (auto &BB : F)
107    for (Instruction &I : llvm::make_early_inc_range(BB))
108      Changed |= visit(I);
109
110  return Changed;
111}
112
113bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
114  unsigned AS = LI.getPointerAddressSpace();
115  // Skip non-constant address space.
116  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
117      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
118    return false;
119  // Skip non-simple loads.
120  if (!LI.isSimple())
121    return false;
122  auto *Ty = LI.getType();
123  // Skip aggregate types.
124  if (Ty->isAggregateType())
125    return false;
126  unsigned TySize = DL->getTypeStoreSize(Ty);
127  // Only handle sub-DWORD loads.
128  if (TySize >= 4)
129    return false;
130  // That load must be at least naturally aligned.
131  if (LI.getAlign() < DL->getABITypeAlign(Ty))
132    return false;
133  // It should be uniform, i.e. a scalar load.
134  return UA->isUniform(&LI);
135}
136
137bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
138  if (!WidenLoads)
139    return false;
140
141  // Skip if that load is already aligned on DWORD at least as it's handled in
142  // SDAG.
143  if (LI.getAlign() >= 4)
144    return false;
145
146  if (!canWidenScalarExtLoad(LI))
147    return false;
148
149  int64_t Offset = 0;
150  auto *Base =
151      GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
152  // If that base is not DWORD aligned, it's not safe to perform the following
153  // transforms.
154  if (!isDWORDAligned(Base))
155    return false;
156
157  int64_t Adjust = Offset & 0x3;
158  if (Adjust == 0) {
159    // With a zero adjust, the original alignment could be promoted with a
160    // better one.
161    LI.setAlignment(Align(4));
162    return true;
163  }
164
165  IRBuilder<> IRB(&LI);
166  IRB.SetCurrentDebugLocation(LI.getDebugLoc());
167
168  unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
169  auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
170
171  auto *NewPtr = IRB.CreateConstGEP1_64(
172      IRB.getInt8Ty(),
173      IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
174      Offset - Adjust);
175
176  LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
177  NewLd->copyMetadata(LI);
178  NewLd->setMetadata(LLVMContext::MD_range, nullptr);
179
180  unsigned ShAmt = Adjust * 8;
181  auto *NewVal = IRB.CreateBitCast(
182      IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
183  LI.replaceAllUsesWith(NewVal);
184  RecursivelyDeleteTriviallyDeadInstructions(&LI);
185
186  return true;
187}
188
189INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190                      "AMDGPU IR late optimizations", false, false)
191INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
192INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
193INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
194INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
195                    "AMDGPU IR late optimizations", false, false)
196
197char AMDGPULateCodeGenPrepare::ID = 0;
198
199FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
200  return new AMDGPULateCodeGenPrepare();
201}
202