NVPTXTargetMachine.cpp revision 288943
1//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Top-level implementation for the NVPTX target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXTargetMachine.h"
15#include "MCTargetDesc/NVPTXMCAsmInfo.h"
16#include "NVPTX.h"
17#include "NVPTXAllocaHoisting.h"
18#include "NVPTXLowerAggrCopies.h"
19#include "NVPTXTargetObjectFile.h"
20#include "NVPTXTargetTransformInfo.h"
21#include "llvm/Analysis/Passes.h"
22#include "llvm/CodeGen/AsmPrinter.h"
23#include "llvm/CodeGen/MachineFunctionAnalysis.h"
24#include "llvm/CodeGen/MachineModuleInfo.h"
25#include "llvm/CodeGen/Passes.h"
26#include "llvm/IR/DataLayout.h"
27#include "llvm/IR/IRPrintingPasses.h"
28#include "llvm/IR/LegacyPassManager.h"
29#include "llvm/IR/Verifier.h"
30#include "llvm/MC/MCAsmInfo.h"
31#include "llvm/MC/MCInstrInfo.h"
32#include "llvm/MC/MCStreamer.h"
33#include "llvm/MC/MCSubtargetInfo.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/FormattedStream.h"
37#include "llvm/Support/TargetRegistry.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetInstrInfo.h"
40#include "llvm/Target/TargetLowering.h"
41#include "llvm/Target/TargetLoweringObjectFile.h"
42#include "llvm/Target/TargetMachine.h"
43#include "llvm/Target/TargetOptions.h"
44#include "llvm/Target/TargetRegisterInfo.h"
45#include "llvm/Target/TargetSubtargetInfo.h"
46#include "llvm/Transforms/Scalar.h"
47
48using namespace llvm;
49
50namespace llvm {
51void initializeNVVMReflectPass(PassRegistry&);
52void initializeGenericToNVVMPass(PassRegistry&);
53void initializeNVPTXAllocaHoistingPass(PassRegistry &);
54void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
55void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
56void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
57void initializeNVPTXLowerAllocaPass(PassRegistry &);
58}
59
60extern "C" void LLVMInitializeNVPTXTarget() {
61  // Register the target.
62  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
63  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
64
65  // FIXME: This pass is really intended to be invoked during IR optimization,
66  // but it's very NVPTX-specific.
67  initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
68  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
69  initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry());
70  initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
71  initializeNVPTXFavorNonGenericAddrSpacesPass(
72    *PassRegistry::getPassRegistry());
73  initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
74  initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());
75}
76
77static std::string computeDataLayout(bool is64Bit) {
78  std::string Ret = "e";
79
80  if (!is64Bit)
81    Ret += "-p:32:32";
82
83  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
84
85  return Ret;
86}
87
88NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
89                                       StringRef CPU, StringRef FS,
90                                       const TargetOptions &Options,
91                                       Reloc::Model RM, CodeModel::Model CM,
92                                       CodeGenOpt::Level OL, bool is64bit)
93    : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, RM,
94                        CM, OL),
95      is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()),
96      Subtarget(TT, CPU, FS, *this) {
97  if (TT.getOS() == Triple::NVCL)
98    drvInterface = NVPTX::NVCL;
99  else
100    drvInterface = NVPTX::CUDA;
101  initAsmInfo();
102}
103
104NVPTXTargetMachine::~NVPTXTargetMachine() {}
105
106void NVPTXTargetMachine32::anchor() {}
107
108NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
109                                           StringRef CPU, StringRef FS,
110                                           const TargetOptions &Options,
111                                           Reloc::Model RM, CodeModel::Model CM,
112                                           CodeGenOpt::Level OL)
113    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
114
115void NVPTXTargetMachine64::anchor() {}
116
117NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
118                                           StringRef CPU, StringRef FS,
119                                           const TargetOptions &Options,
120                                           Reloc::Model RM, CodeModel::Model CM,
121                                           CodeGenOpt::Level OL)
122    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
123
124namespace {
125class NVPTXPassConfig : public TargetPassConfig {
126public:
127  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
128      : TargetPassConfig(TM, PM) {}
129
130  NVPTXTargetMachine &getNVPTXTargetMachine() const {
131    return getTM<NVPTXTargetMachine>();
132  }
133
134  void addIRPasses() override;
135  bool addInstSelector() override;
136  void addPostRegAlloc() override;
137  void addMachineSSAOptimization() override;
138
139  FunctionPass *createTargetRegisterAllocator(bool) override;
140  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
141  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
142};
143} // end anonymous namespace
144
145TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
146  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
147  return PassConfig;
148}
149
150TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
151  return TargetIRAnalysis([this](Function &F) {
152    return TargetTransformInfo(NVPTXTTIImpl(this, F));
153  });
154}
155
156void NVPTXPassConfig::addIRPasses() {
157  // The following passes are known to not play well with virtual regs hanging
158  // around after register allocation (which in our case, is *all* registers).
159  // We explicitly disable them here.  We do, however, need some functionality
160  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
161  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
162  disablePass(&PrologEpilogCodeInserterID);
163  disablePass(&MachineCopyPropagationID);
164  disablePass(&BranchFolderPassID);
165  disablePass(&TailDuplicateID);
166
167  addPass(createNVPTXImageOptimizerPass());
168  TargetPassConfig::addIRPasses();
169  addPass(createNVPTXAssignValidGlobalNamesPass());
170  addPass(createGenericToNVVMPass());
171  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
172  // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
173  // be eliminated by SROA.
174  addPass(createSROAPass());
175  addPass(createNVPTXLowerAllocaPass());
176  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
177  // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
178  // them unused. We could remove dead code in an ad-hoc manner, but that
179  // requires manual work and might be error-prone.
180  addPass(createDeadCodeEliminationPass());
181  addPass(createSeparateConstOffsetFromGEPPass());
182  // ReassociateGEPs exposes more opportunites for SLSR. See
183  // the example in reassociate-geps-and-slsr.ll.
184  addPass(createStraightLineStrengthReducePass());
185  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
186  // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
187  // for some of our benchmarks.
188  if (getOptLevel() == CodeGenOpt::Aggressive)
189    addPass(createGVNPass());
190  else
191    addPass(createEarlyCSEPass());
192  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
193  addPass(createNaryReassociatePass());
194  // NaryReassociate on GEPs creates redundant common expressions, so run
195  // EarlyCSE after it.
196  addPass(createEarlyCSEPass());
197}
198
199bool NVPTXPassConfig::addInstSelector() {
200  const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
201
202  addPass(createLowerAggrCopies());
203  addPass(createAllocaHoisting());
204  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
205
206  if (!ST.hasImageHandles())
207    addPass(createNVPTXReplaceImageHandlesPass());
208
209  return false;
210}
211
212void NVPTXPassConfig::addPostRegAlloc() {
213  addPass(createNVPTXPrologEpilogPass(), false);
214  // NVPTXPrologEpilogPass calculates frame object offset and replace frame
215  // index with VRFrame register. NVPTXPeephole need to be run after that and
216  // will replace VRFrame with VRFrameLocal when possible.
217  addPass(createNVPTXPeephole());
218}
219
220FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
221  return nullptr; // No reg alloc
222}
223
224void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
225  assert(!RegAllocPass && "NVPTX uses no regalloc!");
226  addPass(&PHIEliminationID);
227  addPass(&TwoAddressInstructionPassID);
228}
229
230void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
231  assert(!RegAllocPass && "NVPTX uses no regalloc!");
232
233  addPass(&ProcessImplicitDefsID);
234  addPass(&LiveVariablesID);
235  addPass(&MachineLoopInfoID);
236  addPass(&PHIEliminationID);
237
238  addPass(&TwoAddressInstructionPassID);
239  addPass(&RegisterCoalescerID);
240
241  // PreRA instruction scheduling.
242  if (addPass(&MachineSchedulerID))
243    printAndVerify("After Machine Scheduling");
244
245
246  addPass(&StackSlotColoringID);
247
248  // FIXME: Needs physical registers
249  //addPass(&PostRAMachineLICMID);
250
251  printAndVerify("After StackSlotColoring");
252}
253
254void NVPTXPassConfig::addMachineSSAOptimization() {
255  // Pre-ra tail duplication.
256  if (addPass(&EarlyTailDuplicateID))
257    printAndVerify("After Pre-RegAlloc TailDuplicate");
258
259  // Optimize PHIs before DCE: removing dead PHI cycles may make more
260  // instructions dead.
261  addPass(&OptimizePHIsID);
262
263  // This pass merges large allocas. StackSlotColoring is a different pass
264  // which merges spill slots.
265  addPass(&StackColoringID);
266
267  // If the target requests it, assign local variables to stack slots relative
268  // to one another and simplify frame index references where possible.
269  addPass(&LocalStackSlotAllocationID);
270
271  // With optimization, dead code should already be eliminated. However
272  // there is one known exception: lowered code for arguments that are only
273  // used by tail calls, where the tail calls reuse the incoming stack
274  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
275  addPass(&DeadMachineInstructionElimID);
276  printAndVerify("After codegen DCE pass");
277
278  // Allow targets to insert passes that improve instruction level parallelism,
279  // like if-conversion. Such passes will typically need dominator trees and
280  // loop info, just like LICM and CSE below.
281  if (addILPOpts())
282    printAndVerify("After ILP optimizations");
283
284  addPass(&MachineLICMID);
285  addPass(&MachineCSEID);
286
287  addPass(&MachineSinkingID);
288  printAndVerify("After Machine LICM, CSE and Sinking passes");
289
290  addPass(&PeepholeOptimizerID);
291  printAndVerify("After codegen peephole optimization pass");
292}
293