1//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Top-level implementation for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXTargetMachine.h"
14#include "NVPTX.h"
15#include "NVPTXAllocaHoisting.h"
16#include "NVPTXLowerAggrCopies.h"
17#include "NVPTXTargetObjectFile.h"
18#include "NVPTXTargetTransformInfo.h"
19#include "TargetInfo/NVPTXTargetInfo.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Triple.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/CodeGen/Passes.h"
24#include "llvm/CodeGen/TargetPassConfig.h"
25#include "llvm/IR/LegacyPassManager.h"
26#include "llvm/Pass.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/TargetRegistry.h"
29#include "llvm/Target/TargetMachine.h"
30#include "llvm/Target/TargetOptions.h"
31#include "llvm/Transforms/IPO/PassManagerBuilder.h"
32#include "llvm/Transforms/Scalar.h"
33#include "llvm/Transforms/Scalar/GVN.h"
34#include "llvm/Transforms/Vectorize.h"
35#include <cassert>
36#include <string>
37
38using namespace llvm;
39
40// LSV is still relatively new; this switch lets us turn it off in case we
41// encounter (or suspect) a bug.
42static cl::opt<bool>
43    DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
44                               cl::desc("Disable load/store vectorizer"),
45                               cl::init(false), cl::Hidden);
46
47// TODO: Remove this flag when we are confident with no regressions.
48static cl::opt<bool> DisableRequireStructuredCFG(
49    "disable-nvptx-require-structured-cfg",
50    cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
51             "structured CFG. The requirement should be disabled only when "
52             "unexpected regressions happen."),
53    cl::init(false), cl::Hidden);
54
55static cl::opt<bool> UseShortPointersOpt(
56    "nvptx-short-ptr",
57    cl::desc(
58        "Use 32-bit pointers for accessing const/local/shared address spaces."),
59    cl::init(false), cl::Hidden);
60
61namespace llvm {
62
63void initializeNVVMIntrRangePass(PassRegistry&);
64void initializeNVVMReflectPass(PassRegistry&);
65void initializeGenericToNVVMPass(PassRegistry&);
66void initializeNVPTXAllocaHoistingPass(PassRegistry &);
67void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
68void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
69void initializeNVPTXLowerArgsPass(PassRegistry &);
70void initializeNVPTXLowerAllocaPass(PassRegistry &);
71void initializeNVPTXProxyRegErasurePass(PassRegistry &);
72
73} // end namespace llvm
74
75extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
76  // Register the target.
77  RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
78  RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
79
80  // FIXME: This pass is really intended to be invoked during IR optimization,
81  // but it's very NVPTX-specific.
82  PassRegistry &PR = *PassRegistry::getPassRegistry();
83  initializeNVVMReflectPass(PR);
84  initializeNVVMIntrRangePass(PR);
85  initializeGenericToNVVMPass(PR);
86  initializeNVPTXAllocaHoistingPass(PR);
87  initializeNVPTXAssignValidGlobalNamesPass(PR);
88  initializeNVPTXLowerArgsPass(PR);
89  initializeNVPTXLowerAllocaPass(PR);
90  initializeNVPTXLowerAggrCopiesPass(PR);
91  initializeNVPTXProxyRegErasurePass(PR);
92}
93
94static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
95  std::string Ret = "e";
96
97  if (!is64Bit)
98    Ret += "-p:32:32";
99  else if (UseShortPointers)
100    Ret += "-p3:32:32-p4:32:32-p5:32:32";
101
102  Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
103
104  return Ret;
105}
106
107NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
108                                       StringRef CPU, StringRef FS,
109                                       const TargetOptions &Options,
110                                       Optional<Reloc::Model> RM,
111                                       Optional<CodeModel::Model> CM,
112                                       CodeGenOpt::Level OL, bool is64bit)
113    // The pic relocation model is used regardless of what the client has
114    // specified, as it is the only relocation model currently supported.
115    : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
116                        CPU, FS, Options, Reloc::PIC_,
117                        getEffectiveCodeModel(CM, CodeModel::Small), OL),
118      is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
119      TLOF(std::make_unique<NVPTXTargetObjectFile>()),
120      Subtarget(TT, CPU, FS, *this) {
121  if (TT.getOS() == Triple::NVCL)
122    drvInterface = NVPTX::NVCL;
123  else
124    drvInterface = NVPTX::CUDA;
125  if (!DisableRequireStructuredCFG)
126    setRequiresStructuredCFG(true);
127  initAsmInfo();
128}
129
130NVPTXTargetMachine::~NVPTXTargetMachine() = default;
131
132void NVPTXTargetMachine32::anchor() {}
133
134NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
135                                           StringRef CPU, StringRef FS,
136                                           const TargetOptions &Options,
137                                           Optional<Reloc::Model> RM,
138                                           Optional<CodeModel::Model> CM,
139                                           CodeGenOpt::Level OL, bool JIT)
140    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
141
142void NVPTXTargetMachine64::anchor() {}
143
144NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
145                                           StringRef CPU, StringRef FS,
146                                           const TargetOptions &Options,
147                                           Optional<Reloc::Model> RM,
148                                           Optional<CodeModel::Model> CM,
149                                           CodeGenOpt::Level OL, bool JIT)
150    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
151
152namespace {
153
154class NVPTXPassConfig : public TargetPassConfig {
155public:
156  NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
157      : TargetPassConfig(TM, PM) {}
158
159  NVPTXTargetMachine &getNVPTXTargetMachine() const {
160    return getTM<NVPTXTargetMachine>();
161  }
162
163  void addIRPasses() override;
164  bool addInstSelector() override;
165  void addPreRegAlloc() override;
166  void addPostRegAlloc() override;
167  void addMachineSSAOptimization() override;
168
169  FunctionPass *createTargetRegisterAllocator(bool) override;
170  void addFastRegAlloc() override;
171  void addOptimizedRegAlloc() override;
172
173  bool addRegAssignmentFast() override {
174    llvm_unreachable("should not be used");
175  }
176
177  bool addRegAssignmentOptimized() override {
178    llvm_unreachable("should not be used");
179  }
180
181private:
182  // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
183  // function is only called in opt mode.
184  void addEarlyCSEOrGVNPass();
185
186  // Add passes that propagate special memory spaces.
187  void addAddressSpaceInferencePasses();
188
189  // Add passes that perform straight-line scalar optimizations.
190  void addStraightLineScalarOptimizationPasses();
191};
192
193} // end anonymous namespace
194
195TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
196  return new NVPTXPassConfig(*this, PM);
197}
198
199void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
200  Builder.addExtension(
201    PassManagerBuilder::EP_EarlyAsPossible,
202    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
203      PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
204      PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
205    });
206}
207
208TargetTransformInfo
209NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
210  return TargetTransformInfo(NVPTXTTIImpl(this, F));
211}
212
213void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
214  if (getOptLevel() == CodeGenOpt::Aggressive)
215    addPass(createGVNPass());
216  else
217    addPass(createEarlyCSEPass());
218}
219
220void NVPTXPassConfig::addAddressSpaceInferencePasses() {
221  // NVPTXLowerArgs emits alloca for byval parameters which can often
222  // be eliminated by SROA.
223  addPass(createSROAPass());
224  addPass(createNVPTXLowerAllocaPass());
225  addPass(createInferAddressSpacesPass());
226}
227
228void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
229  addPass(createSeparateConstOffsetFromGEPPass());
230  addPass(createSpeculativeExecutionPass());
231  // ReassociateGEPs exposes more opportunites for SLSR. See
232  // the example in reassociate-geps-and-slsr.ll.
233  addPass(createStraightLineStrengthReducePass());
234  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
235  // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
236  // for some of our benchmarks.
237  addEarlyCSEOrGVNPass();
238  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
239  addPass(createNaryReassociatePass());
240  // NaryReassociate on GEPs creates redundant common expressions, so run
241  // EarlyCSE after it.
242  addPass(createEarlyCSEPass());
243}
244
245void NVPTXPassConfig::addIRPasses() {
246  // The following passes are known to not play well with virtual regs hanging
247  // around after register allocation (which in our case, is *all* registers).
248  // We explicitly disable them here.  We do, however, need some functionality
249  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
250  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
251  disablePass(&PrologEpilogCodeInserterID);
252  disablePass(&MachineCopyPropagationID);
253  disablePass(&TailDuplicateID);
254  disablePass(&StackMapLivenessID);
255  disablePass(&LiveDebugValuesID);
256  disablePass(&PostRAMachineSinkingID);
257  disablePass(&PostRASchedulerID);
258  disablePass(&FuncletLayoutID);
259  disablePass(&PatchableFunctionID);
260  disablePass(&ShrinkWrapID);
261
262  // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
263  // it here does nothing.  But since we need it for correctness when lowering
264  // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
265  // call addEarlyAsPossiblePasses.
266  const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
267  addPass(createNVVMReflectPass(ST.getSmVersion()));
268
269  if (getOptLevel() != CodeGenOpt::None)
270    addPass(createNVPTXImageOptimizerPass());
271  addPass(createNVPTXAssignValidGlobalNamesPass());
272  addPass(createGenericToNVVMPass());
273
274  // NVPTXLowerArgs is required for correctness and should be run right
275  // before the address space inference passes.
276  addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
277  if (getOptLevel() != CodeGenOpt::None) {
278    addAddressSpaceInferencePasses();
279    if (!DisableLoadStoreVectorizer)
280      addPass(createLoadStoreVectorizerPass());
281    addStraightLineScalarOptimizationPasses();
282  }
283
284  // === LSR and other generic IR passes ===
285  TargetPassConfig::addIRPasses();
286  // EarlyCSE is not always strong enough to clean up what LSR produces. For
287  // example, GVN can combine
288  //
289  //   %0 = add %a, %b
290  //   %1 = add %b, %a
291  //
292  // and
293  //
294  //   %0 = shl nsw %a, 2
295  //   %1 = shl %a, 2
296  //
297  // but EarlyCSE can do neither of them.
298  if (getOptLevel() != CodeGenOpt::None)
299    addEarlyCSEOrGVNPass();
300}
301
302bool NVPTXPassConfig::addInstSelector() {
303  const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
304
305  addPass(createLowerAggrCopies());
306  addPass(createAllocaHoisting());
307  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
308
309  if (!ST.hasImageHandles())
310    addPass(createNVPTXReplaceImageHandlesPass());
311
312  return false;
313}
314
315void NVPTXPassConfig::addPreRegAlloc() {
316  // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
317  addPass(createNVPTXProxyRegErasurePass());
318}
319
320void NVPTXPassConfig::addPostRegAlloc() {
321  addPass(createNVPTXPrologEpilogPass(), false);
322  if (getOptLevel() != CodeGenOpt::None) {
323    // NVPTXPrologEpilogPass calculates frame object offset and replace frame
324    // index with VRFrame register. NVPTXPeephole need to be run after that and
325    // will replace VRFrame with VRFrameLocal when possible.
326    addPass(createNVPTXPeephole());
327  }
328}
329
330FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
331  return nullptr; // No reg alloc
332}
333
334void NVPTXPassConfig::addFastRegAlloc() {
335  addPass(&PHIEliminationID);
336  addPass(&TwoAddressInstructionPassID);
337}
338
339void NVPTXPassConfig::addOptimizedRegAlloc() {
340  addPass(&ProcessImplicitDefsID);
341  addPass(&LiveVariablesID);
342  addPass(&MachineLoopInfoID);
343  addPass(&PHIEliminationID);
344
345  addPass(&TwoAddressInstructionPassID);
346  addPass(&RegisterCoalescerID);
347
348  // PreRA instruction scheduling.
349  if (addPass(&MachineSchedulerID))
350    printAndVerify("After Machine Scheduling");
351
352
353  addPass(&StackSlotColoringID);
354
355  // FIXME: Needs physical registers
356  //addPass(&MachineLICMID);
357
358  printAndVerify("After StackSlotColoring");
359}
360
361void NVPTXPassConfig::addMachineSSAOptimization() {
362  // Pre-ra tail duplication.
363  if (addPass(&EarlyTailDuplicateID))
364    printAndVerify("After Pre-RegAlloc TailDuplicate");
365
366  // Optimize PHIs before DCE: removing dead PHI cycles may make more
367  // instructions dead.
368  addPass(&OptimizePHIsID);
369
370  // This pass merges large allocas. StackSlotColoring is a different pass
371  // which merges spill slots.
372  addPass(&StackColoringID);
373
374  // If the target requests it, assign local variables to stack slots relative
375  // to one another and simplify frame index references where possible.
376  addPass(&LocalStackSlotAllocationID);
377
378  // With optimization, dead code should already be eliminated. However
379  // there is one known exception: lowered code for arguments that are only
380  // used by tail calls, where the tail calls reuse the incoming stack
381  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
382  addPass(&DeadMachineInstructionElimID);
383  printAndVerify("After codegen DCE pass");
384
385  // Allow targets to insert passes that improve instruction level parallelism,
386  // like if-conversion. Such passes will typically need dominator trees and
387  // loop info, just like LICM and CSE below.
388  if (addILPOpts())
389    printAndVerify("After ILP optimizations");
390
391  addPass(&EarlyMachineLICMID);
392  addPass(&MachineCSEID);
393
394  addPass(&MachineSinkingID);
395  printAndVerify("After Machine LICM, CSE and Sinking passes");
396
397  addPass(&PeepholeOptimizerID);
398  printAndVerify("After codegen peephole optimization pass");
399}
400