1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information  needed to emit code for R600 and SI GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUCallLowering.h"
19#include "AMDGPUExportClustering.h"
20#include "AMDGPUInstructionSelector.h"
21#include "AMDGPULegalizerInfo.h"
22#include "AMDGPUMacroFusion.h"
23#include "AMDGPUTargetObjectFile.h"
24#include "AMDGPUTargetTransformInfo.h"
25#include "GCNIterativeScheduler.h"
26#include "GCNSchedStrategy.h"
27#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28#include "R600MachineScheduler.h"
29#include "SIMachineFunctionInfo.h"
30#include "SIMachineScheduler.h"
31#include "TargetInfo/AMDGPUTargetInfo.h"
32#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
33#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
34#include "llvm/CodeGen/GlobalISel/Legalizer.h"
35#include "llvm/CodeGen/GlobalISel/Localizer.h"
36#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
37#include "llvm/CodeGen/MIRParser/MIParser.h"
38#include "llvm/CodeGen/Passes.h"
39#include "llvm/CodeGen/TargetPassConfig.h"
40#include "llvm/IR/Attributes.h"
41#include "llvm/IR/Function.h"
42#include "llvm/IR/LegacyPassManager.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Compiler.h"
47#include "llvm/Support/TargetRegistry.h"
48#include "llvm/Target/TargetLoweringObjectFile.h"
49#include "llvm/Transforms/IPO.h"
50#include "llvm/Transforms/IPO/AlwaysInliner.h"
51#include "llvm/Transforms/IPO/PassManagerBuilder.h"
52#include "llvm/Transforms/Scalar.h"
53#include "llvm/Transforms/Scalar/GVN.h"
54#include "llvm/Transforms/Utils.h"
55#include "llvm/Transforms/Vectorize.h"
56#include <memory>
57
58using namespace llvm;
59
60static cl::opt<bool> EnableR600StructurizeCFG(
61  "r600-ir-structurize",
62  cl::desc("Use StructurizeCFG IR pass"),
63  cl::init(true));
64
65static cl::opt<bool> EnableSROA(
66  "amdgpu-sroa",
67  cl::desc("Run SROA after promote alloca pass"),
68  cl::ReallyHidden,
69  cl::init(true));
70
71static cl::opt<bool>
72EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
73                        cl::desc("Run early if-conversion"),
74                        cl::init(false));
75
76static cl::opt<bool>
77OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
78            cl::desc("Run pre-RA exec mask optimizations"),
79            cl::init(true));
80
81static cl::opt<bool> EnableR600IfConvert(
82  "r600-if-convert",
83  cl::desc("Use if conversion pass"),
84  cl::ReallyHidden,
85  cl::init(true));
86
87// Option to disable vectorizer for tests.
88static cl::opt<bool> EnableLoadStoreVectorizer(
89  "amdgpu-load-store-vectorizer",
90  cl::desc("Enable load store vectorizer"),
91  cl::init(true),
92  cl::Hidden);
93
94// Option to control global loads scalarization
95static cl::opt<bool> ScalarizeGlobal(
96  "amdgpu-scalarize-global-loads",
97  cl::desc("Enable global load scalarization"),
98  cl::init(true),
99  cl::Hidden);
100
101// Option to run internalize pass.
102static cl::opt<bool> InternalizeSymbols(
103  "amdgpu-internalize-symbols",
104  cl::desc("Enable elimination of non-kernel functions and unused globals"),
105  cl::init(false),
106  cl::Hidden);
107
108// Option to inline all early.
109static cl::opt<bool> EarlyInlineAll(
110  "amdgpu-early-inline-all",
111  cl::desc("Inline all functions early"),
112  cl::init(false),
113  cl::Hidden);
114
115static cl::opt<bool> EnableSDWAPeephole(
116  "amdgpu-sdwa-peephole",
117  cl::desc("Enable SDWA peepholer"),
118  cl::init(true));
119
120static cl::opt<bool> EnableDPPCombine(
121  "amdgpu-dpp-combine",
122  cl::desc("Enable DPP combiner"),
123  cl::init(true));
124
125// Enable address space based alias analysis
126static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
127  cl::desc("Enable AMDGPU Alias Analysis"),
128  cl::init(true));
129
130// Option to run late CFG structurizer
131static cl::opt<bool, true> LateCFGStructurize(
132  "amdgpu-late-structurize",
133  cl::desc("Enable late CFG structurization"),
134  cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
135  cl::Hidden);
136
137static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
138  "amdgpu-function-calls",
139  cl::desc("Enable AMDGPU function call support"),
140  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
141  cl::init(true),
142  cl::Hidden);
143
144static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
145  "amdgpu-fixed-function-abi",
146  cl::desc("Enable all implicit function arguments"),
147  cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
148  cl::init(false),
149  cl::Hidden);
150
151// Enable lib calls simplifications
152static cl::opt<bool> EnableLibCallSimplify(
153  "amdgpu-simplify-libcall",
154  cl::desc("Enable amdgpu library simplifications"),
155  cl::init(true),
156  cl::Hidden);
157
158static cl::opt<bool> EnableLowerKernelArguments(
159  "amdgpu-ir-lower-kernel-arguments",
160  cl::desc("Lower kernel argument loads in IR pass"),
161  cl::init(true),
162  cl::Hidden);
163
164static cl::opt<bool> EnableRegReassign(
165  "amdgpu-reassign-regs",
166  cl::desc("Enable register reassign optimizations on gfx10+"),
167  cl::init(true),
168  cl::Hidden);
169
170// Enable atomic optimization
171static cl::opt<bool> EnableAtomicOptimizations(
172  "amdgpu-atomic-optimizations",
173  cl::desc("Enable atomic optimizations"),
174  cl::init(false),
175  cl::Hidden);
176
177// Enable Mode register optimization
178static cl::opt<bool> EnableSIModeRegisterPass(
179  "amdgpu-mode-register",
180  cl::desc("Enable mode register pass"),
181  cl::init(true),
182  cl::Hidden);
183
184// Option is used in lit tests to prevent deadcoding of patterns inspected.
185static cl::opt<bool>
186EnableDCEInRA("amdgpu-dce-in-ra",
187    cl::init(true), cl::Hidden,
188    cl::desc("Enable machine DCE inside regalloc"));
189
190static cl::opt<bool> EnableScalarIRPasses(
191  "amdgpu-scalar-ir-passes",
192  cl::desc("Enable scalar IR passes"),
193  cl::init(true),
194  cl::Hidden);
195
196static cl::opt<bool> EnableStructurizerWorkarounds(
197    "amdgpu-enable-structurizer-workarounds",
198    cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
199    cl::Hidden);
200
201extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
202  // Register the target
203  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
204  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
205
206  PassRegistry *PR = PassRegistry::getPassRegistry();
207  initializeR600ClauseMergePassPass(*PR);
208  initializeR600ControlFlowFinalizerPass(*PR);
209  initializeR600PacketizerPass(*PR);
210  initializeR600ExpandSpecialInstrsPassPass(*PR);
211  initializeR600VectorRegMergerPass(*PR);
212  initializeGlobalISel(*PR);
213  initializeAMDGPUDAGToDAGISelPass(*PR);
214  initializeGCNDPPCombinePass(*PR);
215  initializeSILowerI1CopiesPass(*PR);
216  initializeSILowerSGPRSpillsPass(*PR);
217  initializeSIFixSGPRCopiesPass(*PR);
218  initializeSIFixVGPRCopiesPass(*PR);
219  initializeSIFixupVectorISelPass(*PR);
220  initializeSIFoldOperandsPass(*PR);
221  initializeSIPeepholeSDWAPass(*PR);
222  initializeSIShrinkInstructionsPass(*PR);
223  initializeSIOptimizeExecMaskingPreRAPass(*PR);
224  initializeSILoadStoreOptimizerPass(*PR);
225  initializeAMDGPUFixFunctionBitcastsPass(*PR);
226  initializeAMDGPUAlwaysInlinePass(*PR);
227  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
228  initializeAMDGPUAnnotateUniformValuesPass(*PR);
229  initializeAMDGPUArgumentUsageInfoPass(*PR);
230  initializeAMDGPUAtomicOptimizerPass(*PR);
231  initializeAMDGPULowerKernelArgumentsPass(*PR);
232  initializeAMDGPULowerKernelAttributesPass(*PR);
233  initializeAMDGPULowerIntrinsicsPass(*PR);
234  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
235  initializeAMDGPUPostLegalizerCombinerPass(*PR);
236  initializeAMDGPUPreLegalizerCombinerPass(*PR);
237  initializeAMDGPUPromoteAllocaPass(*PR);
238  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
239  initializeAMDGPUCodeGenPreparePass(*PR);
240  initializeAMDGPUPropagateAttributesEarlyPass(*PR);
241  initializeAMDGPUPropagateAttributesLatePass(*PR);
242  initializeAMDGPURewriteOutArgumentsPass(*PR);
243  initializeAMDGPUUnifyMetadataPass(*PR);
244  initializeSIAnnotateControlFlowPass(*PR);
245  initializeSIInsertHardClausesPass(*PR);
246  initializeSIInsertWaitcntsPass(*PR);
247  initializeSIModeRegisterPass(*PR);
248  initializeSIWholeQuadModePass(*PR);
249  initializeSILowerControlFlowPass(*PR);
250  initializeSIRemoveShortExecBranchesPass(*PR);
251  initializeSIPreEmitPeepholePass(*PR);
252  initializeSIInsertSkipsPass(*PR);
253  initializeSIMemoryLegalizerPass(*PR);
254  initializeSIOptimizeExecMaskingPass(*PR);
255  initializeSIPreAllocateWWMRegsPass(*PR);
256  initializeSIFormMemoryClausesPass(*PR);
257  initializeSIPostRABundlerPass(*PR);
258  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
259  initializeAMDGPUAAWrapperPassPass(*PR);
260  initializeAMDGPUExternalAAWrapperPass(*PR);
261  initializeAMDGPUUseNativeCallsPass(*PR);
262  initializeAMDGPUSimplifyLibCallsPass(*PR);
263  initializeAMDGPUInlinerPass(*PR);
264  initializeAMDGPUPrintfRuntimeBindingPass(*PR);
265  initializeGCNRegBankReassignPass(*PR);
266  initializeGCNNSAReassignPass(*PR);
267  initializeSIAddIMGInitPass(*PR);
268}
269
270static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
271  return std::make_unique<AMDGPUTargetObjectFile>();
272}
273
274static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
275  return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
276}
277
278static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
279  return new SIScheduleDAGMI(C);
280}
281
282static ScheduleDAGInstrs *
283createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
284  ScheduleDAGMILive *DAG =
285    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
286  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
287  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
288  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
289  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
290  return DAG;
291}
292
293static ScheduleDAGInstrs *
294createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
295  auto DAG = new GCNIterativeScheduler(C,
296    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
297  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
298  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
299  return DAG;
300}
301
302static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
303  return new GCNIterativeScheduler(C,
304    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
305}
306
307static ScheduleDAGInstrs *
308createIterativeILPMachineScheduler(MachineSchedContext *C) {
309  auto DAG = new GCNIterativeScheduler(C,
310    GCNIterativeScheduler::SCHEDULE_ILP);
311  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
312  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
313  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
314  return DAG;
315}
316
317static MachineSchedRegistry
318R600SchedRegistry("r600", "Run R600's custom scheduler",
319                   createR600MachineScheduler);
320
321static MachineSchedRegistry
322SISchedRegistry("si", "Run SI's custom scheduler",
323                createSIMachineScheduler);
324
325static MachineSchedRegistry
326GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
327                             "Run GCN scheduler to maximize occupancy",
328                             createGCNMaxOccupancyMachineScheduler);
329
330static MachineSchedRegistry
331IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
332  "Run GCN scheduler to maximize occupancy (experimental)",
333  createIterativeGCNMaxOccupancyMachineScheduler);
334
335static MachineSchedRegistry
336GCNMinRegSchedRegistry("gcn-minreg",
337  "Run GCN iterative scheduler for minimal register usage (experimental)",
338  createMinRegScheduler);
339
340static MachineSchedRegistry
341GCNILPSchedRegistry("gcn-ilp",
342  "Run GCN iterative scheduler for ILP scheduling (experimental)",
343  createIterativeILPMachineScheduler);
344
345static StringRef computeDataLayout(const Triple &TT) {
346  if (TT.getArch() == Triple::r600) {
347    // 32-bit pointers.
348      return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
349             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
350  }
351
352  // 32-bit private, local, and region pointers. 64-bit global, constant and
353  // flat, non-integral buffer fat pointers.
354    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
355         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
356         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
357         "-ni:7";
358}
359
360LLVM_READNONE
361static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
362  if (!GPU.empty())
363    return GPU;
364
365  // Need to default to a target with flat support for HSA.
366  if (TT.getArch() == Triple::amdgcn)
367    return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
368
369  return "r600";
370}
371
372static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
373  // The AMDGPU toolchain only supports generating shared objects, so we
374  // must always use PIC.
375  return Reloc::PIC_;
376}
377
378AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
379                                         StringRef CPU, StringRef FS,
380                                         TargetOptions Options,
381                                         Optional<Reloc::Model> RM,
382                                         Optional<CodeModel::Model> CM,
383                                         CodeGenOpt::Level OptLevel)
384    : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
385                        FS, Options, getEffectiveRelocModel(RM),
386                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
387      TLOF(createTLOF(getTargetTriple())) {
388  initAsmInfo();
389  if (TT.getArch() == Triple::amdgcn) {
390    if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
391      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
392    else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
393      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
394  }
395}
396
397bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
398bool AMDGPUTargetMachine::EnableFunctionCalls = false;
399bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
400
401AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
402
403StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
404  Attribute GPUAttr = F.getFnAttribute("target-cpu");
405  return GPUAttr.hasAttribute(Attribute::None) ?
406    getTargetCPU() : GPUAttr.getValueAsString();
407}
408
409StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
410  Attribute FSAttr = F.getFnAttribute("target-features");
411
412  return FSAttr.hasAttribute(Attribute::None) ?
413    getTargetFeatureString() :
414    FSAttr.getValueAsString();
415}
416
417/// Predicate for Internalize pass.
418static bool mustPreserveGV(const GlobalValue &GV) {
419  if (const Function *F = dyn_cast<Function>(&GV))
420    return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
421
422  return !GV.use_empty();
423}
424
425void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
426  Builder.DivergentTarget = true;
427
428  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
429  bool Internalize = InternalizeSymbols;
430  bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
431  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
432  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
433
434  if (EnableFunctionCalls) {
435    delete Builder.Inliner;
436    Builder.Inliner = createAMDGPUFunctionInliningPass();
437  }
438
439  Builder.addExtension(
440    PassManagerBuilder::EP_ModuleOptimizerEarly,
441    [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
442                                               legacy::PassManagerBase &PM) {
443      if (AMDGPUAA) {
444        PM.add(createAMDGPUAAWrapperPass());
445        PM.add(createAMDGPUExternalAAWrapperPass());
446      }
447      PM.add(createAMDGPUUnifyMetadataPass());
448      PM.add(createAMDGPUPrintfRuntimeBinding());
449      if (Internalize)
450        PM.add(createInternalizePass(mustPreserveGV));
451      PM.add(createAMDGPUPropagateAttributesLatePass(this));
452      if (Internalize)
453        PM.add(createGlobalDCEPass());
454      if (EarlyInline)
455        PM.add(createAMDGPUAlwaysInlinePass(false));
456  });
457
458  Builder.addExtension(
459    PassManagerBuilder::EP_EarlyAsPossible,
460    [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
461                                      legacy::PassManagerBase &PM) {
462      if (AMDGPUAA) {
463        PM.add(createAMDGPUAAWrapperPass());
464        PM.add(createAMDGPUExternalAAWrapperPass());
465      }
466      PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
467      PM.add(llvm::createAMDGPUUseNativeCallsPass());
468      if (LibCallSimplify)
469        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
470  });
471
472  Builder.addExtension(
473    PassManagerBuilder::EP_CGSCCOptimizerLate,
474    [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
475      // Add infer address spaces pass to the opt pipeline after inlining
476      // but before SROA to increase SROA opportunities.
477      PM.add(createInferAddressSpacesPass());
478
479      // This should run after inlining to have any chance of doing anything,
480      // and before other cleanup optimizations.
481      PM.add(createAMDGPULowerKernelAttributesPass());
482
483      // Promote alloca to vector before SROA and loop unroll. If we manage
484      // to eliminate allocas before unroll we may choose to unroll less.
485      if (EnableOpt)
486        PM.add(createAMDGPUPromoteAllocaToVector());
487  });
488}
489
490//===----------------------------------------------------------------------===//
491// R600 Target Machine (R600 -> Cayman)
492//===----------------------------------------------------------------------===//
493
494R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
495                                     StringRef CPU, StringRef FS,
496                                     TargetOptions Options,
497                                     Optional<Reloc::Model> RM,
498                                     Optional<CodeModel::Model> CM,
499                                     CodeGenOpt::Level OL, bool JIT)
500    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
501  setRequiresStructuredCFG(true);
502
503  // Override the default since calls aren't supported for r600.
504  if (EnableFunctionCalls &&
505      EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
506    EnableFunctionCalls = false;
507}
508
509const R600Subtarget *R600TargetMachine::getSubtargetImpl(
510  const Function &F) const {
511  StringRef GPU = getGPUName(F);
512  StringRef FS = getFeatureString(F);
513
514  SmallString<128> SubtargetKey(GPU);
515  SubtargetKey.append(FS);
516
517  auto &I = SubtargetMap[SubtargetKey];
518  if (!I) {
519    // This needs to be done before we create a new subtarget since any
520    // creation will depend on the TM and the code generation flags on the
521    // function that reside in TargetOptions.
522    resetTargetOptions(F);
523    I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
524  }
525
526  return I.get();
527}
528
529TargetTransformInfo
530R600TargetMachine::getTargetTransformInfo(const Function &F) {
531  return TargetTransformInfo(R600TTIImpl(this, F));
532}
533
534//===----------------------------------------------------------------------===//
535// GCN Target Machine (SI+)
536//===----------------------------------------------------------------------===//
537
538GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
539                                   StringRef CPU, StringRef FS,
540                                   TargetOptions Options,
541                                   Optional<Reloc::Model> RM,
542                                   Optional<CodeModel::Model> CM,
543                                   CodeGenOpt::Level OL, bool JIT)
544    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
545
546const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
547  StringRef GPU = getGPUName(F);
548  StringRef FS = getFeatureString(F);
549
550  SmallString<128> SubtargetKey(GPU);
551  SubtargetKey.append(FS);
552
553  auto &I = SubtargetMap[SubtargetKey];
554  if (!I) {
555    // This needs to be done before we create a new subtarget since any
556    // creation will depend on the TM and the code generation flags on the
557    // function that reside in TargetOptions.
558    resetTargetOptions(F);
559    I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
560  }
561
562  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
563
564  return I.get();
565}
566
567TargetTransformInfo
568GCNTargetMachine::getTargetTransformInfo(const Function &F) {
569  return TargetTransformInfo(GCNTTIImpl(this, F));
570}
571
572//===----------------------------------------------------------------------===//
573// AMDGPU Pass Setup
574//===----------------------------------------------------------------------===//
575
576namespace {
577
578class AMDGPUPassConfig : public TargetPassConfig {
579public:
580  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
581    : TargetPassConfig(TM, PM) {
582    // Exceptions and StackMaps are not supported, so these passes will never do
583    // anything.
584    disablePass(&StackMapLivenessID);
585    disablePass(&FuncletLayoutID);
586  }
587
588  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
589    return getTM<AMDGPUTargetMachine>();
590  }
591
592  ScheduleDAGInstrs *
593  createMachineScheduler(MachineSchedContext *C) const override {
594    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
595    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
596    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
597    return DAG;
598  }
599
600  void addEarlyCSEOrGVNPass();
601  void addStraightLineScalarOptimizationPasses();
602  void addIRPasses() override;
603  void addCodeGenPrepare() override;
604  bool addPreISel() override;
605  bool addInstSelector() override;
606  bool addGCPasses() override;
607
608  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
609};
610
611std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
612  return getStandardCSEConfigForOpt(TM->getOptLevel());
613}
614
615class R600PassConfig final : public AMDGPUPassConfig {
616public:
617  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
618    : AMDGPUPassConfig(TM, PM) {}
619
620  ScheduleDAGInstrs *createMachineScheduler(
621    MachineSchedContext *C) const override {
622    return createR600MachineScheduler(C);
623  }
624
625  bool addPreISel() override;
626  bool addInstSelector() override;
627  void addPreRegAlloc() override;
628  void addPreSched2() override;
629  void addPreEmitPass() override;
630};
631
632class GCNPassConfig final : public AMDGPUPassConfig {
633public:
634  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
635    : AMDGPUPassConfig(TM, PM) {
636    // It is necessary to know the register usage of the entire call graph.  We
637    // allow calls without EnableAMDGPUFunctionCalls if they are marked
638    // noinline, so this is always required.
639    setRequiresCodeGenSCCOrder(true);
640  }
641
642  GCNTargetMachine &getGCNTargetMachine() const {
643    return getTM<GCNTargetMachine>();
644  }
645
646  ScheduleDAGInstrs *
647  createMachineScheduler(MachineSchedContext *C) const override;
648
649  bool addPreISel() override;
650  void addMachineSSAOptimization() override;
651  bool addILPOpts() override;
652  bool addInstSelector() override;
653  bool addIRTranslator() override;
654  void addPreLegalizeMachineIR() override;
655  bool addLegalizeMachineIR() override;
656  void addPreRegBankSelect() override;
657  bool addRegBankSelect() override;
658  bool addGlobalInstructionSelect() override;
659  void addFastRegAlloc() override;
660  void addOptimizedRegAlloc() override;
661  void addPreRegAlloc() override;
662  bool addPreRewrite() override;
663  void addPostRegAlloc() override;
664  void addPreSched2() override;
665  void addPreEmitPass() override;
666};
667
668} // end anonymous namespace
669
670void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
671  if (getOptLevel() == CodeGenOpt::Aggressive)
672    addPass(createGVNPass());
673  else
674    addPass(createEarlyCSEPass());
675}
676
677void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
678  addPass(createLICMPass());
679  addPass(createSeparateConstOffsetFromGEPPass());
680  addPass(createSpeculativeExecutionPass());
681  // ReassociateGEPs exposes more opportunites for SLSR. See
682  // the example in reassociate-geps-and-slsr.ll.
683  addPass(createStraightLineStrengthReducePass());
684  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
685  // EarlyCSE can reuse.
686  addEarlyCSEOrGVNPass();
687  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
688  addPass(createNaryReassociatePass());
689  // NaryReassociate on GEPs creates redundant common expressions, so run
690  // EarlyCSE after it.
691  addPass(createEarlyCSEPass());
692}
693
694void AMDGPUPassConfig::addIRPasses() {
695  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
696
697  // There is no reason to run these.
698  disablePass(&StackMapLivenessID);
699  disablePass(&FuncletLayoutID);
700  disablePass(&PatchableFunctionID);
701
702  addPass(createAMDGPUPrintfRuntimeBinding());
703
704  // This must occur before inlining, as the inliner will not look through
705  // bitcast calls.
706  addPass(createAMDGPUFixFunctionBitcastsPass());
707
708  // A call to propagate attributes pass in the backend in case opt was not run.
709  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
710
711  addPass(createAtomicExpandPass());
712
713
714  addPass(createAMDGPULowerIntrinsicsPass());
715
716  // Function calls are not supported, so make sure we inline everything.
717  addPass(createAMDGPUAlwaysInlinePass());
718  addPass(createAlwaysInlinerLegacyPass());
719  // We need to add the barrier noop pass, otherwise adding the function
720  // inlining pass will cause all of the PassConfigs passes to be run
721  // one function at a time, which means if we have a nodule with two
722  // functions, then we will generate code for the first function
723  // without ever running any passes on the second.
724  addPass(createBarrierNoopPass());
725
726  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
727  if (TM.getTargetTriple().getArch() == Triple::r600)
728    addPass(createR600OpenCLImageTypeLoweringPass());
729
730  // Replace OpenCL enqueued block function pointers with global variables.
731  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
732
733  if (TM.getOptLevel() > CodeGenOpt::None) {
734    addPass(createInferAddressSpacesPass());
735    addPass(createAMDGPUPromoteAlloca());
736
737    if (EnableSROA)
738      addPass(createSROAPass());
739
740    if (EnableScalarIRPasses)
741      addStraightLineScalarOptimizationPasses();
742
743    if (EnableAMDGPUAliasAnalysis) {
744      addPass(createAMDGPUAAWrapperPass());
745      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
746                                             AAResults &AAR) {
747        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
748          AAR.addAAResult(WrapperPass->getResult());
749        }));
750    }
751  }
752
753  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
754    // TODO: May want to move later or split into an early and late one.
755    addPass(createAMDGPUCodeGenPreparePass());
756  }
757
758  TargetPassConfig::addIRPasses();
759
760  // EarlyCSE is not always strong enough to clean up what LSR produces. For
761  // example, GVN can combine
762  //
763  //   %0 = add %a, %b
764  //   %1 = add %b, %a
765  //
766  // and
767  //
768  //   %0 = shl nsw %a, 2
769  //   %1 = shl %a, 2
770  //
771  // but EarlyCSE can do neither of them.
772  if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
773    addEarlyCSEOrGVNPass();
774}
775
776void AMDGPUPassConfig::addCodeGenPrepare() {
777  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
778    addPass(createAMDGPUAnnotateKernelFeaturesPass());
779
780  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
781      EnableLowerKernelArguments)
782    addPass(createAMDGPULowerKernelArgumentsPass());
783
784  addPass(&AMDGPUPerfHintAnalysisID);
785
786  TargetPassConfig::addCodeGenPrepare();
787
788  if (EnableLoadStoreVectorizer)
789    addPass(createLoadStoreVectorizerPass());
790
791  // LowerSwitch pass may introduce unreachable blocks that can
792  // cause unexpected behavior for subsequent passes. Placing it
793  // here seems better that these blocks would get cleaned up by
794  // UnreachableBlockElim inserted next in the pass flow.
795  addPass(createLowerSwitchPass());
796}
797
798bool AMDGPUPassConfig::addPreISel() {
799  addPass(createFlattenCFGPass());
800  return false;
801}
802
803bool AMDGPUPassConfig::addInstSelector() {
804  // Defer the verifier until FinalizeISel.
805  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
806  return false;
807}
808
809bool AMDGPUPassConfig::addGCPasses() {
810  // Do nothing. GC is not supported.
811  return false;
812}
813
814//===----------------------------------------------------------------------===//
815// R600 Pass Setup
816//===----------------------------------------------------------------------===//
817
818bool R600PassConfig::addPreISel() {
819  AMDGPUPassConfig::addPreISel();
820
821  if (EnableR600StructurizeCFG)
822    addPass(createStructurizeCFGPass());
823  return false;
824}
825
826bool R600PassConfig::addInstSelector() {
827  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
828  return false;
829}
830
831void R600PassConfig::addPreRegAlloc() {
832  addPass(createR600VectorRegMerger());
833}
834
835void R600PassConfig::addPreSched2() {
836  addPass(createR600EmitClauseMarkers(), false);
837  if (EnableR600IfConvert)
838    addPass(&IfConverterID, false);
839  addPass(createR600ClauseMergePass(), false);
840}
841
842void R600PassConfig::addPreEmitPass() {
843  addPass(createAMDGPUCFGStructurizerPass(), false);
844  addPass(createR600ExpandSpecialInstrsPass(), false);
845  addPass(&FinalizeMachineBundlesID, false);
846  addPass(createR600Packetizer(), false);
847  addPass(createR600ControlFlowFinalizer(), false);
848}
849
850TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
851  return new R600PassConfig(*this, PM);
852}
853
854//===----------------------------------------------------------------------===//
855// GCN Pass Setup
856//===----------------------------------------------------------------------===//
857
858ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
859  MachineSchedContext *C) const {
860  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
861  if (ST.enableSIScheduler())
862    return createSIMachineScheduler(C);
863  return createGCNMaxOccupancyMachineScheduler(C);
864}
865
866bool GCNPassConfig::addPreISel() {
867  AMDGPUPassConfig::addPreISel();
868
869  if (EnableAtomicOptimizations) {
870    addPass(createAMDGPUAtomicOptimizerPass());
871  }
872
873  // FIXME: We need to run a pass to propagate the attributes when calls are
874  // supported.
875
876  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
877  // regions formed by them.
878  addPass(&AMDGPUUnifyDivergentExitNodesID);
879  if (!LateCFGStructurize) {
880    if (EnableStructurizerWorkarounds) {
881      addPass(createFixIrreduciblePass());
882      addPass(createUnifyLoopExitsPass());
883    }
884    addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
885  }
886  addPass(createSinkingPass());
887  addPass(createAMDGPUAnnotateUniformValues());
888  if (!LateCFGStructurize) {
889    addPass(createSIAnnotateControlFlowPass());
890  }
891  addPass(createLCSSAPass());
892
893  return false;
894}
895
896void GCNPassConfig::addMachineSSAOptimization() {
897  TargetPassConfig::addMachineSSAOptimization();
898
899  // We want to fold operands after PeepholeOptimizer has run (or as part of
900  // it), because it will eliminate extra copies making it easier to fold the
901  // real source operand. We want to eliminate dead instructions after, so that
902  // we see fewer uses of the copies. We then need to clean up the dead
903  // instructions leftover after the operands are folded as well.
904  //
905  // XXX - Can we get away without running DeadMachineInstructionElim again?
906  addPass(&SIFoldOperandsID);
907  if (EnableDPPCombine)
908    addPass(&GCNDPPCombineID);
909  addPass(&DeadMachineInstructionElimID);
910  addPass(&SILoadStoreOptimizerID);
911  if (EnableSDWAPeephole) {
912    addPass(&SIPeepholeSDWAID);
913    addPass(&EarlyMachineLICMID);
914    addPass(&MachineCSEID);
915    addPass(&SIFoldOperandsID);
916    addPass(&DeadMachineInstructionElimID);
917  }
918  addPass(createSIShrinkInstructionsPass());
919}
920
921bool GCNPassConfig::addILPOpts() {
922  if (EnableEarlyIfConversion)
923    addPass(&EarlyIfConverterID);
924
925  TargetPassConfig::addILPOpts();
926  return false;
927}
928
929bool GCNPassConfig::addInstSelector() {
930  AMDGPUPassConfig::addInstSelector();
931  addPass(&SIFixSGPRCopiesID);
932  addPass(createSILowerI1CopiesPass());
933  // TODO: We have to add FinalizeISel
934  // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
935  // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
936  // Will be removed as soon as SIFixupVectorISel is changed
937  // to work with V_ADD/SUB_U64_PSEUDO instead.
938  addPass(&FinalizeISelID);
939  addPass(createSIFixupVectorISelPass());
940  addPass(createSIAddIMGInitPass());
941  return false;
942}
943
944bool GCNPassConfig::addIRTranslator() {
945  addPass(new IRTranslator());
946  return false;
947}
948
949void GCNPassConfig::addPreLegalizeMachineIR() {
950  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
951  addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
952  addPass(new Localizer());
953}
954
955bool GCNPassConfig::addLegalizeMachineIR() {
956  addPass(new Legalizer());
957  return false;
958}
959
960void GCNPassConfig::addPreRegBankSelect() {
961  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
962  addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
963}
964
965bool GCNPassConfig::addRegBankSelect() {
966  addPass(new RegBankSelect());
967  return false;
968}
969
970bool GCNPassConfig::addGlobalInstructionSelect() {
971  addPass(new InstructionSelect());
972  return false;
973}
974
975void GCNPassConfig::addPreRegAlloc() {
976  if (LateCFGStructurize) {
977    addPass(createAMDGPUMachineCFGStructurizerPass());
978  }
979  addPass(createSIWholeQuadModePass());
980}
981
982void GCNPassConfig::addFastRegAlloc() {
983  // FIXME: We have to disable the verifier here because of PHIElimination +
984  // TwoAddressInstructions disabling it.
985
986  // This must be run immediately after phi elimination and before
987  // TwoAddressInstructions, otherwise the processing of the tied operand of
988  // SI_ELSE will introduce a copy of the tied operand source after the else.
989  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
990
991  // This must be run just after RegisterCoalescing.
992  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
993
994  TargetPassConfig::addFastRegAlloc();
995}
996
997void GCNPassConfig::addOptimizedRegAlloc() {
998  if (OptExecMaskPreRA)
999    insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1000  insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1001
1002  // This must be run immediately after phi elimination and before
1003  // TwoAddressInstructions, otherwise the processing of the tied operand of
1004  // SI_ELSE will introduce a copy of the tied operand source after the else.
1005  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1006
1007  // This must be run just after RegisterCoalescing.
1008  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
1009
1010  if (EnableDCEInRA)
1011    insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1012
1013  TargetPassConfig::addOptimizedRegAlloc();
1014}
1015
1016bool GCNPassConfig::addPreRewrite() {
1017  if (EnableRegReassign) {
1018    addPass(&GCNNSAReassignID);
1019    addPass(&GCNRegBankReassignID);
1020  }
1021  return true;
1022}
1023
1024void GCNPassConfig::addPostRegAlloc() {
1025  addPass(&SIFixVGPRCopiesID);
1026  if (getOptLevel() > CodeGenOpt::None)
1027    addPass(&SIOptimizeExecMaskingID);
1028  TargetPassConfig::addPostRegAlloc();
1029
1030  // Equivalent of PEI for SGPRs.
1031  addPass(&SILowerSGPRSpillsID);
1032}
1033
1034void GCNPassConfig::addPreSched2() {
1035  addPass(&SIPostRABundlerID);
1036}
1037
1038void GCNPassConfig::addPreEmitPass() {
1039  addPass(createSIMemoryLegalizerPass());
1040  addPass(createSIInsertWaitcntsPass());
1041  addPass(createSIShrinkInstructionsPass());
1042  addPass(createSIModeRegisterPass());
1043
1044  // The hazard recognizer that runs as part of the post-ra scheduler does not
1045  // guarantee to be able handle all hazards correctly. This is because if there
1046  // are multiple scheduling regions in a basic block, the regions are scheduled
1047  // bottom up, so when we begin to schedule a region we don't know what
1048  // instructions were emitted directly before it.
1049  //
1050  // Here we add a stand-alone hazard recognizer pass which can handle all
1051  // cases.
1052  //
1053  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
1054  // be better for it to emit S_NOP <N> when possible.
1055  addPass(&PostRAHazardRecognizerID);
1056  if (getOptLevel() > CodeGenOpt::None)
1057    addPass(&SIInsertHardClausesID);
1058
1059  addPass(&SIRemoveShortExecBranchesID);
1060  addPass(&SIInsertSkipsPassID);
1061  addPass(&SIPreEmitPeepholeID);
1062  addPass(&BranchRelaxationPassID);
1063}
1064
1065TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1066  return new GCNPassConfig(*this, PM);
1067}
1068
1069yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1070  return new yaml::SIMachineFunctionInfo();
1071}
1072
1073yaml::MachineFunctionInfo *
1074GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1075  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1076  return new yaml::SIMachineFunctionInfo(*MFI,
1077                                         *MF.getSubtarget().getRegisterInfo());
1078}
1079
1080bool GCNTargetMachine::parseMachineFunctionInfo(
1081    const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1082    SMDiagnostic &Error, SMRange &SourceRange) const {
1083  const yaml::SIMachineFunctionInfo &YamlMFI =
1084      reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1085  MachineFunction &MF = PFS.MF;
1086  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1087
1088  MFI->initializeBaseYamlFields(YamlMFI);
1089
1090  auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1091    Register TempReg;
1092    if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1093      SourceRange = RegName.SourceRange;
1094      return true;
1095    }
1096    RegVal = TempReg;
1097
1098    return false;
1099  };
1100
1101  auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1102    // Create a diagnostic for a the register string literal.
1103    const MemoryBuffer &Buffer =
1104        *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1105    Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1106                         RegName.Value.size(), SourceMgr::DK_Error,
1107                         "incorrect register class for field", RegName.Value,
1108                         None, None);
1109    SourceRange = RegName.SourceRange;
1110    return true;
1111  };
1112
1113  if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1114      parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1115      parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1116    return true;
1117
1118  if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1119      !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1120    return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1121  }
1122
1123  if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1124      !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1125    return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1126  }
1127
1128  if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1129      !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1130    return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1131  }
1132
1133  auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1134                                   const TargetRegisterClass &RC,
1135                                   ArgDescriptor &Arg, unsigned UserSGPRs,
1136                                   unsigned SystemSGPRs) {
1137    // Skip parsing if it's not present.
1138    if (!A)
1139      return false;
1140
1141    if (A->IsRegister) {
1142      Register Reg;
1143      if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1144        SourceRange = A->RegisterName.SourceRange;
1145        return true;
1146      }
1147      if (!RC.contains(Reg))
1148        return diagnoseRegisterClass(A->RegisterName);
1149      Arg = ArgDescriptor::createRegister(Reg);
1150    } else
1151      Arg = ArgDescriptor::createStack(A->StackOffset);
1152    // Check and apply the optional mask.
1153    if (A->Mask)
1154      Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1155
1156    MFI->NumUserSGPRs += UserSGPRs;
1157    MFI->NumSystemSGPRs += SystemSGPRs;
1158    return false;
1159  };
1160
1161  if (YamlMFI.ArgInfo &&
1162      (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1163                             AMDGPU::SGPR_128RegClass,
1164                             MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1165       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1166                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1167                             2, 0) ||
1168       parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1169                             MFI->ArgInfo.QueuePtr, 2, 0) ||
1170       parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1171                             AMDGPU::SReg_64RegClass,
1172                             MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1173       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1174                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1175                             2, 0) ||
1176       parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1177                             AMDGPU::SReg_64RegClass,
1178                             MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1179       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1180                             AMDGPU::SGPR_32RegClass,
1181                             MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1182       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1183                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1184                             0, 1) ||
1185       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1186                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1187                             0, 1) ||
1188       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1189                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1190                             0, 1) ||
1191       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1192                             AMDGPU::SGPR_32RegClass,
1193                             MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1194       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1195                             AMDGPU::SGPR_32RegClass,
1196                             MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1197       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1198                             AMDGPU::SReg_64RegClass,
1199                             MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1200       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1201                             AMDGPU::SReg_64RegClass,
1202                             MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1203       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1204                             AMDGPU::VGPR_32RegClass,
1205                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1206       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1207                             AMDGPU::VGPR_32RegClass,
1208                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1209       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1210                             AMDGPU::VGPR_32RegClass,
1211                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1212    return true;
1213
1214  MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1215  MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1216  MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1217  MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1218  MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1219  MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1220
1221  return false;
1222}
1223