AArch64Subtarget.cpp revision 363496
1//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64 specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64Subtarget.h"
14
15#include "AArch64.h"
16#include "AArch64CallLowering.h"
17#include "AArch64InstrInfo.h"
18#include "AArch64LegalizerInfo.h"
19#include "AArch64PBQPRegAlloc.h"
20#include "AArch64RegisterBankInfo.h"
21#include "AArch64TargetMachine.h"
22#include "MCTargetDesc/AArch64AddressingModes.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24#include "llvm/CodeGen/MachineScheduler.h"
25#include "llvm/IR/GlobalValue.h"
26#include "llvm/Support/TargetParser.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "aarch64-subtarget"
31
32#define GET_SUBTARGETINFO_CTOR
33#define GET_SUBTARGETINFO_TARGET_DESC
34#include "AArch64GenSubtargetInfo.inc"
35
36static cl::opt<bool>
37EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                     "converter pass"), cl::init(true), cl::Hidden);
39
40// If OS supports TBI, use this flag to enable it.
41static cl::opt<bool>
42UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                         "an address is ignored"), cl::init(false), cl::Hidden);
44
45static cl::opt<bool>
46    UseNonLazyBind("aarch64-enable-nonlazybind",
47                   cl::desc("Call nonlazybind functions via direct GOT load"),
48                   cl::init(false), cl::Hidden);
49
50AArch64Subtarget &
51AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
52                                                  StringRef CPUString) {
53  // Determine default and user-specified characteristics
54
55  if (CPUString.empty())
56    CPUString = "generic";
57
58  ParseSubtargetFeatures(CPUString, FS);
59  initializeProperties();
60
61  return *this;
62}
63
64void AArch64Subtarget::initializeProperties() {
65  // Initialize CPU specific properties. We should add a tablegen feature for
66  // this in the future so we can specify it together with the subtarget
67  // features.
68  switch (ARMProcFamily) {
69  case Others:
70    break;
71  case CortexA35:
72    break;
73  case CortexA53:
74    PrefFunctionLogAlignment = 3;
75    break;
76  case CortexA55:
77    break;
78  case CortexA57:
79    MaxInterleaveFactor = 4;
80    PrefFunctionLogAlignment = 4;
81    break;
82  case CortexA65:
83    PrefFunctionLogAlignment = 3;
84    break;
85  case CortexA72:
86  case CortexA73:
87  case CortexA75:
88  case CortexA76:
89    PrefFunctionLogAlignment = 4;
90    break;
91  case AppleA7:
92  case AppleA10:
93  case AppleA11:
94  case AppleA12:
95  case AppleA13:
96    CacheLineSize = 64;
97    PrefetchDistance = 280;
98    MinPrefetchStride = 2048;
99    MaxPrefetchIterationsAhead = 3;
100    break;
101  case ExynosM3:
102    MaxInterleaveFactor = 4;
103    MaxJumpTableSize = 20;
104    PrefFunctionLogAlignment = 5;
105    PrefLoopLogAlignment = 4;
106    break;
107  case Falkor:
108    MaxInterleaveFactor = 4;
109    // FIXME: remove this to enable 64-bit SLP if performance looks good.
110    MinVectorRegisterBitWidth = 128;
111    CacheLineSize = 128;
112    PrefetchDistance = 820;
113    MinPrefetchStride = 2048;
114    MaxPrefetchIterationsAhead = 8;
115    break;
116  case Kryo:
117    MaxInterleaveFactor = 4;
118    VectorInsertExtractBaseCost = 2;
119    CacheLineSize = 128;
120    PrefetchDistance = 740;
121    MinPrefetchStride = 1024;
122    MaxPrefetchIterationsAhead = 11;
123    // FIXME: remove this to enable 64-bit SLP if performance looks good.
124    MinVectorRegisterBitWidth = 128;
125    break;
126  case NeoverseE1:
127    PrefFunctionLogAlignment = 3;
128    break;
129  case NeoverseN1:
130    PrefFunctionLogAlignment = 4;
131    break;
132  case Saphira:
133    MaxInterleaveFactor = 4;
134    // FIXME: remove this to enable 64-bit SLP if performance looks good.
135    MinVectorRegisterBitWidth = 128;
136    break;
137  case ThunderX2T99:
138    CacheLineSize = 64;
139    PrefFunctionLogAlignment = 3;
140    PrefLoopLogAlignment = 2;
141    MaxInterleaveFactor = 4;
142    PrefetchDistance = 128;
143    MinPrefetchStride = 1024;
144    MaxPrefetchIterationsAhead = 4;
145    // FIXME: remove this to enable 64-bit SLP if performance looks good.
146    MinVectorRegisterBitWidth = 128;
147    break;
148  case ThunderX:
149  case ThunderXT88:
150  case ThunderXT81:
151  case ThunderXT83:
152    CacheLineSize = 128;
153    PrefFunctionLogAlignment = 3;
154    PrefLoopLogAlignment = 2;
155    // FIXME: remove this to enable 64-bit SLP if performance looks good.
156    MinVectorRegisterBitWidth = 128;
157    break;
158  case TSV110:
159    CacheLineSize = 64;
160    PrefFunctionLogAlignment = 4;
161    PrefLoopLogAlignment = 2;
162    break;
163  case ThunderX3T110:
164    CacheLineSize = 64;
165    PrefFunctionLogAlignment = 4;
166    PrefLoopLogAlignment = 2;
167    MaxInterleaveFactor = 4;
168    PrefetchDistance = 128;
169    MinPrefetchStride = 1024;
170    MaxPrefetchIterationsAhead = 4;
171    // FIXME: remove this to enable 64-bit SLP if performance looks good.
172    MinVectorRegisterBitWidth = 128;
173    break;
174  }
175}
176
177AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
178                                   const std::string &FS,
179                                   const TargetMachine &TM, bool LittleEndian)
180    : AArch64GenSubtargetInfo(TT, CPU, FS),
181      ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
182      CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
183      IsLittle(LittleEndian),
184      TargetTriple(TT), FrameLowering(),
185      InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
186      TLInfo(TM, *this) {
187  if (AArch64::isX18ReservedByDefault(TT))
188    ReserveXRegister.set(18);
189
190  CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
191  Legalizer.reset(new AArch64LegalizerInfo(*this));
192
193  auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
194
195  // FIXME: At this point, we can't rely on Subtarget having RBI.
196  // It's awkward to mix passing RBI and the Subtarget; should we pass
197  // TII/TRI as well?
198  InstSelector.reset(createAArch64InstructionSelector(
199      *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
200
201  RegBankInfo.reset(RBI);
202}
203
204const CallLowering *AArch64Subtarget::getCallLowering() const {
205  return CallLoweringInfo.get();
206}
207
208InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
209  return InstSelector.get();
210}
211
212const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
213  return Legalizer.get();
214}
215
216const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
217  return RegBankInfo.get();
218}
219
220/// Find the target operand flags that describe how a global value should be
221/// referenced for the current subtarget.
222unsigned
223AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
224                                          const TargetMachine &TM) const {
225  // MachO large model always goes via a GOT, simply to get a single 8-byte
226  // absolute relocation on all global addresses.
227  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
228    return AArch64II::MO_GOT;
229
230  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
231    if (GV->hasDLLImportStorageClass())
232      return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
233    if (getTargetTriple().isOSWindows())
234      return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
235    return AArch64II::MO_GOT;
236  }
237
238  // The small code model's direct accesses use ADRP, which cannot
239  // necessarily produce the value 0 (if the code is above 4GB).
240  // Same for the tiny code model, where we have a pc relative LDR.
241  if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
242      GV->hasExternalWeakLinkage())
243    return AArch64II::MO_GOT;
244
245  // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
246  // that their nominal addresses are tagged and outside of the code model. In
247  // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
248  // tag if necessary based on MO_TAGGED.
249  if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
250    return AArch64II::MO_NC | AArch64II::MO_TAGGED;
251
252  return AArch64II::MO_NO_FLAG;
253}
254
255unsigned AArch64Subtarget::classifyGlobalFunctionReference(
256    const GlobalValue *GV, const TargetMachine &TM) const {
257  // MachO large model always goes via a GOT, because we don't have the
258  // relocations available to do anything else..
259  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
260      !GV->hasInternalLinkage())
261    return AArch64II::MO_GOT;
262
263  // NonLazyBind goes via GOT unless we know it's available locally.
264  auto *F = dyn_cast<Function>(GV);
265  if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
266      !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
267    return AArch64II::MO_GOT;
268
269  // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
270  if (getTargetTriple().isOSWindows())
271    return ClassifyGlobalReference(GV, TM);
272
273  return AArch64II::MO_NO_FLAG;
274}
275
276void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
277                                           unsigned NumRegionInstrs) const {
278  // LNT run (at least on Cyclone) showed reasonably significant gains for
279  // bi-directional scheduling. 253.perlbmk.
280  Policy.OnlyTopDown = false;
281  Policy.OnlyBottomUp = false;
282  // Enabling or Disabling the latency heuristic is a close call: It seems to
283  // help nearly no benchmark on out-of-order architectures, on the other hand
284  // it regresses register pressure on a few benchmarking.
285  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
286}
287
288bool AArch64Subtarget::enableEarlyIfConversion() const {
289  return EnableEarlyIfConvert;
290}
291
292bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
293  if (!UseAddressTopByteIgnored)
294    return false;
295
296  if (TargetTriple.isiOS()) {
297    unsigned Major, Minor, Micro;
298    TargetTriple.getiOSVersion(Major, Minor, Micro);
299    return Major >= 8;
300  }
301
302  return false;
303}
304
305std::unique_ptr<PBQPRAConstraint>
306AArch64Subtarget::getCustomPBQPConstraints() const {
307  return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
308}
309
310void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
311  // We usually compute max call frame size after ISel. Do the computation now
312  // if the .mir file didn't specify it. Note that this will probably give you
313  // bogus values after PEI has eliminated the callframe setup/destroy pseudo
314  // instructions, specify explicitly if you need it to be correct.
315  MachineFrameInfo &MFI = MF.getFrameInfo();
316  if (!MFI.isMaxCallFrameSizeComputed())
317    MFI.computeMaxCallFrameSize(MF);
318}
319