1//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64 specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64Subtarget.h"
14
15#include "AArch64.h"
16#include "AArch64InstrInfo.h"
17#include "AArch64PBQPRegAlloc.h"
18#include "AArch64TargetMachine.h"
19#include "GISel/AArch64CallLowering.h"
20#include "GISel/AArch64LegalizerInfo.h"
21#include "GISel/AArch64RegisterBankInfo.h"
22#include "MCTargetDesc/AArch64AddressingModes.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24#include "llvm/CodeGen/MachineScheduler.h"
25#include "llvm/IR/GlobalValue.h"
26#include "llvm/Support/TargetParser.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "aarch64-subtarget"
31
32#define GET_SUBTARGETINFO_CTOR
33#define GET_SUBTARGETINFO_TARGET_DESC
34#include "AArch64GenSubtargetInfo.inc"
35
36static cl::opt<bool>
37EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                     "converter pass"), cl::init(true), cl::Hidden);
39
40// If OS supports TBI, use this flag to enable it.
41static cl::opt<bool>
42UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                         "an address is ignored"), cl::init(false), cl::Hidden);
44
45static cl::opt<bool>
46    UseNonLazyBind("aarch64-enable-nonlazybind",
47                   cl::desc("Call nonlazybind functions via direct GOT load"),
48                   cl::init(false), cl::Hidden);
49
50static cl::opt<unsigned> SVEVectorBitsMax(
51    "aarch64-sve-vector-bits-max",
52    cl::desc("Assume SVE vector registers are at most this big, "
53             "with zero meaning no maximum size is assumed."),
54    cl::init(0), cl::Hidden);
55
56static cl::opt<unsigned> SVEVectorBitsMin(
57    "aarch64-sve-vector-bits-min",
58    cl::desc("Assume SVE vector registers are at least this big, "
59             "with zero meaning no minimum size is assumed."),
60    cl::init(0), cl::Hidden);
61
62AArch64Subtarget &
63AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
64                                                  StringRef CPUString) {
65  // Determine default and user-specified characteristics
66
67  if (CPUString.empty())
68    CPUString = "generic";
69
70  ParseSubtargetFeatures(CPUString, FS);
71  initializeProperties();
72
73  return *this;
74}
75
76void AArch64Subtarget::initializeProperties() {
77  // Initialize CPU specific properties. We should add a tablegen feature for
78  // this in the future so we can specify it together with the subtarget
79  // features.
80  switch (ARMProcFamily) {
81  case Others:
82    break;
83  case Carmel:
84    CacheLineSize = 64;
85    break;
86  case CortexA35:
87    break;
88  case CortexA53:
89    PrefFunctionLogAlignment = 3;
90    break;
91  case CortexA55:
92    break;
93  case CortexA57:
94    MaxInterleaveFactor = 4;
95    PrefFunctionLogAlignment = 4;
96    break;
97  case CortexA65:
98    PrefFunctionLogAlignment = 3;
99    break;
100  case CortexA72:
101  case CortexA73:
102  case CortexA75:
103  case CortexA76:
104  case CortexA77:
105  case CortexA78:
106  case CortexX1:
107    PrefFunctionLogAlignment = 4;
108    break;
109  case A64FX:
110    CacheLineSize = 256;
111    PrefFunctionLogAlignment = 5;
112    PrefLoopLogAlignment = 5;
113    break;
114  case AppleA7:
115  case AppleA10:
116  case AppleA11:
117  case AppleA12:
118  case AppleA13:
119    CacheLineSize = 64;
120    PrefetchDistance = 280;
121    MinPrefetchStride = 2048;
122    MaxPrefetchIterationsAhead = 3;
123    break;
124  case ExynosM3:
125    MaxInterleaveFactor = 4;
126    MaxJumpTableSize = 20;
127    PrefFunctionLogAlignment = 5;
128    PrefLoopLogAlignment = 4;
129    break;
130  case Falkor:
131    MaxInterleaveFactor = 4;
132    // FIXME: remove this to enable 64-bit SLP if performance looks good.
133    MinVectorRegisterBitWidth = 128;
134    CacheLineSize = 128;
135    PrefetchDistance = 820;
136    MinPrefetchStride = 2048;
137    MaxPrefetchIterationsAhead = 8;
138    break;
139  case Kryo:
140    MaxInterleaveFactor = 4;
141    VectorInsertExtractBaseCost = 2;
142    CacheLineSize = 128;
143    PrefetchDistance = 740;
144    MinPrefetchStride = 1024;
145    MaxPrefetchIterationsAhead = 11;
146    // FIXME: remove this to enable 64-bit SLP if performance looks good.
147    MinVectorRegisterBitWidth = 128;
148    break;
149  case NeoverseE1:
150    PrefFunctionLogAlignment = 3;
151    break;
152  case NeoverseN1:
153    PrefFunctionLogAlignment = 4;
154    break;
155  case Saphira:
156    MaxInterleaveFactor = 4;
157    // FIXME: remove this to enable 64-bit SLP if performance looks good.
158    MinVectorRegisterBitWidth = 128;
159    break;
160  case ThunderX2T99:
161    CacheLineSize = 64;
162    PrefFunctionLogAlignment = 3;
163    PrefLoopLogAlignment = 2;
164    MaxInterleaveFactor = 4;
165    PrefetchDistance = 128;
166    MinPrefetchStride = 1024;
167    MaxPrefetchIterationsAhead = 4;
168    // FIXME: remove this to enable 64-bit SLP if performance looks good.
169    MinVectorRegisterBitWidth = 128;
170    break;
171  case ThunderX:
172  case ThunderXT88:
173  case ThunderXT81:
174  case ThunderXT83:
175    CacheLineSize = 128;
176    PrefFunctionLogAlignment = 3;
177    PrefLoopLogAlignment = 2;
178    // FIXME: remove this to enable 64-bit SLP if performance looks good.
179    MinVectorRegisterBitWidth = 128;
180    break;
181  case TSV110:
182    CacheLineSize = 64;
183    PrefFunctionLogAlignment = 4;
184    PrefLoopLogAlignment = 2;
185    break;
186  case ThunderX3T110:
187    CacheLineSize = 64;
188    PrefFunctionLogAlignment = 4;
189    PrefLoopLogAlignment = 2;
190    MaxInterleaveFactor = 4;
191    PrefetchDistance = 128;
192    MinPrefetchStride = 1024;
193    MaxPrefetchIterationsAhead = 4;
194    // FIXME: remove this to enable 64-bit SLP if performance looks good.
195    MinVectorRegisterBitWidth = 128;
196    break;
197  }
198}
199
200AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
201                                   const std::string &FS,
202                                   const TargetMachine &TM, bool LittleEndian)
203    : AArch64GenSubtargetInfo(TT, CPU, FS),
204      ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
205      CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
206      IsLittle(LittleEndian),
207      TargetTriple(TT), FrameLowering(),
208      InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
209      TLInfo(TM, *this) {
210  if (AArch64::isX18ReservedByDefault(TT))
211    ReserveXRegister.set(18);
212
213  CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
214  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
215  Legalizer.reset(new AArch64LegalizerInfo(*this));
216
217  auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
218
219  // FIXME: At this point, we can't rely on Subtarget having RBI.
220  // It's awkward to mix passing RBI and the Subtarget; should we pass
221  // TII/TRI as well?
222  InstSelector.reset(createAArch64InstructionSelector(
223      *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
224
225  RegBankInfo.reset(RBI);
226}
227
228const CallLowering *AArch64Subtarget::getCallLowering() const {
229  return CallLoweringInfo.get();
230}
231
232const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
233  return InlineAsmLoweringInfo.get();
234}
235
236InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
237  return InstSelector.get();
238}
239
240const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
241  return Legalizer.get();
242}
243
244const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
245  return RegBankInfo.get();
246}
247
248/// Find the target operand flags that describe how a global value should be
249/// referenced for the current subtarget.
250unsigned
251AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
252                                          const TargetMachine &TM) const {
253  // MachO large model always goes via a GOT, simply to get a single 8-byte
254  // absolute relocation on all global addresses.
255  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
256    return AArch64II::MO_GOT;
257
258  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
259    if (GV->hasDLLImportStorageClass())
260      return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
261    if (getTargetTriple().isOSWindows())
262      return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
263    return AArch64II::MO_GOT;
264  }
265
266  // The small code model's direct accesses use ADRP, which cannot
267  // necessarily produce the value 0 (if the code is above 4GB).
268  // Same for the tiny code model, where we have a pc relative LDR.
269  if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
270      GV->hasExternalWeakLinkage())
271    return AArch64II::MO_GOT;
272
273  // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
274  // that their nominal addresses are tagged and outside of the code model. In
275  // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
276  // tag if necessary based on MO_TAGGED.
277  if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
278    return AArch64II::MO_NC | AArch64II::MO_TAGGED;
279
280  return AArch64II::MO_NO_FLAG;
281}
282
283unsigned AArch64Subtarget::classifyGlobalFunctionReference(
284    const GlobalValue *GV, const TargetMachine &TM) const {
285  // MachO large model always goes via a GOT, because we don't have the
286  // relocations available to do anything else..
287  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
288      !GV->hasInternalLinkage())
289    return AArch64II::MO_GOT;
290
291  // NonLazyBind goes via GOT unless we know it's available locally.
292  auto *F = dyn_cast<Function>(GV);
293  if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
294      !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
295    return AArch64II::MO_GOT;
296
297  // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
298  if (getTargetTriple().isOSWindows())
299    return ClassifyGlobalReference(GV, TM);
300
301  return AArch64II::MO_NO_FLAG;
302}
303
304void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
305                                           unsigned NumRegionInstrs) const {
306  // LNT run (at least on Cyclone) showed reasonably significant gains for
307  // bi-directional scheduling. 253.perlbmk.
308  Policy.OnlyTopDown = false;
309  Policy.OnlyBottomUp = false;
310  // Enabling or Disabling the latency heuristic is a close call: It seems to
311  // help nearly no benchmark on out-of-order architectures, on the other hand
312  // it regresses register pressure on a few benchmarking.
313  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
314}
315
316bool AArch64Subtarget::enableEarlyIfConversion() const {
317  return EnableEarlyIfConvert;
318}
319
320bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
321  if (!UseAddressTopByteIgnored)
322    return false;
323
324  if (TargetTriple.isiOS()) {
325    unsigned Major, Minor, Micro;
326    TargetTriple.getiOSVersion(Major, Minor, Micro);
327    return Major >= 8;
328  }
329
330  return false;
331}
332
333std::unique_ptr<PBQPRAConstraint>
334AArch64Subtarget::getCustomPBQPConstraints() const {
335  return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
336}
337
338void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
339  // We usually compute max call frame size after ISel. Do the computation now
340  // if the .mir file didn't specify it. Note that this will probably give you
341  // bogus values after PEI has eliminated the callframe setup/destroy pseudo
342  // instructions, specify explicitly if you need it to be correct.
343  MachineFrameInfo &MFI = MF.getFrameInfo();
344  if (!MFI.isMaxCallFrameSizeComputed())
345    MFI.computeMaxCallFrameSize(MF);
346}
347
348unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
349  assert(HasSVE && "Tried to get SVE vector length without SVE support!");
350  assert(SVEVectorBitsMax % 128 == 0 &&
351         "SVE requires vector length in multiples of 128!");
352  assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
353         "Minimum SVE vector size should not be larger than its maximum!");
354  if (SVEVectorBitsMax == 0)
355    return 0;
356  return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
357}
358
359unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
360  assert(HasSVE && "Tried to get SVE vector length without SVE support!");
361  assert(SVEVectorBitsMin % 128 == 0 &&
362         "SVE requires vector length in multiples of 128!");
363  assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
364         "Minimum SVE vector size should not be larger than its maximum!");
365  if (SVEVectorBitsMax == 0)
366    return (SVEVectorBitsMin / 128) * 128;
367  return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
368}
369