1//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of TargetFrameLowering class.
10//
11// On AArch64, stack frames are structured as follows:
12//
13// The stack grows downward.
14//
15// All of the individual frame areas on the frame below are optional, i.e. it's
16// possible to create a function so that the particular area isn't present
17// in the frame.
18//
19// At function entry, the "frame" looks as follows:
20//
21// |                                   | Higher address
22// |-----------------------------------|
23// |                                   |
24// | arguments passed on the stack     |
25// |                                   |
26// |-----------------------------------| <- sp
27// |                                   | Lower address
28//
29//
30// After the prologue has run, the frame has the following general structure.
31// Note that this doesn't depict the case where a red-zone is used. Also,
32// technically the last frame area (VLAs) doesn't get created until in the
33// main function body, after the prologue is run. However, it's depicted here
34// for completeness.
35//
36// |                                   | Higher address
37// |-----------------------------------|
38// |                                   |
39// | arguments passed on the stack     |
40// |                                   |
41// |-----------------------------------|
42// |                                   |
43// | (Win64 only) varargs from reg     |
44// |                                   |
45// |-----------------------------------|
46// |                                   |
47// | callee-saved gpr registers        | <--.
48// |                                   |    | On Darwin platforms these
49// |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50// | prev_lr                           |    | (frame record first)
51// | prev_fp                           | <--'
52// | async context if needed           |
53// | (a.k.a. "frame record")           |
54// |-----------------------------------| <- fp(=x29)
55// |                                   |
56// | callee-saved fp/simd/SVE regs     |
57// |                                   |
58// |-----------------------------------|
59// |                                   |
60// |        SVE stack objects          |
61// |                                   |
62// |-----------------------------------|
63// |.empty.space.to.make.part.below....|
64// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65// |.the.standard.16-byte.alignment....|  compile time; if present)
66// |-----------------------------------|
67// |                                   |
68// | local variables of fixed size     |
69// | including spill slots             |
70// |-----------------------------------| <- bp(not defined by ABI,
71// |.variable-sized.local.variables....|       LLVM chooses X19)
72// |.(VLAs)............................| (size of this area is unknown at
73// |...................................|  compile time)
74// |-----------------------------------| <- sp
75// |                                   | Lower address
76//
77//
78// To access the data in a frame, at-compile time, a constant offset must be
79// computable from one of the pointers (fp, bp, sp) to access it. The size
80// of the areas with a dotted background cannot be computed at compile-time
81// if they are present, making it required to have all three of fp, bp and
82// sp to be set up to be able to access all contents in the frame areas,
83// assuming all of the frame areas are non-empty.
84//
85// For most functions, some of the frame areas are empty. For those functions,
86// it may not be necessary to set up fp or bp:
87// * A base pointer is definitely needed when there are both VLAs and local
88//   variables with more-than-default alignment requirements.
89// * A frame pointer is definitely needed when there are local variables with
90//   more-than-default alignment requirements.
91//
92// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93// callee-saved area, since the unwind encoding does not allow for encoding
94// this dynamically and existing tools depend on this layout. For other
95// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96// area to allow SVE stack objects (allocated directly below the callee-saves,
97// if available) to be accessed directly from the framepointer.
98// The SVE spill/fill instructions have VL-scaled addressing modes such
99// as:
100//    ldr z8, [fp, #-7 mul vl]
101// For SVE the size of the vector length (VL) is not known at compile-time, so
102// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103// layout, we don't need to add an unscaled offset to the framepointer before
104// accessing the SVE object in the frame.
105//
106// In some cases when a base pointer is not strictly needed, it is generated
107// anyway when offsets from the frame pointer to access local variables become
108// so large that the offset can't be encoded in the immediate fields of loads
109// or stores.
110//
111// Outgoing function arguments must be at the bottom of the stack frame when
112// calling another function. If we do not have variable-sized stack objects, we
113// can allocate a "reserved call frame" area at the bottom of the local
114// variable area, large enough for all outgoing calls. If we do have VLAs, then
115// the stack pointer must be decremented and incremented around each call to
116// make space for the arguments below the VLAs.
117//
118// FIXME: also explain the redzone concept.
119//
120// An example of the prologue:
121//
122//     .globl __foo
123//     .align 2
124//  __foo:
125// Ltmp0:
126//     .cfi_startproc
127//     .cfi_personality 155, ___gxx_personality_v0
128// Leh_func_begin:
129//     .cfi_lsda 16, Lexception33
130//
131//     stp  xa,bx, [sp, -#offset]!
132//     ...
133//     stp  x28, x27, [sp, #offset-32]
134//     stp  fp, lr, [sp, #offset-16]
135//     add  fp, sp, #offset - 16
136//     sub  sp, sp, #1360
137//
138// The Stack:
139//       +-------------------------------------------+
140// 10000 | ........ | ........ | ........ | ........ |
141// 10004 | ........ | ........ | ........ | ........ |
142//       +-------------------------------------------+
143// 10008 | ........ | ........ | ........ | ........ |
144// 1000c | ........ | ........ | ........ | ........ |
145//       +===========================================+
146// 10010 |                X28 Register               |
147// 10014 |                X28 Register               |
148//       +-------------------------------------------+
149// 10018 |                X27 Register               |
150// 1001c |                X27 Register               |
151//       +===========================================+
152// 10020 |                Frame Pointer              |
153// 10024 |                Frame Pointer              |
154//       +-------------------------------------------+
155// 10028 |                Link Register              |
156// 1002c |                Link Register              |
157//       +===========================================+
158// 10030 | ........ | ........ | ........ | ........ |
159// 10034 | ........ | ........ | ........ | ........ |
160//       +-------------------------------------------+
161// 10038 | ........ | ........ | ........ | ........ |
162// 1003c | ........ | ........ | ........ | ........ |
163//       +-------------------------------------------+
164//
165//     [sp] = 10030        ::    >>initial value<<
166//     sp = 10020          ::  stp fp, lr, [sp, #-16]!
167//     fp = sp == 10020    ::  mov fp, sp
168//     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
169//     sp == 10010         ::    >>final value<<
170//
171// The frame pointer (w29) points to address 10020. If we use an offset of
172// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
173// for w27, and -32 for w28:
174//
175//  Ltmp1:
176//     .cfi_def_cfa w29, 16
177//  Ltmp2:
178//     .cfi_offset w30, -8
179//  Ltmp3:
180//     .cfi_offset w29, -16
181//  Ltmp4:
182//     .cfi_offset w27, -24
183//  Ltmp5:
184//     .cfi_offset w28, -32
185//
186//===----------------------------------------------------------------------===//
187
188#include "AArch64FrameLowering.h"
189#include "AArch64InstrInfo.h"
190#include "AArch64MachineFunctionInfo.h"
191#include "AArch64RegisterInfo.h"
192#include "AArch64Subtarget.h"
193#include "AArch64TargetMachine.h"
194#include "MCTargetDesc/AArch64AddressingModes.h"
195#include "MCTargetDesc/AArch64MCTargetDesc.h"
196#include "llvm/ADT/ScopeExit.h"
197#include "llvm/ADT/SmallVector.h"
198#include "llvm/ADT/Statistic.h"
199#include "llvm/CodeGen/LivePhysRegs.h"
200#include "llvm/CodeGen/MachineBasicBlock.h"
201#include "llvm/CodeGen/MachineFrameInfo.h"
202#include "llvm/CodeGen/MachineFunction.h"
203#include "llvm/CodeGen/MachineInstr.h"
204#include "llvm/CodeGen/MachineInstrBuilder.h"
205#include "llvm/CodeGen/MachineMemOperand.h"
206#include "llvm/CodeGen/MachineModuleInfo.h"
207#include "llvm/CodeGen/MachineOperand.h"
208#include "llvm/CodeGen/MachineRegisterInfo.h"
209#include "llvm/CodeGen/RegisterScavenging.h"
210#include "llvm/CodeGen/TargetInstrInfo.h"
211#include "llvm/CodeGen/TargetRegisterInfo.h"
212#include "llvm/CodeGen/TargetSubtargetInfo.h"
213#include "llvm/CodeGen/WinEHFuncInfo.h"
214#include "llvm/IR/Attributes.h"
215#include "llvm/IR/CallingConv.h"
216#include "llvm/IR/DataLayout.h"
217#include "llvm/IR/DebugLoc.h"
218#include "llvm/IR/Function.h"
219#include "llvm/MC/MCAsmInfo.h"
220#include "llvm/MC/MCDwarf.h"
221#include "llvm/Support/CommandLine.h"
222#include "llvm/Support/Debug.h"
223#include "llvm/Support/ErrorHandling.h"
224#include "llvm/Support/MathExtras.h"
225#include "llvm/Support/raw_ostream.h"
226#include "llvm/Target/TargetMachine.h"
227#include "llvm/Target/TargetOptions.h"
228#include <cassert>
229#include <cstdint>
230#include <iterator>
231#include <optional>
232#include <vector>
233
234using namespace llvm;
235
236#define DEBUG_TYPE "frame-info"
237
238static cl::opt<bool> EnableRedZone("aarch64-redzone",
239                                   cl::desc("enable use of redzone on AArch64"),
240                                   cl::init(false), cl::Hidden);
241
242static cl::opt<bool>
243    ReverseCSRRestoreSeq("reverse-csr-restore-seq",
244                         cl::desc("reverse the CSR restore sequence"),
245                         cl::init(false), cl::Hidden);
246
247static cl::opt<bool> StackTaggingMergeSetTag(
248    "stack-tagging-merge-settag",
249    cl::desc("merge settag instruction in function epilog"), cl::init(true),
250    cl::Hidden);
251
252static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
253                                       cl::desc("sort stack allocations"),
254                                       cl::init(true), cl::Hidden);
255
256cl::opt<bool> EnableHomogeneousPrologEpilog(
257    "homogeneous-prolog-epilog", cl::Hidden,
258    cl::desc("Emit homogeneous prologue and epilogue for the size "
259             "optimization (default = off)"));
260
261STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
262
263/// Returns how much of the incoming argument stack area (in bytes) we should
264/// clean up in an epilogue. For the C calling convention this will be 0, for
265/// guaranteed tail call conventions it can be positive (a normal return or a
266/// tail call to a function that uses less stack space for arguments) or
267/// negative (for a tail call to a function that needs more stack space than us
268/// for arguments).
269static int64_t getArgumentStackToRestore(MachineFunction &MF,
270                                         MachineBasicBlock &MBB) {
271  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
272  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
273  bool IsTailCallReturn = (MBB.end() != MBBI)
274                              ? AArch64InstrInfo::isTailCallReturnInst(*MBBI)
275                              : false;
276
277  int64_t ArgumentPopSize = 0;
278  if (IsTailCallReturn) {
279    MachineOperand &StackAdjust = MBBI->getOperand(1);
280
281    // For a tail-call in a callee-pops-arguments environment, some or all of
282    // the stack may actually be in use for the call's arguments, this is
283    // calculated during LowerCall and consumed here...
284    ArgumentPopSize = StackAdjust.getImm();
285  } else {
286    // ... otherwise the amount to pop is *all* of the argument space,
287    // conveniently stored in the MachineFunctionInfo by
288    // LowerFormalArguments. This will, of course, be zero for the C calling
289    // convention.
290    ArgumentPopSize = AFI->getArgumentStackToRestore();
291  }
292
293  return ArgumentPopSize;
294}
295
296static bool produceCompactUnwindFrame(MachineFunction &MF);
297static bool needsWinCFI(const MachineFunction &MF);
298static StackOffset getSVEStackSize(const MachineFunction &MF);
299static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
300
301/// Returns true if a homogeneous prolog or epilog code can be emitted
302/// for the size optimization. If possible, a frame helper call is injected.
303/// When Exit block is given, this check is for epilog.
304bool AArch64FrameLowering::homogeneousPrologEpilog(
305    MachineFunction &MF, MachineBasicBlock *Exit) const {
306  if (!MF.getFunction().hasMinSize())
307    return false;
308  if (!EnableHomogeneousPrologEpilog)
309    return false;
310  if (ReverseCSRRestoreSeq)
311    return false;
312  if (EnableRedZone)
313    return false;
314
315  // TODO: Window is supported yet.
316  if (needsWinCFI(MF))
317    return false;
318  // TODO: SVE is not supported yet.
319  if (getSVEStackSize(MF))
320    return false;
321
322  // Bail on stack adjustment needed on return for simplicity.
323  const MachineFrameInfo &MFI = MF.getFrameInfo();
324  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
325  if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
326    return false;
327  if (Exit && getArgumentStackToRestore(MF, *Exit))
328    return false;
329
330  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
331  if (AFI->hasSwiftAsyncContext())
332    return false;
333
334  // If there are an odd number of GPRs before LR and FP in the CSRs list,
335  // they will not be paired into one RegPairInfo, which is incompatible with
336  // the assumption made by the homogeneous prolog epilog pass.
337  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
338  unsigned NumGPRs = 0;
339  for (unsigned I = 0; CSRegs[I]; ++I) {
340    Register Reg = CSRegs[I];
341    if (Reg == AArch64::LR) {
342      assert(CSRegs[I + 1] == AArch64::FP);
343      if (NumGPRs % 2 != 0)
344        return false;
345      break;
346    }
347    if (AArch64::GPR64RegClass.contains(Reg))
348      ++NumGPRs;
349  }
350
351  return true;
352}
353
354/// Returns true if CSRs should be paired.
355bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
356  return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
357}
358
359/// This is the biggest offset to the stack pointer we can encode in aarch64
360/// instructions (without using a separate calculation and a temp register).
361/// Note that the exception here are vector stores/loads which cannot encode any
362/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
363static const unsigned DefaultSafeSPDisplacement = 255;
364
365/// Look at each instruction that references stack frames and return the stack
366/// size limit beyond which some of these instructions will require a scratch
367/// register during their expansion later.
368static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
369  // FIXME: For now, just conservatively guestimate based on unscaled indexing
370  // range. We'll end up allocating an unnecessary spill slot a lot, but
371  // realistically that's not a big deal at this stage of the game.
372  for (MachineBasicBlock &MBB : MF) {
373    for (MachineInstr &MI : MBB) {
374      if (MI.isDebugInstr() || MI.isPseudo() ||
375          MI.getOpcode() == AArch64::ADDXri ||
376          MI.getOpcode() == AArch64::ADDSXri)
377        continue;
378
379      for (const MachineOperand &MO : MI.operands()) {
380        if (!MO.isFI())
381          continue;
382
383        StackOffset Offset;
384        if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
385            AArch64FrameOffsetCannotUpdate)
386          return 0;
387      }
388    }
389  }
390  return DefaultSafeSPDisplacement;
391}
392
393TargetStackID::Value
394AArch64FrameLowering::getStackIDForScalableVectors() const {
395  return TargetStackID::ScalableVector;
396}
397
398/// Returns the size of the fixed object area (allocated next to sp on entry)
399/// On Win64 this may include a var args area and an UnwindHelp object for EH.
400static unsigned getFixedObjectSize(const MachineFunction &MF,
401                                   const AArch64FunctionInfo *AFI, bool IsWin64,
402                                   bool IsFunclet) {
403  if (!IsWin64 || IsFunclet) {
404    return AFI->getTailCallReservedStack();
405  } else {
406    if (AFI->getTailCallReservedStack() != 0)
407      report_fatal_error("cannot generate ABI-changing tail call for Win64");
408    // Var args are stored here in the primary function.
409    const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
410    // To support EH funclets we allocate an UnwindHelp object
411    const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
412    return alignTo(VarArgsArea + UnwindHelpObject, 16);
413  }
414}
415
416/// Returns the size of the entire SVE stackframe (calleesaves + spills).
417static StackOffset getSVEStackSize(const MachineFunction &MF) {
418  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
419  return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
420}
421
422bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
423  if (!EnableRedZone)
424    return false;
425
426  // Don't use the red zone if the function explicitly asks us not to.
427  // This is typically used for kernel code.
428  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
429  const unsigned RedZoneSize =
430      Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
431  if (!RedZoneSize)
432    return false;
433
434  const MachineFrameInfo &MFI = MF.getFrameInfo();
435  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
436  uint64_t NumBytes = AFI->getLocalStackSize();
437
438  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
439           getSVEStackSize(MF));
440}
441
442/// hasFP - Return true if the specified function should have a dedicated frame
443/// pointer register.
444bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
445  const MachineFrameInfo &MFI = MF.getFrameInfo();
446  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
447
448  // Win64 EH requires a frame pointer if funclets are present, as the locals
449  // are accessed off the frame pointer in both the parent function and the
450  // funclets.
451  if (MF.hasEHFunclets())
452    return true;
453  // Retain behavior of always omitting the FP for leaf functions when possible.
454  if (MF.getTarget().Options.DisableFramePointerElim(MF))
455    return true;
456  if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
457      MFI.hasStackMap() || MFI.hasPatchPoint() ||
458      RegInfo->hasStackRealignment(MF))
459    return true;
460  // With large callframes around we may need to use FP to access the scavenging
461  // emergency spillslot.
462  //
463  // Unfortunately some calls to hasFP() like machine verifier ->
464  // getReservedReg() -> hasFP in the middle of global isel are too early
465  // to know the max call frame size. Hopefully conservatively returning "true"
466  // in those cases is fine.
467  // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
468  if (!MFI.isMaxCallFrameSizeComputed() ||
469      MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
470    return true;
471
472  return false;
473}
474
475/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
476/// not required, we reserve argument space for call sites in the function
477/// immediately on entry to the current function.  This eliminates the need for
478/// add/sub sp brackets around call sites.  Returns true if the call frame is
479/// included as part of the stack frame.
480bool
481AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
482  // The stack probing code for the dynamically allocated outgoing arguments
483  // area assumes that the stack is probed at the top - either by the prologue
484  // code, which issues a probe if `hasVarSizedObjects` return true, or by the
485  // most recent variable-sized object allocation. Changing the condition here
486  // may need to be followed up by changes to the probe issuing logic.
487  return !MF.getFrameInfo().hasVarSizedObjects();
488}
489
490MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
491    MachineFunction &MF, MachineBasicBlock &MBB,
492    MachineBasicBlock::iterator I) const {
493  const AArch64InstrInfo *TII =
494      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
495  const AArch64TargetLowering *TLI =
496      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
497  [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
498  DebugLoc DL = I->getDebugLoc();
499  unsigned Opc = I->getOpcode();
500  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
501  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
502
503  if (!hasReservedCallFrame(MF)) {
504    int64_t Amount = I->getOperand(0).getImm();
505    Amount = alignTo(Amount, getStackAlign());
506    if (!IsDestroy)
507      Amount = -Amount;
508
509    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
510    // doesn't have to pop anything), then the first operand will be zero too so
511    // this adjustment is a no-op.
512    if (CalleePopAmount == 0) {
513      // FIXME: in-function stack adjustment for calls is limited to 24-bits
514      // because there's no guaranteed temporary register available.
515      //
516      // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
517      // 1) For offset <= 12-bit, we use LSL #0
518      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
519      // LSL #0, and the other uses LSL #12.
520      //
521      // Most call frames will be allocated at the start of a function so
522      // this is OK, but it is a limitation that needs dealing with.
523      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
524
525      if (TLI->hasInlineStackProbe(MF) &&
526          -Amount >= AArch64::StackProbeMaxUnprobedStack) {
527        // When stack probing is enabled, the decrement of SP may need to be
528        // probed. We only need to do this if the call site needs 1024 bytes of
529        // space or more, because a region smaller than that is allowed to be
530        // unprobed at an ABI boundary. We rely on the fact that SP has been
531        // probed exactly at this point, either by the prologue or most recent
532        // dynamic allocation.
533        assert(MFI.hasVarSizedObjects() &&
534               "non-reserved call frame without var sized objects?");
535        Register ScratchReg =
536            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
537        inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
538      } else {
539        emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
540                        StackOffset::getFixed(Amount), TII);
541      }
542    }
543  } else if (CalleePopAmount != 0) {
544    // If the calling convention demands that the callee pops arguments from the
545    // stack, we want to add it back if we have a reserved call frame.
546    assert(CalleePopAmount < 0xffffff && "call frame too large");
547    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
548                    StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
549  }
550  return MBB.erase(I);
551}
552
553void AArch64FrameLowering::emitCalleeSavedGPRLocations(
554    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
555  MachineFunction &MF = *MBB.getParent();
556  MachineFrameInfo &MFI = MF.getFrameInfo();
557
558  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
559  if (CSI.empty())
560    return;
561
562  const TargetSubtargetInfo &STI = MF.getSubtarget();
563  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
564  const TargetInstrInfo &TII = *STI.getInstrInfo();
565  DebugLoc DL = MBB.findDebugLoc(MBBI);
566
567  for (const auto &Info : CSI) {
568    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
569      continue;
570
571    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
572    unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
573
574    int64_t Offset =
575        MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
576    unsigned CFIIndex = MF.addFrameInst(
577        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
578    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
579        .addCFIIndex(CFIIndex)
580        .setMIFlags(MachineInstr::FrameSetup);
581  }
582}
583
584void AArch64FrameLowering::emitCalleeSavedSVELocations(
585    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
586  MachineFunction &MF = *MBB.getParent();
587  MachineFrameInfo &MFI = MF.getFrameInfo();
588
589  // Add callee saved registers to move list.
590  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
591  if (CSI.empty())
592    return;
593
594  const TargetSubtargetInfo &STI = MF.getSubtarget();
595  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
596  const TargetInstrInfo &TII = *STI.getInstrInfo();
597  DebugLoc DL = MBB.findDebugLoc(MBBI);
598  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
599
600  for (const auto &Info : CSI) {
601    if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
602      continue;
603
604    // Not all unwinders may know about SVE registers, so assume the lowest
605    // common demoninator.
606    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
607    unsigned Reg = Info.getReg();
608    if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
609      continue;
610
611    StackOffset Offset =
612        StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
613        StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
614
615    unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
616    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
617        .addCFIIndex(CFIIndex)
618        .setMIFlags(MachineInstr::FrameSetup);
619  }
620}
621
622static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
623                               MachineBasicBlock &MBB,
624                               MachineBasicBlock::iterator InsertPt,
625                               unsigned DwarfReg) {
626  unsigned CFIIndex =
627      MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
628  BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
629}
630
631void AArch64FrameLowering::resetCFIToInitialState(
632    MachineBasicBlock &MBB) const {
633
634  MachineFunction &MF = *MBB.getParent();
635  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
636  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
637  const auto &TRI =
638      static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
639  const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
640
641  const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
642  DebugLoc DL;
643
644  // Reset the CFA to `SP + 0`.
645  MachineBasicBlock::iterator InsertPt = MBB.begin();
646  unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
647      nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
648  BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
649
650  // Flip the RA sign state.
651  if (MFI.shouldSignReturnAddress(MF)) {
652    CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
653    BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
654  }
655
656  // Shadow call stack uses X18, reset it.
657  if (MFI.needsShadowCallStackPrologueEpilogue(MF))
658    insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
659                       TRI.getDwarfRegNum(AArch64::X18, true));
660
661  // Emit .cfi_same_value for callee-saved registers.
662  const std::vector<CalleeSavedInfo> &CSI =
663      MF.getFrameInfo().getCalleeSavedInfo();
664  for (const auto &Info : CSI) {
665    unsigned Reg = Info.getReg();
666    if (!TRI.regNeedsCFI(Reg, Reg))
667      continue;
668    insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
669                       TRI.getDwarfRegNum(Reg, true));
670  }
671}
672
673static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
674                                    MachineBasicBlock::iterator MBBI,
675                                    bool SVE) {
676  MachineFunction &MF = *MBB.getParent();
677  MachineFrameInfo &MFI = MF.getFrameInfo();
678
679  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
680  if (CSI.empty())
681    return;
682
683  const TargetSubtargetInfo &STI = MF.getSubtarget();
684  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
685  const TargetInstrInfo &TII = *STI.getInstrInfo();
686  DebugLoc DL = MBB.findDebugLoc(MBBI);
687
688  for (const auto &Info : CSI) {
689    if (SVE !=
690        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
691      continue;
692
693    unsigned Reg = Info.getReg();
694    if (SVE &&
695        !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
696      continue;
697
698    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
699        nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
700    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
701        .addCFIIndex(CFIIndex)
702        .setMIFlags(MachineInstr::FrameDestroy);
703  }
704}
705
706void AArch64FrameLowering::emitCalleeSavedGPRRestores(
707    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
708  emitCalleeSavedRestores(MBB, MBBI, false);
709}
710
711void AArch64FrameLowering::emitCalleeSavedSVERestores(
712    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
713  emitCalleeSavedRestores(MBB, MBBI, true);
714}
715
716// Return the maximum possible number of bytes for `Size` due to the
717// architectural limit on the size of a SVE register.
718static int64_t upperBound(StackOffset Size) {
719  static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
720  return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
721}
722
723void AArch64FrameLowering::allocateStackSpace(
724    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
725    int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
726    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
727    bool FollowupAllocs) const {
728
729  if (!AllocSize)
730    return;
731
732  DebugLoc DL;
733  MachineFunction &MF = *MBB.getParent();
734  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
735  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
736  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
737  const MachineFrameInfo &MFI = MF.getFrameInfo();
738
739  const int64_t MaxAlign = MFI.getMaxAlign().value();
740  const uint64_t AndMask = ~(MaxAlign - 1);
741
742  if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
743    Register TargetReg = RealignmentPadding
744                             ? findScratchNonCalleeSaveRegister(&MBB)
745                             : AArch64::SP;
746    // SUB Xd/SP, SP, AllocSize
747    emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
748                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
749                    EmitCFI, InitialOffset);
750
751    if (RealignmentPadding) {
752      // AND SP, X9, 0b11111...0000
753      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
754          .addReg(TargetReg, RegState::Kill)
755          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
756          .setMIFlags(MachineInstr::FrameSetup);
757      AFI.setStackRealigned(true);
758
759      // No need for SEH instructions here; if we're realigning the stack,
760      // we've set a frame pointer and already finished the SEH prologue.
761      assert(!NeedsWinCFI);
762    }
763    return;
764  }
765
766  //
767  // Stack probing allocation.
768  //
769
770  // Fixed length allocation. If we don't need to re-align the stack and don't
771  // have SVE objects, we can use a more efficient sequence for stack probing.
772  if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
773    Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
774    assert(ScratchReg != AArch64::NoRegister);
775    BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
776        .addDef(ScratchReg)
777        .addImm(AllocSize.getFixed())
778        .addImm(InitialOffset.getFixed())
779        .addImm(InitialOffset.getScalable());
780    // The fixed allocation may leave unprobed bytes at the top of the
781    // stack. If we have subsequent alocation (e.g. if we have variable-sized
782    // objects), we need to issue an extra probe, so these allocations start in
783    // a known state.
784    if (FollowupAllocs) {
785      // STR XZR, [SP]
786      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
787          .addReg(AArch64::XZR)
788          .addReg(AArch64::SP)
789          .addImm(0)
790          .setMIFlags(MachineInstr::FrameSetup);
791    }
792
793    return;
794  }
795
796  // Variable length allocation.
797
798  // If the (unknown) allocation size cannot exceed the probe size, decrement
799  // the stack pointer right away.
800  int64_t ProbeSize = AFI.getStackProbeSize();
801  if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
802    Register ScratchReg = RealignmentPadding
803                              ? findScratchNonCalleeSaveRegister(&MBB)
804                              : AArch64::SP;
805    assert(ScratchReg != AArch64::NoRegister);
806    // SUB Xd, SP, AllocSize
807    emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
808                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
809                    EmitCFI, InitialOffset);
810    if (RealignmentPadding) {
811      // AND SP, Xn, 0b11111...0000
812      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
813          .addReg(ScratchReg, RegState::Kill)
814          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
815          .setMIFlags(MachineInstr::FrameSetup);
816      AFI.setStackRealigned(true);
817    }
818    if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
819                              AArch64::StackProbeMaxUnprobedStack) {
820      // STR XZR, [SP]
821      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
822          .addReg(AArch64::XZR)
823          .addReg(AArch64::SP)
824          .addImm(0)
825          .setMIFlags(MachineInstr::FrameSetup);
826    }
827    return;
828  }
829
830  // Emit a variable-length allocation probing loop.
831  // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
832  // each of them guaranteed to adjust the stack by less than the probe size.
833  Register TargetReg = findScratchNonCalleeSaveRegister(&MBB);
834  assert(TargetReg != AArch64::NoRegister);
835  // SUB Xd, SP, AllocSize
836  emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
837                  MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
838                  EmitCFI, InitialOffset);
839  if (RealignmentPadding) {
840    // AND Xn, Xn, 0b11111...0000
841    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
842        .addReg(TargetReg, RegState::Kill)
843        .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
844        .setMIFlags(MachineInstr::FrameSetup);
845  }
846
847  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
848      .addReg(TargetReg);
849  if (EmitCFI) {
850    // Set the CFA register back to SP.
851    unsigned Reg =
852        Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true);
853    unsigned CFIIndex =
854        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
855    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
856        .addCFIIndex(CFIIndex)
857        .setMIFlags(MachineInstr::FrameSetup);
858  }
859  if (RealignmentPadding)
860    AFI.setStackRealigned(true);
861}
862
863static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
864  switch (Reg.id()) {
865  default:
866    // The called routine is expected to preserve r19-r28
867    // r29 and r30 are used as frame pointer and link register resp.
868    return 0;
869
870    // GPRs
871#define CASE(n)                                                                \
872  case AArch64::W##n:                                                          \
873  case AArch64::X##n:                                                          \
874    return AArch64::X##n
875  CASE(0);
876  CASE(1);
877  CASE(2);
878  CASE(3);
879  CASE(4);
880  CASE(5);
881  CASE(6);
882  CASE(7);
883  CASE(8);
884  CASE(9);
885  CASE(10);
886  CASE(11);
887  CASE(12);
888  CASE(13);
889  CASE(14);
890  CASE(15);
891  CASE(16);
892  CASE(17);
893  CASE(18);
894#undef CASE
895
896    // FPRs
897#define CASE(n)                                                                \
898  case AArch64::B##n:                                                          \
899  case AArch64::H##n:                                                          \
900  case AArch64::S##n:                                                          \
901  case AArch64::D##n:                                                          \
902  case AArch64::Q##n:                                                          \
903    return HasSVE ? AArch64::Z##n : AArch64::Q##n
904  CASE(0);
905  CASE(1);
906  CASE(2);
907  CASE(3);
908  CASE(4);
909  CASE(5);
910  CASE(6);
911  CASE(7);
912  CASE(8);
913  CASE(9);
914  CASE(10);
915  CASE(11);
916  CASE(12);
917  CASE(13);
918  CASE(14);
919  CASE(15);
920  CASE(16);
921  CASE(17);
922  CASE(18);
923  CASE(19);
924  CASE(20);
925  CASE(21);
926  CASE(22);
927  CASE(23);
928  CASE(24);
929  CASE(25);
930  CASE(26);
931  CASE(27);
932  CASE(28);
933  CASE(29);
934  CASE(30);
935  CASE(31);
936#undef CASE
937  }
938}
939
940void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
941                                                MachineBasicBlock &MBB) const {
942  // Insertion point.
943  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
944
945  // Fake a debug loc.
946  DebugLoc DL;
947  if (MBBI != MBB.end())
948    DL = MBBI->getDebugLoc();
949
950  const MachineFunction &MF = *MBB.getParent();
951  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
952  const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
953
954  BitVector GPRsToZero(TRI.getNumRegs());
955  BitVector FPRsToZero(TRI.getNumRegs());
956  bool HasSVE = STI.hasSVE();
957  for (MCRegister Reg : RegsToZero.set_bits()) {
958    if (TRI.isGeneralPurposeRegister(MF, Reg)) {
959      // For GPRs, we only care to clear out the 64-bit register.
960      if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
961        GPRsToZero.set(XReg);
962    } else if (AArch64::FPR128RegClass.contains(Reg) ||
963               AArch64::FPR64RegClass.contains(Reg) ||
964               AArch64::FPR32RegClass.contains(Reg) ||
965               AArch64::FPR16RegClass.contains(Reg) ||
966               AArch64::FPR8RegClass.contains(Reg)) {
967      // For FPRs,
968      if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
969        FPRsToZero.set(XReg);
970    }
971  }
972
973  const AArch64InstrInfo &TII = *STI.getInstrInfo();
974
975  // Zero out GPRs.
976  for (MCRegister Reg : GPRsToZero.set_bits())
977    TII.buildClearRegister(Reg, MBB, MBBI, DL);
978
979  // Zero out FP/vector registers.
980  for (MCRegister Reg : FPRsToZero.set_bits())
981    TII.buildClearRegister(Reg, MBB, MBBI, DL);
982
983  if (HasSVE) {
984    for (MCRegister PReg :
985         {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
986          AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
987          AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
988          AArch64::P15}) {
989      if (RegsToZero[PReg])
990        BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
991    }
992  }
993}
994
995static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
996                                   const MachineBasicBlock &MBB) {
997  const MachineFunction *MF = MBB.getParent();
998  LiveRegs.addLiveIns(MBB);
999  // Mark callee saved registers as used so we will not choose them.
1000  const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
1001  for (unsigned i = 0; CSRegs[i]; ++i)
1002    LiveRegs.addReg(CSRegs[i]);
1003}
1004
1005// Find a scratch register that we can use at the start of the prologue to
1006// re-align the stack pointer.  We avoid using callee-save registers since they
1007// may appear to be free when this is called from canUseAsPrologue (during
1008// shrink wrapping), but then no longer be free when this is called from
1009// emitPrologue.
1010//
1011// FIXME: This is a bit conservative, since in the above case we could use one
1012// of the callee-save registers as a scratch temp to re-align the stack pointer,
1013// but we would then have to make sure that we were in fact saving at least one
1014// callee-save register in the prologue, which is additional complexity that
1015// doesn't seem worth the benefit.
1016static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
1017  MachineFunction *MF = MBB->getParent();
1018
1019  // If MBB is an entry block, use X9 as the scratch register
1020  if (&MF->front() == MBB)
1021    return AArch64::X9;
1022
1023  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1024  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1025  LivePhysRegs LiveRegs(TRI);
1026  getLiveRegsForEntryMBB(LiveRegs, *MBB);
1027
1028  // Prefer X9 since it was historically used for the prologue scratch reg.
1029  const MachineRegisterInfo &MRI = MF->getRegInfo();
1030  if (LiveRegs.available(MRI, AArch64::X9))
1031    return AArch64::X9;
1032
1033  for (unsigned Reg : AArch64::GPR64RegClass) {
1034    if (LiveRegs.available(MRI, Reg))
1035      return Reg;
1036  }
1037  return AArch64::NoRegister;
1038}
1039
1040bool AArch64FrameLowering::canUseAsPrologue(
1041    const MachineBasicBlock &MBB) const {
1042  const MachineFunction *MF = MBB.getParent();
1043  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
1044  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1045  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1046  const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
1047  const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
1048
1049  if (AFI->hasSwiftAsyncContext()) {
1050    const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1051    const MachineRegisterInfo &MRI = MF->getRegInfo();
1052    LivePhysRegs LiveRegs(TRI);
1053    getLiveRegsForEntryMBB(LiveRegs, MBB);
1054    // The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are
1055    // available.
1056    if (!LiveRegs.available(MRI, AArch64::X16) ||
1057        !LiveRegs.available(MRI, AArch64::X17))
1058      return false;
1059  }
1060
1061  // Don't need a scratch register if we're not going to re-align the stack or
1062  // emit stack probes.
1063  if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF))
1064    return true;
1065  // Otherwise, we can use any block as long as it has a scratch register
1066  // available.
1067  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
1068}
1069
1070static bool windowsRequiresStackProbe(MachineFunction &MF,
1071                                      uint64_t StackSizeInBytes) {
1072  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1073  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
1074  // TODO: When implementing stack protectors, take that into account
1075  // for the probe threshold.
1076  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1077         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1078}
1079
1080static bool needsWinCFI(const MachineFunction &MF) {
1081  const Function &F = MF.getFunction();
1082  return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1083         F.needsUnwindTableEntry();
1084}
1085
1086bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
1087    MachineFunction &MF, uint64_t StackBumpBytes) const {
1088  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1089  const MachineFrameInfo &MFI = MF.getFrameInfo();
1090  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1091  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1092  if (homogeneousPrologEpilog(MF))
1093    return false;
1094
1095  if (AFI->getLocalStackSize() == 0)
1096    return false;
1097
1098  // For WinCFI, if optimizing for size, prefer to not combine the stack bump
1099  // (to force a stp with predecrement) to match the packed unwind format,
1100  // provided that there actually are any callee saved registers to merge the
1101  // decrement with.
1102  // This is potentially marginally slower, but allows using the packed
1103  // unwind format for functions that both have a local area and callee saved
1104  // registers. Using the packed unwind format notably reduces the size of
1105  // the unwind info.
1106  if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
1107      MF.getFunction().hasOptSize())
1108    return false;
1109
1110  // 512 is the maximum immediate for stp/ldp that will be used for
1111  // callee-save save/restores
1112  if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
1113    return false;
1114
1115  if (MFI.hasVarSizedObjects())
1116    return false;
1117
1118  if (RegInfo->hasStackRealignment(MF))
1119    return false;
1120
1121  // This isn't strictly necessary, but it simplifies things a bit since the
1122  // current RedZone handling code assumes the SP is adjusted by the
1123  // callee-save save/restore code.
1124  if (canUseRedZone(MF))
1125    return false;
1126
1127  // When there is an SVE area on the stack, always allocate the
1128  // callee-saves and spills/locals separately.
1129  if (getSVEStackSize(MF))
1130    return false;
1131
1132  return true;
1133}
1134
1135bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
1136    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
1137  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
1138    return false;
1139
1140  if (MBB.empty())
1141    return true;
1142
1143  // Disable combined SP bump if the last instruction is an MTE tag store. It
1144  // is almost always better to merge SP adjustment into those instructions.
1145  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
1146  MachineBasicBlock::iterator Begin = MBB.begin();
1147  while (LastI != Begin) {
1148    --LastI;
1149    if (LastI->isTransient())
1150      continue;
1151    if (!LastI->getFlag(MachineInstr::FrameDestroy))
1152      break;
1153  }
1154  switch (LastI->getOpcode()) {
1155  case AArch64::STGloop:
1156  case AArch64::STZGloop:
1157  case AArch64::STGi:
1158  case AArch64::STZGi:
1159  case AArch64::ST2Gi:
1160  case AArch64::STZ2Gi:
1161    return false;
1162  default:
1163    return true;
1164  }
1165  llvm_unreachable("unreachable");
1166}
1167
1168// Given a load or a store instruction, generate an appropriate unwinding SEH
1169// code on Windows.
1170static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
1171                                             const TargetInstrInfo &TII,
1172                                             MachineInstr::MIFlag Flag) {
1173  unsigned Opc = MBBI->getOpcode();
1174  MachineBasicBlock *MBB = MBBI->getParent();
1175  MachineFunction &MF = *MBB->getParent();
1176  DebugLoc DL = MBBI->getDebugLoc();
1177  unsigned ImmIdx = MBBI->getNumOperands() - 1;
1178  int Imm = MBBI->getOperand(ImmIdx).getImm();
1179  MachineInstrBuilder MIB;
1180  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1181  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1182
1183  switch (Opc) {
1184  default:
1185    llvm_unreachable("No SEH Opcode for this instruction");
1186  case AArch64::LDPDpost:
1187    Imm = -Imm;
1188    [[fallthrough]];
1189  case AArch64::STPDpre: {
1190    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1191    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1192    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
1193              .addImm(Reg0)
1194              .addImm(Reg1)
1195              .addImm(Imm * 8)
1196              .setMIFlag(Flag);
1197    break;
1198  }
1199  case AArch64::LDPXpost:
1200    Imm = -Imm;
1201    [[fallthrough]];
1202  case AArch64::STPXpre: {
1203    Register Reg0 = MBBI->getOperand(1).getReg();
1204    Register Reg1 = MBBI->getOperand(2).getReg();
1205    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1206      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1207                .addImm(Imm * 8)
1208                .setMIFlag(Flag);
1209    else
1210      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1211                .addImm(RegInfo->getSEHRegNum(Reg0))
1212                .addImm(RegInfo->getSEHRegNum(Reg1))
1213                .addImm(Imm * 8)
1214                .setMIFlag(Flag);
1215    break;
1216  }
1217  case AArch64::LDRDpost:
1218    Imm = -Imm;
1219    [[fallthrough]];
1220  case AArch64::STRDpre: {
1221    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1222    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1223              .addImm(Reg)
1224              .addImm(Imm)
1225              .setMIFlag(Flag);
1226    break;
1227  }
1228  case AArch64::LDRXpost:
1229    Imm = -Imm;
1230    [[fallthrough]];
1231  case AArch64::STRXpre: {
1232    unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1233    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1234              .addImm(Reg)
1235              .addImm(Imm)
1236              .setMIFlag(Flag);
1237    break;
1238  }
1239  case AArch64::STPDi:
1240  case AArch64::LDPDi: {
1241    unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1242    unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1243    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1244              .addImm(Reg0)
1245              .addImm(Reg1)
1246              .addImm(Imm * 8)
1247              .setMIFlag(Flag);
1248    break;
1249  }
1250  case AArch64::STPXi:
1251  case AArch64::LDPXi: {
1252    Register Reg0 = MBBI->getOperand(0).getReg();
1253    Register Reg1 = MBBI->getOperand(1).getReg();
1254    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1255      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1256                .addImm(Imm * 8)
1257                .setMIFlag(Flag);
1258    else
1259      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1260                .addImm(RegInfo->getSEHRegNum(Reg0))
1261                .addImm(RegInfo->getSEHRegNum(Reg1))
1262                .addImm(Imm * 8)
1263                .setMIFlag(Flag);
1264    break;
1265  }
1266  case AArch64::STRXui:
1267  case AArch64::LDRXui: {
1268    int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1269    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1270              .addImm(Reg)
1271              .addImm(Imm * 8)
1272              .setMIFlag(Flag);
1273    break;
1274  }
1275  case AArch64::STRDui:
1276  case AArch64::LDRDui: {
1277    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1278    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1279              .addImm(Reg)
1280              .addImm(Imm * 8)
1281              .setMIFlag(Flag);
1282    break;
1283  }
1284  case AArch64::STPQi:
1285  case AArch64::LDPQi: {
1286    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1287    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1288    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQP))
1289              .addImm(Reg0)
1290              .addImm(Reg1)
1291              .addImm(Imm * 16)
1292              .setMIFlag(Flag);
1293    break;
1294  }
1295  case AArch64::LDPQpost:
1296    Imm = -Imm;
1297    LLVM_FALLTHROUGH;
1298  case AArch64::STPQpre: {
1299    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1300    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1301    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQPX))
1302              .addImm(Reg0)
1303              .addImm(Reg1)
1304              .addImm(Imm * 16)
1305              .setMIFlag(Flag);
1306    break;
1307  }
1308  }
1309  auto I = MBB->insertAfter(MBBI, MIB);
1310  return I;
1311}
1312
1313// Fix up the SEH opcode associated with the save/restore instruction.
1314static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1315                           unsigned LocalStackSize) {
1316  MachineOperand *ImmOpnd = nullptr;
1317  unsigned ImmIdx = MBBI->getNumOperands() - 1;
1318  switch (MBBI->getOpcode()) {
1319  default:
1320    llvm_unreachable("Fix the offset in the SEH instruction");
1321  case AArch64::SEH_SaveFPLR:
1322  case AArch64::SEH_SaveRegP:
1323  case AArch64::SEH_SaveReg:
1324  case AArch64::SEH_SaveFRegP:
1325  case AArch64::SEH_SaveFReg:
1326  case AArch64::SEH_SaveAnyRegQP:
1327  case AArch64::SEH_SaveAnyRegQPX:
1328    ImmOpnd = &MBBI->getOperand(ImmIdx);
1329    break;
1330  }
1331  if (ImmOpnd)
1332    ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1333}
1334
1335// Convert callee-save register save/restore instruction to do stack pointer
1336// decrement/increment to allocate/deallocate the callee-save stack area by
1337// converting store/load to use pre/post increment version.
1338static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1339    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1340    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1341    bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1342    MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1343    int CFAOffset = 0) {
1344  unsigned NewOpc;
1345  switch (MBBI->getOpcode()) {
1346  default:
1347    llvm_unreachable("Unexpected callee-save save/restore opcode!");
1348  case AArch64::STPXi:
1349    NewOpc = AArch64::STPXpre;
1350    break;
1351  case AArch64::STPDi:
1352    NewOpc = AArch64::STPDpre;
1353    break;
1354  case AArch64::STPQi:
1355    NewOpc = AArch64::STPQpre;
1356    break;
1357  case AArch64::STRXui:
1358    NewOpc = AArch64::STRXpre;
1359    break;
1360  case AArch64::STRDui:
1361    NewOpc = AArch64::STRDpre;
1362    break;
1363  case AArch64::STRQui:
1364    NewOpc = AArch64::STRQpre;
1365    break;
1366  case AArch64::LDPXi:
1367    NewOpc = AArch64::LDPXpost;
1368    break;
1369  case AArch64::LDPDi:
1370    NewOpc = AArch64::LDPDpost;
1371    break;
1372  case AArch64::LDPQi:
1373    NewOpc = AArch64::LDPQpost;
1374    break;
1375  case AArch64::LDRXui:
1376    NewOpc = AArch64::LDRXpost;
1377    break;
1378  case AArch64::LDRDui:
1379    NewOpc = AArch64::LDRDpost;
1380    break;
1381  case AArch64::LDRQui:
1382    NewOpc = AArch64::LDRQpost;
1383    break;
1384  }
1385  // Get rid of the SEH code associated with the old instruction.
1386  if (NeedsWinCFI) {
1387    auto SEH = std::next(MBBI);
1388    if (AArch64InstrInfo::isSEHInstruction(*SEH))
1389      SEH->eraseFromParent();
1390  }
1391
1392  TypeSize Scale = TypeSize::getFixed(1), Width = TypeSize::getFixed(0);
1393  int64_t MinOffset, MaxOffset;
1394  bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1395      NewOpc, Scale, Width, MinOffset, MaxOffset);
1396  (void)Success;
1397  assert(Success && "unknown load/store opcode");
1398
1399  // If the first store isn't right where we want SP then we can't fold the
1400  // update in so create a normal arithmetic instruction instead.
1401  MachineFunction &MF = *MBB.getParent();
1402  if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1403      CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1404    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1405                    StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1406                    false, false, nullptr, EmitCFI,
1407                    StackOffset::getFixed(CFAOffset));
1408
1409    return std::prev(MBBI);
1410  }
1411
1412  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1413  MIB.addReg(AArch64::SP, RegState::Define);
1414
1415  // Copy all operands other than the immediate offset.
1416  unsigned OpndIdx = 0;
1417  for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1418       ++OpndIdx)
1419    MIB.add(MBBI->getOperand(OpndIdx));
1420
1421  assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1422         "Unexpected immediate offset in first/last callee-save save/restore "
1423         "instruction!");
1424  assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1425         "Unexpected base register in callee-save save/restore instruction!");
1426  assert(CSStackSizeInc % Scale == 0);
1427  MIB.addImm(CSStackSizeInc / (int)Scale);
1428
1429  MIB.setMIFlags(MBBI->getFlags());
1430  MIB.setMemRefs(MBBI->memoperands());
1431
1432  // Generate a new SEH code that corresponds to the new instruction.
1433  if (NeedsWinCFI) {
1434    *HasWinCFI = true;
1435    InsertSEH(*MIB, *TII, FrameFlag);
1436  }
1437
1438  if (EmitCFI) {
1439    unsigned CFIIndex = MF.addFrameInst(
1440        MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
1441    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1442        .addCFIIndex(CFIIndex)
1443        .setMIFlags(FrameFlag);
1444  }
1445
1446  return std::prev(MBB.erase(MBBI));
1447}
1448
1449// Fixup callee-save register save/restore instructions to take into account
1450// combined SP bump by adding the local stack size to the stack offsets.
1451static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1452                                              uint64_t LocalStackSize,
1453                                              bool NeedsWinCFI,
1454                                              bool *HasWinCFI) {
1455  if (AArch64InstrInfo::isSEHInstruction(MI))
1456    return;
1457
1458  unsigned Opc = MI.getOpcode();
1459  unsigned Scale;
1460  switch (Opc) {
1461  case AArch64::STPXi:
1462  case AArch64::STRXui:
1463  case AArch64::STPDi:
1464  case AArch64::STRDui:
1465  case AArch64::LDPXi:
1466  case AArch64::LDRXui:
1467  case AArch64::LDPDi:
1468  case AArch64::LDRDui:
1469    Scale = 8;
1470    break;
1471  case AArch64::STPQi:
1472  case AArch64::STRQui:
1473  case AArch64::LDPQi:
1474  case AArch64::LDRQui:
1475    Scale = 16;
1476    break;
1477  default:
1478    llvm_unreachable("Unexpected callee-save save/restore opcode!");
1479  }
1480
1481  unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1482  assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1483         "Unexpected base register in callee-save save/restore instruction!");
1484  // Last operand is immediate offset that needs fixing.
1485  MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1486  // All generated opcodes have scaled offsets.
1487  assert(LocalStackSize % Scale == 0);
1488  OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1489
1490  if (NeedsWinCFI) {
1491    *HasWinCFI = true;
1492    auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1493    assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1494    assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1495           "Expecting a SEH instruction");
1496    fixupSEHOpcode(MBBI, LocalStackSize);
1497  }
1498}
1499
1500static bool isTargetWindows(const MachineFunction &MF) {
1501  return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1502}
1503
1504// Convenience function to determine whether I is an SVE callee save.
1505static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1506  switch (I->getOpcode()) {
1507  default:
1508    return false;
1509  case AArch64::STR_ZXI:
1510  case AArch64::STR_PXI:
1511  case AArch64::LDR_ZXI:
1512  case AArch64::LDR_PXI:
1513    return I->getFlag(MachineInstr::FrameSetup) ||
1514           I->getFlag(MachineInstr::FrameDestroy);
1515  }
1516}
1517
1518static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1519                                        MachineFunction &MF,
1520                                        MachineBasicBlock &MBB,
1521                                        MachineBasicBlock::iterator MBBI,
1522                                        const DebugLoc &DL, bool NeedsWinCFI,
1523                                        bool NeedsUnwindInfo) {
1524  // Shadow call stack prolog: str x30, [x18], #8
1525  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1526      .addReg(AArch64::X18, RegState::Define)
1527      .addReg(AArch64::LR)
1528      .addReg(AArch64::X18)
1529      .addImm(8)
1530      .setMIFlag(MachineInstr::FrameSetup);
1531
1532  // This instruction also makes x18 live-in to the entry block.
1533  MBB.addLiveIn(AArch64::X18);
1534
1535  if (NeedsWinCFI)
1536    BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1537        .setMIFlag(MachineInstr::FrameSetup);
1538
1539  if (NeedsUnwindInfo) {
1540    // Emit a CFI instruction that causes 8 to be subtracted from the value of
1541    // x18 when unwinding past this frame.
1542    static const char CFIInst[] = {
1543        dwarf::DW_CFA_val_expression,
1544        18, // register
1545        2,  // length
1546        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1547        static_cast<char>(-8) & 0x7f, // addend (sleb128)
1548    };
1549    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
1550        nullptr, StringRef(CFIInst, sizeof(CFIInst))));
1551    BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
1552        .addCFIIndex(CFIIndex)
1553        .setMIFlag(MachineInstr::FrameSetup);
1554  }
1555}
1556
1557static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1558                                        MachineFunction &MF,
1559                                        MachineBasicBlock &MBB,
1560                                        MachineBasicBlock::iterator MBBI,
1561                                        const DebugLoc &DL) {
1562  // Shadow call stack epilog: ldr x30, [x18, #-8]!
1563  BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1564      .addReg(AArch64::X18, RegState::Define)
1565      .addReg(AArch64::LR, RegState::Define)
1566      .addReg(AArch64::X18)
1567      .addImm(-8)
1568      .setMIFlag(MachineInstr::FrameDestroy);
1569
1570  if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF)) {
1571    unsigned CFIIndex =
1572        MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
1573    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
1574        .addCFIIndex(CFIIndex)
1575        .setMIFlags(MachineInstr::FrameDestroy);
1576  }
1577}
1578
1579// Define the current CFA rule to use the provided FP.
1580static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB,
1581                                MachineBasicBlock::iterator MBBI,
1582                                const DebugLoc &DL, unsigned FixedObject) {
1583  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
1584  const AArch64RegisterInfo *TRI = STI.getRegisterInfo();
1585  const TargetInstrInfo *TII = STI.getInstrInfo();
1586  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1587
1588  const int OffsetToFirstCalleeSaveFromFP =
1589      AFI->getCalleeSaveBaseToFrameRecordOffset() -
1590      AFI->getCalleeSavedStackSize();
1591  Register FramePtr = TRI->getFrameRegister(MF);
1592  unsigned Reg = TRI->getDwarfRegNum(FramePtr, true);
1593  unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
1594      nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1595  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1596      .addCFIIndex(CFIIndex)
1597      .setMIFlags(MachineInstr::FrameSetup);
1598}
1599
1600#ifndef NDEBUG
1601/// Collect live registers from the end of \p MI's parent up to (including) \p
1602/// MI in \p LiveRegs.
1603static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
1604                                LivePhysRegs &LiveRegs) {
1605
1606  MachineBasicBlock &MBB = *MI.getParent();
1607  LiveRegs.addLiveOuts(MBB);
1608  for (const MachineInstr &MI :
1609       reverse(make_range(MI.getIterator(), MBB.instr_end())))
1610    LiveRegs.stepBackward(MI);
1611}
1612#endif
1613
1614void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1615                                        MachineBasicBlock &MBB) const {
1616  MachineBasicBlock::iterator MBBI = MBB.begin();
1617  const MachineFrameInfo &MFI = MF.getFrameInfo();
1618  const Function &F = MF.getFunction();
1619  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1620  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1621  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1622
1623  MachineModuleInfo &MMI = MF.getMMI();
1624  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1625  bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
1626  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
1627  bool HasFP = hasFP(MF);
1628  bool NeedsWinCFI = needsWinCFI(MF);
1629  bool HasWinCFI = false;
1630  auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1631
1632  MachineBasicBlock::iterator End = MBB.end();
1633#ifndef NDEBUG
1634  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1635  // Collect live register from the end of MBB up to the start of the existing
1636  // frame setup instructions.
1637  MachineBasicBlock::iterator NonFrameStart = MBB.begin();
1638  while (NonFrameStart != End &&
1639         NonFrameStart->getFlag(MachineInstr::FrameSetup))
1640    ++NonFrameStart;
1641
1642  LivePhysRegs LiveRegs(*TRI);
1643  if (NonFrameStart != MBB.end()) {
1644    getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
1645    // Ignore registers used for stack management for now.
1646    LiveRegs.removeReg(AArch64::SP);
1647    LiveRegs.removeReg(AArch64::X19);
1648    LiveRegs.removeReg(AArch64::FP);
1649    LiveRegs.removeReg(AArch64::LR);
1650  }
1651
1652  auto VerifyClobberOnExit = make_scope_exit([&]() {
1653    if (NonFrameStart == MBB.end())
1654      return;
1655    // Check if any of the newly instructions clobber any of the live registers.
1656    for (MachineInstr &MI :
1657         make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
1658      for (auto &Op : MI.operands())
1659        if (Op.isReg() && Op.isDef())
1660          assert(!LiveRegs.contains(Op.getReg()) &&
1661                 "live register clobbered by inserted prologue instructions");
1662    }
1663  });
1664#endif
1665
1666  bool IsFunclet = MBB.isEHFuncletEntry();
1667
1668  // At this point, we're going to decide whether or not the function uses a
1669  // redzone. In most cases, the function doesn't have a redzone so let's
1670  // assume that's false and set it to true in the case that there's a redzone.
1671  AFI->setHasRedZone(false);
1672
1673  // Debug location must be unknown since the first debug location is used
1674  // to determine the end of the prologue.
1675  DebugLoc DL;
1676
1677  const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1678  if (MFnI.needsShadowCallStackPrologueEpilogue(MF))
1679    emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1680                                MFnI.needsDwarfUnwindInfo(MF));
1681
1682  if (MFnI.shouldSignReturnAddress(MF)) {
1683    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
1684        .setMIFlag(MachineInstr::FrameSetup);
1685    if (NeedsWinCFI)
1686      HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
1687  }
1688
1689  if (EmitCFI && MFnI.isMTETagged()) {
1690    BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
1691        .setMIFlag(MachineInstr::FrameSetup);
1692  }
1693
1694  // We signal the presence of a Swift extended frame to external tools by
1695  // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1696  // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1697  // bits so that is still true.
1698  if (HasFP && AFI->hasSwiftAsyncContext()) {
1699    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1700    case SwiftAsyncFramePointerMode::DeploymentBased:
1701      if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1702        // The special symbol below is absolute and has a *value* that can be
1703        // combined with the frame pointer to signal an extended frame.
1704        BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1705            .addExternalSymbol("swift_async_extendedFramePointerFlags",
1706                               AArch64II::MO_GOT);
1707        if (NeedsWinCFI) {
1708          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1709              .setMIFlags(MachineInstr::FrameSetup);
1710          HasWinCFI = true;
1711        }
1712        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1713            .addUse(AArch64::FP)
1714            .addUse(AArch64::X16)
1715            .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1716        if (NeedsWinCFI) {
1717          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1718              .setMIFlags(MachineInstr::FrameSetup);
1719          HasWinCFI = true;
1720        }
1721        break;
1722      }
1723      [[fallthrough]];
1724
1725    case SwiftAsyncFramePointerMode::Always:
1726      // ORR x29, x29, #0x1000_0000_0000_0000
1727      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1728          .addUse(AArch64::FP)
1729          .addImm(0x1100)
1730          .setMIFlag(MachineInstr::FrameSetup);
1731      if (NeedsWinCFI) {
1732        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1733            .setMIFlags(MachineInstr::FrameSetup);
1734        HasWinCFI = true;
1735      }
1736      break;
1737
1738    case SwiftAsyncFramePointerMode::Never:
1739      break;
1740    }
1741  }
1742
1743  // All calls are tail calls in GHC calling conv, and functions have no
1744  // prologue/epilogue.
1745  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1746    return;
1747
1748  // Set tagged base pointer to the requested stack slot.
1749  // Ideally it should match SP value after prologue.
1750  std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1751  if (TBPI)
1752    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1753  else
1754    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1755
1756  const StackOffset &SVEStackSize = getSVEStackSize(MF);
1757
1758  // getStackSize() includes all the locals in its size calculation. We don't
1759  // include these locals when computing the stack size of a funclet, as they
1760  // are allocated in the parent's stack frame and accessed via the frame
1761  // pointer from the funclet.  We only save the callee saved registers in the
1762  // funclet, which are really the callee saved registers of the parent
1763  // function, including the funclet.
1764  int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1765                               : MFI.getStackSize();
1766  if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1767    assert(!HasFP && "unexpected function without stack frame but with FP");
1768    assert(!SVEStackSize &&
1769           "unexpected function without stack frame but with SVE objects");
1770    // All of the stack allocation is for locals.
1771    AFI->setLocalStackSize(NumBytes);
1772    if (!NumBytes)
1773      return;
1774    // REDZONE: If the stack size is less than 128 bytes, we don't need
1775    // to actually allocate.
1776    if (canUseRedZone(MF)) {
1777      AFI->setHasRedZone(true);
1778      ++NumRedZoneFunctions;
1779    } else {
1780      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1781                      StackOffset::getFixed(-NumBytes), TII,
1782                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1783      if (EmitCFI) {
1784        // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1785        MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1786          // Encode the stack size of the leaf function.
1787        unsigned CFIIndex = MF.addFrameInst(
1788            MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1789        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1790            .addCFIIndex(CFIIndex)
1791            .setMIFlags(MachineInstr::FrameSetup);
1792      }
1793    }
1794
1795    if (NeedsWinCFI) {
1796      HasWinCFI = true;
1797      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1798          .setMIFlag(MachineInstr::FrameSetup);
1799    }
1800
1801    return;
1802  }
1803
1804  bool IsWin64 =
1805      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1806  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1807
1808  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1809  // All of the remaining stack allocations are for locals.
1810  AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1811  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1812  bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1813  if (CombineSPBump) {
1814    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1815    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1816                    StackOffset::getFixed(-NumBytes), TII,
1817                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
1818                    EmitAsyncCFI);
1819    NumBytes = 0;
1820  } else if (HomPrologEpilog) {
1821    // Stack has been already adjusted.
1822    NumBytes -= PrologueSaveSize;
1823  } else if (PrologueSaveSize != 0) {
1824    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1825        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
1826        EmitAsyncCFI);
1827    NumBytes -= PrologueSaveSize;
1828  }
1829  assert(NumBytes >= 0 && "Negative stack allocation size!?");
1830
1831  // Move past the saves of the callee-saved registers, fixing up the offsets
1832  // and pre-inc if we decided to combine the callee-save and local stack
1833  // pointer bump above.
1834  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1835         !IsSVECalleeSave(MBBI)) {
1836    if (CombineSPBump)
1837      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1838                                        NeedsWinCFI, &HasWinCFI);
1839    ++MBBI;
1840  }
1841
1842  // For funclets the FP belongs to the containing function.
1843  if (!IsFunclet && HasFP) {
1844    // Only set up FP if we actually need to.
1845    int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1846
1847    if (CombineSPBump)
1848      FPOffset += AFI->getLocalStackSize();
1849
1850    if (AFI->hasSwiftAsyncContext()) {
1851      // Before we update the live FP we have to ensure there's a valid (or
1852      // null) asynchronous context in its slot just before FP in the frame
1853      // record, so store it now.
1854      const auto &Attrs = MF.getFunction().getAttributes();
1855      bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1856      if (HaveInitialContext)
1857        MBB.addLiveIn(AArch64::X22);
1858      Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
1859      BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1860          .addUse(Reg)
1861          .addUse(AArch64::SP)
1862          .addImm(FPOffset - 8)
1863          .setMIFlags(MachineInstr::FrameSetup);
1864      if (NeedsWinCFI) {
1865        // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
1866        // to multiple instructions, should be mutually-exclusive.
1867        assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
1868        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1869            .setMIFlags(MachineInstr::FrameSetup);
1870        HasWinCFI = true;
1871      }
1872    }
1873
1874    if (HomPrologEpilog) {
1875      auto Prolog = MBBI;
1876      --Prolog;
1877      assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1878      Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1879    } else {
1880      // Issue    sub fp, sp, FPOffset or
1881      //          mov fp,sp          when FPOffset is zero.
1882      // Note: All stores of callee-saved registers are marked as "FrameSetup".
1883      // This code marks the instruction(s) that set the FP also.
1884      emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1885                      StackOffset::getFixed(FPOffset), TII,
1886                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1887      if (NeedsWinCFI && HasWinCFI) {
1888        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1889            .setMIFlag(MachineInstr::FrameSetup);
1890        // After setting up the FP, the rest of the prolog doesn't need to be
1891        // included in the SEH unwind info.
1892        NeedsWinCFI = false;
1893      }
1894    }
1895    if (EmitAsyncCFI)
1896      emitDefineCFAWithFP(MF, MBB, MBBI, DL, FixedObject);
1897  }
1898
1899  // Now emit the moves for whatever callee saved regs we have (including FP,
1900  // LR if those are saved). Frame instructions for SVE register are emitted
1901  // later, after the instruction which actually save SVE regs.
1902  if (EmitAsyncCFI)
1903    emitCalleeSavedGPRLocations(MBB, MBBI);
1904
1905  // Alignment is required for the parent frame, not the funclet
1906  const bool NeedsRealignment =
1907      NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
1908  const int64_t RealignmentPadding =
1909      (NeedsRealignment && MFI.getMaxAlign() > Align(16))
1910          ? MFI.getMaxAlign().value() - 16
1911          : 0;
1912
1913  if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
1914    uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
1915    if (NeedsWinCFI) {
1916      HasWinCFI = true;
1917      // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1918      // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1919      // This is at most two instructions, MOVZ follwed by MOVK.
1920      // TODO: Fix to use multiple stack alloc unwind codes for stacks
1921      // exceeding 256MB in size.
1922      if (NumBytes >= (1 << 28))
1923        report_fatal_error("Stack size cannot exceed 256MB for stack "
1924                            "unwinding purposes");
1925
1926      uint32_t LowNumWords = NumWords & 0xFFFF;
1927      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1928            .addImm(LowNumWords)
1929            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1930            .setMIFlag(MachineInstr::FrameSetup);
1931      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1932            .setMIFlag(MachineInstr::FrameSetup);
1933      if ((NumWords & 0xFFFF0000) != 0) {
1934          BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1935              .addReg(AArch64::X15)
1936              .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1937              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1938              .setMIFlag(MachineInstr::FrameSetup);
1939          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1940            .setMIFlag(MachineInstr::FrameSetup);
1941      }
1942    } else {
1943      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1944          .addImm(NumWords)
1945          .setMIFlags(MachineInstr::FrameSetup);
1946    }
1947
1948    const char* ChkStk = Subtarget.getChkStkName();
1949    switch (MF.getTarget().getCodeModel()) {
1950    case CodeModel::Tiny:
1951    case CodeModel::Small:
1952    case CodeModel::Medium:
1953    case CodeModel::Kernel:
1954      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1955          .addExternalSymbol(ChkStk)
1956          .addReg(AArch64::X15, RegState::Implicit)
1957          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1958          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1959          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1960          .setMIFlags(MachineInstr::FrameSetup);
1961      if (NeedsWinCFI) {
1962        HasWinCFI = true;
1963        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1964            .setMIFlag(MachineInstr::FrameSetup);
1965      }
1966      break;
1967    case CodeModel::Large:
1968      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1969          .addReg(AArch64::X16, RegState::Define)
1970          .addExternalSymbol(ChkStk)
1971          .addExternalSymbol(ChkStk)
1972          .setMIFlags(MachineInstr::FrameSetup);
1973      if (NeedsWinCFI) {
1974        HasWinCFI = true;
1975        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1976            .setMIFlag(MachineInstr::FrameSetup);
1977      }
1978
1979      BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1980          .addReg(AArch64::X16, RegState::Kill)
1981          .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1982          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1983          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1984          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1985          .setMIFlags(MachineInstr::FrameSetup);
1986      if (NeedsWinCFI) {
1987        HasWinCFI = true;
1988        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1989            .setMIFlag(MachineInstr::FrameSetup);
1990      }
1991      break;
1992    }
1993
1994    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1995        .addReg(AArch64::SP, RegState::Kill)
1996        .addReg(AArch64::X15, RegState::Kill)
1997        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1998        .setMIFlags(MachineInstr::FrameSetup);
1999    if (NeedsWinCFI) {
2000      HasWinCFI = true;
2001      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2002          .addImm(NumBytes)
2003          .setMIFlag(MachineInstr::FrameSetup);
2004    }
2005    NumBytes = 0;
2006
2007    if (RealignmentPadding > 0) {
2008      if (RealignmentPadding >= 4096) {
2009        BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
2010            .addReg(AArch64::X16, RegState::Define)
2011            .addImm(RealignmentPadding)
2012            .setMIFlags(MachineInstr::FrameSetup);
2013        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
2014            .addReg(AArch64::SP)
2015            .addReg(AArch64::X16, RegState::Kill)
2016            .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
2017            .setMIFlag(MachineInstr::FrameSetup);
2018      } else {
2019        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
2020            .addReg(AArch64::SP)
2021            .addImm(RealignmentPadding)
2022            .addImm(0)
2023            .setMIFlag(MachineInstr::FrameSetup);
2024      }
2025
2026      uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
2027      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
2028          .addReg(AArch64::X15, RegState::Kill)
2029          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
2030      AFI->setStackRealigned(true);
2031
2032      // No need for SEH instructions here; if we're realigning the stack,
2033      // we've set a frame pointer and already finished the SEH prologue.
2034      assert(!NeedsWinCFI);
2035    }
2036  }
2037
2038  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
2039  MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
2040
2041  // Process the SVE callee-saves to determine what space needs to be
2042  // allocated.
2043  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2044    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
2045                      << "\n");
2046    // Find callee save instructions in frame.
2047    CalleeSavesBegin = MBBI;
2048    assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
2049    while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
2050      ++MBBI;
2051    CalleeSavesEnd = MBBI;
2052
2053    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
2054    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
2055  }
2056
2057  // Allocate space for the callee saves (if any).
2058  StackOffset CFAOffset =
2059      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
2060  StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
2061  allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
2062                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
2063                     MFI.hasVarSizedObjects() || LocalsSize);
2064  CFAOffset += SVECalleeSavesSize;
2065
2066  if (EmitAsyncCFI)
2067    emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
2068
2069  // Allocate space for the rest of the frame including SVE locals. Align the
2070  // stack as necessary.
2071  assert(!(canUseRedZone(MF) && NeedsRealignment) &&
2072         "Cannot use redzone with stack realignment");
2073  if (!canUseRedZone(MF)) {
2074    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
2075    // the correct value here, as NumBytes also includes padding bytes,
2076    // which shouldn't be counted here.
2077    allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
2078                       SVELocalsSize + StackOffset::getFixed(NumBytes),
2079                       NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
2080                       CFAOffset, MFI.hasVarSizedObjects());
2081  }
2082
2083  // If we need a base pointer, set it up here. It's whatever the value of the
2084  // stack pointer is at this point. Any variable size objects will be allocated
2085  // after this, so we can still use the base pointer to reference locals.
2086  //
2087  // FIXME: Clarify FrameSetup flags here.
2088  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
2089  // needed.
2090  // For funclets the BP belongs to the containing function.
2091  if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
2092    TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
2093                     false);
2094    if (NeedsWinCFI) {
2095      HasWinCFI = true;
2096      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2097          .setMIFlag(MachineInstr::FrameSetup);
2098    }
2099  }
2100
2101  // The very last FrameSetup instruction indicates the end of prologue. Emit a
2102  // SEH opcode indicating the prologue end.
2103  if (NeedsWinCFI && HasWinCFI) {
2104    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2105        .setMIFlag(MachineInstr::FrameSetup);
2106  }
2107
2108  // SEH funclets are passed the frame pointer in X1.  If the parent
2109  // function uses the base register, then the base register is used
2110  // directly, and is not retrieved from X1.
2111  if (IsFunclet && F.hasPersonalityFn()) {
2112    EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
2113    if (isAsynchronousEHPersonality(Per)) {
2114      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
2115          .addReg(AArch64::X1)
2116          .setMIFlag(MachineInstr::FrameSetup);
2117      MBB.addLiveIn(AArch64::X1);
2118    }
2119  }
2120
2121  if (EmitCFI && !EmitAsyncCFI) {
2122    if (HasFP) {
2123      emitDefineCFAWithFP(MF, MBB, MBBI, DL, FixedObject);
2124    } else {
2125      StackOffset TotalSize =
2126          SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
2127      unsigned CFIIndex = MF.addFrameInst(createDefCFA(
2128          *RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, TotalSize,
2129          /*LastAdjustmentWasScalable=*/false));
2130      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2131          .addCFIIndex(CFIIndex)
2132          .setMIFlags(MachineInstr::FrameSetup);
2133    }
2134    emitCalleeSavedGPRLocations(MBB, MBBI);
2135    emitCalleeSavedSVELocations(MBB, MBBI);
2136  }
2137}
2138
2139static bool isFuncletReturnInstr(const MachineInstr &MI) {
2140  switch (MI.getOpcode()) {
2141  default:
2142    return false;
2143  case AArch64::CATCHRET:
2144  case AArch64::CLEANUPRET:
2145    return true;
2146  }
2147}
2148
2149void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
2150                                        MachineBasicBlock &MBB) const {
2151  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
2152  MachineFrameInfo &MFI = MF.getFrameInfo();
2153  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2154  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2155  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
2156  DebugLoc DL;
2157  bool NeedsWinCFI = needsWinCFI(MF);
2158  bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
2159  bool HasWinCFI = false;
2160  bool IsFunclet = false;
2161
2162  if (MBB.end() != MBBI) {
2163    DL = MBBI->getDebugLoc();
2164    IsFunclet = isFuncletReturnInstr(*MBBI);
2165  }
2166
2167  MachineBasicBlock::iterator EpilogStartI = MBB.end();
2168
2169  auto FinishingTouches = make_scope_exit([&]() {
2170    if (AFI->shouldSignReturnAddress(MF)) {
2171      BuildMI(MBB, MBB.getFirstTerminator(), DL,
2172              TII->get(AArch64::PAUTH_EPILOGUE))
2173          .setMIFlag(MachineInstr::FrameDestroy);
2174      if (NeedsWinCFI)
2175        HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
2176    }
2177    if (AFI->needsShadowCallStackPrologueEpilogue(MF))
2178      emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
2179    if (EmitCFI)
2180      emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
2181    if (HasWinCFI) {
2182      BuildMI(MBB, MBB.getFirstTerminator(), DL,
2183              TII->get(AArch64::SEH_EpilogEnd))
2184          .setMIFlag(MachineInstr::FrameDestroy);
2185      if (!MF.hasWinCFI())
2186        MF.setHasWinCFI(true);
2187    }
2188    if (NeedsWinCFI) {
2189      assert(EpilogStartI != MBB.end());
2190      if (!HasWinCFI)
2191        MBB.erase(EpilogStartI);
2192    }
2193  });
2194
2195  int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
2196                               : MFI.getStackSize();
2197
2198  // All calls are tail calls in GHC calling conv, and functions have no
2199  // prologue/epilogue.
2200  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2201    return;
2202
2203  // How much of the stack used by incoming arguments this function is expected
2204  // to restore in this particular epilogue.
2205  int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
2206  bool IsWin64 =
2207      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
2208  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
2209
2210  int64_t AfterCSRPopSize = ArgumentStackToRestore;
2211  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
2212  // We cannot rely on the local stack size set in emitPrologue if the function
2213  // has funclets, as funclets have different local stack size requirements, and
2214  // the current value set in emitPrologue may be that of the containing
2215  // function.
2216  if (MF.hasEHFunclets())
2217    AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
2218  if (homogeneousPrologEpilog(MF, &MBB)) {
2219    assert(!NeedsWinCFI);
2220    auto LastPopI = MBB.getFirstTerminator();
2221    if (LastPopI != MBB.begin()) {
2222      auto HomogeneousEpilog = std::prev(LastPopI);
2223      if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
2224        LastPopI = HomogeneousEpilog;
2225    }
2226
2227    // Adjust local stack
2228    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2229                    StackOffset::getFixed(AFI->getLocalStackSize()), TII,
2230                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2231
2232    // SP has been already adjusted while restoring callee save regs.
2233    // We've bailed-out the case with adjusting SP for arguments.
2234    assert(AfterCSRPopSize == 0);
2235    return;
2236  }
2237  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
2238  // Assume we can't combine the last pop with the sp restore.
2239
2240  bool CombineAfterCSRBump = false;
2241  if (!CombineSPBump && PrologueSaveSize != 0) {
2242    MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
2243    while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
2244           AArch64InstrInfo::isSEHInstruction(*Pop))
2245      Pop = std::prev(Pop);
2246    // Converting the last ldp to a post-index ldp is valid only if the last
2247    // ldp's offset is 0.
2248    const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
2249    // If the offset is 0 and the AfterCSR pop is not actually trying to
2250    // allocate more stack for arguments (in space that an untimely interrupt
2251    // may clobber), convert it to a post-index ldp.
2252    if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
2253      convertCalleeSaveRestoreToSPPrePostIncDec(
2254          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
2255          MachineInstr::FrameDestroy, PrologueSaveSize);
2256    } else {
2257      // If not, make sure to emit an add after the last ldp.
2258      // We're doing this by transfering the size to be restored from the
2259      // adjustment *before* the CSR pops to the adjustment *after* the CSR
2260      // pops.
2261      AfterCSRPopSize += PrologueSaveSize;
2262      CombineAfterCSRBump = true;
2263    }
2264  }
2265
2266  // Move past the restores of the callee-saved registers.
2267  // If we plan on combining the sp bump of the local stack size and the callee
2268  // save stack size, we might need to adjust the CSR save and restore offsets.
2269  MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
2270  MachineBasicBlock::iterator Begin = MBB.begin();
2271  while (LastPopI != Begin) {
2272    --LastPopI;
2273    if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
2274        IsSVECalleeSave(LastPopI)) {
2275      ++LastPopI;
2276      break;
2277    } else if (CombineSPBump)
2278      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
2279                                        NeedsWinCFI, &HasWinCFI);
2280  }
2281
2282  if (NeedsWinCFI) {
2283    // Note that there are cases where we insert SEH opcodes in the
2284    // epilogue when we had no SEH opcodes in the prologue. For
2285    // example, when there is no stack frame but there are stack
2286    // arguments. Insert the SEH_EpilogStart and remove it later if it
2287    // we didn't emit any SEH opcodes to avoid generating WinCFI for
2288    // functions that don't need it.
2289    BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2290        .setMIFlag(MachineInstr::FrameDestroy);
2291    EpilogStartI = LastPopI;
2292    --EpilogStartI;
2293  }
2294
2295  if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2296    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2297    case SwiftAsyncFramePointerMode::DeploymentBased:
2298      // Avoid the reload as it is GOT relative, and instead fall back to the
2299      // hardcoded value below.  This allows a mismatch between the OS and
2300      // application without immediately terminating on the difference.
2301      [[fallthrough]];
2302    case SwiftAsyncFramePointerMode::Always:
2303      // We need to reset FP to its untagged state on return. Bit 60 is
2304      // currently used to show the presence of an extended frame.
2305
2306      // BIC x29, x29, #0x1000_0000_0000_0000
2307      BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2308              AArch64::FP)
2309          .addUse(AArch64::FP)
2310          .addImm(0x10fe)
2311          .setMIFlag(MachineInstr::FrameDestroy);
2312      if (NeedsWinCFI) {
2313        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2314            .setMIFlags(MachineInstr::FrameDestroy);
2315        HasWinCFI = true;
2316      }
2317      break;
2318
2319    case SwiftAsyncFramePointerMode::Never:
2320      break;
2321    }
2322  }
2323
2324  const StackOffset &SVEStackSize = getSVEStackSize(MF);
2325
2326  // If there is a single SP update, insert it before the ret and we're done.
2327  if (CombineSPBump) {
2328    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2329
2330    // When we are about to restore the CSRs, the CFA register is SP again.
2331    if (EmitCFI && hasFP(MF)) {
2332      const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2333      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2334      unsigned CFIIndex =
2335          MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
2336      BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2337          .addCFIIndex(CFIIndex)
2338          .setMIFlags(MachineInstr::FrameDestroy);
2339    }
2340
2341    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2342                    StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
2343                    TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
2344                    &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
2345    return;
2346  }
2347
2348  NumBytes -= PrologueSaveSize;
2349  assert(NumBytes >= 0 && "Negative stack allocation size!?");
2350
2351  // Process the SVE callee-saves to determine what space needs to be
2352  // deallocated.
2353  StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2354  MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2355  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2356    RestoreBegin = std::prev(RestoreEnd);
2357    while (RestoreBegin != MBB.begin() &&
2358           IsSVECalleeSave(std::prev(RestoreBegin)))
2359      --RestoreBegin;
2360
2361    assert(IsSVECalleeSave(RestoreBegin) &&
2362           IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2363
2364    StackOffset CalleeSavedSizeAsOffset =
2365        StackOffset::getScalable(CalleeSavedSize);
2366    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2367    DeallocateAfter = CalleeSavedSizeAsOffset;
2368  }
2369
2370  // Deallocate the SVE area.
2371  if (SVEStackSize) {
2372    // If we have stack realignment or variable sized objects on the stack,
2373    // restore the stack pointer from the frame pointer prior to SVE CSR
2374    // restoration.
2375    if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
2376      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2377        // Set SP to start of SVE callee-save area from which they can
2378        // be reloaded. The code below will deallocate the stack space
2379        // space by moving FP -> SP.
2380        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
2381                        StackOffset::getScalable(-CalleeSavedSize), TII,
2382                        MachineInstr::FrameDestroy);
2383      }
2384    } else {
2385      if (AFI->getSVECalleeSavedStackSize()) {
2386        // Deallocate the non-SVE locals first before we can deallocate (and
2387        // restore callee saves) from the SVE area.
2388        emitFrameOffset(
2389            MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2390            StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
2391            false, false, nullptr, EmitCFI && !hasFP(MF),
2392            SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2393        NumBytes = 0;
2394      }
2395
2396      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2397                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2398                      false, nullptr, EmitCFI && !hasFP(MF),
2399                      SVEStackSize +
2400                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
2401
2402      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2403                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2404                      false, nullptr, EmitCFI && !hasFP(MF),
2405                      DeallocateAfter +
2406                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
2407    }
2408    if (EmitCFI)
2409      emitCalleeSavedSVERestores(MBB, RestoreEnd);
2410  }
2411
2412  if (!hasFP(MF)) {
2413    bool RedZone = canUseRedZone(MF);
2414    // If this was a redzone leaf function, we don't need to restore the
2415    // stack pointer (but we may need to pop stack args for fastcc).
2416    if (RedZone && AfterCSRPopSize == 0)
2417      return;
2418
2419    // Pop the local variables off the stack. If there are no callee-saved
2420    // registers, it means we are actually positioned at the terminator and can
2421    // combine stack increment for the locals and the stack increment for
2422    // callee-popped arguments into (possibly) a single instruction and be done.
2423    bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2424    int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2425    if (NoCalleeSaveRestore)
2426      StackRestoreBytes += AfterCSRPopSize;
2427
2428    emitFrameOffset(
2429        MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2430        StackOffset::getFixed(StackRestoreBytes), TII,
2431        MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2432        StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2433
2434    // If we were able to combine the local stack pop with the argument pop,
2435    // then we're done.
2436    if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2437      return;
2438    }
2439
2440    NumBytes = 0;
2441  }
2442
2443  // Restore the original stack pointer.
2444  // FIXME: Rather than doing the math here, we should instead just use
2445  // non-post-indexed loads for the restores if we aren't actually going to
2446  // be able to save any instructions.
2447  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2448    emitFrameOffset(
2449        MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2450        StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
2451        TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2452  } else if (NumBytes)
2453    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2454                    StackOffset::getFixed(NumBytes), TII,
2455                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2456
2457  // When we are about to restore the CSRs, the CFA register is SP again.
2458  if (EmitCFI && hasFP(MF)) {
2459    const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2460    unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2461    unsigned CFIIndex = MF.addFrameInst(
2462        MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
2463    BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2464        .addCFIIndex(CFIIndex)
2465        .setMIFlags(MachineInstr::FrameDestroy);
2466  }
2467
2468  // This must be placed after the callee-save restore code because that code
2469  // assumes the SP is at the same location as it was after the callee-save save
2470  // code in the prologue.
2471  if (AfterCSRPopSize) {
2472    assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2473                                  "interrupt may have clobbered");
2474
2475    emitFrameOffset(
2476        MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2477        StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2478        false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2479        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2480  }
2481}
2482
2483bool AArch64FrameLowering::enableCFIFixup(MachineFunction &MF) const {
2484  return TargetFrameLowering::enableCFIFixup(MF) &&
2485         MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF);
2486}
2487
2488/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2489/// debug info.  It's the same as what we use for resolving the code-gen
2490/// references for now.  FIXME: This can go wrong when references are
2491/// SP-relative and simple call frames aren't used.
2492StackOffset
2493AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2494                                             Register &FrameReg) const {
2495  return resolveFrameIndexReference(
2496      MF, FI, FrameReg,
2497      /*PreferFP=*/
2498      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
2499      /*ForSimm=*/false);
2500}
2501
2502StackOffset
2503AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2504                                                     int FI) const {
2505  return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
2506}
2507
2508static StackOffset getFPOffset(const MachineFunction &MF,
2509                               int64_t ObjectOffset) {
2510  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2511  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2512  bool IsWin64 =
2513      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
2514  unsigned FixedObject =
2515      getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2516  int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2517  int64_t FPAdjust =
2518      CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2519  return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2520}
2521
2522static StackOffset getStackOffset(const MachineFunction &MF,
2523                                  int64_t ObjectOffset) {
2524  const auto &MFI = MF.getFrameInfo();
2525  return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2526}
2527
2528  // TODO: This function currently does not work for scalable vectors.
2529int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2530                                                 int FI) const {
2531  const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2532      MF.getSubtarget().getRegisterInfo());
2533  int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2534  return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2535             ? getFPOffset(MF, ObjectOffset).getFixed()
2536             : getStackOffset(MF, ObjectOffset).getFixed();
2537}
2538
2539StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2540    const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2541    bool ForSimm) const {
2542  const auto &MFI = MF.getFrameInfo();
2543  int64_t ObjectOffset = MFI.getObjectOffset(FI);
2544  bool isFixed = MFI.isFixedObjectIndex(FI);
2545  bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2546  return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2547                                     PreferFP, ForSimm);
2548}
2549
2550StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2551    const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2552    Register &FrameReg, bool PreferFP, bool ForSimm) const {
2553  const auto &MFI = MF.getFrameInfo();
2554  const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2555      MF.getSubtarget().getRegisterInfo());
2556  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2557  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2558
2559  int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2560  int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2561  bool isCSR =
2562      !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2563
2564  const StackOffset &SVEStackSize = getSVEStackSize(MF);
2565
2566  // Use frame pointer to reference fixed objects. Use it for locals if
2567  // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2568  // reliable as a base). Make sure useFPForScavengingIndex() does the
2569  // right thing for the emergency spill slot.
2570  bool UseFP = false;
2571  if (AFI->hasStackFrame() && !isSVE) {
2572    // We shouldn't prefer using the FP to access fixed-sized stack objects when
2573    // there are scalable (SVE) objects in between the FP and the fixed-sized
2574    // objects.
2575    PreferFP &= !SVEStackSize;
2576
2577    // Note: Keeping the following as multiple 'if' statements rather than
2578    // merging to a single expression for readability.
2579    //
2580    // Argument access should always use the FP.
2581    if (isFixed) {
2582      UseFP = hasFP(MF);
2583    } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2584      // References to the CSR area must use FP if we're re-aligning the stack
2585      // since the dynamically-sized alignment padding is between the SP/BP and
2586      // the CSR area.
2587      assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2588      UseFP = true;
2589    } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2590      // If the FPOffset is negative and we're producing a signed immediate, we
2591      // have to keep in mind that the available offset range for negative
2592      // offsets is smaller than for positive ones. If an offset is available
2593      // via the FP and the SP, use whichever is closest.
2594      bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2595      PreferFP |= Offset > -FPOffset && !SVEStackSize;
2596
2597      if (MFI.hasVarSizedObjects()) {
2598        // If we have variable sized objects, we can use either FP or BP, as the
2599        // SP offset is unknown. We can use the base pointer if we have one and
2600        // FP is not preferred. If not, we're stuck with using FP.
2601        bool CanUseBP = RegInfo->hasBasePointer(MF);
2602        if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2603          UseFP = PreferFP;
2604        else if (!CanUseBP) // Can't use BP. Forced to use FP.
2605          UseFP = true;
2606        // else we can use BP and FP, but the offset from FP won't fit.
2607        // That will make us scavenge registers which we can probably avoid by
2608        // using BP. If it won't fit for BP either, we'll scavenge anyway.
2609      } else if (FPOffset >= 0) {
2610        // Use SP or FP, whichever gives us the best chance of the offset
2611        // being in range for direct access. If the FPOffset is positive,
2612        // that'll always be best, as the SP will be even further away.
2613        UseFP = true;
2614      } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2615        // Funclets access the locals contained in the parent's stack frame
2616        // via the frame pointer, so we have to use the FP in the parent
2617        // function.
2618        (void) Subtarget;
2619        assert(
2620            Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2621            "Funclets should only be present on Win64");
2622        UseFP = true;
2623      } else {
2624        // We have the choice between FP and (SP or BP).
2625        if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2626          UseFP = true;
2627      }
2628    }
2629  }
2630
2631  assert(
2632      ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2633      "In the presence of dynamic stack pointer realignment, "
2634      "non-argument/CSR objects cannot be accessed through the frame pointer");
2635
2636  if (isSVE) {
2637    StackOffset FPOffset =
2638        StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2639    StackOffset SPOffset =
2640        SVEStackSize +
2641        StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2642                         ObjectOffset);
2643    // Always use the FP for SVE spills if available and beneficial.
2644    if (hasFP(MF) && (SPOffset.getFixed() ||
2645                      FPOffset.getScalable() < SPOffset.getScalable() ||
2646                      RegInfo->hasStackRealignment(MF))) {
2647      FrameReg = RegInfo->getFrameRegister(MF);
2648      return FPOffset;
2649    }
2650
2651    FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2652                                           : (unsigned)AArch64::SP;
2653    return SPOffset;
2654  }
2655
2656  StackOffset ScalableOffset = {};
2657  if (UseFP && !(isFixed || isCSR))
2658    ScalableOffset = -SVEStackSize;
2659  if (!UseFP && (isFixed || isCSR))
2660    ScalableOffset = SVEStackSize;
2661
2662  if (UseFP) {
2663    FrameReg = RegInfo->getFrameRegister(MF);
2664    return StackOffset::getFixed(FPOffset) + ScalableOffset;
2665  }
2666
2667  // Use the base pointer if we have one.
2668  if (RegInfo->hasBasePointer(MF))
2669    FrameReg = RegInfo->getBaseRegister();
2670  else {
2671    assert(!MFI.hasVarSizedObjects() &&
2672           "Can't use SP when we have var sized objects.");
2673    FrameReg = AArch64::SP;
2674    // If we're using the red zone for this function, the SP won't actually
2675    // be adjusted, so the offsets will be negative. They're also all
2676    // within range of the signed 9-bit immediate instructions.
2677    if (canUseRedZone(MF))
2678      Offset -= AFI->getLocalStackSize();
2679  }
2680
2681  return StackOffset::getFixed(Offset) + ScalableOffset;
2682}
2683
2684static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2685  // Do not set a kill flag on values that are also marked as live-in. This
2686  // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2687  // callee saved registers.
2688  // Omitting the kill flags is conservatively correct even if the live-in
2689  // is not used after all.
2690  bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2691  return getKillRegState(!IsLiveIn);
2692}
2693
2694static bool produceCompactUnwindFrame(MachineFunction &MF) {
2695  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2696  AttributeList Attrs = MF.getFunction().getAttributes();
2697  return Subtarget.isTargetMachO() &&
2698         !(Subtarget.getTargetLowering()->supportSwiftError() &&
2699           Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2700         MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2701}
2702
2703static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2704                                             bool NeedsWinCFI, bool IsFirst,
2705                                             const TargetRegisterInfo *TRI) {
2706  // If we are generating register pairs for a Windows function that requires
2707  // EH support, then pair consecutive registers only.  There are no unwind
2708  // opcodes for saves/restores of non-consectuve register pairs.
2709  // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2710  // save_lrpair.
2711  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2712
2713  if (Reg2 == AArch64::FP)
2714    return true;
2715  if (!NeedsWinCFI)
2716    return false;
2717  if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1)
2718    return false;
2719  // If pairing a GPR with LR, the pair can be described by the save_lrpair
2720  // opcode. If this is the first register pair, it would end up with a
2721  // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2722  // if LR is paired with something else than the first register.
2723  // The save_lrpair opcode requires the first register to be an odd one.
2724  if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2725      (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2726    return false;
2727  return true;
2728}
2729
2730/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2731/// WindowsCFI requires that only consecutive registers can be paired.
2732/// LR and FP need to be allocated together when the frame needs to save
2733/// the frame-record. This means any other register pairing with LR is invalid.
2734static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2735                                      bool UsesWinAAPCS, bool NeedsWinCFI,
2736                                      bool NeedsFrameRecord, bool IsFirst,
2737                                      const TargetRegisterInfo *TRI) {
2738  if (UsesWinAAPCS)
2739    return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst,
2740                                            TRI);
2741
2742  // If we need to store the frame record, don't pair any register
2743  // with LR other than FP.
2744  if (NeedsFrameRecord)
2745    return Reg2 == AArch64::LR;
2746
2747  return false;
2748}
2749
2750namespace {
2751
2752struct RegPairInfo {
2753  unsigned Reg1 = AArch64::NoRegister;
2754  unsigned Reg2 = AArch64::NoRegister;
2755  int FrameIdx;
2756  int Offset;
2757  enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2758
2759  RegPairInfo() = default;
2760
2761  bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2762
2763  unsigned getScale() const {
2764    switch (Type) {
2765    case PPR:
2766      return 2;
2767    case GPR:
2768    case FPR64:
2769      return 8;
2770    case ZPR:
2771    case FPR128:
2772      return 16;
2773    }
2774    llvm_unreachable("Unsupported type");
2775  }
2776
2777  bool isScalable() const { return Type == PPR || Type == ZPR; }
2778};
2779
2780} // end anonymous namespace
2781
2782static void computeCalleeSaveRegisterPairs(
2783    MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2784    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2785    bool NeedsFrameRecord) {
2786
2787  if (CSI.empty())
2788    return;
2789
2790  bool IsWindows = isTargetWindows(MF);
2791  bool NeedsWinCFI = needsWinCFI(MF);
2792  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2793  MachineFrameInfo &MFI = MF.getFrameInfo();
2794  CallingConv::ID CC = MF.getFunction().getCallingConv();
2795  unsigned Count = CSI.size();
2796  (void)CC;
2797  // MachO's compact unwind format relies on all registers being stored in
2798  // pairs.
2799  assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
2800          CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
2801          CC == CallingConv::Win64 || (Count & 1) == 0) &&
2802         "Odd number of callee-saved regs to spill!");
2803  int ByteOffset = AFI->getCalleeSavedStackSize();
2804  int StackFillDir = -1;
2805  int RegInc = 1;
2806  unsigned FirstReg = 0;
2807  if (NeedsWinCFI) {
2808    // For WinCFI, fill the stack from the bottom up.
2809    ByteOffset = 0;
2810    StackFillDir = 1;
2811    // As the CSI array is reversed to match PrologEpilogInserter, iterate
2812    // backwards, to pair up registers starting from lower numbered registers.
2813    RegInc = -1;
2814    FirstReg = Count - 1;
2815  }
2816  int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2817  bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2818
2819  // When iterating backwards, the loop condition relies on unsigned wraparound.
2820  for (unsigned i = FirstReg; i < Count; i += RegInc) {
2821    RegPairInfo RPI;
2822    RPI.Reg1 = CSI[i].getReg();
2823
2824    if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2825      RPI.Type = RegPairInfo::GPR;
2826    else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2827      RPI.Type = RegPairInfo::FPR64;
2828    else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2829      RPI.Type = RegPairInfo::FPR128;
2830    else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2831      RPI.Type = RegPairInfo::ZPR;
2832    else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2833      RPI.Type = RegPairInfo::PPR;
2834    else
2835      llvm_unreachable("Unsupported register class.");
2836
2837    // Add the next reg to the pair if it is in the same register class.
2838    if (unsigned(i + RegInc) < Count) {
2839      Register NextReg = CSI[i + RegInc].getReg();
2840      bool IsFirst = i == FirstReg;
2841      switch (RPI.Type) {
2842      case RegPairInfo::GPR:
2843        if (AArch64::GPR64RegClass.contains(NextReg) &&
2844            !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2845                                       NeedsWinCFI, NeedsFrameRecord, IsFirst,
2846                                       TRI))
2847          RPI.Reg2 = NextReg;
2848        break;
2849      case RegPairInfo::FPR64:
2850        if (AArch64::FPR64RegClass.contains(NextReg) &&
2851            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2852                                              IsFirst, TRI))
2853          RPI.Reg2 = NextReg;
2854        break;
2855      case RegPairInfo::FPR128:
2856        if (AArch64::FPR128RegClass.contains(NextReg))
2857          RPI.Reg2 = NextReg;
2858        break;
2859      case RegPairInfo::PPR:
2860      case RegPairInfo::ZPR:
2861        break;
2862      }
2863    }
2864
2865    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2866    // list to come in sorted by frame index so that we can issue the store
2867    // pair instructions directly. Assert if we see anything otherwise.
2868    //
2869    // The order of the registers in the list is controlled by
2870    // getCalleeSavedRegs(), so they will always be in-order, as well.
2871    assert((!RPI.isPaired() ||
2872            (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2873           "Out of order callee saved regs!");
2874
2875    assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2876            RPI.Reg1 == AArch64::LR) &&
2877           "FrameRecord must be allocated together with LR");
2878
2879    // Windows AAPCS has FP and LR reversed.
2880    assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2881            RPI.Reg2 == AArch64::LR) &&
2882           "FrameRecord must be allocated together with LR");
2883
2884    // MachO's compact unwind format relies on all registers being stored in
2885    // adjacent register pairs.
2886    assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
2887            CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
2888            CC == CallingConv::Win64 ||
2889            (RPI.isPaired() &&
2890             ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2891              RPI.Reg1 + 1 == RPI.Reg2))) &&
2892           "Callee-save registers not saved as adjacent register pair!");
2893
2894    RPI.FrameIdx = CSI[i].getFrameIdx();
2895    if (NeedsWinCFI &&
2896        RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2897      RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2898
2899    int Scale = RPI.getScale();
2900
2901    int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2902    assert(OffsetPre % Scale == 0);
2903
2904    if (RPI.isScalable())
2905      ScalableByteOffset += StackFillDir * Scale;
2906    else
2907      ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2908
2909    // Swift's async context is directly before FP, so allocate an extra
2910    // 8 bytes for it.
2911    if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2912        ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
2913         (IsWindows && RPI.Reg2 == AArch64::LR)))
2914      ByteOffset += StackFillDir * 8;
2915
2916    assert(!(RPI.isScalable() && RPI.isPaired()) &&
2917           "Paired spill/fill instructions don't exist for SVE vectors");
2918
2919    // Round up size of non-pair to pair size if we need to pad the
2920    // callee-save area to ensure 16-byte alignment.
2921    if (NeedGapToAlignStack && !NeedsWinCFI &&
2922        !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2923        !RPI.isPaired() && ByteOffset % 16 != 0) {
2924      ByteOffset += 8 * StackFillDir;
2925      assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2926      // A stack frame with a gap looks like this, bottom up:
2927      // d9, d8. x21, gap, x20, x19.
2928      // Set extra alignment on the x21 object to create the gap above it.
2929      MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2930      NeedGapToAlignStack = false;
2931    }
2932
2933    int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2934    assert(OffsetPost % Scale == 0);
2935    // If filling top down (default), we want the offset after incrementing it.
2936    // If filling bottom up (WinCFI) we need the original offset.
2937    int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2938
2939    // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2940    // Swift context can directly precede FP.
2941    if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2942        ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
2943         (IsWindows && RPI.Reg2 == AArch64::LR)))
2944      Offset += 8;
2945    RPI.Offset = Offset / Scale;
2946
2947    assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2948            (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2949           "Offset out of bounds for LDP/STP immediate");
2950
2951    // Save the offset to frame record so that the FP register can point to the
2952    // innermost frame record (spilled FP and LR registers).
2953    if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2954                              RPI.Reg2 == AArch64::FP) ||
2955                             (IsWindows && RPI.Reg1 == AArch64::FP &&
2956                              RPI.Reg2 == AArch64::LR)))
2957      AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2958
2959    RegPairs.push_back(RPI);
2960    if (RPI.isPaired())
2961      i += RegInc;
2962  }
2963  if (NeedsWinCFI) {
2964    // If we need an alignment gap in the stack, align the topmost stack
2965    // object. A stack frame with a gap looks like this, bottom up:
2966    // x19, d8. d9, gap.
2967    // Set extra alignment on the topmost stack object (the first element in
2968    // CSI, which goes top down), to create the gap above it.
2969    if (AFI->hasCalleeSaveStackFreeSpace())
2970      MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2971    // We iterated bottom up over the registers; flip RegPairs back to top
2972    // down order.
2973    std::reverse(RegPairs.begin(), RegPairs.end());
2974  }
2975}
2976
2977bool AArch64FrameLowering::spillCalleeSavedRegisters(
2978    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2979    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2980  MachineFunction &MF = *MBB.getParent();
2981  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2982  bool NeedsWinCFI = needsWinCFI(MF);
2983  DebugLoc DL;
2984  SmallVector<RegPairInfo, 8> RegPairs;
2985
2986  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2987
2988  const MachineRegisterInfo &MRI = MF.getRegInfo();
2989  if (homogeneousPrologEpilog(MF)) {
2990    auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2991                   .setMIFlag(MachineInstr::FrameSetup);
2992
2993    for (auto &RPI : RegPairs) {
2994      MIB.addReg(RPI.Reg1);
2995      MIB.addReg(RPI.Reg2);
2996
2997      // Update register live in.
2998      if (!MRI.isReserved(RPI.Reg1))
2999        MBB.addLiveIn(RPI.Reg1);
3000      if (RPI.isPaired() && !MRI.isReserved(RPI.Reg2))
3001        MBB.addLiveIn(RPI.Reg2);
3002    }
3003    return true;
3004  }
3005  for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
3006    unsigned Reg1 = RPI.Reg1;
3007    unsigned Reg2 = RPI.Reg2;
3008    unsigned StrOpc;
3009
3010    // Issue sequence of spills for cs regs.  The first spill may be converted
3011    // to a pre-decrement store later by emitPrologue if the callee-save stack
3012    // area allocation can't be combined with the local stack area allocation.
3013    // For example:
3014    //    stp     x22, x21, [sp, #0]     // addImm(+0)
3015    //    stp     x20, x19, [sp, #16]    // addImm(+2)
3016    //    stp     fp, lr, [sp, #32]      // addImm(+4)
3017    // Rationale: This sequence saves uop updates compared to a sequence of
3018    // pre-increment spills like stp xi,xj,[sp,#-16]!
3019    // Note: Similar rationale and sequence for restores in epilog.
3020    unsigned Size;
3021    Align Alignment;
3022    switch (RPI.Type) {
3023    case RegPairInfo::GPR:
3024       StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
3025       Size = 8;
3026       Alignment = Align(8);
3027       break;
3028    case RegPairInfo::FPR64:
3029       StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
3030       Size = 8;
3031       Alignment = Align(8);
3032       break;
3033    case RegPairInfo::FPR128:
3034       StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
3035       Size = 16;
3036       Alignment = Align(16);
3037       break;
3038    case RegPairInfo::ZPR:
3039       StrOpc = AArch64::STR_ZXI;
3040       Size = 16;
3041       Alignment = Align(16);
3042       break;
3043    case RegPairInfo::PPR:
3044       StrOpc = AArch64::STR_PXI;
3045       Size = 2;
3046       Alignment = Align(2);
3047       break;
3048    }
3049    LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
3050               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3051               dbgs() << ") -> fi#(" << RPI.FrameIdx;
3052               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3053               dbgs() << ")\n");
3054
3055    assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
3056           "Windows unwdinding requires a consecutive (FP,LR) pair");
3057    // Windows unwind codes require consecutive registers if registers are
3058    // paired.  Make the switch here, so that the code below will save (x,x+1)
3059    // and not (x+1,x).
3060    unsigned FrameIdxReg1 = RPI.FrameIdx;
3061    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3062    if (NeedsWinCFI && RPI.isPaired()) {
3063      std::swap(Reg1, Reg2);
3064      std::swap(FrameIdxReg1, FrameIdxReg2);
3065    }
3066    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
3067    if (!MRI.isReserved(Reg1))
3068      MBB.addLiveIn(Reg1);
3069    if (RPI.isPaired()) {
3070      if (!MRI.isReserved(Reg2))
3071        MBB.addLiveIn(Reg2);
3072      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
3073      MIB.addMemOperand(MF.getMachineMemOperand(
3074          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3075          MachineMemOperand::MOStore, Size, Alignment));
3076    }
3077    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
3078        .addReg(AArch64::SP)
3079        .addImm(RPI.Offset) // [sp, #offset*scale],
3080                            // where factor*scale is implicit
3081        .setMIFlag(MachineInstr::FrameSetup);
3082    MIB.addMemOperand(MF.getMachineMemOperand(
3083        MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3084        MachineMemOperand::MOStore, Size, Alignment));
3085    if (NeedsWinCFI)
3086      InsertSEH(MIB, TII, MachineInstr::FrameSetup);
3087
3088    // Update the StackIDs of the SVE stack slots.
3089    MachineFrameInfo &MFI = MF.getFrameInfo();
3090    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
3091      MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
3092
3093  }
3094  return true;
3095}
3096
3097bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3098    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3099    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3100  MachineFunction &MF = *MBB.getParent();
3101  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3102  DebugLoc DL;
3103  SmallVector<RegPairInfo, 8> RegPairs;
3104  bool NeedsWinCFI = needsWinCFI(MF);
3105
3106  if (MBBI != MBB.end())
3107    DL = MBBI->getDebugLoc();
3108
3109  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
3110
3111  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
3112    unsigned Reg1 = RPI.Reg1;
3113    unsigned Reg2 = RPI.Reg2;
3114
3115    // Issue sequence of restores for cs regs. The last restore may be converted
3116    // to a post-increment load later by emitEpilogue if the callee-save stack
3117    // area allocation can't be combined with the local stack area allocation.
3118    // For example:
3119    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
3120    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
3121    //    ldp     x22, x21, [sp, #0]      // addImm(+0)
3122    // Note: see comment in spillCalleeSavedRegisters()
3123    unsigned LdrOpc;
3124    unsigned Size;
3125    Align Alignment;
3126    switch (RPI.Type) {
3127    case RegPairInfo::GPR:
3128       LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
3129       Size = 8;
3130       Alignment = Align(8);
3131       break;
3132    case RegPairInfo::FPR64:
3133       LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
3134       Size = 8;
3135       Alignment = Align(8);
3136       break;
3137    case RegPairInfo::FPR128:
3138       LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
3139       Size = 16;
3140       Alignment = Align(16);
3141       break;
3142    case RegPairInfo::ZPR:
3143       LdrOpc = AArch64::LDR_ZXI;
3144       Size = 16;
3145       Alignment = Align(16);
3146       break;
3147    case RegPairInfo::PPR:
3148       LdrOpc = AArch64::LDR_PXI;
3149       Size = 2;
3150       Alignment = Align(2);
3151       break;
3152    }
3153    LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
3154               if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3155               dbgs() << ") -> fi#(" << RPI.FrameIdx;
3156               if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3157               dbgs() << ")\n");
3158
3159    // Windows unwind codes require consecutive registers if registers are
3160    // paired.  Make the switch here, so that the code below will save (x,x+1)
3161    // and not (x+1,x).
3162    unsigned FrameIdxReg1 = RPI.FrameIdx;
3163    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3164    if (NeedsWinCFI && RPI.isPaired()) {
3165      std::swap(Reg1, Reg2);
3166      std::swap(FrameIdxReg1, FrameIdxReg2);
3167    }
3168    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
3169    if (RPI.isPaired()) {
3170      MIB.addReg(Reg2, getDefRegState(true));
3171      MIB.addMemOperand(MF.getMachineMemOperand(
3172          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3173          MachineMemOperand::MOLoad, Size, Alignment));
3174    }
3175    MIB.addReg(Reg1, getDefRegState(true))
3176        .addReg(AArch64::SP)
3177        .addImm(RPI.Offset) // [sp, #offset*scale]
3178                            // where factor*scale is implicit
3179        .setMIFlag(MachineInstr::FrameDestroy);
3180    MIB.addMemOperand(MF.getMachineMemOperand(
3181        MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3182        MachineMemOperand::MOLoad, Size, Alignment));
3183    if (NeedsWinCFI)
3184      InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
3185
3186    return MIB->getIterator();
3187  };
3188
3189  // SVE objects are always restored in reverse order.
3190  for (const RegPairInfo &RPI : reverse(RegPairs))
3191    if (RPI.isScalable())
3192      EmitMI(RPI);
3193
3194  if (homogeneousPrologEpilog(MF, &MBB)) {
3195    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
3196                   .setMIFlag(MachineInstr::FrameDestroy);
3197    for (auto &RPI : RegPairs) {
3198      MIB.addReg(RPI.Reg1, RegState::Define);
3199      MIB.addReg(RPI.Reg2, RegState::Define);
3200    }
3201    return true;
3202  }
3203
3204  if (ReverseCSRRestoreSeq) {
3205    MachineBasicBlock::iterator First = MBB.end();
3206    for (const RegPairInfo &RPI : reverse(RegPairs)) {
3207      if (RPI.isScalable())
3208        continue;
3209      MachineBasicBlock::iterator It = EmitMI(RPI);
3210      if (First == MBB.end())
3211        First = It;
3212    }
3213    if (First != MBB.end())
3214      MBB.splice(MBBI, &MBB, First);
3215  } else {
3216    for (const RegPairInfo &RPI : RegPairs) {
3217      if (RPI.isScalable())
3218        continue;
3219      (void)EmitMI(RPI);
3220    }
3221  }
3222
3223  return true;
3224}
3225
3226void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3227                                                BitVector &SavedRegs,
3228                                                RegScavenger *RS) const {
3229  // All calls are tail calls in GHC calling conv, and functions have no
3230  // prologue/epilogue.
3231  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
3232    return;
3233
3234  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3235  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
3236      MF.getSubtarget().getRegisterInfo());
3237  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
3238  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3239  unsigned UnspilledCSGPR = AArch64::NoRegister;
3240  unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
3241
3242  MachineFrameInfo &MFI = MF.getFrameInfo();
3243  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
3244
3245  unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
3246                                ? RegInfo->getBaseRegister()
3247                                : (unsigned)AArch64::NoRegister;
3248
3249  unsigned ExtraCSSpill = 0;
3250  bool HasUnpairedGPR64 = false;
3251  // Figure out which callee-saved registers to save/restore.
3252  for (unsigned i = 0; CSRegs[i]; ++i) {
3253    const unsigned Reg = CSRegs[i];
3254
3255    // Add the base pointer register to SavedRegs if it is callee-save.
3256    if (Reg == BasePointerReg)
3257      SavedRegs.set(Reg);
3258
3259    bool RegUsed = SavedRegs.test(Reg);
3260    unsigned PairedReg = AArch64::NoRegister;
3261    const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
3262    if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
3263        AArch64::FPR128RegClass.contains(Reg)) {
3264      // Compensate for odd numbers of GP CSRs.
3265      // For now, all the known cases of odd number of CSRs are of GPRs.
3266      if (HasUnpairedGPR64)
3267        PairedReg = CSRegs[i % 2 == 0 ? i - 1 : i + 1];
3268      else
3269        PairedReg = CSRegs[i ^ 1];
3270    }
3271
3272    // If the function requires all the GP registers to save (SavedRegs),
3273    // and there are an odd number of GP CSRs at the same time (CSRegs),
3274    // PairedReg could be in a different register class from Reg, which would
3275    // lead to a FPR (usually D8) accidentally being marked saved.
3276    if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(PairedReg)) {
3277      PairedReg = AArch64::NoRegister;
3278      HasUnpairedGPR64 = true;
3279    }
3280    assert(PairedReg == AArch64::NoRegister ||
3281           AArch64::GPR64RegClass.contains(Reg, PairedReg) ||
3282           AArch64::FPR64RegClass.contains(Reg, PairedReg) ||
3283           AArch64::FPR128RegClass.contains(Reg, PairedReg));
3284
3285    if (!RegUsed) {
3286      if (AArch64::GPR64RegClass.contains(Reg) &&
3287          !RegInfo->isReservedReg(MF, Reg)) {
3288        UnspilledCSGPR = Reg;
3289        UnspilledCSGPRPaired = PairedReg;
3290      }
3291      continue;
3292    }
3293
3294    // MachO's compact unwind format relies on all registers being stored in
3295    // pairs.
3296    // FIXME: the usual format is actually better if unwinding isn't needed.
3297    if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
3298        !SavedRegs.test(PairedReg)) {
3299      SavedRegs.set(PairedReg);
3300      if (AArch64::GPR64RegClass.contains(PairedReg) &&
3301          !RegInfo->isReservedReg(MF, PairedReg))
3302        ExtraCSSpill = PairedReg;
3303    }
3304  }
3305
3306  if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
3307      !Subtarget.isTargetWindows()) {
3308    // For Windows calling convention on a non-windows OS, where X18 is treated
3309    // as reserved, back up X18 when entering non-windows code (marked with the
3310    // Windows calling convention) and restore when returning regardless of
3311    // whether the individual function uses it - it might call other functions
3312    // that clobber it.
3313    SavedRegs.set(AArch64::X18);
3314  }
3315
3316  // Calculates the callee saved stack size.
3317  unsigned CSStackSize = 0;
3318  unsigned SVECSStackSize = 0;
3319  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3320  const MachineRegisterInfo &MRI = MF.getRegInfo();
3321  for (unsigned Reg : SavedRegs.set_bits()) {
3322    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
3323    if (AArch64::PPRRegClass.contains(Reg) ||
3324        AArch64::ZPRRegClass.contains(Reg))
3325      SVECSStackSize += RegSize;
3326    else
3327      CSStackSize += RegSize;
3328  }
3329
3330  // Save number of saved regs, so we can easily update CSStackSize later.
3331  unsigned NumSavedRegs = SavedRegs.count();
3332
3333  // The frame record needs to be created by saving the appropriate registers
3334  uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3335  if (hasFP(MF) ||
3336      windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
3337    SavedRegs.set(AArch64::FP);
3338    SavedRegs.set(AArch64::LR);
3339  }
3340
3341  LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3342             for (unsigned Reg
3343                  : SavedRegs.set_bits()) dbgs()
3344             << ' ' << printReg(Reg, RegInfo);
3345             dbgs() << "\n";);
3346
3347  // If any callee-saved registers are used, the frame cannot be eliminated.
3348  int64_t SVEStackSize =
3349      alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
3350  bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3351
3352  // The CSR spill slots have not been allocated yet, so estimateStackSize
3353  // won't include them.
3354  unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3355
3356  // We may address some of the stack above the canonical frame address, either
3357  // for our own arguments or during a call. Include that in calculating whether
3358  // we have complicated addressing concerns.
3359  int64_t CalleeStackUsed = 0;
3360  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) {
3361    int64_t FixedOff = MFI.getObjectOffset(I);
3362    if (FixedOff > CalleeStackUsed) CalleeStackUsed = FixedOff;
3363  }
3364
3365  // Conservatively always assume BigStack when there are SVE spills.
3366  bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize +
3367                                   CalleeStackUsed) > EstimatedStackSizeLimit;
3368  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
3369    AFI->setHasStackFrame(true);
3370
3371  // Estimate if we might need to scavenge a register at some point in order
3372  // to materialize a stack offset. If so, either spill one additional
3373  // callee-saved register or reserve a special spill slot to facilitate
3374  // register scavenging. If we already spilled an extra callee-saved register
3375  // above to keep the number of spills even, we don't need to do anything else
3376  // here.
3377  if (BigStack) {
3378    if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
3379      LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
3380                        << " to get a scratch register.\n");
3381      SavedRegs.set(UnspilledCSGPR);
3382      ExtraCSSpill = UnspilledCSGPR;
3383
3384      // MachO's compact unwind format relies on all registers being stored in
3385      // pairs, so if we need to spill one extra for BigStack, then we need to
3386      // store the pair.
3387      if (producePairRegisters(MF)) {
3388        if (UnspilledCSGPRPaired == AArch64::NoRegister) {
3389          // Failed to make a pair for compact unwind format, revert spilling.
3390          if (produceCompactUnwindFrame(MF)) {
3391            SavedRegs.reset(UnspilledCSGPR);
3392            ExtraCSSpill = AArch64::NoRegister;
3393          }
3394        } else
3395          SavedRegs.set(UnspilledCSGPRPaired);
3396      }
3397    }
3398
3399    // If we didn't find an extra callee-saved register to spill, create
3400    // an emergency spill slot.
3401    if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
3402      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3403      const TargetRegisterClass &RC = AArch64::GPR64RegClass;
3404      unsigned Size = TRI->getSpillSize(RC);
3405      Align Alignment = TRI->getSpillAlign(RC);
3406      int FI = MFI.CreateStackObject(Size, Alignment, false);
3407      RS->addScavengingFrameIndex(FI);
3408      LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
3409                        << " as the emergency spill slot.\n");
3410    }
3411  }
3412
3413  // Adding the size of additional 64bit GPR saves.
3414  CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
3415
3416  // A Swift asynchronous context extends the frame record with a pointer
3417  // directly before FP.
3418  if (hasFP(MF) && AFI->hasSwiftAsyncContext())
3419    CSStackSize += 8;
3420
3421  uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
3422  LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
3423               << EstimatedStackSize + AlignedCSStackSize
3424               << " bytes.\n");
3425
3426  assert((!MFI.isCalleeSavedInfoValid() ||
3427          AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
3428         "Should not invalidate callee saved info");
3429
3430  // Round up to register pair alignment to avoid additional SP adjustment
3431  // instructions.
3432  AFI->setCalleeSavedStackSize(AlignedCSStackSize);
3433  AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
3434  AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
3435}
3436
3437bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3438    MachineFunction &MF, const TargetRegisterInfo *RegInfo,
3439    std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
3440    unsigned &MaxCSFrameIndex) const {
3441  bool NeedsWinCFI = needsWinCFI(MF);
3442  // To match the canonical windows frame layout, reverse the list of
3443  // callee saved registers to get them laid out by PrologEpilogInserter
3444  // in the right order. (PrologEpilogInserter allocates stack objects top
3445  // down. Windows canonical prologs store higher numbered registers at
3446  // the top, thus have the CSI array start from the highest registers.)
3447  if (NeedsWinCFI)
3448    std::reverse(CSI.begin(), CSI.end());
3449
3450  if (CSI.empty())
3451    return true; // Early exit if no callee saved registers are modified!
3452
3453  // Now that we know which registers need to be saved and restored, allocate
3454  // stack slots for them.
3455  MachineFrameInfo &MFI = MF.getFrameInfo();
3456  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3457
3458  bool UsesWinAAPCS = isTargetWindows(MF);
3459  if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
3460    int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
3461    AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3462    if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3463    if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3464  }
3465
3466  for (auto &CS : CSI) {
3467    Register Reg = CS.getReg();
3468    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
3469
3470    unsigned Size = RegInfo->getSpillSize(*RC);
3471    Align Alignment(RegInfo->getSpillAlign(*RC));
3472    int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
3473    CS.setFrameIdx(FrameIdx);
3474
3475    if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3476    if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3477
3478    // Grab 8 bytes below FP for the extended asynchronous frame info.
3479    if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
3480        Reg == AArch64::FP) {
3481      FrameIdx = MFI.CreateStackObject(8, Alignment, true);
3482      AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3483      if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3484      if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3485    }
3486  }
3487  return true;
3488}
3489
3490bool AArch64FrameLowering::enableStackSlotScavenging(
3491    const MachineFunction &MF) const {
3492  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3493  // If the function has streaming-mode changes, don't scavenge a
3494  // spillslot in the callee-save area, as that might require an
3495  // 'addvl' in the streaming-mode-changing call-sequence when the
3496  // function doesn't use a FP.
3497  if (AFI->hasStreamingModeChanges() && !hasFP(MF))
3498    return false;
3499  return AFI->hasCalleeSaveStackFreeSpace();
3500}
3501
3502/// returns true if there are any SVE callee saves.
3503static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
3504                                      int &Min, int &Max) {
3505  Min = std::numeric_limits<int>::max();
3506  Max = std::numeric_limits<int>::min();
3507
3508  if (!MFI.isCalleeSavedInfoValid())
3509    return false;
3510
3511  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
3512  for (auto &CS : CSI) {
3513    if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
3514        AArch64::PPRRegClass.contains(CS.getReg())) {
3515      assert((Max == std::numeric_limits<int>::min() ||
3516              Max + 1 == CS.getFrameIdx()) &&
3517             "SVE CalleeSaves are not consecutive");
3518
3519      Min = std::min(Min, CS.getFrameIdx());
3520      Max = std::max(Max, CS.getFrameIdx());
3521    }
3522  }
3523  return Min != std::numeric_limits<int>::max();
3524}
3525
3526// Process all the SVE stack objects and determine offsets for each
3527// object. If AssignOffsets is true, the offsets get assigned.
3528// Fills in the first and last callee-saved frame indices into
3529// Min/MaxCSFrameIndex, respectively.
3530// Returns the size of the stack.
3531static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
3532                                              int &MinCSFrameIndex,
3533                                              int &MaxCSFrameIndex,
3534                                              bool AssignOffsets) {
3535#ifndef NDEBUG
3536  // First process all fixed stack objects.
3537  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
3538    assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
3539           "SVE vectors should never be passed on the stack by value, only by "
3540           "reference.");
3541#endif
3542
3543  auto Assign = [&MFI](int FI, int64_t Offset) {
3544    LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
3545    MFI.setObjectOffset(FI, Offset);
3546  };
3547
3548  int64_t Offset = 0;
3549
3550  // Then process all callee saved slots.
3551  if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3552    // Assign offsets to the callee save slots.
3553    for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3554      Offset += MFI.getObjectSize(I);
3555      Offset = alignTo(Offset, MFI.getObjectAlign(I));
3556      if (AssignOffsets)
3557        Assign(I, -Offset);
3558    }
3559  }
3560
3561  // Ensure that the Callee-save area is aligned to 16bytes.
3562  Offset = alignTo(Offset, Align(16U));
3563
3564  // Create a buffer of SVE objects to allocate and sort it.
3565  SmallVector<int, 8> ObjectsToAllocate;
3566  // If we have a stack protector, and we've previously decided that we have SVE
3567  // objects on the stack and thus need it to go in the SVE stack area, then it
3568  // needs to go first.
3569  int StackProtectorFI = -1;
3570  if (MFI.hasStackProtectorIndex()) {
3571    StackProtectorFI = MFI.getStackProtectorIndex();
3572    if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
3573      ObjectsToAllocate.push_back(StackProtectorFI);
3574  }
3575  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3576    unsigned StackID = MFI.getStackID(I);
3577    if (StackID != TargetStackID::ScalableVector)
3578      continue;
3579    if (I == StackProtectorFI)
3580      continue;
3581    if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3582      continue;
3583    if (MFI.isDeadObjectIndex(I))
3584      continue;
3585
3586    ObjectsToAllocate.push_back(I);
3587  }
3588
3589  // Allocate all SVE locals and spills
3590  for (unsigned FI : ObjectsToAllocate) {
3591    Align Alignment = MFI.getObjectAlign(FI);
3592    // FIXME: Given that the length of SVE vectors is not necessarily a power of
3593    // two, we'd need to align every object dynamically at runtime if the
3594    // alignment is larger than 16. This is not yet supported.
3595    if (Alignment > Align(16))
3596      report_fatal_error(
3597          "Alignment of scalable vectors > 16 bytes is not yet supported");
3598
3599    Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3600    if (AssignOffsets)
3601      Assign(FI, -Offset);
3602  }
3603
3604  return Offset;
3605}
3606
3607int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3608    MachineFrameInfo &MFI) const {
3609  int MinCSFrameIndex, MaxCSFrameIndex;
3610  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3611}
3612
3613int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3614    MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3615  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3616                                        true);
3617}
3618
3619void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3620    MachineFunction &MF, RegScavenger *RS) const {
3621  MachineFrameInfo &MFI = MF.getFrameInfo();
3622
3623  assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3624         "Upwards growing stack unsupported");
3625
3626  int MinCSFrameIndex, MaxCSFrameIndex;
3627  int64_t SVEStackSize =
3628      assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3629
3630  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3631  AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3632  AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3633
3634  // If this function isn't doing Win64-style C++ EH, we don't need to do
3635  // anything.
3636  if (!MF.hasEHFunclets())
3637    return;
3638  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3639  WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3640
3641  MachineBasicBlock &MBB = MF.front();
3642  auto MBBI = MBB.begin();
3643  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3644    ++MBBI;
3645
3646  // Create an UnwindHelp object.
3647  // The UnwindHelp object is allocated at the start of the fixed object area
3648  int64_t FixedObject =
3649      getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3650  int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3651                                           /*SPOffset*/ -FixedObject,
3652                                           /*IsImmutable=*/false);
3653  EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3654
3655  // We need to store -2 into the UnwindHelp object at the start of the
3656  // function.
3657  DebugLoc DL;
3658  RS->enterBasicBlockEnd(MBB);
3659  RS->backward(MBBI);
3660  Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3661  assert(DstReg && "There must be a free register after frame setup");
3662  BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3663  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3664      .addReg(DstReg, getKillRegState(true))
3665      .addFrameIndex(UnwindHelpFI)
3666      .addImm(0);
3667}
3668
3669namespace {
3670struct TagStoreInstr {
3671  MachineInstr *MI;
3672  int64_t Offset, Size;
3673  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3674      : MI(MI), Offset(Offset), Size(Size) {}
3675};
3676
3677class TagStoreEdit {
3678  MachineFunction *MF;
3679  MachineBasicBlock *MBB;
3680  MachineRegisterInfo *MRI;
3681  // Tag store instructions that are being replaced.
3682  SmallVector<TagStoreInstr, 8> TagStores;
3683  // Combined memref arguments of the above instructions.
3684  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3685
3686  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3687  // FrameRegOffset + Size) with the address tag of SP.
3688  Register FrameReg;
3689  StackOffset FrameRegOffset;
3690  int64_t Size;
3691  // If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the
3692  // end.
3693  std::optional<int64_t> FrameRegUpdate;
3694  // MIFlags for any FrameReg updating instructions.
3695  unsigned FrameRegUpdateFlags;
3696
3697  // Use zeroing instruction variants.
3698  bool ZeroData;
3699  DebugLoc DL;
3700
3701  void emitUnrolled(MachineBasicBlock::iterator InsertI);
3702  void emitLoop(MachineBasicBlock::iterator InsertI);
3703
3704public:
3705  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3706      : MBB(MBB), ZeroData(ZeroData) {
3707    MF = MBB->getParent();
3708    MRI = &MF->getRegInfo();
3709  }
3710  // Add an instruction to be replaced. Instructions must be added in the
3711  // ascending order of Offset, and have to be adjacent.
3712  void addInstruction(TagStoreInstr I) {
3713    assert((TagStores.empty() ||
3714            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3715           "Non-adjacent tag store instructions.");
3716    TagStores.push_back(I);
3717  }
3718  void clear() { TagStores.clear(); }
3719  // Emit equivalent code at the given location, and erase the current set of
3720  // instructions. May skip if the replacement is not profitable. May invalidate
3721  // the input iterator and replace it with a valid one.
3722  void emitCode(MachineBasicBlock::iterator &InsertI,
3723                const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
3724};
3725
3726void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3727  const AArch64InstrInfo *TII =
3728      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3729
3730  const int64_t kMinOffset = -256 * 16;
3731  const int64_t kMaxOffset = 255 * 16;
3732
3733  Register BaseReg = FrameReg;
3734  int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3735  if (BaseRegOffsetBytes < kMinOffset ||
3736      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset ||
3737      // BaseReg can be FP, which is not necessarily aligned to 16-bytes. In
3738      // that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which
3739      // is required for the offset of ST2G.
3740      BaseRegOffsetBytes % 16 != 0) {
3741    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3742    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3743                    StackOffset::getFixed(BaseRegOffsetBytes), TII);
3744    BaseReg = ScratchReg;
3745    BaseRegOffsetBytes = 0;
3746  }
3747
3748  MachineInstr *LastI = nullptr;
3749  while (Size) {
3750    int64_t InstrSize = (Size > 16) ? 32 : 16;
3751    unsigned Opcode =
3752        InstrSize == 16
3753            ? (ZeroData ? AArch64::STZGi : AArch64::STGi)
3754            : (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi);
3755    assert(BaseRegOffsetBytes % 16 == 0);
3756    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3757                          .addReg(AArch64::SP)
3758                          .addReg(BaseReg)
3759                          .addImm(BaseRegOffsetBytes / 16)
3760                          .setMemRefs(CombinedMemRefs);
3761    // A store to [BaseReg, #0] should go last for an opportunity to fold the
3762    // final SP adjustment in the epilogue.
3763    if (BaseRegOffsetBytes == 0)
3764      LastI = I;
3765    BaseRegOffsetBytes += InstrSize;
3766    Size -= InstrSize;
3767  }
3768
3769  if (LastI)
3770    MBB->splice(InsertI, MBB, LastI);
3771}
3772
3773void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3774  const AArch64InstrInfo *TII =
3775      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3776
3777  Register BaseReg = FrameRegUpdate
3778                         ? FrameReg
3779                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3780  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3781
3782  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3783
3784  int64_t LoopSize = Size;
3785  // If the loop size is not a multiple of 32, split off one 16-byte store at
3786  // the end to fold BaseReg update into.
3787  if (FrameRegUpdate && *FrameRegUpdate)
3788    LoopSize -= LoopSize % 32;
3789  MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3790                                TII->get(ZeroData ? AArch64::STZGloop_wback
3791                                                  : AArch64::STGloop_wback))
3792                            .addDef(SizeReg)
3793                            .addDef(BaseReg)
3794                            .addImm(LoopSize)
3795                            .addReg(BaseReg)
3796                            .setMemRefs(CombinedMemRefs);
3797  if (FrameRegUpdate)
3798    LoopI->setFlags(FrameRegUpdateFlags);
3799
3800  int64_t ExtraBaseRegUpdate =
3801      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3802  if (LoopSize < Size) {
3803    assert(FrameRegUpdate);
3804    assert(Size - LoopSize == 16);
3805    // Tag 16 more bytes at BaseReg and update BaseReg.
3806    BuildMI(*MBB, InsertI, DL,
3807            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3808        .addDef(BaseReg)
3809        .addReg(BaseReg)
3810        .addReg(BaseReg)
3811        .addImm(1 + ExtraBaseRegUpdate / 16)
3812        .setMemRefs(CombinedMemRefs)
3813        .setMIFlags(FrameRegUpdateFlags);
3814  } else if (ExtraBaseRegUpdate) {
3815    // Update BaseReg.
3816    BuildMI(
3817        *MBB, InsertI, DL,
3818        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3819        .addDef(BaseReg)
3820        .addReg(BaseReg)
3821        .addImm(std::abs(ExtraBaseRegUpdate))
3822        .addImm(0)
3823        .setMIFlags(FrameRegUpdateFlags);
3824  }
3825}
3826
3827// Check if *II is a register update that can be merged into STGloop that ends
3828// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3829// end of the loop.
3830bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3831                       int64_t Size, int64_t *TotalOffset) {
3832  MachineInstr &MI = *II;
3833  if ((MI.getOpcode() == AArch64::ADDXri ||
3834       MI.getOpcode() == AArch64::SUBXri) &&
3835      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3836    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3837    int64_t Offset = MI.getOperand(2).getImm() << Shift;
3838    if (MI.getOpcode() == AArch64::SUBXri)
3839      Offset = -Offset;
3840    int64_t AbsPostOffset = std::abs(Offset - Size);
3841    const int64_t kMaxOffset =
3842        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3843    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3844      *TotalOffset = Offset;
3845      return true;
3846    }
3847  }
3848  return false;
3849}
3850
3851void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3852                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3853  MemRefs.clear();
3854  for (auto &TS : TSE) {
3855    MachineInstr *MI = TS.MI;
3856    // An instruction without memory operands may access anything. Be
3857    // conservative and return an empty list.
3858    if (MI->memoperands_empty()) {
3859      MemRefs.clear();
3860      return;
3861    }
3862    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3863  }
3864}
3865
3866void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3867                            const AArch64FrameLowering *TFI,
3868                            bool TryMergeSPUpdate) {
3869  if (TagStores.empty())
3870    return;
3871  TagStoreInstr &FirstTagStore = TagStores[0];
3872  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3873  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3874  DL = TagStores[0].MI->getDebugLoc();
3875
3876  Register Reg;
3877  FrameRegOffset = TFI->resolveFrameOffsetReference(
3878      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3879      /*PreferFP=*/false, /*ForSimm=*/true);
3880  FrameReg = Reg;
3881  FrameRegUpdate = std::nullopt;
3882
3883  mergeMemRefs(TagStores, CombinedMemRefs);
3884
3885  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3886             for (const auto &Instr
3887                  : TagStores) { dbgs() << "  " << *Instr.MI; });
3888
3889  // Size threshold where a loop becomes shorter than a linear sequence of
3890  // tagging instructions.
3891  const int kSetTagLoopThreshold = 176;
3892  if (Size < kSetTagLoopThreshold) {
3893    if (TagStores.size() < 2)
3894      return;
3895    emitUnrolled(InsertI);
3896  } else {
3897    MachineInstr *UpdateInstr = nullptr;
3898    int64_t TotalOffset = 0;
3899    if (TryMergeSPUpdate) {
3900      // See if we can merge base register update into the STGloop.
3901      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3902      // but STGloop is way too unusual for that, and also it only
3903      // realistically happens in function epilogue. Also, STGloop is expanded
3904      // before that pass.
3905      if (InsertI != MBB->end() &&
3906          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3907                            &TotalOffset)) {
3908        UpdateInstr = &*InsertI++;
3909        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3910                          << *UpdateInstr);
3911      }
3912    }
3913
3914    if (!UpdateInstr && TagStores.size() < 2)
3915      return;
3916
3917    if (UpdateInstr) {
3918      FrameRegUpdate = TotalOffset;
3919      FrameRegUpdateFlags = UpdateInstr->getFlags();
3920    }
3921    emitLoop(InsertI);
3922    if (UpdateInstr)
3923      UpdateInstr->eraseFromParent();
3924  }
3925
3926  for (auto &TS : TagStores)
3927    TS.MI->eraseFromParent();
3928}
3929
3930bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3931                                        int64_t &Size, bool &ZeroData) {
3932  MachineFunction &MF = *MI.getParent()->getParent();
3933  const MachineFrameInfo &MFI = MF.getFrameInfo();
3934
3935  unsigned Opcode = MI.getOpcode();
3936  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGi ||
3937              Opcode == AArch64::STZ2Gi);
3938
3939  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3940    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3941      return false;
3942    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3943      return false;
3944    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3945    Size = MI.getOperand(2).getImm();
3946    return true;
3947  }
3948
3949  if (Opcode == AArch64::STGi || Opcode == AArch64::STZGi)
3950    Size = 16;
3951  else if (Opcode == AArch64::ST2Gi || Opcode == AArch64::STZ2Gi)
3952    Size = 32;
3953  else
3954    return false;
3955
3956  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3957    return false;
3958
3959  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3960           16 * MI.getOperand(2).getImm();
3961  return true;
3962}
3963
3964// Detect a run of memory tagging instructions for adjacent stack frame slots,
3965// and replace them with a shorter instruction sequence:
3966// * replace STG + STG with ST2G
3967// * replace STGloop + STGloop with STGloop
3968// This code needs to run when stack slot offsets are already known, but before
3969// FrameIndex operands in STG instructions are eliminated.
3970MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3971                                                const AArch64FrameLowering *TFI,
3972                                                RegScavenger *RS) {
3973  bool FirstZeroData;
3974  int64_t Size, Offset;
3975  MachineInstr &MI = *II;
3976  MachineBasicBlock *MBB = MI.getParent();
3977  MachineBasicBlock::iterator NextI = ++II;
3978  if (&MI == &MBB->instr_back())
3979    return II;
3980  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3981    return II;
3982
3983  SmallVector<TagStoreInstr, 4> Instrs;
3984  Instrs.emplace_back(&MI, Offset, Size);
3985
3986  constexpr int kScanLimit = 10;
3987  int Count = 0;
3988  for (MachineBasicBlock::iterator E = MBB->end();
3989       NextI != E && Count < kScanLimit; ++NextI) {
3990    MachineInstr &MI = *NextI;
3991    bool ZeroData;
3992    int64_t Size, Offset;
3993    // Collect instructions that update memory tags with a FrameIndex operand
3994    // and (when applicable) constant size, and whose output registers are dead
3995    // (the latter is almost always the case in practice). Since these
3996    // instructions effectively have no inputs or outputs, we are free to skip
3997    // any non-aliasing instructions in between without tracking used registers.
3998    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3999      if (ZeroData != FirstZeroData)
4000        break;
4001      Instrs.emplace_back(&MI, Offset, Size);
4002      continue;
4003    }
4004
4005    // Only count non-transient, non-tagging instructions toward the scan
4006    // limit.
4007    if (!MI.isTransient())
4008      ++Count;
4009
4010    // Just in case, stop before the epilogue code starts.
4011    if (MI.getFlag(MachineInstr::FrameSetup) ||
4012        MI.getFlag(MachineInstr::FrameDestroy))
4013      break;
4014
4015    // Reject anything that may alias the collected instructions.
4016    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
4017      break;
4018  }
4019
4020  // New code will be inserted after the last tagging instruction we've found.
4021  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
4022
4023  // All the gathered stack tag instructions are merged and placed after
4024  // last tag store in the list. The check should be made if the nzcv
4025  // flag is live at the point where we are trying to insert. Otherwise
4026  // the nzcv flag might get clobbered if any stg loops are present.
4027
4028  // FIXME : This approach of bailing out from merge is conservative in
4029  // some ways like even if stg loops are not present after merge the
4030  // insert list, this liveness check is done (which is not needed).
4031  LivePhysRegs LiveRegs(*(MBB->getParent()->getSubtarget().getRegisterInfo()));
4032  LiveRegs.addLiveOuts(*MBB);
4033  for (auto I = MBB->rbegin();; ++I) {
4034    MachineInstr &MI = *I;
4035    if (MI == InsertI)
4036      break;
4037    LiveRegs.stepBackward(*I);
4038  }
4039  InsertI++;
4040  if (LiveRegs.contains(AArch64::NZCV))
4041    return InsertI;
4042
4043  llvm::stable_sort(Instrs,
4044                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
4045                      return Left.Offset < Right.Offset;
4046                    });
4047
4048  // Make sure that we don't have any overlapping stores.
4049  int64_t CurOffset = Instrs[0].Offset;
4050  for (auto &Instr : Instrs) {
4051    if (CurOffset > Instr.Offset)
4052      return NextI;
4053    CurOffset = Instr.Offset + Instr.Size;
4054  }
4055
4056  // Find contiguous runs of tagged memory and emit shorter instruction
4057  // sequencies for them when possible.
4058  TagStoreEdit TSE(MBB, FirstZeroData);
4059  std::optional<int64_t> EndOffset;
4060  for (auto &Instr : Instrs) {
4061    if (EndOffset && *EndOffset != Instr.Offset) {
4062      // Found a gap.
4063      TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
4064      TSE.clear();
4065    }
4066
4067    TSE.addInstruction(Instr);
4068    EndOffset = Instr.Offset + Instr.Size;
4069  }
4070
4071  const MachineFunction *MF = MBB->getParent();
4072  // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
4073  TSE.emitCode(
4074      InsertI, TFI, /*TryMergeSPUpdate = */
4075      !MF->getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(*MF));
4076
4077  return InsertI;
4078}
4079} // namespace
4080
4081void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
4082    MachineFunction &MF, RegScavenger *RS = nullptr) const {
4083  if (StackTaggingMergeSetTag)
4084    for (auto &BB : MF)
4085      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
4086        II = tryMergeAdjacentSTG(II, this, RS);
4087}
4088
4089/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
4090/// before the update.  This is easily retrieved as it is exactly the offset
4091/// that is set in processFunctionBeforeFrameFinalized.
4092StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
4093    const MachineFunction &MF, int FI, Register &FrameReg,
4094    bool IgnoreSPUpdates) const {
4095  const MachineFrameInfo &MFI = MF.getFrameInfo();
4096  if (IgnoreSPUpdates) {
4097    LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
4098                      << MFI.getObjectOffset(FI) << "\n");
4099    FrameReg = AArch64::SP;
4100    return StackOffset::getFixed(MFI.getObjectOffset(FI));
4101  }
4102
4103  // Go to common code if we cannot provide sp + offset.
4104  if (MFI.hasVarSizedObjects() ||
4105      MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
4106      MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
4107    return getFrameIndexReference(MF, FI, FrameReg);
4108
4109  FrameReg = AArch64::SP;
4110  return getStackOffset(MF, MFI.getObjectOffset(FI));
4111}
4112
4113/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
4114/// the parent's frame pointer
4115unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
4116    const MachineFunction &MF) const {
4117  return 0;
4118}
4119
4120/// Funclets only need to account for space for the callee saved registers,
4121/// as the locals are accounted for in the parent's stack frame.
4122unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
4123    const MachineFunction &MF) const {
4124  // This is the size of the pushed CSRs.
4125  unsigned CSSize =
4126      MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
4127  // This is the amount of stack a funclet needs to allocate.
4128  return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
4129                 getStackAlign());
4130}
4131
4132namespace {
4133struct FrameObject {
4134  bool IsValid = false;
4135  // Index of the object in MFI.
4136  int ObjectIndex = 0;
4137  // Group ID this object belongs to.
4138  int GroupIndex = -1;
4139  // This object should be placed first (closest to SP).
4140  bool ObjectFirst = false;
4141  // This object's group (which always contains the object with
4142  // ObjectFirst==true) should be placed first.
4143  bool GroupFirst = false;
4144};
4145
4146class GroupBuilder {
4147  SmallVector<int, 8> CurrentMembers;
4148  int NextGroupIndex = 0;
4149  std::vector<FrameObject> &Objects;
4150
4151public:
4152  GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
4153  void AddMember(int Index) { CurrentMembers.push_back(Index); }
4154  void EndCurrentGroup() {
4155    if (CurrentMembers.size() > 1) {
4156      // Create a new group with the current member list. This might remove them
4157      // from their pre-existing groups. That's OK, dealing with overlapping
4158      // groups is too hard and unlikely to make a difference.
4159      LLVM_DEBUG(dbgs() << "group:");
4160      for (int Index : CurrentMembers) {
4161        Objects[Index].GroupIndex = NextGroupIndex;
4162        LLVM_DEBUG(dbgs() << " " << Index);
4163      }
4164      LLVM_DEBUG(dbgs() << "\n");
4165      NextGroupIndex++;
4166    }
4167    CurrentMembers.clear();
4168  }
4169};
4170
4171bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
4172  // Objects at a lower index are closer to FP; objects at a higher index are
4173  // closer to SP.
4174  //
4175  // For consistency in our comparison, all invalid objects are placed
4176  // at the end. This also allows us to stop walking when we hit the
4177  // first invalid item after it's all sorted.
4178  //
4179  // The "first" object goes first (closest to SP), followed by the members of
4180  // the "first" group.
4181  //
4182  // The rest are sorted by the group index to keep the groups together.
4183  // Higher numbered groups are more likely to be around longer (i.e. untagged
4184  // in the function epilogue and not at some earlier point). Place them closer
4185  // to SP.
4186  //
4187  // If all else equal, sort by the object index to keep the objects in the
4188  // original order.
4189  return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
4190                         A.ObjectIndex) <
4191         std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
4192                         B.ObjectIndex);
4193}
4194} // namespace
4195
4196void AArch64FrameLowering::orderFrameObjects(
4197    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
4198  if (!OrderFrameObjects || ObjectsToAllocate.empty())
4199    return;
4200
4201  const MachineFrameInfo &MFI = MF.getFrameInfo();
4202  std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
4203  for (auto &Obj : ObjectsToAllocate) {
4204    FrameObjects[Obj].IsValid = true;
4205    FrameObjects[Obj].ObjectIndex = Obj;
4206  }
4207
4208  // Identify stack slots that are tagged at the same time.
4209  GroupBuilder GB(FrameObjects);
4210  for (auto &MBB : MF) {
4211    for (auto &MI : MBB) {
4212      if (MI.isDebugInstr())
4213        continue;
4214      int OpIndex;
4215      switch (MI.getOpcode()) {
4216      case AArch64::STGloop:
4217      case AArch64::STZGloop:
4218        OpIndex = 3;
4219        break;
4220      case AArch64::STGi:
4221      case AArch64::STZGi:
4222      case AArch64::ST2Gi:
4223      case AArch64::STZ2Gi:
4224        OpIndex = 1;
4225        break;
4226      default:
4227        OpIndex = -1;
4228      }
4229
4230      int TaggedFI = -1;
4231      if (OpIndex >= 0) {
4232        const MachineOperand &MO = MI.getOperand(OpIndex);
4233        if (MO.isFI()) {
4234          int FI = MO.getIndex();
4235          if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
4236              FrameObjects[FI].IsValid)
4237            TaggedFI = FI;
4238        }
4239      }
4240
4241      // If this is a stack tagging instruction for a slot that is not part of a
4242      // group yet, either start a new group or add it to the current one.
4243      if (TaggedFI >= 0)
4244        GB.AddMember(TaggedFI);
4245      else
4246        GB.EndCurrentGroup();
4247    }
4248    // Groups should never span multiple basic blocks.
4249    GB.EndCurrentGroup();
4250  }
4251
4252  // If the function's tagged base pointer is pinned to a stack slot, we want to
4253  // put that slot first when possible. This will likely place it at SP + 0,
4254  // and save one instruction when generating the base pointer because IRG does
4255  // not allow an immediate offset.
4256  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
4257  std::optional<int> TBPI = AFI.getTaggedBasePointerIndex();
4258  if (TBPI) {
4259    FrameObjects[*TBPI].ObjectFirst = true;
4260    FrameObjects[*TBPI].GroupFirst = true;
4261    int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
4262    if (FirstGroupIndex >= 0)
4263      for (FrameObject &Object : FrameObjects)
4264        if (Object.GroupIndex == FirstGroupIndex)
4265          Object.GroupFirst = true;
4266  }
4267
4268  llvm::stable_sort(FrameObjects, FrameObjectCompare);
4269
4270  int i = 0;
4271  for (auto &Obj : FrameObjects) {
4272    // All invalid items are sorted at the end, so it's safe to stop.
4273    if (!Obj.IsValid)
4274      break;
4275    ObjectsToAllocate[i++] = Obj.ObjectIndex;
4276  }
4277
4278  LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
4279                                                    : FrameObjects) {
4280    if (!Obj.IsValid)
4281      break;
4282    dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
4283    if (Obj.ObjectFirst)
4284      dbgs() << ", first";
4285    if (Obj.GroupFirst)
4286      dbgs() << ", group-first";
4287    dbgs() << "\n";
4288  });
4289}
4290
4291/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
4292/// least every ProbeSize bytes. Returns an iterator of the first instruction
4293/// after the loop. The difference between SP and TargetReg must be an exact
4294/// multiple of ProbeSize.
4295MachineBasicBlock::iterator
4296AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
4297    MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
4298    Register TargetReg) const {
4299  MachineBasicBlock &MBB = *MBBI->getParent();
4300  MachineFunction &MF = *MBB.getParent();
4301  const AArch64InstrInfo *TII =
4302      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
4303  DebugLoc DL = MBB.findDebugLoc(MBBI);
4304
4305  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
4306  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
4307  MF.insert(MBBInsertPoint, LoopMBB);
4308  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
4309  MF.insert(MBBInsertPoint, ExitMBB);
4310
4311  // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
4312  // in SUB).
4313  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
4314                  StackOffset::getFixed(-ProbeSize), TII,
4315                  MachineInstr::FrameSetup);
4316  // STR XZR, [SP]
4317  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
4318      .addReg(AArch64::XZR)
4319      .addReg(AArch64::SP)
4320      .addImm(0)
4321      .setMIFlags(MachineInstr::FrameSetup);
4322  // CMP SP, TargetReg
4323  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
4324          AArch64::XZR)
4325      .addReg(AArch64::SP)
4326      .addReg(TargetReg)
4327      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
4328      .setMIFlags(MachineInstr::FrameSetup);
4329  // B.CC Loop
4330  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
4331      .addImm(AArch64CC::NE)
4332      .addMBB(LoopMBB)
4333      .setMIFlags(MachineInstr::FrameSetup);
4334
4335  LoopMBB->addSuccessor(ExitMBB);
4336  LoopMBB->addSuccessor(LoopMBB);
4337  // Synthesize the exit MBB.
4338  ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
4339  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
4340  MBB.addSuccessor(LoopMBB);
4341  // Update liveins.
4342  bool anyChange = false;
4343  do {
4344    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
4345  } while (anyChange);
4346
4347  return ExitMBB->begin();
4348}
4349
4350void AArch64FrameLowering::inlineStackProbeFixed(
4351    MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
4352    StackOffset CFAOffset) const {
4353  MachineBasicBlock *MBB = MBBI->getParent();
4354  MachineFunction &MF = *MBB->getParent();
4355  const AArch64InstrInfo *TII =
4356      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
4357  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4358  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
4359  bool HasFP = hasFP(MF);
4360
4361  DebugLoc DL;
4362  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
4363  int64_t NumBlocks = FrameSize / ProbeSize;
4364  int64_t ResidualSize = FrameSize % ProbeSize;
4365
4366  LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
4367                    << NumBlocks << " blocks of " << ProbeSize
4368                    << " bytes, plus " << ResidualSize << " bytes\n");
4369
4370  // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
4371  // ordinary loop.
4372  if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
4373    for (int i = 0; i < NumBlocks; ++i) {
4374      // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
4375      // encodable in a SUB).
4376      emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
4377                      StackOffset::getFixed(-ProbeSize), TII,
4378                      MachineInstr::FrameSetup, false, false, nullptr,
4379                      EmitAsyncCFI && !HasFP, CFAOffset);
4380      CFAOffset += StackOffset::getFixed(ProbeSize);
4381      // STR XZR, [SP]
4382      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
4383          .addReg(AArch64::XZR)
4384          .addReg(AArch64::SP)
4385          .addImm(0)
4386          .setMIFlags(MachineInstr::FrameSetup);
4387    }
4388  } else if (NumBlocks != 0) {
4389    // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
4390    // encodable in ADD). ScrathReg may temporarily become the CFA register.
4391    emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
4392                    StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
4393                    MachineInstr::FrameSetup, false, false, nullptr,
4394                    EmitAsyncCFI && !HasFP, CFAOffset);
4395    CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
4396    MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
4397    MBB = MBBI->getParent();
4398    if (EmitAsyncCFI && !HasFP) {
4399      // Set the CFA register back to SP.
4400      const AArch64RegisterInfo &RegInfo =
4401          *MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
4402      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
4403      unsigned CFIIndex =
4404          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
4405      BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
4406          .addCFIIndex(CFIIndex)
4407          .setMIFlags(MachineInstr::FrameSetup);
4408    }
4409  }
4410
4411  if (ResidualSize != 0) {
4412    // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
4413    // in SUB).
4414    emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
4415                    StackOffset::getFixed(-ResidualSize), TII,
4416                    MachineInstr::FrameSetup, false, false, nullptr,
4417                    EmitAsyncCFI && !HasFP, CFAOffset);
4418    if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
4419      // STR XZR, [SP]
4420      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
4421          .addReg(AArch64::XZR)
4422          .addReg(AArch64::SP)
4423          .addImm(0)
4424          .setMIFlags(MachineInstr::FrameSetup);
4425    }
4426  }
4427}
4428
4429void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
4430                                            MachineBasicBlock &MBB) const {
4431  // Get the instructions that need to be replaced. We emit at most two of
4432  // these. Remember them in order to avoid complications coming from the need
4433  // to traverse the block while potentially creating more blocks.
4434  SmallVector<MachineInstr *, 4> ToReplace;
4435  for (MachineInstr &MI : MBB)
4436    if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
4437        MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
4438      ToReplace.push_back(&MI);
4439
4440  for (MachineInstr *MI : ToReplace) {
4441    if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
4442      Register ScratchReg = MI->getOperand(0).getReg();
4443      int64_t FrameSize = MI->getOperand(1).getImm();
4444      StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(),
4445                                               MI->getOperand(3).getImm());
4446      inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize,
4447                            CFAOffset);
4448    } else {
4449      assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
4450             "Stack probe pseudo-instruction expected");
4451      const AArch64InstrInfo *TII =
4452          MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
4453      Register TargetReg = MI->getOperand(0).getReg();
4454      (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true);
4455    }
4456    MI->eraseFromParent();
4457  }
4458}
4459