1//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements the lowering of LLVM calls to DAG nodes.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
15#include "X86CallingConv.h"
16#include "X86FrameLowering.h"
17#include "X86ISelLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86MachineFunctionInfo.h"
20#include "X86TargetMachine.h"
21#include "X86TargetObjectFile.h"
22#include "llvm/ADT/Statistic.h"
23#include "llvm/Analysis/ObjCARCUtil.h"
24#include "llvm/CodeGen/MachineJumpTableInfo.h"
25#include "llvm/CodeGen/MachineModuleInfo.h"
26#include "llvm/CodeGen/WinEHFuncInfo.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IRBuilder.h"
29
30#define DEBUG_TYPE "x86-isel"
31
32using namespace llvm;
33
34STATISTIC(NumTailCalls, "Number of tail calls");
35
36/// Call this when the user attempts to do something unsupported, like
37/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
38/// report_fatal_error, so calling code should attempt to recover without
39/// crashing.
40static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
41                             const char *Msg) {
42  MachineFunction &MF = DAG.getMachineFunction();
43  DAG.getContext()->diagnose(
44      DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
45}
46
47/// Returns true if a CC can dynamically exclude a register from the list of
48/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
49/// the return registers.
50static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
51  switch (CC) {
52  default:
53    return false;
54  case CallingConv::X86_RegCall:
55  case CallingConv::PreserveMost:
56  case CallingConv::PreserveAll:
57    return true;
58  }
59}
60
61/// Returns true if a CC can dynamically exclude a register from the list of
62/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
63/// the parameters.
64static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
65  return CC == CallingConv::X86_RegCall;
66}
67
68static std::pair<MVT, unsigned>
69handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
70                                 const X86Subtarget &Subtarget) {
71  // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
72  // convention is one that uses k registers.
73  if (NumElts == 2)
74    return {MVT::v2i64, 1};
75  if (NumElts == 4)
76    return {MVT::v4i32, 1};
77  if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
78      CC != CallingConv::Intel_OCL_BI)
79    return {MVT::v8i16, 1};
80  if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
81      CC != CallingConv::Intel_OCL_BI)
82    return {MVT::v16i8, 1};
83  // v32i1 passes in ymm unless we have BWI and the calling convention is
84  // regcall.
85  if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
86    return {MVT::v32i8, 1};
87  // Split v64i1 vectors if we don't have v64i8 available.
88  if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
89    if (Subtarget.useAVX512Regs())
90      return {MVT::v64i8, 1};
91    return {MVT::v32i8, 2};
92  }
93
94  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
95  if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
96      NumElts > 64)
97    return {MVT::i8, NumElts};
98
99  return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
100}
101
102MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
103                                                     CallingConv::ID CC,
104                                                     EVT VT) const {
105  if (VT.isVector()) {
106    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
107      unsigned NumElts = VT.getVectorNumElements();
108
109      MVT RegisterVT;
110      unsigned NumRegisters;
111      std::tie(RegisterVT, NumRegisters) =
112          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
113      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
114        return RegisterVT;
115    }
116
117    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
118      return MVT::v8f16;
119  }
120
121  // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
122  if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
123      !Subtarget.hasX87())
124    return MVT::i32;
125
126  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
127    return getRegisterTypeForCallingConv(Context, CC,
128                                         VT.changeVectorElementType(MVT::f16));
129
130  if (VT == MVT::bf16)
131    return MVT::f16;
132
133  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
134}
135
136unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
137                                                          CallingConv::ID CC,
138                                                          EVT VT) const {
139  if (VT.isVector()) {
140    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
141      unsigned NumElts = VT.getVectorNumElements();
142
143      MVT RegisterVT;
144      unsigned NumRegisters;
145      std::tie(RegisterVT, NumRegisters) =
146          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
147      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
148        return NumRegisters;
149    }
150
151    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
152      return 1;
153  }
154
155  // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
156  // x87 is disabled.
157  if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
158    if (VT == MVT::f64)
159      return 2;
160    if (VT == MVT::f80)
161      return 3;
162  }
163
164  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
165    return getNumRegistersForCallingConv(Context, CC,
166                                         VT.changeVectorElementType(MVT::f16));
167
168  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
169}
170
171unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
172    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
173    unsigned &NumIntermediates, MVT &RegisterVT) const {
174  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
175  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
176      Subtarget.hasAVX512() &&
177      (!isPowerOf2_32(VT.getVectorNumElements()) ||
178       (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
179       VT.getVectorNumElements() > 64)) {
180    RegisterVT = MVT::i8;
181    IntermediateVT = MVT::i1;
182    NumIntermediates = VT.getVectorNumElements();
183    return NumIntermediates;
184  }
185
186  // Split v64i1 vectors if we don't have v64i8 available.
187  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
188      CC != CallingConv::X86_RegCall) {
189    RegisterVT = MVT::v32i8;
190    IntermediateVT = MVT::v32i1;
191    NumIntermediates = 2;
192    return 2;
193  }
194
195  // Split vNbf16 vectors according to vNf16.
196  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
197    VT = VT.changeVectorElementType(MVT::f16);
198
199  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
200                                              NumIntermediates, RegisterVT);
201}
202
203EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
204                                          LLVMContext& Context,
205                                          EVT VT) const {
206  if (!VT.isVector())
207    return MVT::i8;
208
209  if (Subtarget.hasAVX512()) {
210    // Figure out what this type will be legalized to.
211    EVT LegalVT = VT;
212    while (getTypeAction(Context, LegalVT) != TypeLegal)
213      LegalVT = getTypeToTransformTo(Context, LegalVT);
214
215    // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
216    if (LegalVT.getSimpleVT().is512BitVector())
217      return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
218
219    if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
220      // If we legalized to less than a 512-bit vector, then we will use a vXi1
221      // compare for vXi32/vXi64 for sure. If we have BWI we will also support
222      // vXi16/vXi8.
223      MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
224      if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
225        return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
226    }
227  }
228
229  return VT.changeVectorElementTypeToInteger();
230}
231
232/// Helper for getByValTypeAlignment to determine
233/// the desired ByVal argument alignment.
234static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
235  if (MaxAlign == 16)
236    return;
237  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
238    if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
239      MaxAlign = Align(16);
240  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
241    Align EltAlign;
242    getMaxByValAlign(ATy->getElementType(), EltAlign);
243    if (EltAlign > MaxAlign)
244      MaxAlign = EltAlign;
245  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
246    for (auto *EltTy : STy->elements()) {
247      Align EltAlign;
248      getMaxByValAlign(EltTy, EltAlign);
249      if (EltAlign > MaxAlign)
250        MaxAlign = EltAlign;
251      if (MaxAlign == 16)
252        break;
253    }
254  }
255}
256
257/// Return the desired alignment for ByVal aggregate
258/// function arguments in the caller parameter area. For X86, aggregates
259/// that contain SSE vectors are placed at 16-byte boundaries while the rest
260/// are at 4-byte boundaries.
261uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
262                                                  const DataLayout &DL) const {
263  if (Subtarget.is64Bit()) {
264    // Max of 8 and alignment of type.
265    Align TyAlign = DL.getABITypeAlign(Ty);
266    if (TyAlign > 8)
267      return TyAlign.value();
268    return 8;
269  }
270
271  Align Alignment(4);
272  if (Subtarget.hasSSE1())
273    getMaxByValAlign(Ty, Alignment);
274  return Alignment.value();
275}
276
277/// It returns EVT::Other if the type should be determined using generic
278/// target-independent logic.
279/// For vector ops we check that the overall size isn't larger than our
280/// preferred vector width.
281EVT X86TargetLowering::getOptimalMemOpType(
282    const MemOp &Op, const AttributeList &FuncAttributes) const {
283  if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
284    if (Op.size() >= 16 &&
285        (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
286      // FIXME: Check if unaligned 64-byte accesses are slow.
287      if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
288          (Subtarget.getPreferVectorWidth() >= 512)) {
289        return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
290      }
291      // FIXME: Check if unaligned 32-byte accesses are slow.
292      if (Op.size() >= 32 && Subtarget.hasAVX() &&
293          Subtarget.useLight256BitInstructions()) {
294        // Although this isn't a well-supported type for AVX1, we'll let
295        // legalization and shuffle lowering produce the optimal codegen. If we
296        // choose an optimal type with a vector element larger than a byte,
297        // getMemsetStores() may create an intermediate splat (using an integer
298        // multiply) before we splat as a vector.
299        return MVT::v32i8;
300      }
301      if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
302        return MVT::v16i8;
303      // TODO: Can SSE1 handle a byte vector?
304      // If we have SSE1 registers we should be able to use them.
305      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
306          (Subtarget.getPreferVectorWidth() >= 128))
307        return MVT::v4f32;
308    } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
309               Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
310      // Do not use f64 to lower memcpy if source is string constant. It's
311      // better to use i32 to avoid the loads.
312      // Also, do not use f64 to lower memset unless this is a memset of zeros.
313      // The gymnastics of splatting a byte value into an XMM register and then
314      // only using 8-byte stores (because this is a CPU with slow unaligned
315      // 16-byte accesses) makes that a loser.
316      return MVT::f64;
317    }
318  }
319  // This is a compromise. If we reach here, unaligned accesses may be slow on
320  // this target. However, creating smaller, aligned accesses could be even
321  // slower and would certainly be a lot more code.
322  if (Subtarget.is64Bit() && Op.size() >= 8)
323    return MVT::i64;
324  return MVT::i32;
325}
326
327bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
328  if (VT == MVT::f32)
329    return Subtarget.hasSSE1();
330  if (VT == MVT::f64)
331    return Subtarget.hasSSE2();
332  return true;
333}
334
335static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
336  return (8 * Alignment.value()) % SizeInBits == 0;
337}
338
339bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
340  if (isBitAligned(Alignment, VT.getSizeInBits()))
341    return true;
342  switch (VT.getSizeInBits()) {
343  default:
344    // 8-byte and under are always assumed to be fast.
345    return true;
346  case 128:
347    return !Subtarget.isUnalignedMem16Slow();
348  case 256:
349    return !Subtarget.isUnalignedMem32Slow();
350    // TODO: What about AVX-512 (512-bit) accesses?
351  }
352}
353
354bool X86TargetLowering::allowsMisalignedMemoryAccesses(
355    EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
356    unsigned *Fast) const {
357  if (Fast)
358    *Fast = isMemoryAccessFast(VT, Alignment);
359  // NonTemporal vector memory ops must be aligned.
360  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
361    // NT loads can only be vector aligned, so if its less aligned than the
362    // minimum vector size (which we can split the vector down to), we might as
363    // well use a regular unaligned vector load.
364    // We don't have any NT loads pre-SSE41.
365    if (!!(Flags & MachineMemOperand::MOLoad))
366      return (Alignment < 16 || !Subtarget.hasSSE41());
367    return false;
368  }
369  // Misaligned accesses of any size are always allowed.
370  return true;
371}
372
373bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
374                                           const DataLayout &DL, EVT VT,
375                                           unsigned AddrSpace, Align Alignment,
376                                           MachineMemOperand::Flags Flags,
377                                           unsigned *Fast) const {
378  if (Fast)
379    *Fast = isMemoryAccessFast(VT, Alignment);
380  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
381    if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
382                                       /*Fast=*/nullptr))
383      return true;
384    // NonTemporal vector memory ops are special, and must be aligned.
385    if (!isBitAligned(Alignment, VT.getSizeInBits()))
386      return false;
387    switch (VT.getSizeInBits()) {
388    case 128:
389      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
390        return true;
391      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
392        return true;
393      return false;
394    case 256:
395      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
396        return true;
397      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
398        return true;
399      return false;
400    case 512:
401      if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
402        return true;
403      return false;
404    default:
405      return false; // Don't have NonTemporal vector memory ops of this size.
406    }
407  }
408  return true;
409}
410
411/// Return the entry encoding for a jump table in the
412/// current function.  The returned value is a member of the
413/// MachineJumpTableInfo::JTEntryKind enum.
414unsigned X86TargetLowering::getJumpTableEncoding() const {
415  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
416  // symbol.
417  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
418    return MachineJumpTableInfo::EK_Custom32;
419  if (isPositionIndependent() &&
420      getTargetMachine().getCodeModel() == CodeModel::Large)
421    return MachineJumpTableInfo::EK_LabelDifference64;
422
423  // Otherwise, use the normal jump table encoding heuristics.
424  return TargetLowering::getJumpTableEncoding();
425}
426
427bool X86TargetLowering::useSoftFloat() const {
428  return Subtarget.useSoftFloat();
429}
430
431void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
432                                              ArgListTy &Args) const {
433
434  // Only relabel X86-32 for C / Stdcall CCs.
435  if (Subtarget.is64Bit())
436    return;
437  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
438    return;
439  unsigned ParamRegs = 0;
440  if (auto *M = MF->getFunction().getParent())
441    ParamRegs = M->getNumberRegisterParameters();
442
443  // Mark the first N int arguments as having reg
444  for (auto &Arg : Args) {
445    Type *T = Arg.Ty;
446    if (T->isIntOrPtrTy())
447      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
448        unsigned numRegs = 1;
449        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
450          numRegs = 2;
451        if (ParamRegs < numRegs)
452          return;
453        ParamRegs -= numRegs;
454        Arg.IsInReg = true;
455      }
456  }
457}
458
459const MCExpr *
460X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
461                                             const MachineBasicBlock *MBB,
462                                             unsigned uid,MCContext &Ctx) const{
463  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
464  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
465  // entries.
466  return MCSymbolRefExpr::create(MBB->getSymbol(),
467                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
468}
469
470/// Returns relocation base for the given PIC jumptable.
471SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
472                                                    SelectionDAG &DAG) const {
473  if (!Subtarget.is64Bit())
474    // This doesn't have SDLoc associated with it, but is not really the
475    // same as a Register.
476    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
477                       getPointerTy(DAG.getDataLayout()));
478  return Table;
479}
480
481/// This returns the relocation base for the given PIC jumptable,
482/// the same as getPICJumpTableRelocBase, but as an MCExpr.
483const MCExpr *X86TargetLowering::
484getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
485                             MCContext &Ctx) const {
486  // X86-64 uses RIP relative addressing based on the jump table label.
487  if (Subtarget.isPICStyleRIPRel() ||
488      (Subtarget.is64Bit() &&
489       getTargetMachine().getCodeModel() == CodeModel::Large))
490    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
491
492  // Otherwise, the reference is relative to the PIC base.
493  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
494}
495
496std::pair<const TargetRegisterClass *, uint8_t>
497X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
498                                           MVT VT) const {
499  const TargetRegisterClass *RRC = nullptr;
500  uint8_t Cost = 1;
501  switch (VT.SimpleTy) {
502  default:
503    return TargetLowering::findRepresentativeClass(TRI, VT);
504  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
505    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
506    break;
507  case MVT::x86mmx:
508    RRC = &X86::VR64RegClass;
509    break;
510  case MVT::f32: case MVT::f64:
511  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
512  case MVT::v4f32: case MVT::v2f64:
513  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
514  case MVT::v8f32: case MVT::v4f64:
515  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
516  case MVT::v16f32: case MVT::v8f64:
517    RRC = &X86::VR128XRegClass;
518    break;
519  }
520  return std::make_pair(RRC, Cost);
521}
522
523unsigned X86TargetLowering::getAddressSpace() const {
524  if (Subtarget.is64Bit())
525    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
526  return 256;
527}
528
529static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
530  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
531         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
532}
533
534static Constant* SegmentOffset(IRBuilderBase &IRB,
535                               int Offset, unsigned AddressSpace) {
536  return ConstantExpr::getIntToPtr(
537      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
538      IRB.getPtrTy(AddressSpace));
539}
540
541Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
542  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
543  // tcbhead_t; use it instead of the usual global variable (see
544  // sysdeps/{i386,x86_64}/nptl/tls.h)
545  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
546    unsigned AddressSpace = getAddressSpace();
547
548    // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
549    if (Subtarget.isTargetFuchsia())
550      return SegmentOffset(IRB, 0x10, AddressSpace);
551
552    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
553    // Specially, some users may customize the base reg and offset.
554    int Offset = M->getStackProtectorGuardOffset();
555    // If we don't set -stack-protector-guard-offset value:
556    // %fs:0x28, unless we're using a Kernel code model, in which case
557    // it's %gs:0x28.  gs:0x14 on i386.
558    if (Offset == INT_MAX)
559      Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
560
561    StringRef GuardReg = M->getStackProtectorGuardReg();
562    if (GuardReg == "fs")
563      AddressSpace = X86AS::FS;
564    else if (GuardReg == "gs")
565      AddressSpace = X86AS::GS;
566
567    // Use symbol guard if user specify.
568    StringRef GuardSymb = M->getStackProtectorGuardSymbol();
569    if (!GuardSymb.empty()) {
570      GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
571      if (!GV) {
572        Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
573                                       : Type::getInt32Ty(M->getContext());
574        GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
575                                nullptr, GuardSymb, nullptr,
576                                GlobalValue::NotThreadLocal, AddressSpace);
577        if (!Subtarget.isTargetDarwin())
578          GV->setDSOLocal(M->getDirectAccessExternalData());
579      }
580      return GV;
581    }
582
583    return SegmentOffset(IRB, Offset, AddressSpace);
584  }
585  return TargetLowering::getIRStackGuard(IRB);
586}
587
588void X86TargetLowering::insertSSPDeclarations(Module &M) const {
589  // MSVC CRT provides functionalities for stack protection.
590  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
591      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
592    // MSVC CRT has a global variable holding security cookie.
593    M.getOrInsertGlobal("__security_cookie",
594                        PointerType::getUnqual(M.getContext()));
595
596    // MSVC CRT has a function to validate security cookie.
597    FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
598        "__security_check_cookie", Type::getVoidTy(M.getContext()),
599        PointerType::getUnqual(M.getContext()));
600    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
601      F->setCallingConv(CallingConv::X86_FastCall);
602      F->addParamAttr(0, Attribute::AttrKind::InReg);
603    }
604    return;
605  }
606
607  StringRef GuardMode = M.getStackProtectorGuard();
608
609  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
610  if ((GuardMode == "tls" || GuardMode.empty()) &&
611      hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
612    return;
613  TargetLowering::insertSSPDeclarations(M);
614}
615
616Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
617  // MSVC CRT has a global variable holding security cookie.
618  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
619      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
620    return M.getGlobalVariable("__security_cookie");
621  }
622  return TargetLowering::getSDagStackGuard(M);
623}
624
625Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
626  // MSVC CRT has a function to validate security cookie.
627  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
628      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
629    return M.getFunction("__security_check_cookie");
630  }
631  return TargetLowering::getSSPStackGuardCheck(M);
632}
633
634Value *
635X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
636  // Android provides a fixed TLS slot for the SafeStack pointer. See the
637  // definition of TLS_SLOT_SAFESTACK in
638  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
639  if (Subtarget.isTargetAndroid()) {
640    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
641    // %gs:0x24 on i386
642    int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
643    return SegmentOffset(IRB, Offset, getAddressSpace());
644  }
645
646  // Fuchsia is similar.
647  if (Subtarget.isTargetFuchsia()) {
648    // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
649    return SegmentOffset(IRB, 0x18, getAddressSpace());
650  }
651
652  return TargetLowering::getSafeStackPointerLocation(IRB);
653}
654
655//===----------------------------------------------------------------------===//
656//               Return Value Calling Convention Implementation
657//===----------------------------------------------------------------------===//
658
659bool X86TargetLowering::CanLowerReturn(
660    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
661    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
662  SmallVector<CCValAssign, 16> RVLocs;
663  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
664  return CCInfo.CheckReturn(Outs, RetCC_X86);
665}
666
667const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
668  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
669  return ScratchRegs;
670}
671
672ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
673  // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
674  // tests at the moment, which is not what we expected.
675  static const MCPhysReg RCRegs[] = {X86::MXCSR};
676  return RCRegs;
677}
678
679/// Lowers masks values (v*i1) to the local register values
680/// \returns DAG node after lowering to register type
681static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
682                               const SDLoc &DL, SelectionDAG &DAG) {
683  EVT ValVT = ValArg.getValueType();
684
685  if (ValVT == MVT::v1i1)
686    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
687                       DAG.getIntPtrConstant(0, DL));
688
689  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
690      (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
691    // Two stage lowering might be required
692    // bitcast:   v8i1 -> i8 / v16i1 -> i16
693    // anyextend: i8   -> i32 / i16   -> i32
694    EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
695    SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
696    if (ValLoc == MVT::i32)
697      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
698    return ValToCopy;
699  }
700
701  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
702      (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
703    // One stage lowering is required
704    // bitcast:   v32i1 -> i32 / v64i1 -> i64
705    return DAG.getBitcast(ValLoc, ValArg);
706  }
707
708  return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
709}
710
711/// Breaks v64i1 value into two registers and adds the new node to the DAG
712static void Passv64i1ArgInRegs(
713    const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
714    SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
715    CCValAssign &NextVA, const X86Subtarget &Subtarget) {
716  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
717  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
718  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
719  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
720         "The value should reside in two registers");
721
722  // Before splitting the value we cast it to i64
723  Arg = DAG.getBitcast(MVT::i64, Arg);
724
725  // Splitting the value into two i32 types
726  SDValue Lo, Hi;
727  std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
728
729  // Attach the two i32 types into corresponding registers
730  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
731  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
732}
733
734SDValue
735X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
736                               bool isVarArg,
737                               const SmallVectorImpl<ISD::OutputArg> &Outs,
738                               const SmallVectorImpl<SDValue> &OutVals,
739                               const SDLoc &dl, SelectionDAG &DAG) const {
740  MachineFunction &MF = DAG.getMachineFunction();
741  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
742
743  // In some cases we need to disable registers from the default CSR list.
744  // For example, when they are used as return registers (preserve_* and X86's
745  // regcall) or for argument passing (X86's regcall).
746  bool ShouldDisableCalleeSavedRegister =
747      shouldDisableRetRegFromCSR(CallConv) ||
748      MF.getFunction().hasFnAttribute("no_caller_saved_registers");
749
750  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
751    report_fatal_error("X86 interrupts may not return any value");
752
753  SmallVector<CCValAssign, 16> RVLocs;
754  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
755  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
756
757  SmallVector<std::pair<Register, SDValue>, 4> RetVals;
758  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
759       ++I, ++OutsIndex) {
760    CCValAssign &VA = RVLocs[I];
761    assert(VA.isRegLoc() && "Can only return in registers!");
762
763    // Add the register to the CalleeSaveDisableRegs list.
764    if (ShouldDisableCalleeSavedRegister)
765      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
766
767    SDValue ValToCopy = OutVals[OutsIndex];
768    EVT ValVT = ValToCopy.getValueType();
769
770    // Promote values to the appropriate types.
771    if (VA.getLocInfo() == CCValAssign::SExt)
772      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
773    else if (VA.getLocInfo() == CCValAssign::ZExt)
774      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
775    else if (VA.getLocInfo() == CCValAssign::AExt) {
776      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
777        ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
778      else
779        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
780    }
781    else if (VA.getLocInfo() == CCValAssign::BCvt)
782      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
783
784    assert(VA.getLocInfo() != CCValAssign::FPExt &&
785           "Unexpected FP-extend for return value.");
786
787    // Report an error if we have attempted to return a value via an XMM
788    // register and SSE was disabled.
789    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
790      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
791      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
792    } else if (!Subtarget.hasSSE2() &&
793               X86::FR64XRegClass.contains(VA.getLocReg()) &&
794               ValVT == MVT::f64) {
795      // When returning a double via an XMM register, report an error if SSE2 is
796      // not enabled.
797      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
798      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
799    }
800
801    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
802    // the RET instruction and handled by the FP Stackifier.
803    if (VA.getLocReg() == X86::FP0 ||
804        VA.getLocReg() == X86::FP1) {
805      // If this is a copy from an xmm register to ST(0), use an FPExtend to
806      // change the value to the FP stack register class.
807      if (isScalarFPTypeInSSEReg(VA.getValVT()))
808        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
809      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
810      // Don't emit a copytoreg.
811      continue;
812    }
813
814    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
815    // which is returned in RAX / RDX.
816    if (Subtarget.is64Bit()) {
817      if (ValVT == MVT::x86mmx) {
818        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
819          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
820          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
821                                  ValToCopy);
822          // If we don't have SSE2 available, convert to v4f32 so the generated
823          // register is legal.
824          if (!Subtarget.hasSSE2())
825            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
826        }
827      }
828    }
829
830    if (VA.needsCustom()) {
831      assert(VA.getValVT() == MVT::v64i1 &&
832             "Currently the only custom case is when we split v64i1 to 2 regs");
833
834      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
835                         Subtarget);
836
837      // Add the second register to the CalleeSaveDisableRegs list.
838      if (ShouldDisableCalleeSavedRegister)
839        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
840    } else {
841      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
842    }
843  }
844
845  SDValue Glue;
846  SmallVector<SDValue, 6> RetOps;
847  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
848  // Operand #1 = Bytes To Pop
849  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
850                   MVT::i32));
851
852  // Copy the result values into the output registers.
853  for (auto &RetVal : RetVals) {
854    if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
855      RetOps.push_back(RetVal.second);
856      continue; // Don't emit a copytoreg.
857    }
858
859    Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
860    Glue = Chain.getValue(1);
861    RetOps.push_back(
862        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
863  }
864
865  // Swift calling convention does not require we copy the sret argument
866  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
867
868  // All x86 ABIs require that for returning structs by value we copy
869  // the sret argument into %rax/%eax (depending on ABI) for the return.
870  // We saved the argument into a virtual register in the entry block,
871  // so now we copy the value out and into %rax/%eax.
872  //
873  // Checking Function.hasStructRetAttr() here is insufficient because the IR
874  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
875  // false, then an sret argument may be implicitly inserted in the SelDAG. In
876  // either case FuncInfo->setSRetReturnReg() will have been called.
877  if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
878    // When we have both sret and another return value, we should use the
879    // original Chain stored in RetOps[0], instead of the current Chain updated
880    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
881
882    // For the case of sret and another return value, we have
883    //   Chain_0 at the function entry
884    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
885    // If we use Chain_1 in getCopyFromReg, we will have
886    //   Val = getCopyFromReg(Chain_1)
887    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
888
889    // getCopyToReg(Chain_0) will be glued together with
890    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
891    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
892    //   Data dependency from Unit B to Unit A due to usage of Val in
893    //     getCopyToReg(Chain_1, Val)
894    //   Chain dependency from Unit A to Unit B
895
896    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
897    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
898                                     getPointerTy(MF.getDataLayout()));
899
900    Register RetValReg
901        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
902          X86::RAX : X86::EAX;
903    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
904    Glue = Chain.getValue(1);
905
906    // RAX/EAX now acts like a return value.
907    RetOps.push_back(
908        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
909
910    // Add the returned register to the CalleeSaveDisableRegs list. Don't do
911    // this however for preserve_most/preserve_all to minimize the number of
912    // callee-saved registers for these CCs.
913    if (ShouldDisableCalleeSavedRegister &&
914        CallConv != CallingConv::PreserveAll &&
915        CallConv != CallingConv::PreserveMost)
916      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
917  }
918
919  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
920  const MCPhysReg *I =
921      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
922  if (I) {
923    for (; *I; ++I) {
924      if (X86::GR64RegClass.contains(*I))
925        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
926      else
927        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
928    }
929  }
930
931  RetOps[0] = Chain;  // Update chain.
932
933  // Add the glue if we have it.
934  if (Glue.getNode())
935    RetOps.push_back(Glue);
936
937  X86ISD::NodeType opcode = X86ISD::RET_GLUE;
938  if (CallConv == CallingConv::X86_INTR)
939    opcode = X86ISD::IRET;
940  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
941}
942
943bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
944  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
945    return false;
946
947  SDValue TCChain = Chain;
948  SDNode *Copy = *N->use_begin();
949  if (Copy->getOpcode() == ISD::CopyToReg) {
950    // If the copy has a glue operand, we conservatively assume it isn't safe to
951    // perform a tail call.
952    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
953      return false;
954    TCChain = Copy->getOperand(0);
955  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
956    return false;
957
958  bool HasRet = false;
959  for (const SDNode *U : Copy->uses()) {
960    if (U->getOpcode() != X86ISD::RET_GLUE)
961      return false;
962    // If we are returning more than one value, we can definitely
963    // not make a tail call see PR19530
964    if (U->getNumOperands() > 4)
965      return false;
966    if (U->getNumOperands() == 4 &&
967        U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
968      return false;
969    HasRet = true;
970  }
971
972  if (!HasRet)
973    return false;
974
975  Chain = TCChain;
976  return true;
977}
978
979EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
980                                           ISD::NodeType ExtendKind) const {
981  MVT ReturnMVT = MVT::i32;
982
983  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
984  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
985    // The ABI does not require i1, i8 or i16 to be extended.
986    //
987    // On Darwin, there is code in the wild relying on Clang's old behaviour of
988    // always extending i8/i16 return values, so keep doing that for now.
989    // (PR26665).
990    ReturnMVT = MVT::i8;
991  }
992
993  EVT MinVT = getRegisterType(Context, ReturnMVT);
994  return VT.bitsLT(MinVT) ? MinVT : VT;
995}
996
997/// Reads two 32 bit registers and creates a 64 bit mask value.
998/// \param VA The current 32 bit value that need to be assigned.
999/// \param NextVA The next 32 bit value that need to be assigned.
1000/// \param Root The parent DAG node.
1001/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1002///                        glue purposes. In the case the DAG is already using
1003///                        physical register instead of virtual, we should glue
1004///                        our new SDValue to InGlue SDvalue.
1005/// \return a new SDvalue of size 64bit.
1006static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1007                                SDValue &Root, SelectionDAG &DAG,
1008                                const SDLoc &DL, const X86Subtarget &Subtarget,
1009                                SDValue *InGlue = nullptr) {
1010  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1011  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1012  assert(VA.getValVT() == MVT::v64i1 &&
1013         "Expecting first location of 64 bit width type");
1014  assert(NextVA.getValVT() == VA.getValVT() &&
1015         "The locations should have the same type");
1016  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1017         "The values should reside in two registers");
1018
1019  SDValue Lo, Hi;
1020  SDValue ArgValueLo, ArgValueHi;
1021
1022  MachineFunction &MF = DAG.getMachineFunction();
1023  const TargetRegisterClass *RC = &X86::GR32RegClass;
1024
1025  // Read a 32 bit value from the registers.
1026  if (nullptr == InGlue) {
1027    // When no physical register is present,
1028    // create an intermediate virtual register.
1029    Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1030    ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1031    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1032    ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1033  } else {
1034    // When a physical register is available read the value from it and glue
1035    // the reads together.
1036    ArgValueLo =
1037      DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
1038    *InGlue = ArgValueLo.getValue(2);
1039    ArgValueHi =
1040      DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
1041    *InGlue = ArgValueHi.getValue(2);
1042  }
1043
1044  // Convert the i32 type into v32i1 type.
1045  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
1046
1047  // Convert the i32 type into v32i1 type.
1048  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
1049
1050  // Concatenate the two values together.
1051  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
1052}
1053
1054/// The function will lower a register of various sizes (8/16/32/64)
1055/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1056/// \returns a DAG node contains the operand after lowering to mask type.
1057static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1058                               const EVT &ValLoc, const SDLoc &DL,
1059                               SelectionDAG &DAG) {
1060  SDValue ValReturned = ValArg;
1061
1062  if (ValVT == MVT::v1i1)
1063    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
1064
1065  if (ValVT == MVT::v64i1) {
1066    // In 32 bit machine, this case is handled by getv64i1Argument
1067    assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1068    // In 64 bit machine, There is no need to truncate the value only bitcast
1069  } else {
1070    MVT MaskLenVT;
1071    switch (ValVT.getSimpleVT().SimpleTy) {
1072    case MVT::v8i1:
1073      MaskLenVT = MVT::i8;
1074      break;
1075    case MVT::v16i1:
1076      MaskLenVT = MVT::i16;
1077      break;
1078    case MVT::v32i1:
1079      MaskLenVT = MVT::i32;
1080      break;
1081    default:
1082      llvm_unreachable("Expecting a vector of i1 types");
1083    }
1084
1085    ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
1086  }
1087  return DAG.getBitcast(ValVT, ValReturned);
1088}
1089
1090/// Lower the result values of a call into the
1091/// appropriate copies out of appropriate physical registers.
1092///
1093SDValue X86TargetLowering::LowerCallResult(
1094    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1095    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1096    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1097    uint32_t *RegMask) const {
1098
1099  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1100  // Assign locations to each value returned by this call.
1101  SmallVector<CCValAssign, 16> RVLocs;
1102  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1103                 *DAG.getContext());
1104  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1105
1106  // Copy all of the result registers out of their specified physreg.
1107  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1108       ++I, ++InsIndex) {
1109    CCValAssign &VA = RVLocs[I];
1110    EVT CopyVT = VA.getLocVT();
1111
1112    // In some calling conventions we need to remove the used registers
1113    // from the register mask.
1114    if (RegMask) {
1115      for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
1116        RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1117    }
1118
1119    // Report an error if there was an attempt to return FP values via XMM
1120    // registers.
1121    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
1122      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
1123      if (VA.getLocReg() == X86::XMM1)
1124        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1125      else
1126        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1127    } else if (!Subtarget.hasSSE2() &&
1128               X86::FR64XRegClass.contains(VA.getLocReg()) &&
1129               CopyVT == MVT::f64) {
1130      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
1131      if (VA.getLocReg() == X86::XMM1)
1132        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1133      else
1134        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1135    }
1136
1137    // If we prefer to use the value in xmm registers, copy it out as f80 and
1138    // use a truncate to move it from fp stack reg to xmm reg.
1139    bool RoundAfterCopy = false;
1140    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
1141        isScalarFPTypeInSSEReg(VA.getValVT())) {
1142      if (!Subtarget.hasX87())
1143        report_fatal_error("X87 register return with X87 disabled");
1144      CopyVT = MVT::f80;
1145      RoundAfterCopy = (CopyVT != VA.getLocVT());
1146    }
1147
1148    SDValue Val;
1149    if (VA.needsCustom()) {
1150      assert(VA.getValVT() == MVT::v64i1 &&
1151             "Currently the only custom case is when we split v64i1 to 2 regs");
1152      Val =
1153          getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
1154    } else {
1155      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1156                  .getValue(1);
1157      Val = Chain.getValue(0);
1158      InGlue = Chain.getValue(2);
1159    }
1160
1161    if (RoundAfterCopy)
1162      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1163                        // This truncation won't change the value.
1164                        DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
1165
1166    if (VA.isExtInLoc()) {
1167      if (VA.getValVT().isVector() &&
1168          VA.getValVT().getScalarType() == MVT::i1 &&
1169          ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1170           (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1171        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1172        Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
1173      } else
1174        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1175    }
1176
1177    if (VA.getLocInfo() == CCValAssign::BCvt)
1178      Val = DAG.getBitcast(VA.getValVT(), Val);
1179
1180    InVals.push_back(Val);
1181  }
1182
1183  return Chain;
1184}
1185
1186//===----------------------------------------------------------------------===//
1187//                C & StdCall & Fast Calling Convention implementation
1188//===----------------------------------------------------------------------===//
1189//  StdCall calling convention seems to be standard for many Windows' API
1190//  routines and around. It differs from C calling convention just a little:
1191//  callee should clean up the stack, not caller. Symbols should be also
1192//  decorated in some fancy way :) It doesn't support any vector arguments.
1193//  For info on fast calling convention see Fast Calling Convention (tail call)
1194//  implementation LowerX86_32FastCCCallTo.
1195
1196/// Determines whether Args, either a set of outgoing arguments to a call, or a
1197/// set of incoming args of a call, contains an sret pointer that the callee
1198/// pops
1199template <typename T>
1200static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1201                             const X86Subtarget &Subtarget) {
1202  // Not C++20 (yet), so no concepts available.
1203  static_assert(std::is_same_v<T, ISD::OutputArg> ||
1204                    std::is_same_v<T, ISD::InputArg>,
1205                "requires ISD::OutputArg or ISD::InputArg");
1206
1207  // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
1208  // for most compilations.
1209  if (!Subtarget.is32Bit())
1210    return false;
1211
1212  if (Args.empty())
1213    return false;
1214
1215  // Most calls do not have an sret argument, check the arg next.
1216  const ISD::ArgFlagsTy &Flags = Args[0].Flags;
1217  if (!Flags.isSRet() || Flags.isInReg())
1218    return false;
1219
1220  // The MSVCabi does not pop the sret.
1221  if (Subtarget.getTargetTriple().isOSMSVCRT())
1222    return false;
1223
1224  // MCUs don't pop the sret
1225  if (Subtarget.isTargetMCU())
1226    return false;
1227
1228  // Callee pops argument
1229  return true;
1230}
1231
1232/// Make a copy of an aggregate at address specified by "Src" to address
1233/// "Dst" with size and alignment information specified by the specific
1234/// parameter attribute. The copy will be passed as a byval function parameter.
1235static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1236                                         SDValue Chain, ISD::ArgFlagsTy Flags,
1237                                         SelectionDAG &DAG, const SDLoc &dl) {
1238  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
1239
1240  return DAG.getMemcpy(
1241      Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
1242      /*isVolatile*/ false, /*AlwaysInline=*/true,
1243      /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
1244}
1245
1246/// Return true if the calling convention is one that we can guarantee TCO for.
1247static bool canGuaranteeTCO(CallingConv::ID CC) {
1248  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1249          CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1250          CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1251}
1252
1253/// Return true if we might ever do TCO for calls with this calling convention.
1254static bool mayTailCallThisCC(CallingConv::ID CC) {
1255  switch (CC) {
1256  // C calling conventions:
1257  case CallingConv::C:
1258  case CallingConv::Win64:
1259  case CallingConv::X86_64_SysV:
1260  // Callee pop conventions:
1261  case CallingConv::X86_ThisCall:
1262  case CallingConv::X86_StdCall:
1263  case CallingConv::X86_VectorCall:
1264  case CallingConv::X86_FastCall:
1265  // Swift:
1266  case CallingConv::Swift:
1267    return true;
1268  default:
1269    return canGuaranteeTCO(CC);
1270  }
1271}
1272
1273/// Return true if the function is being made into a tailcall target by
1274/// changing its ABI.
1275static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1276  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1277         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1278}
1279
1280bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1281  if (!CI->isTailCall())
1282    return false;
1283
1284  CallingConv::ID CalleeCC = CI->getCallingConv();
1285  if (!mayTailCallThisCC(CalleeCC))
1286    return false;
1287
1288  return true;
1289}
1290
1291SDValue
1292X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1293                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1294                                    const SDLoc &dl, SelectionDAG &DAG,
1295                                    const CCValAssign &VA,
1296                                    MachineFrameInfo &MFI, unsigned i) const {
1297  // Create the nodes corresponding to a load from this parameter slot.
1298  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1299  bool AlwaysUseMutable = shouldGuaranteeTCO(
1300      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
1301  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1302  EVT ValVT;
1303  MVT PtrVT = getPointerTy(DAG.getDataLayout());
1304
1305  // If value is passed by pointer we have address passed instead of the value
1306  // itself. No need to extend if the mask value and location share the same
1307  // absolute size.
1308  bool ExtendedInMem =
1309      VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1310      VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1311
1312  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1313    ValVT = VA.getLocVT();
1314  else
1315    ValVT = VA.getValVT();
1316
1317  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1318  // changed with more analysis.
1319  // In case of tail call optimization mark all arguments mutable. Since they
1320  // could be overwritten by lowering of arguments in case of a tail call.
1321  if (Flags.isByVal()) {
1322    unsigned Bytes = Flags.getByValSize();
1323    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1324
1325    // FIXME: For now, all byval parameter objects are marked as aliasing. This
1326    // can be improved with deeper analysis.
1327    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
1328                                   /*isAliased=*/true);
1329    return DAG.getFrameIndex(FI, PtrVT);
1330  }
1331
1332  EVT ArgVT = Ins[i].ArgVT;
1333
1334  // If this is a vector that has been split into multiple parts, don't elide
1335  // the copy. The layout on the stack may not match the packed in-memory
1336  // layout.
1337  bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1338
1339  // This is an argument in memory. We might be able to perform copy elision.
1340  // If the argument is passed directly in memory without any extension, then we
1341  // can perform copy elision. Large vector types, for example, may be passed
1342  // indirectly by pointer.
1343  if (Flags.isCopyElisionCandidate() &&
1344      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1345      !ScalarizedVector) {
1346    SDValue PartAddr;
1347    if (Ins[i].PartOffset == 0) {
1348      // If this is a one-part value or the first part of a multi-part value,
1349      // create a stack object for the entire argument value type and return a
1350      // load from our portion of it. This assumes that if the first part of an
1351      // argument is in memory, the rest will also be in memory.
1352      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
1353                                     /*IsImmutable=*/false);
1354      PartAddr = DAG.getFrameIndex(FI, PtrVT);
1355      return DAG.getLoad(
1356          ValVT, dl, Chain, PartAddr,
1357          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
1358    }
1359
1360    // This is not the first piece of an argument in memory. See if there is
1361    // already a fixed stack object including this offset. If so, assume it
1362    // was created by the PartOffset == 0 branch above and create a load from
1363    // the appropriate offset into it.
1364    int64_t PartBegin = VA.getLocMemOffset();
1365    int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1366    int FI = MFI.getObjectIndexBegin();
1367    for (; MFI.isFixedObjectIndex(FI); ++FI) {
1368      int64_t ObjBegin = MFI.getObjectOffset(FI);
1369      int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
1370      if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1371        break;
1372    }
1373    if (MFI.isFixedObjectIndex(FI)) {
1374      SDValue Addr =
1375          DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
1376                      DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
1377      return DAG.getLoad(ValVT, dl, Chain, Addr,
1378                         MachinePointerInfo::getFixedStack(
1379                             DAG.getMachineFunction(), FI, Ins[i].PartOffset));
1380    }
1381  }
1382
1383  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
1384                                 VA.getLocMemOffset(), isImmutable);
1385
1386  // Set SExt or ZExt flag.
1387  if (VA.getLocInfo() == CCValAssign::ZExt) {
1388    MFI.setObjectZExt(FI, true);
1389  } else if (VA.getLocInfo() == CCValAssign::SExt) {
1390    MFI.setObjectSExt(FI, true);
1391  }
1392
1393  MaybeAlign Alignment;
1394  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1395      ValVT != MVT::f80)
1396    Alignment = MaybeAlign(4);
1397  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1398  SDValue Val = DAG.getLoad(
1399      ValVT, dl, Chain, FIN,
1400      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1401      Alignment);
1402  return ExtendedInMem
1403             ? (VA.getValVT().isVector()
1404                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
1405                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
1406             : Val;
1407}
1408
1409// FIXME: Get this from tablegen.
1410static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1411                                                const X86Subtarget &Subtarget) {
1412  assert(Subtarget.is64Bit());
1413
1414  if (Subtarget.isCallingConvWin64(CallConv)) {
1415    static const MCPhysReg GPR64ArgRegsWin64[] = {
1416      X86::RCX, X86::RDX, X86::R8,  X86::R9
1417    };
1418    return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
1419  }
1420
1421  static const MCPhysReg GPR64ArgRegs64Bit[] = {
1422    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1423  };
1424  return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
1425}
1426
1427// FIXME: Get this from tablegen.
1428static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1429                                                CallingConv::ID CallConv,
1430                                                const X86Subtarget &Subtarget) {
1431  assert(Subtarget.is64Bit());
1432  if (Subtarget.isCallingConvWin64(CallConv)) {
1433    // The XMM registers which might contain var arg parameters are shadowed
1434    // in their paired GPR.  So we only need to save the GPR to their home
1435    // slots.
1436    // TODO: __vectorcall will change this.
1437    return std::nullopt;
1438  }
1439
1440  bool isSoftFloat = Subtarget.useSoftFloat();
1441  if (isSoftFloat || !Subtarget.hasSSE1())
1442    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1443    // registers.
1444    return std::nullopt;
1445
1446  static const MCPhysReg XMMArgRegs64Bit[] = {
1447    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1448    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1449  };
1450  return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
1451}
1452
1453#ifndef NDEBUG
1454static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1455  return llvm::is_sorted(
1456      ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1457        return A.getValNo() < B.getValNo();
1458      });
1459}
1460#endif
1461
1462namespace {
1463/// This is a helper class for lowering variable arguments parameters.
1464class VarArgsLoweringHelper {
1465public:
1466  VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1467                        SelectionDAG &DAG, const X86Subtarget &Subtarget,
1468                        CallingConv::ID CallConv, CCState &CCInfo)
1469      : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1470        TheMachineFunction(DAG.getMachineFunction()),
1471        TheFunction(TheMachineFunction.getFunction()),
1472        FrameInfo(TheMachineFunction.getFrameInfo()),
1473        FrameLowering(*Subtarget.getFrameLowering()),
1474        TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1475        CCInfo(CCInfo) {}
1476
1477  // Lower variable arguments parameters.
1478  void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1479
1480private:
1481  void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1482
1483  void forwardMustTailParameters(SDValue &Chain);
1484
1485  bool is64Bit() const { return Subtarget.is64Bit(); }
1486  bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
1487
1488  X86MachineFunctionInfo *FuncInfo;
1489  const SDLoc &DL;
1490  SelectionDAG &DAG;
1491  const X86Subtarget &Subtarget;
1492  MachineFunction &TheMachineFunction;
1493  const Function &TheFunction;
1494  MachineFrameInfo &FrameInfo;
1495  const TargetFrameLowering &FrameLowering;
1496  const TargetLowering &TargLowering;
1497  CallingConv::ID CallConv;
1498  CCState &CCInfo;
1499};
1500} // namespace
1501
1502void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1503    SDValue &Chain, unsigned StackSize) {
1504  // If the function takes variable number of arguments, make a frame index for
1505  // the start of the first vararg value... for expansion of llvm.va_start. We
1506  // can skip this if there are no va_start calls.
1507  if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1508                    CallConv != CallingConv::X86_ThisCall)) {
1509    FuncInfo->setVarArgsFrameIndex(
1510        FrameInfo.CreateFixedObject(1, StackSize, true));
1511  }
1512
1513  // 64-bit calling conventions support varargs and register parameters, so we
1514  // have to do extra work to spill them in the prologue.
1515  if (is64Bit()) {
1516    // Find the first unallocated argument registers.
1517    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1518    ArrayRef<MCPhysReg> ArgXMMs =
1519        get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
1520    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
1521    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
1522
1523    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1524           "SSE register cannot be used when SSE is disabled!");
1525
1526    if (isWin64()) {
1527      // Get to the caller-allocated home save location.  Add 8 to account
1528      // for the return address.
1529      int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1530      FuncInfo->setRegSaveFrameIndex(
1531          FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1532      // Fixup to set vararg frame on shadow area (4 x i64).
1533      if (NumIntRegs < 4)
1534        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1535    } else {
1536      // For X86-64, if there are vararg parameters that are passed via
1537      // registers, then we must store them to their spots on the stack so
1538      // they may be loaded by dereferencing the result of va_next.
1539      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1540      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1541      FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1542          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
1543    }
1544
1545    SmallVector<SDValue, 6>
1546        LiveGPRs; // list of SDValue for GPR registers keeping live input value
1547    SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1548                                         // keeping live input value
1549    SDValue ALVal; // if applicable keeps SDValue for %al register
1550
1551    // Gather all the live in physical registers.
1552    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
1553      Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
1554      LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
1555    }
1556    const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
1557    if (!AvailableXmms.empty()) {
1558      Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1559      ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
1560      for (MCPhysReg Reg : AvailableXmms) {
1561        // FastRegisterAllocator spills virtual registers at basic
1562        // block boundary. That leads to usages of xmm registers
1563        // outside of check for %al. Pass physical registers to
1564        // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1565        TheMachineFunction.getRegInfo().addLiveIn(Reg);
1566        LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
1567      }
1568    }
1569
1570    // Store the integer parameter registers.
1571    SmallVector<SDValue, 8> MemOps;
1572    SDValue RSFIN =
1573        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1574                          TargLowering.getPointerTy(DAG.getDataLayout()));
1575    unsigned Offset = FuncInfo->getVarArgsGPOffset();
1576    for (SDValue Val : LiveGPRs) {
1577      SDValue FIN = DAG.getNode(ISD::ADD, DL,
1578                                TargLowering.getPointerTy(DAG.getDataLayout()),
1579                                RSFIN, DAG.getIntPtrConstant(Offset, DL));
1580      SDValue Store =
1581          DAG.getStore(Val.getValue(1), DL, Val, FIN,
1582                       MachinePointerInfo::getFixedStack(
1583                           DAG.getMachineFunction(),
1584                           FuncInfo->getRegSaveFrameIndex(), Offset));
1585      MemOps.push_back(Store);
1586      Offset += 8;
1587    }
1588
1589    // Now store the XMM (fp + vector) parameter registers.
1590    if (!LiveXMMRegs.empty()) {
1591      SmallVector<SDValue, 12> SaveXMMOps;
1592      SaveXMMOps.push_back(Chain);
1593      SaveXMMOps.push_back(ALVal);
1594      SaveXMMOps.push_back(RSFIN);
1595      SaveXMMOps.push_back(
1596          DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
1597      llvm::append_range(SaveXMMOps, LiveXMMRegs);
1598      MachineMemOperand *StoreMMO =
1599          DAG.getMachineFunction().getMachineMemOperand(
1600              MachinePointerInfo::getFixedStack(
1601                  DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
1602                  Offset),
1603              MachineMemOperand::MOStore, 128, Align(16));
1604      MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
1605                                               DL, DAG.getVTList(MVT::Other),
1606                                               SaveXMMOps, MVT::i8, StoreMMO));
1607    }
1608
1609    if (!MemOps.empty())
1610      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1611  }
1612}
1613
1614void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1615  // Find the largest legal vector type.
1616  MVT VecVT = MVT::Other;
1617  // FIXME: Only some x86_32 calling conventions support AVX512.
1618  if (Subtarget.useAVX512Regs() &&
1619      (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1620                     CallConv == CallingConv::Intel_OCL_BI)))
1621    VecVT = MVT::v16f32;
1622  else if (Subtarget.hasAVX())
1623    VecVT = MVT::v8f32;
1624  else if (Subtarget.hasSSE2())
1625    VecVT = MVT::v4f32;
1626
1627  // We forward some GPRs and some vector types.
1628  SmallVector<MVT, 2> RegParmTypes;
1629  MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1630  RegParmTypes.push_back(IntVT);
1631  if (VecVT != MVT::Other)
1632    RegParmTypes.push_back(VecVT);
1633
1634  // Compute the set of forwarded registers. The rest are scratch.
1635  SmallVectorImpl<ForwardedRegister> &Forwards =
1636      FuncInfo->getForwardedMustTailRegParms();
1637  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
1638
1639  // Forward AL for SysV x86_64 targets, since it is used for varargs.
1640  if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
1641    Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1642    Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1643  }
1644
1645  // Copy all forwards from physical to virtual registers.
1646  for (ForwardedRegister &FR : Forwards) {
1647    // FIXME: Can we use a less constrained schedule?
1648    SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
1649    FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1650        TargLowering.getRegClassFor(FR.VT));
1651    Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
1652  }
1653}
1654
1655void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1656                                                   unsigned StackSize) {
1657  // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1658  // If necessary, it would be set into the correct value later.
1659  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1660  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1661
1662  if (FrameInfo.hasVAStart())
1663    createVarArgAreaAndStoreRegisters(Chain, StackSize);
1664
1665  if (FrameInfo.hasMustTailInVarArgFunc())
1666    forwardMustTailParameters(Chain);
1667}
1668
1669SDValue X86TargetLowering::LowerFormalArguments(
1670    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1671    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1672    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1673  MachineFunction &MF = DAG.getMachineFunction();
1674  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1675
1676  const Function &F = MF.getFunction();
1677  if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1678      F.getName() == "main")
1679    FuncInfo->setForceFramePointer(true);
1680
1681  MachineFrameInfo &MFI = MF.getFrameInfo();
1682  bool Is64Bit = Subtarget.is64Bit();
1683  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
1684
1685  assert(
1686      !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1687      "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1688
1689  // Assign locations to all of the incoming arguments.
1690  SmallVector<CCValAssign, 16> ArgLocs;
1691  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1692
1693  // Allocate shadow area for Win64.
1694  if (IsWin64)
1695    CCInfo.AllocateStack(32, Align(8));
1696
1697  CCInfo.AnalyzeArguments(Ins, CC_X86);
1698
1699  // In vectorcall calling convention a second pass is required for the HVA
1700  // types.
1701  if (CallingConv::X86_VectorCall == CallConv) {
1702    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
1703  }
1704
1705  // The next loop assumes that the locations are in the same order of the
1706  // input arguments.
1707  assert(isSortedByValueNo(ArgLocs) &&
1708         "Argument Location list must be sorted before lowering");
1709
1710  SDValue ArgValue;
1711  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1712       ++I, ++InsIndex) {
1713    assert(InsIndex < Ins.size() && "Invalid Ins index");
1714    CCValAssign &VA = ArgLocs[I];
1715
1716    if (VA.isRegLoc()) {
1717      EVT RegVT = VA.getLocVT();
1718      if (VA.needsCustom()) {
1719        assert(
1720            VA.getValVT() == MVT::v64i1 &&
1721            "Currently the only custom case is when we split v64i1 to 2 regs");
1722
1723        // v64i1 values, in regcall calling convention, that are
1724        // compiled to 32 bit arch, are split up into two registers.
1725        ArgValue =
1726            getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
1727      } else {
1728        const TargetRegisterClass *RC;
1729        if (RegVT == MVT::i8)
1730          RC = &X86::GR8RegClass;
1731        else if (RegVT == MVT::i16)
1732          RC = &X86::GR16RegClass;
1733        else if (RegVT == MVT::i32)
1734          RC = &X86::GR32RegClass;
1735        else if (Is64Bit && RegVT == MVT::i64)
1736          RC = &X86::GR64RegClass;
1737        else if (RegVT == MVT::f16)
1738          RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1739        else if (RegVT == MVT::f32)
1740          RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1741        else if (RegVT == MVT::f64)
1742          RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1743        else if (RegVT == MVT::f80)
1744          RC = &X86::RFP80RegClass;
1745        else if (RegVT == MVT::f128)
1746          RC = &X86::VR128RegClass;
1747        else if (RegVT.is512BitVector())
1748          RC = &X86::VR512RegClass;
1749        else if (RegVT.is256BitVector())
1750          RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1751        else if (RegVT.is128BitVector())
1752          RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1753        else if (RegVT == MVT::x86mmx)
1754          RC = &X86::VR64RegClass;
1755        else if (RegVT == MVT::v1i1)
1756          RC = &X86::VK1RegClass;
1757        else if (RegVT == MVT::v8i1)
1758          RC = &X86::VK8RegClass;
1759        else if (RegVT == MVT::v16i1)
1760          RC = &X86::VK16RegClass;
1761        else if (RegVT == MVT::v32i1)
1762          RC = &X86::VK32RegClass;
1763        else if (RegVT == MVT::v64i1)
1764          RC = &X86::VK64RegClass;
1765        else
1766          llvm_unreachable("Unknown argument type!");
1767
1768        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1769        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1770      }
1771
1772      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1773      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1774      // right size.
1775      if (VA.getLocInfo() == CCValAssign::SExt)
1776        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1777                               DAG.getValueType(VA.getValVT()));
1778      else if (VA.getLocInfo() == CCValAssign::ZExt)
1779        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1780                               DAG.getValueType(VA.getValVT()));
1781      else if (VA.getLocInfo() == CCValAssign::BCvt)
1782        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
1783
1784      if (VA.isExtInLoc()) {
1785        // Handle MMX values passed in XMM regs.
1786        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1787          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
1788        else if (VA.getValVT().isVector() &&
1789                 VA.getValVT().getScalarType() == MVT::i1 &&
1790                 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1791                  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1792          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1793          ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
1794        } else
1795          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1796      }
1797    } else {
1798      assert(VA.isMemLoc());
1799      ArgValue =
1800          LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
1801    }
1802
1803    // If value is passed via pointer - do a load.
1804    if (VA.getLocInfo() == CCValAssign::Indirect &&
1805        !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1806      ArgValue =
1807          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
1808    }
1809
1810    InVals.push_back(ArgValue);
1811  }
1812
1813  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1814    if (Ins[I].Flags.isSwiftAsync()) {
1815      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1816      if (Subtarget.is64Bit())
1817        X86FI->setHasSwiftAsyncContext(true);
1818      else {
1819        int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
1820        X86FI->setSwiftAsyncContextFrameIdx(FI);
1821        SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
1822                                  DAG.getFrameIndex(FI, MVT::i32),
1823                                  MachinePointerInfo::getFixedStack(MF, FI));
1824        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
1825      }
1826    }
1827
1828    // Swift calling convention does not require we copy the sret argument
1829    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1830    if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1831      continue;
1832
1833    // All x86 ABIs require that for returning structs by value we copy the
1834    // sret argument into %rax/%eax (depending on ABI) for the return. Save
1835    // the argument into a virtual register so that we can access it from the
1836    // return points.
1837    if (Ins[I].Flags.isSRet()) {
1838      assert(!FuncInfo->getSRetReturnReg() &&
1839             "SRet return has already been set");
1840      MVT PtrTy = getPointerTy(DAG.getDataLayout());
1841      Register Reg =
1842          MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
1843      FuncInfo->setSRetReturnReg(Reg);
1844      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
1845      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1846      break;
1847    }
1848  }
1849
1850  unsigned StackSize = CCInfo.getStackSize();
1851  // Align stack specially for tail calls.
1852  if (shouldGuaranteeTCO(CallConv,
1853                         MF.getTarget().Options.GuaranteedTailCallOpt))
1854    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1855
1856  if (IsVarArg)
1857    VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1858        .lowerVarArgsParameters(Chain, StackSize);
1859
1860  // Some CCs need callee pop.
1861  if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
1862                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
1863    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1864  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1865    // X86 interrupts must pop the error code (and the alignment padding) if
1866    // present.
1867    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1868  } else {
1869    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1870    // If this is an sret function, the return should pop the hidden pointer.
1871    if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
1872      FuncInfo->setBytesToPopOnReturn(4);
1873  }
1874
1875  if (!Is64Bit) {
1876    // RegSaveFrameIndex is X86-64 only.
1877    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1878  }
1879
1880  FuncInfo->setArgumentStackSize(StackSize);
1881
1882  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1883    EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
1884    if (Personality == EHPersonality::CoreCLR) {
1885      assert(Is64Bit);
1886      // TODO: Add a mechanism to frame lowering that will allow us to indicate
1887      // that we'd prefer this slot be allocated towards the bottom of the frame
1888      // (i.e. near the stack pointer after allocating the frame).  Every
1889      // funclet needs a copy of this slot in its (mostly empty) frame, and the
1890      // offset from the bottom of this and each funclet's frame must be the
1891      // same, so the size of funclets' (mostly empty) frames is dictated by
1892      // how far this slot is from the bottom (since they allocate just enough
1893      // space to accommodate holding this slot at the correct offset).
1894      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
1895      EHInfo->PSPSymFrameIdx = PSPSymFI;
1896    }
1897  }
1898
1899  if (shouldDisableArgRegFromCSR(CallConv) ||
1900      F.hasFnAttribute("no_caller_saved_registers")) {
1901    MachineRegisterInfo &MRI = MF.getRegInfo();
1902    for (std::pair<Register, Register> Pair : MRI.liveins())
1903      MRI.disableCalleeSavedRegister(Pair.first);
1904  }
1905
1906  return Chain;
1907}
1908
1909SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1910                                            SDValue Arg, const SDLoc &dl,
1911                                            SelectionDAG &DAG,
1912                                            const CCValAssign &VA,
1913                                            ISD::ArgFlagsTy Flags,
1914                                            bool isByVal) const {
1915  unsigned LocMemOffset = VA.getLocMemOffset();
1916  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1917  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1918                       StackPtr, PtrOff);
1919  if (isByVal)
1920    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1921
1922  MaybeAlign Alignment;
1923  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1924      Arg.getSimpleValueType() != MVT::f80)
1925    Alignment = MaybeAlign(4);
1926  return DAG.getStore(
1927      Chain, dl, Arg, PtrOff,
1928      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1929      Alignment);
1930}
1931
1932/// Emit a load of return address if tail call
1933/// optimization is performed and it is required.
1934SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1935    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1936    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1937  // Adjust the Return address stack slot.
1938  EVT VT = getPointerTy(DAG.getDataLayout());
1939  OutRetAddr = getReturnAddressFrameIndex(DAG);
1940
1941  // Load the "old" Return address.
1942  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
1943  return SDValue(OutRetAddr.getNode(), 1);
1944}
1945
1946/// Emit a store of the return address if tail call
1947/// optimization is performed and it is required (FPDiff!=0).
1948static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1949                                        SDValue Chain, SDValue RetAddrFrIdx,
1950                                        EVT PtrVT, unsigned SlotSize,
1951                                        int FPDiff, const SDLoc &dl) {
1952  // Store the return address to the appropriate stack slot.
1953  if (!FPDiff) return Chain;
1954  // Calculate the new stack slot for the return address.
1955  int NewReturnAddrFI =
1956    MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
1957                                         false);
1958  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
1959  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1960                       MachinePointerInfo::getFixedStack(
1961                           DAG.getMachineFunction(), NewReturnAddrFI));
1962  return Chain;
1963}
1964
1965/// Returns a vector_shuffle mask for an movs{s|d}, movd
1966/// operation of specified width.
1967SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
1968                                   SDValue V1, SDValue V2) const {
1969  unsigned NumElems = VT.getVectorNumElements();
1970  SmallVector<int, 8> Mask;
1971  Mask.push_back(NumElems);
1972  for (unsigned i = 1; i != NumElems; ++i)
1973    Mask.push_back(i);
1974  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
1975}
1976
1977SDValue
1978X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1979                             SmallVectorImpl<SDValue> &InVals) const {
1980  SelectionDAG &DAG                     = CLI.DAG;
1981  SDLoc &dl                             = CLI.DL;
1982  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1983  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1984  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1985  SDValue Chain                         = CLI.Chain;
1986  SDValue Callee                        = CLI.Callee;
1987  CallingConv::ID CallConv              = CLI.CallConv;
1988  bool &isTailCall                      = CLI.IsTailCall;
1989  bool isVarArg                         = CLI.IsVarArg;
1990  const auto *CB                        = CLI.CB;
1991
1992  MachineFunction &MF = DAG.getMachineFunction();
1993  bool Is64Bit        = Subtarget.is64Bit();
1994  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
1995  bool IsSibcall      = false;
1996  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
1997      CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
1998  bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
1999  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2000  bool HasNCSR = (CB && isa<CallInst>(CB) &&
2001                  CB->hasFnAttr("no_caller_saved_registers"));
2002  bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
2003  bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
2004  bool IsCFICall = IsIndirectCall && CLI.CFIType;
2005  const Module *M = MF.getMMI().getModule();
2006  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
2007
2008  MachineFunction::CallSiteInfo CSInfo;
2009  if (CallConv == CallingConv::X86_INTR)
2010    report_fatal_error("X86 interrupts may not be called directly");
2011
2012  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2013  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2014    // If we are using a GOT, disable tail calls to external symbols with
2015    // default visibility. Tail calling such a symbol requires using a GOT
2016    // relocation, which forces early binding of the symbol. This breaks code
2017    // that require lazy function symbol resolution. Using musttail or
2018    // GuaranteedTailCallOpt will override this.
2019    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2020    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2021               G->getGlobal()->hasDefaultVisibility()))
2022      isTailCall = false;
2023  }
2024
2025  if (isTailCall && !IsMustTail) {
2026    // Check if it's really possible to do a tail call.
2027    isTailCall = IsEligibleForTailCallOptimization(
2028        Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
2029        Ins, DAG);
2030
2031    // Sibcalls are automatically detected tailcalls which do not require
2032    // ABI changes.
2033    if (!IsGuaranteeTCO && isTailCall)
2034      IsSibcall = true;
2035
2036    if (isTailCall)
2037      ++NumTailCalls;
2038  }
2039
2040  if (IsMustTail && !isTailCall)
2041    report_fatal_error("failed to perform tail call elimination on a call "
2042                       "site marked musttail");
2043
2044  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2045         "Var args not supported with calling convention fastcc, ghc or hipe");
2046
2047  // Analyze operands of the call, assigning locations to each operand.
2048  SmallVector<CCValAssign, 16> ArgLocs;
2049  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2050
2051  // Allocate shadow area for Win64.
2052  if (IsWin64)
2053    CCInfo.AllocateStack(32, Align(8));
2054
2055  CCInfo.AnalyzeArguments(Outs, CC_X86);
2056
2057  // In vectorcall calling convention a second pass is required for the HVA
2058  // types.
2059  if (CallingConv::X86_VectorCall == CallConv) {
2060    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
2061  }
2062
2063  // Get a count of how many bytes are to be pushed on the stack.
2064  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2065  if (IsSibcall)
2066    // This is a sibcall. The memory operands are available in caller's
2067    // own caller's stack.
2068    NumBytes = 0;
2069  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
2070    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2071
2072  int FPDiff = 0;
2073  if (isTailCall &&
2074      shouldGuaranteeTCO(CallConv,
2075                         MF.getTarget().Options.GuaranteedTailCallOpt)) {
2076    // Lower arguments at fp - stackoffset + fpdiff.
2077    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2078
2079    FPDiff = NumBytesCallerPushed - NumBytes;
2080
2081    // Set the delta of movement of the returnaddr stackslot.
2082    // But only set if delta is greater than previous delta.
2083    if (FPDiff < X86Info->getTCReturnAddrDelta())
2084      X86Info->setTCReturnAddrDelta(FPDiff);
2085  }
2086
2087  unsigned NumBytesToPush = NumBytes;
2088  unsigned NumBytesToPop = NumBytes;
2089
2090  // If we have an inalloca argument, all stack space has already been allocated
2091  // for us and be right at the top of the stack.  We don't support multiple
2092  // arguments passed in memory when using inalloca.
2093  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2094    NumBytesToPush = 0;
2095    if (!ArgLocs.back().isMemLoc())
2096      report_fatal_error("cannot use inalloca attribute on a register "
2097                         "parameter");
2098    if (ArgLocs.back().getLocMemOffset() != 0)
2099      report_fatal_error("any parameter with the inalloca attribute must be "
2100                         "the only memory argument");
2101  } else if (CLI.IsPreallocated) {
2102    assert(ArgLocs.back().isMemLoc() &&
2103           "cannot use preallocated attribute on a register "
2104           "parameter");
2105    SmallVector<size_t, 4> PreallocatedOffsets;
2106    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2107      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
2108        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
2109      }
2110    }
2111    auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2112    size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
2113    MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
2114    MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
2115    NumBytesToPush = 0;
2116  }
2117
2118  if (!IsSibcall && !IsMustTail)
2119    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
2120                                 NumBytes - NumBytesToPush, dl);
2121
2122  SDValue RetAddrFrIdx;
2123  // Load return address for tail calls.
2124  if (isTailCall && FPDiff)
2125    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2126                                    Is64Bit, FPDiff, dl);
2127
2128  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2129  SmallVector<SDValue, 8> MemOpChains;
2130  SDValue StackPtr;
2131
2132  // The next loop assumes that the locations are in the same order of the
2133  // input arguments.
2134  assert(isSortedByValueNo(ArgLocs) &&
2135         "Argument Location list must be sorted before lowering");
2136
2137  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2138  // of tail call optimization arguments are handle later.
2139  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2140  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2141       ++I, ++OutIndex) {
2142    assert(OutIndex < Outs.size() && "Invalid Out index");
2143    // Skip inalloca/preallocated arguments, they have already been written.
2144    ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2145    if (Flags.isInAlloca() || Flags.isPreallocated())
2146      continue;
2147
2148    CCValAssign &VA = ArgLocs[I];
2149    EVT RegVT = VA.getLocVT();
2150    SDValue Arg = OutVals[OutIndex];
2151    bool isByVal = Flags.isByVal();
2152
2153    // Promote the value if needed.
2154    switch (VA.getLocInfo()) {
2155    default: llvm_unreachable("Unknown loc info!");
2156    case CCValAssign::Full: break;
2157    case CCValAssign::SExt:
2158      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2159      break;
2160    case CCValAssign::ZExt:
2161      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2162      break;
2163    case CCValAssign::AExt:
2164      if (Arg.getValueType().isVector() &&
2165          Arg.getValueType().getVectorElementType() == MVT::i1)
2166        Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
2167      else if (RegVT.is128BitVector()) {
2168        // Special case: passing MMX values in XMM registers.
2169        Arg = DAG.getBitcast(MVT::i64, Arg);
2170        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2171        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2172      } else
2173        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2174      break;
2175    case CCValAssign::BCvt:
2176      Arg = DAG.getBitcast(RegVT, Arg);
2177      break;
2178    case CCValAssign::Indirect: {
2179      if (isByVal) {
2180        // Memcpy the argument to a temporary stack slot to prevent
2181        // the caller from seeing any modifications the callee may make
2182        // as guaranteed by the `byval` attribute.
2183        int FrameIdx = MF.getFrameInfo().CreateStackObject(
2184            Flags.getByValSize(),
2185            std::max(Align(16), Flags.getNonZeroByValAlign()), false);
2186        SDValue StackSlot =
2187            DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2188        Chain =
2189            CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
2190        // From now on treat this as a regular pointer
2191        Arg = StackSlot;
2192        isByVal = false;
2193      } else {
2194        // Store the argument.
2195        SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2196        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2197        Chain = DAG.getStore(
2198            Chain, dl, Arg, SpillSlot,
2199            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2200        Arg = SpillSlot;
2201      }
2202      break;
2203    }
2204    }
2205
2206    if (VA.needsCustom()) {
2207      assert(VA.getValVT() == MVT::v64i1 &&
2208             "Currently the only custom case is when we split v64i1 to 2 regs");
2209      // Split v64i1 value into two registers
2210      Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
2211    } else if (VA.isRegLoc()) {
2212      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2213      const TargetOptions &Options = DAG.getTarget().Options;
2214      if (Options.EmitCallSiteInfo)
2215        CSInfo.emplace_back(VA.getLocReg(), I);
2216      if (isVarArg && IsWin64) {
2217        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2218        // shadow reg if callee is a varargs function.
2219        Register ShadowReg;
2220        switch (VA.getLocReg()) {
2221        case X86::XMM0: ShadowReg = X86::RCX; break;
2222        case X86::XMM1: ShadowReg = X86::RDX; break;
2223        case X86::XMM2: ShadowReg = X86::R8; break;
2224        case X86::XMM3: ShadowReg = X86::R9; break;
2225        }
2226        if (ShadowReg)
2227          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2228      }
2229    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2230      assert(VA.isMemLoc());
2231      if (!StackPtr.getNode())
2232        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2233                                      getPointerTy(DAG.getDataLayout()));
2234      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2235                                             dl, DAG, VA, Flags, isByVal));
2236    }
2237  }
2238
2239  if (!MemOpChains.empty())
2240    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2241
2242  if (Subtarget.isPICStyleGOT()) {
2243    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2244    // GOT pointer (except regcall).
2245    if (!isTailCall) {
2246      // Indirect call with RegCall calling convertion may use up all the
2247      // general registers, so it is not suitable to bind EBX reister for
2248      // GOT address, just let register allocator handle it.
2249      if (CallConv != CallingConv::X86_RegCall)
2250        RegsToPass.push_back(std::make_pair(
2251          Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2252                                          getPointerTy(DAG.getDataLayout()))));
2253    } else {
2254      // If we are tail calling and generating PIC/GOT style code load the
2255      // address of the callee into ECX. The value in ecx is used as target of
2256      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2257      // for tail calls on PIC/GOT architectures. Normally we would just put the
2258      // address of GOT into ebx and then call target@PLT. But for tail calls
2259      // ebx would be restored (since ebx is callee saved) before jumping to the
2260      // target@PLT.
2261
2262      // Note: The actual moving to ECX is done further down.
2263      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2264      if (G && !G->getGlobal()->hasLocalLinkage() &&
2265          G->getGlobal()->hasDefaultVisibility())
2266        Callee = LowerGlobalAddress(Callee, DAG);
2267      else if (isa<ExternalSymbolSDNode>(Callee))
2268        Callee = LowerExternalSymbol(Callee, DAG);
2269    }
2270  }
2271
2272  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2273      (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
2274    // From AMD64 ABI document:
2275    // For calls that may call functions that use varargs or stdargs
2276    // (prototype-less calls or calls to functions containing ellipsis (...) in
2277    // the declaration) %al is used as hidden argument to specify the number
2278    // of SSE registers used. The contents of %al do not need to match exactly
2279    // the number of registers, but must be an ubound on the number of SSE
2280    // registers used and is in the range 0 - 8 inclusive.
2281
2282    // Count the number of XMM registers allocated.
2283    static const MCPhysReg XMMArgRegs[] = {
2284      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2285      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2286    };
2287    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2288    assert((Subtarget.hasSSE1() || !NumXMMRegs)
2289           && "SSE registers cannot be used when SSE is disabled");
2290    RegsToPass.push_back(std::make_pair(Register(X86::AL),
2291                                        DAG.getConstant(NumXMMRegs, dl,
2292                                                        MVT::i8)));
2293  }
2294
2295  if (isVarArg && IsMustTail) {
2296    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2297    for (const auto &F : Forwards) {
2298      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2299      RegsToPass.push_back(std::make_pair(F.PReg, Val));
2300    }
2301  }
2302
2303  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
2304  // don't need this because the eligibility check rejects calls that require
2305  // shuffling arguments passed in memory.
2306  if (!IsSibcall && isTailCall) {
2307    // Force all the incoming stack arguments to be loaded from the stack
2308    // before any new outgoing arguments are stored to the stack, because the
2309    // outgoing stack slots may alias the incoming argument stack slots, and
2310    // the alias isn't otherwise explicit. This is slightly more conservative
2311    // than necessary, because it means that each store effectively depends
2312    // on every argument instead of just those arguments it would clobber.
2313    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2314
2315    SmallVector<SDValue, 8> MemOpChains2;
2316    SDValue FIN;
2317    int FI = 0;
2318    for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2319         ++I, ++OutsIndex) {
2320      CCValAssign &VA = ArgLocs[I];
2321
2322      if (VA.isRegLoc()) {
2323        if (VA.needsCustom()) {
2324          assert((CallConv == CallingConv::X86_RegCall) &&
2325                 "Expecting custom case only in regcall calling convention");
2326          // This means that we are in special case where one argument was
2327          // passed through two register locations - Skip the next location
2328          ++I;
2329        }
2330
2331        continue;
2332      }
2333
2334      assert(VA.isMemLoc());
2335      SDValue Arg = OutVals[OutsIndex];
2336      ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2337      // Skip inalloca/preallocated arguments.  They don't require any work.
2338      if (Flags.isInAlloca() || Flags.isPreallocated())
2339        continue;
2340      // Create frame index.
2341      int32_t Offset = VA.getLocMemOffset()+FPDiff;
2342      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2343      FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
2344      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2345
2346      if (Flags.isByVal()) {
2347        // Copy relative to framepointer.
2348        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
2349        if (!StackPtr.getNode())
2350          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2351                                        getPointerTy(DAG.getDataLayout()));
2352        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2353                             StackPtr, Source);
2354
2355        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2356                                                         ArgChain,
2357                                                         Flags, DAG, dl));
2358      } else {
2359        // Store relative to framepointer.
2360        MemOpChains2.push_back(DAG.getStore(
2361            ArgChain, dl, Arg, FIN,
2362            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
2363      }
2364    }
2365
2366    if (!MemOpChains2.empty())
2367      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2368
2369    // Store the return address to the appropriate stack slot.
2370    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2371                                     getPointerTy(DAG.getDataLayout()),
2372                                     RegInfo->getSlotSize(), FPDiff, dl);
2373  }
2374
2375  // Build a sequence of copy-to-reg nodes chained together with token chain
2376  // and glue operands which copy the outgoing args into registers.
2377  SDValue InGlue;
2378  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2379    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2380                             RegsToPass[i].second, InGlue);
2381    InGlue = Chain.getValue(1);
2382  }
2383
2384  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2385    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2386    // In the 64-bit large code model, we have to make all calls
2387    // through a register, since the call instruction's 32-bit
2388    // pc-relative offset may not be large enough to hold the whole
2389    // address.
2390  } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2391             Callee->getOpcode() == ISD::ExternalSymbol) {
2392    // Lower direct calls to global addresses and external symbols. Setting
2393    // ForCall to true here has the effect of removing WrapperRIP when possible
2394    // to allow direct calls to be selected without first materializing the
2395    // address into a register.
2396    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
2397  } else if (Subtarget.isTarget64BitILP32() &&
2398             Callee.getValueType() == MVT::i32) {
2399    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2400    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
2401  }
2402
2403  // Returns a chain & a glue for retval copy to use.
2404  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2405  SmallVector<SDValue, 8> Ops;
2406
2407  if (!IsSibcall && isTailCall && !IsMustTail) {
2408    Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
2409    InGlue = Chain.getValue(1);
2410  }
2411
2412  Ops.push_back(Chain);
2413  Ops.push_back(Callee);
2414
2415  if (isTailCall)
2416    Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
2417
2418  // Add argument registers to the end of the list so that they are known live
2419  // into the call.
2420  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2421    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2422                                  RegsToPass[i].second.getValueType()));
2423
2424  // Add a register mask operand representing the call-preserved registers.
2425  const uint32_t *Mask = [&]() {
2426    auto AdaptedCC = CallConv;
2427    // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2428    // use X86_INTR calling convention because it has the same CSR mask
2429    // (same preserved registers).
2430    if (HasNCSR)
2431      AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2432    // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2433    // to use the CSR_NoRegs_RegMask.
2434    if (CB && CB->hasFnAttr("no_callee_saved_registers"))
2435      AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2436    return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2437  }();
2438  assert(Mask && "Missing call preserved mask for calling convention");
2439
2440  // If this is an invoke in a 32-bit function using a funclet-based
2441  // personality, assume the function clobbers all registers. If an exception
2442  // is thrown, the runtime will not restore CSRs.
2443  // FIXME: Model this more precisely so that we can register allocate across
2444  // the normal edge and spill and fill across the exceptional edge.
2445  if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
2446    const Function &CallerFn = MF.getFunction();
2447    EHPersonality Pers =
2448        CallerFn.hasPersonalityFn()
2449            ? classifyEHPersonality(CallerFn.getPersonalityFn())
2450            : EHPersonality::Unknown;
2451    if (isFuncletEHPersonality(Pers))
2452      Mask = RegInfo->getNoPreservedMask();
2453  }
2454
2455  // Define a new register mask from the existing mask.
2456  uint32_t *RegMask = nullptr;
2457
2458  // In some calling conventions we need to remove the used physical registers
2459  // from the reg mask. Create a new RegMask for such calling conventions.
2460  // RegMask for calling conventions that disable only return registers (e.g.
2461  // preserve_most) will be modified later in LowerCallResult.
2462  bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
2463  if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
2464    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2465
2466    // Allocate a new Reg Mask and copy Mask.
2467    RegMask = MF.allocateRegMask();
2468    unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
2469    memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
2470
2471    // Make sure all sub registers of the argument registers are reset
2472    // in the RegMask.
2473    if (ShouldDisableArgRegs) {
2474      for (auto const &RegPair : RegsToPass)
2475        for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
2476          RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2477    }
2478
2479    // Create the RegMask Operand according to our updated mask.
2480    Ops.push_back(DAG.getRegisterMask(RegMask));
2481  } else {
2482    // Create the RegMask Operand according to the static mask.
2483    Ops.push_back(DAG.getRegisterMask(Mask));
2484  }
2485
2486  if (InGlue.getNode())
2487    Ops.push_back(InGlue);
2488
2489  if (isTailCall) {
2490    // We used to do:
2491    //// If this is the first return lowered for this function, add the regs
2492    //// to the liveout set for the function.
2493    // This isn't right, although it's probably harmless on x86; liveouts
2494    // should be computed from returns not tail calls.  Consider a void
2495    // function making a tail call to a function returning int.
2496    MF.getFrameInfo().setHasTailCall();
2497    SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
2498
2499    if (IsCFICall)
2500      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2501
2502    DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2503    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2504    return Ret;
2505  }
2506
2507  if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
2508    Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
2509  } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
2510    // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2511    // expanded to the call, directly followed by a special marker sequence and
2512    // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2513    assert(!isTailCall &&
2514           "tail calls cannot be marked with clang.arc.attachedcall");
2515    assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2516
2517    // Add a target global address for the retainRV/claimRV runtime function
2518    // just before the call target.
2519    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
2520    auto PtrVT = getPointerTy(DAG.getDataLayout());
2521    auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
2522    Ops.insert(Ops.begin() + 1, GA);
2523    Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
2524  } else {
2525    Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2526  }
2527
2528  if (IsCFICall)
2529    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2530
2531  InGlue = Chain.getValue(1);
2532  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2533  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2534
2535  // Save heapallocsite metadata.
2536  if (CLI.CB)
2537    if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
2538      DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
2539
2540  // Create the CALLSEQ_END node.
2541  unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2542  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2543                       DAG.getTarget().Options.GuaranteedTailCallOpt))
2544    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
2545  else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
2546    // If this call passes a struct-return pointer, the callee
2547    // pops that struct pointer.
2548    NumBytesForCalleeToPop = 4;
2549
2550  // Returns a glue for retval copy to use.
2551  if (!IsSibcall) {
2552    Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
2553                               InGlue, dl);
2554    InGlue = Chain.getValue(1);
2555  }
2556
2557  // Handle result values, copying them out of physregs into vregs that we
2558  // return.
2559  return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2560                         InVals, RegMask);
2561}
2562
2563//===----------------------------------------------------------------------===//
2564//                Fast Calling Convention (tail call) implementation
2565//===----------------------------------------------------------------------===//
2566
2567//  Like std call, callee cleans arguments, convention except that ECX is
2568//  reserved for storing the tail called function address. Only 2 registers are
2569//  free for argument passing (inreg). Tail call optimization is performed
2570//  provided:
2571//                * tailcallopt is enabled
2572//                * caller/callee are fastcc
2573//  On X86_64 architecture with GOT-style position independent code only local
2574//  (within module) calls are supported at the moment.
2575//  To keep the stack aligned according to platform abi the function
2576//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2577//  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2578//  If a tail called function callee has more arguments than the caller the
2579//  caller needs to make sure that there is room to move the RETADDR to. This is
2580//  achieved by reserving an area the size of the argument delta right after the
2581//  original RETADDR, but before the saved framepointer or the spilled registers
2582//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2583//  stack layout:
2584//    arg1
2585//    arg2
2586//    RETADDR
2587//    [ new RETADDR
2588//      move area ]
2589//    (possible EBP)
2590//    ESI
2591//    EDI
2592//    local1 ..
2593
2594/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2595/// requirement.
2596unsigned
2597X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2598                                               SelectionDAG &DAG) const {
2599  const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2600  const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2601  assert(StackSize % SlotSize == 0 &&
2602         "StackSize must be a multiple of SlotSize");
2603  return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
2604}
2605
2606/// Return true if the given stack call argument is already available in the
2607/// same position (relatively) of the caller's incoming argument stack.
2608static
2609bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2610                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2611                         const X86InstrInfo *TII, const CCValAssign &VA) {
2612  unsigned Bytes = Arg.getValueSizeInBits() / 8;
2613
2614  for (;;) {
2615    // Look through nodes that don't alter the bits of the incoming value.
2616    unsigned Op = Arg.getOpcode();
2617    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2618        Op == ISD::AssertZext) {
2619      Arg = Arg.getOperand(0);
2620      continue;
2621    }
2622    if (Op == ISD::TRUNCATE) {
2623      const SDValue &TruncInput = Arg.getOperand(0);
2624      if (TruncInput.getOpcode() == ISD::AssertZext &&
2625          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
2626              Arg.getValueType()) {
2627        Arg = TruncInput.getOperand(0);
2628        continue;
2629      }
2630    }
2631    break;
2632  }
2633
2634  int FI = INT_MAX;
2635  if (Arg.getOpcode() == ISD::CopyFromReg) {
2636    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2637    if (!VR.isVirtual())
2638      return false;
2639    MachineInstr *Def = MRI->getVRegDef(VR);
2640    if (!Def)
2641      return false;
2642    if (!Flags.isByVal()) {
2643      if (!TII->isLoadFromStackSlot(*Def, FI))
2644        return false;
2645    } else {
2646      unsigned Opcode = Def->getOpcode();
2647      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2648           Opcode == X86::LEA64_32r) &&
2649          Def->getOperand(1).isFI()) {
2650        FI = Def->getOperand(1).getIndex();
2651        Bytes = Flags.getByValSize();
2652      } else
2653        return false;
2654    }
2655  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2656    if (Flags.isByVal())
2657      // ByVal argument is passed in as a pointer but it's now being
2658      // dereferenced. e.g.
2659      // define @foo(%struct.X* %A) {
2660      //   tail call @bar(%struct.X* byval %A)
2661      // }
2662      return false;
2663    SDValue Ptr = Ld->getBasePtr();
2664    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2665    if (!FINode)
2666      return false;
2667    FI = FINode->getIndex();
2668  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2669    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2670    FI = FINode->getIndex();
2671    Bytes = Flags.getByValSize();
2672  } else
2673    return false;
2674
2675  assert(FI != INT_MAX);
2676  if (!MFI.isFixedObjectIndex(FI))
2677    return false;
2678
2679  if (Offset != MFI.getObjectOffset(FI))
2680    return false;
2681
2682  // If this is not byval, check that the argument stack object is immutable.
2683  // inalloca and argument copy elision can create mutable argument stack
2684  // objects. Byval objects can be mutated, but a byval call intends to pass the
2685  // mutated memory.
2686  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
2687    return false;
2688
2689  if (VA.getLocVT().getFixedSizeInBits() >
2690      Arg.getValueSizeInBits().getFixedValue()) {
2691    // If the argument location is wider than the argument type, check that any
2692    // extension flags match.
2693    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
2694        Flags.isSExt() != MFI.isObjectSExt(FI)) {
2695      return false;
2696    }
2697  }
2698
2699  return Bytes == MFI.getObjectSize(FI);
2700}
2701
2702/// Check whether the call is eligible for tail call optimization. Targets
2703/// that want to do tail call optimization should implement this function.
2704bool X86TargetLowering::IsEligibleForTailCallOptimization(
2705    SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
2706    bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
2707    const SmallVectorImpl<SDValue> &OutVals,
2708    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2709  if (!mayTailCallThisCC(CalleeCC))
2710    return false;
2711
2712  // If -tailcallopt is specified, make fastcc functions tail-callable.
2713  MachineFunction &MF = DAG.getMachineFunction();
2714  const Function &CallerF = MF.getFunction();
2715
2716  // If the function return type is x86_fp80 and the callee return type is not,
2717  // then the FP_EXTEND of the call result is not a nop. It's not safe to
2718  // perform a tailcall optimization here.
2719  if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
2720    return false;
2721
2722  CallingConv::ID CallerCC = CallerF.getCallingConv();
2723  bool CCMatch = CallerCC == CalleeCC;
2724  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
2725  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
2726  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
2727      CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
2728
2729  // Win64 functions have extra shadow space for argument homing. Don't do the
2730  // sibcall if the caller and callee have mismatched expectations for this
2731  // space.
2732  if (IsCalleeWin64 != IsCallerWin64)
2733    return false;
2734
2735  if (IsGuaranteeTCO) {
2736    if (canGuaranteeTCO(CalleeCC) && CCMatch)
2737      return true;
2738    return false;
2739  }
2740
2741  // Look for obvious safe cases to perform tail call optimization that do not
2742  // require ABI changes. This is what gcc calls sibcall.
2743
2744  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2745  // emit a special epilogue.
2746  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2747  if (RegInfo->hasStackRealignment(MF))
2748    return false;
2749
2750  // Also avoid sibcall optimization if we're an sret return fn and the callee
2751  // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2752  // insufficient.
2753  if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2754    // For a compatible tail call the callee must return our sret pointer. So it
2755    // needs to be (a) an sret function itself and (b) we pass our sret as its
2756    // sret. Condition #b is harder to determine.
2757    return false;
2758  } else if (IsCalleePopSRet)
2759    // The callee pops an sret, so we cannot tail-call, as our caller doesn't
2760    // expect that.
2761    return false;
2762
2763  // Do not sibcall optimize vararg calls unless all arguments are passed via
2764  // registers.
2765  LLVMContext &C = *DAG.getContext();
2766  if (isVarArg && !Outs.empty()) {
2767    // Optimizing for varargs on Win64 is unlikely to be safe without
2768    // additional testing.
2769    if (IsCalleeWin64 || IsCallerWin64)
2770      return false;
2771
2772    SmallVector<CCValAssign, 16> ArgLocs;
2773    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2774    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2775    for (const auto &VA : ArgLocs)
2776      if (!VA.isRegLoc())
2777        return false;
2778  }
2779
2780  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2781  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2782  // this into a sibcall.
2783  bool Unused = false;
2784  for (const auto &In : Ins) {
2785    if (!In.Used) {
2786      Unused = true;
2787      break;
2788    }
2789  }
2790  if (Unused) {
2791    SmallVector<CCValAssign, 16> RVLocs;
2792    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
2793    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2794    for (const auto &VA : RVLocs) {
2795      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
2796        return false;
2797    }
2798  }
2799
2800  // Check that the call results are passed in the same way.
2801  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2802                                  RetCC_X86, RetCC_X86))
2803    return false;
2804  // The callee has to preserve all registers the caller needs to preserve.
2805  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2806  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2807  if (!CCMatch) {
2808    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2809    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2810      return false;
2811  }
2812
2813  unsigned StackArgsSize = 0;
2814
2815  // If the callee takes no arguments then go on to check the results of the
2816  // call.
2817  if (!Outs.empty()) {
2818    // Check if stack adjustment is needed. For now, do not do this if any
2819    // argument is passed on the stack.
2820    SmallVector<CCValAssign, 16> ArgLocs;
2821    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2822
2823    // Allocate shadow area for Win64
2824    if (IsCalleeWin64)
2825      CCInfo.AllocateStack(32, Align(8));
2826
2827    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2828    StackArgsSize = CCInfo.getStackSize();
2829
2830    if (CCInfo.getStackSize()) {
2831      // Check if the arguments are already laid out in the right way as
2832      // the caller's fixed stack objects.
2833      MachineFrameInfo &MFI = MF.getFrameInfo();
2834      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2835      const X86InstrInfo *TII = Subtarget.getInstrInfo();
2836      for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
2837        const CCValAssign &VA = ArgLocs[I];
2838        SDValue Arg = OutVals[I];
2839        ISD::ArgFlagsTy Flags = Outs[I].Flags;
2840        if (VA.getLocInfo() == CCValAssign::Indirect)
2841          return false;
2842        if (!VA.isRegLoc()) {
2843          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
2844                                   TII, VA))
2845            return false;
2846        }
2847      }
2848    }
2849
2850    bool PositionIndependent = isPositionIndependent();
2851    // If the tailcall address may be in a register, then make sure it's
2852    // possible to register allocate for it. In 32-bit, the call address can
2853    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2854    // callee-saved registers are restored. These happen to be the same
2855    // registers used to pass 'inreg' arguments so watch out for those.
2856    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
2857                                  !isa<ExternalSymbolSDNode>(Callee)) ||
2858                                 PositionIndependent)) {
2859      unsigned NumInRegs = 0;
2860      // In PIC we need an extra register to formulate the address computation
2861      // for the callee.
2862      unsigned MaxInRegs = PositionIndependent ? 2 : 3;
2863
2864      for (const auto &VA : ArgLocs) {
2865        if (!VA.isRegLoc())
2866          continue;
2867        Register Reg = VA.getLocReg();
2868        switch (Reg) {
2869        default: break;
2870        case X86::EAX: case X86::EDX: case X86::ECX:
2871          if (++NumInRegs == MaxInRegs)
2872            return false;
2873          break;
2874        }
2875      }
2876    }
2877
2878    const MachineRegisterInfo &MRI = MF.getRegInfo();
2879    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2880      return false;
2881  }
2882
2883  bool CalleeWillPop =
2884      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
2885                       MF.getTarget().Options.GuaranteedTailCallOpt);
2886
2887  if (unsigned BytesToPop =
2888          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2889    // If we have bytes to pop, the callee must pop them.
2890    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2891    if (!CalleePopMatches)
2892      return false;
2893  } else if (CalleeWillPop && StackArgsSize > 0) {
2894    // If we don't have bytes to pop, make sure the callee doesn't pop any.
2895    return false;
2896  }
2897
2898  return true;
2899}
2900
2901/// Determines whether the callee is required to pop its own arguments.
2902/// Callee pop is necessary to support tail calls.
2903bool X86::isCalleePop(CallingConv::ID CallingConv,
2904                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2905  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
2906  // can guarantee TCO.
2907  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
2908    return true;
2909
2910  switch (CallingConv) {
2911  default:
2912    return false;
2913  case CallingConv::X86_StdCall:
2914  case CallingConv::X86_FastCall:
2915  case CallingConv::X86_ThisCall:
2916  case CallingConv::X86_VectorCall:
2917    return !is64Bit;
2918  }
2919}
2920