Target/X86/X86ISelLoweringCall.cpp

//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file implements the lowering of LLVM calls to DAG nodes.
//
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86ISelLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"

#define DEBUG_TYPE "x86-isel"

using namespace llvm;

STATISTIC(NumTailCalls, "Number of tail calls");

/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
/// crashing.
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
                             const char *Msg) {
  MachineFunction &MF = DAG.getMachineFunction();
  DAG.getContext()->diagnose(
      DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
}

/// Returns true if a CC can dynamically exclude a register from the list of
/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
/// the return registers.
static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
  switch (CC) {
  default:
    return false;
  case CallingConv::X86_RegCall:
  case CallingConv::PreserveMost:
  case CallingConv::PreserveAll:
    return true;
  }
}

/// Returns true if a CC can dynamically exclude a register from the list of
/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
/// the parameters.
static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
  return CC == CallingConv::X86_RegCall;
}

static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
                                 const X86Subtarget &Subtarget) {
  // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
  // convention is one that uses k registers.
  if (NumElts == 2)
    return {MVT::v2i64, 1};
  if (NumElts == 4)
    return {MVT::v4i32, 1};
  if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
      CC != CallingConv::Intel_OCL_BI)
    return {MVT::v8i16, 1};
  if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
      CC != CallingConv::Intel_OCL_BI)
    return {MVT::v16i8, 1};
  // v32i1 passes in ymm unless we have BWI and the calling convention is
  // regcall.
  if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
    return {MVT::v32i8, 1};
  // Split v64i1 vectors if we don't have v64i8 available.
  if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
    if (Subtarget.useAVX512Regs())
      return {MVT::v64i8, 1};
    return {MVT::v32i8, 2};
  }

  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
  if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
      NumElts > 64)
    return {MVT::i8, NumElts};

  return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
}

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                     CallingConv::ID CC,
                                                     EVT VT) const {
  if (VT.isVector()) {
    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
      unsigned NumElts = VT.getVectorNumElements();

      MVT RegisterVT;
      unsigned NumRegisters;
      std::tie(RegisterVT, NumRegisters) =
          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
        return RegisterVT;
    }

    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
      return MVT::v8f16;
  }

  // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
  if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
      !Subtarget.hasX87())
    return MVT::i32;

  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
    return getRegisterTypeForCallingConv(Context, CC,
                                         VT.changeVectorElementType(MVT::f16));

  if (VT == MVT::bf16)
    return MVT::f16;

  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                          CallingConv::ID CC,
                                                          EVT VT) const {
  if (VT.isVector()) {
    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
      unsigned NumElts = VT.getVectorNumElements();

      MVT RegisterVT;
      unsigned NumRegisters;
      std::tie(RegisterVT, NumRegisters) =
          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
        return NumRegisters;
    }

    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
      return 1;
  }

  // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
  // x87 is disabled.
  if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
    if (VT == MVT::f64)
      return 2;
    if (VT == MVT::f80)
      return 3;
  }

  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
    return getNumRegistersForCallingConv(Context, CC,
                                         VT.changeVectorElementType(MVT::f16));

  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
    unsigned &NumIntermediates, MVT &RegisterVT) const {
  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
      Subtarget.hasAVX512() &&
      (!isPowerOf2_32(VT.getVectorNumElements()) ||
       (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
       VT.getVectorNumElements() > 64)) {
    RegisterVT = MVT::i8;
    IntermediateVT = MVT::i1;
    NumIntermediates = VT.getVectorNumElements();
    return NumIntermediates;
  }

  // Split v64i1 vectors if we don't have v64i8 available.
  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
      CC != CallingConv::X86_RegCall) {
    RegisterVT = MVT::v32i8;
    IntermediateVT = MVT::v32i1;
    NumIntermediates = 2;
    return 2;
  }

  // Split vNbf16 vectors according to vNf16.
  if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
    VT = VT.changeVectorElementType(MVT::f16);

  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
                                              NumIntermediates, RegisterVT);
}

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                          LLVMContext& Context,
                                          EVT VT) const {
  if (!VT.isVector())
    return MVT::i8;

  if (Subtarget.hasAVX512()) {
    // Figure out what this type will be legalized to.
    EVT LegalVT = VT;
    while (getTypeAction(Context, LegalVT) != TypeLegal)
      LegalVT = getTypeToTransformTo(Context, LegalVT);

    // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
    if (LegalVT.getSimpleVT().is512BitVector())
      return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

    if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
      // If we legalized to less than a 512-bit vector, then we will use a vXi1
      // compare for vXi32/vXi64 for sure. If we have BWI we will also support
      // vXi16/vXi8.
      MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
      if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
        return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
    }
  }

  return VT.changeVectorElementTypeToInteger();
}

/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
  if (MaxAlign == 16)
    return;
  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
    if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
      MaxAlign = Align(16);
  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
    Align EltAlign;
    getMaxByValAlign(ATy->getElementType(), EltAlign);
    if (EltAlign > MaxAlign)
      MaxAlign = EltAlign;
  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
    for (auto *EltTy : STy->elements()) {
      Align EltAlign;
      getMaxByValAlign(EltTy, EltAlign);
      if (EltAlign > MaxAlign)
        MaxAlign = EltAlign;
      if (MaxAlign == 16)
        break;
    }
  }
}

/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                  const DataLayout &DL) const {
  if (Subtarget.is64Bit()) {
    // Max of 8 and alignment of type.
    Align TyAlign = DL.getABITypeAlign(Ty);
    if (TyAlign > 8)
      return TyAlign.value();
    return 8;
  }

  Align Alignment(4);
  if (Subtarget.hasSSE1())
    getMaxByValAlign(Ty, Alignment);
  return Alignment.value();
}

/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
    const MemOp &Op, const AttributeList &FuncAttributes) const {
  if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
    if (Op.size() >= 16 &&
        (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
      // FIXME: Check if unaligned 64-byte accesses are slow.
      if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
          (Subtarget.getPreferVectorWidth() >= 512)) {
        return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
      }
      // FIXME: Check if unaligned 32-byte accesses are slow.
      if (Op.size() >= 32 && Subtarget.hasAVX() &&
          Subtarget.useLight256BitInstructions()) {
        // Although this isn't a well-supported type for AVX1, we'll let
        // legalization and shuffle lowering produce the optimal codegen. If we
        // choose an optimal type with a vector element larger than a byte,
        // getMemsetStores() may create an intermediate splat (using an integer
        // multiply) before we splat as a vector.
        return MVT::v32i8;
      }
      if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
        return MVT::v16i8;
      // TODO: Can SSE1 handle a byte vector?
      // If we have SSE1 registers we should be able to use them.
      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
          (Subtarget.getPreferVectorWidth() >= 128))
        return MVT::v4f32;
    } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
               Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
      // Do not use f64 to lower memcpy if source is string constant. It's
      // better to use i32 to avoid the loads.
      // Also, do not use f64 to lower memset unless this is a memset of zeros.
      // The gymnastics of splatting a byte value into an XMM register and then
      // only using 8-byte stores (because this is a CPU with slow unaligned
      // 16-byte accesses) makes that a loser.
      return MVT::f64;
    }
  }
  // This is a compromise. If we reach here, unaligned accesses may be slow on
  // this target. However, creating smaller, aligned accesses could be even
  // slower and would certainly be a lot more code.
  if (Subtarget.is64Bit() && Op.size() >= 8)
    return MVT::i64;
  return MVT::i32;
}

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
  if (VT == MVT::f32)
    return Subtarget.hasSSE1();
  if (VT == MVT::f64)
    return Subtarget.hasSSE2();
  return true;
}

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
  return (8 * Alignment.value()) % SizeInBits == 0;
}

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
  if (isBitAligned(Alignment, VT.getSizeInBits()))
    return true;
  switch (VT.getSizeInBits()) {
  default:
    // 8-byte and under are always assumed to be fast.
    return true;
  case 128:
    return !Subtarget.isUnalignedMem16Slow();
  case 256:
    return !Subtarget.isUnalignedMem32Slow();
    // TODO: What about AVX-512 (512-bit) accesses?
  }
}

bool X86TargetLowering::allowsMisalignedMemoryAccesses(
    EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
    unsigned *Fast) const {
  if (Fast)
    *Fast = isMemoryAccessFast(VT, Alignment);
  // NonTemporal vector memory ops must be aligned.
  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
    // NT loads can only be vector aligned, so if its less aligned than the
    // minimum vector size (which we can split the vector down to), we might as
    // well use a regular unaligned vector load.
    // We don't have any NT loads pre-SSE41.
    if (!!(Flags & MachineMemOperand::MOLoad))
      return (Alignment < 16 || !Subtarget.hasSSE41());
    return false;
  }
  // Misaligned accesses of any size are always allowed.
  return true;
}

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
                                           const DataLayout &DL, EVT VT,
                                           unsigned AddrSpace, Align Alignment,
                                           MachineMemOperand::Flags Flags,
                                           unsigned *Fast) const {
  if (Fast)
    *Fast = isMemoryAccessFast(VT, Alignment);
  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
    if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
                                       /*Fast=*/nullptr))
      return true;
    // NonTemporal vector memory ops are special, and must be aligned.
    if (!isBitAligned(Alignment, VT.getSizeInBits()))
      return false;
    switch (VT.getSizeInBits()) {
    case 128:
      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
        return true;
      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
        return true;
      return false;
    case 256:
      if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
        return true;
      if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
        return true;
      return false;
    case 512:
      if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
        return true;
      return false;
    default:
      return false; // Don't have NonTemporal vector memory ops of this size.
    }
  }
  return true;
}

/// Return the entry encoding for a jump table in the
/// current function.  The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
  // symbol.
  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
    return MachineJumpTableInfo::EK_Custom32;
  if (isPositionIndependent() &&
      getTargetMachine().getCodeModel() == CodeModel::Large)
    return MachineJumpTableInfo::EK_LabelDifference64;

  // Otherwise, use the normal jump table encoding heuristics.
  return TargetLowering::getJumpTableEncoding();
}

bool X86TargetLowering::useSoftFloat() const {
  return Subtarget.useSoftFloat();
}

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                              ArgListTy &Args) const {

  // Only relabel X86-32 for C / Stdcall CCs.
  if (Subtarget.is64Bit())
    return;
  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
    return;
  unsigned ParamRegs = 0;
  if (auto *M = MF->getFunction().getParent())
    ParamRegs = M->getNumberRegisterParameters();

  // Mark the first N int arguments as having reg
  for (auto &Arg : Args) {
    Type *T = Arg.Ty;
    if (T->isIntOrPtrTy())
      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
        unsigned numRegs = 1;
        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
          numRegs = 2;
        if (ParamRegs < numRegs)
          return;
        ParamRegs -= numRegs;
        Arg.IsInReg = true;
      }
  }
}

const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                             const MachineBasicBlock *MBB,
                                             unsigned uid,MCContext &Ctx) const{
  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
  // entries.
  return MCSymbolRefExpr::create(MBB->getSymbol(),
                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
}

/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                    SelectionDAG &DAG) const {
  if (!Subtarget.is64Bit())
    // This doesn't have SDLoc associated with it, but is not really the
    // same as a Register.
    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                       getPointerTy(DAG.getDataLayout()));
  return Table;
}

/// This returns the relocation base for the given PIC jumptable,
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                             MCContext &Ctx) const {
  // X86-64 uses RIP relative addressing based on the jump table label.
  if (Subtarget.isPICStyleRIPRel() ||
      (Subtarget.is64Bit() &&
       getTargetMachine().getCodeModel() == CodeModel::Large))
    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

  // Otherwise, the reference is relative to the PIC base.
  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
}

std::pair<const TargetRegisterClass *, uint8_t>
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
                                           MVT VT) const {
  const TargetRegisterClass *RRC = nullptr;
  uint8_t Cost = 1;
  switch (VT.SimpleTy) {
  default:
    return TargetLowering::findRepresentativeClass(TRI, VT);
  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
    break;
  case MVT::x86mmx:
    RRC = &X86::VR64RegClass;
    break;
  case MVT::f32: case MVT::f64:
  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
  case MVT::v4f32: case MVT::v2f64:
  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
  case MVT::v8f32: case MVT::v4f64:
  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
  case MVT::v16f32: case MVT::v8f64:
    RRC = &X86::VR128XRegClass;
    break;
  }
  return std::make_pair(RRC, Cost);
}

unsigned X86TargetLowering::getAddressSpace() const {
  if (Subtarget.is64Bit())
    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
  return 256;
}

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
}

static Constant* SegmentOffset(IRBuilderBase &IRB,
                               int Offset, unsigned AddressSpace) {
  return ConstantExpr::getIntToPtr(
      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
      IRB.getPtrTy(AddressSpace));
}

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
  // tcbhead_t; use it instead of the usual global variable (see
  // sysdeps/{i386,x86_64}/nptl/tls.h)
  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
    unsigned AddressSpace = getAddressSpace();

    // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
    if (Subtarget.isTargetFuchsia())
      return SegmentOffset(IRB, 0x10, AddressSpace);

    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
    // Specially, some users may customize the base reg and offset.
    int Offset = M->getStackProtectorGuardOffset();
    // If we don't set -stack-protector-guard-offset value:
    // %fs:0x28, unless we're using a Kernel code model, in which case
    // it's %gs:0x28.  gs:0x14 on i386.
    if (Offset == INT_MAX)
      Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

    StringRef GuardReg = M->getStackProtectorGuardReg();
    if (GuardReg == "fs")
      AddressSpace = X86AS::FS;
    else if (GuardReg == "gs")
      AddressSpace = X86AS::GS;

    // Use symbol guard if user specify.
    StringRef GuardSymb = M->getStackProtectorGuardSymbol();
    if (!GuardSymb.empty()) {
      GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
      if (!GV) {
        Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
                                       : Type::getInt32Ty(M->getContext());
        GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
                                nullptr, GuardSymb, nullptr,
                                GlobalValue::NotThreadLocal, AddressSpace);
        if (!Subtarget.isTargetDarwin())
          GV->setDSOLocal(M->getDirectAccessExternalData());
      }
      return GV;
    }

    return SegmentOffset(IRB, Offset, AddressSpace);
  }
  return TargetLowering::getIRStackGuard(IRB);
}

void X86TargetLowering::insertSSPDeclarations(Module &M) const {
  // MSVC CRT provides functionalities for stack protection.
  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
    // MSVC CRT has a global variable holding security cookie.
    M.getOrInsertGlobal("__security_cookie",
                        PointerType::getUnqual(M.getContext()));

    // MSVC CRT has a function to validate security cookie.
    FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
        "__security_check_cookie", Type::getVoidTy(M.getContext()),
        PointerType::getUnqual(M.getContext()));
    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
      F->setCallingConv(CallingConv::X86_FastCall);
      F->addParamAttr(0, Attribute::AttrKind::InReg);
    }
    return;
  }

  StringRef GuardMode = M.getStackProtectorGuard();

  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
  if ((GuardMode == "tls" || GuardMode.empty()) &&
      hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
    return;
  TargetLowering::insertSSPDeclarations(M);
}

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
  // MSVC CRT has a global variable holding security cookie.
  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
    return M.getGlobalVariable("__security_cookie");
  }
  return TargetLowering::getSDagStackGuard(M);
}

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
  // MSVC CRT has a function to validate security cookie.
  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
    return M.getFunction("__security_check_cookie");
  }
  return TargetLowering::getSSPStackGuardCheck(M);
}

Value *
X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
  // Android provides a fixed TLS slot for the SafeStack pointer. See the
  // definition of TLS_SLOT_SAFESTACK in
  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
  if (Subtarget.isTargetAndroid()) {
    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
    // %gs:0x24 on i386
    int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
    return SegmentOffset(IRB, Offset, getAddressSpace());
  }

  // Fuchsia is similar.
  if (Subtarget.isTargetFuchsia()) {
    // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
    return SegmentOffset(IRB, 0x18, getAddressSpace());
  }

  return TargetLowering::getSafeStackPointerLocation(IRB);
}

//===----------------------------------------------------------------------===//
//               Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//

bool X86TargetLowering::CanLowerReturn(
    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
  return CCInfo.CheckReturn(Outs, RetCC_X86);
}

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
  return ScratchRegs;
}

ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
  // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
  // tests at the moment, which is not what we expected.
  static const MCPhysReg RCRegs[] = {X86::MXCSR};
  return RCRegs;
}

/// Lowers masks values (v*i1) to the local register values
/// \returns DAG node after lowering to register type
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
                               const SDLoc &DL, SelectionDAG &DAG) {
  EVT ValVT = ValArg.getValueType();

  if (ValVT == MVT::v1i1)
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
                       DAG.getIntPtrConstant(0, DL));

  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
      (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
    // Two stage lowering might be required
    // bitcast:   v8i1 -> i8 / v16i1 -> i16
    // anyextend: i8   -> i32 / i16   -> i32
    EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
    SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
    if (ValLoc == MVT::i32)
      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
    return ValToCopy;
  }

  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
      (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
    // One stage lowering is required
    // bitcast:   v32i1 -> i32 / v64i1 -> i64
    return DAG.getBitcast(ValLoc, ValArg);
  }

  return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
}

/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
    const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
    SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
    CCValAssign &NextVA, const X86Subtarget &Subtarget) {
  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
         "The value should reside in two registers");

  // Before splitting the value we cast it to i64
  Arg = DAG.getBitcast(MVT::i64, Arg);

  // Splitting the value into two i32 types
  SDValue Lo, Hi;
  std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);

  // Attach the two i32 types into corresponding registers
  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
}

SDValue
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               const SDLoc &dl, SelectionDAG &DAG) const {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  // In some cases we need to disable registers from the default CSR list.
  // For example, when they are used as return registers (preserve_* and X86's
  // regcall) or for argument passing (X86's regcall).
  bool ShouldDisableCalleeSavedRegister =
      shouldDisableRetRegFromCSR(CallConv) ||
      MF.getFunction().hasFnAttribute("no_caller_saved_registers");

  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
    report_fatal_error("X86 interrupts may not return any value");

  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
  CCInfo.AnalyzeReturn(Outs, RetCC_X86);

  SmallVector<std::pair<Register, SDValue>, 4> RetVals;
  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
       ++I, ++OutsIndex) {
    CCValAssign &VA = RVLocs[I];
    assert(VA.isRegLoc() && "Can only return in registers!");

    // Add the register to the CalleeSaveDisableRegs list.
    if (ShouldDisableCalleeSavedRegister)
      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

    SDValue ValToCopy = OutVals[OutsIndex];
    EVT ValVT = ValToCopy.getValueType();

    // Promote values to the appropriate types.
    if (VA.getLocInfo() == CCValAssign::SExt)
      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
    else if (VA.getLocInfo() == CCValAssign::ZExt)
      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
    else if (VA.getLocInfo() == CCValAssign::AExt) {
      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
        ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
      else
        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
    }
    else if (VA.getLocInfo() == CCValAssign::BCvt)
      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

    assert(VA.getLocInfo() != CCValAssign::FPExt &&
           "Unexpected FP-extend for return value.");

    // Report an error if we have attempted to return a value via an XMM
    // register and SSE was disabled.
    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    } else if (!Subtarget.hasSSE2() &&
               X86::FR64XRegClass.contains(VA.getLocReg()) &&
               ValVT == MVT::f64) {
      // When returning a double via an XMM register, report an error if SSE2 is
      // not enabled.
      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    }

    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
    // the RET instruction and handled by the FP Stackifier.
    if (VA.getLocReg() == X86::FP0 ||
        VA.getLocReg() == X86::FP1) {
      // If this is a copy from an xmm register to ST(0), use an FPExtend to
      // change the value to the FP stack register class.
      if (isScalarFPTypeInSSEReg(VA.getValVT()))
        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
      // Don't emit a copytoreg.
      continue;
    }

    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
    // which is returned in RAX / RDX.
    if (Subtarget.is64Bit()) {
      if (ValVT == MVT::x86mmx) {
        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                  ValToCopy);
          // If we don't have SSE2 available, convert to v4f32 so the generated
          // register is legal.
          if (!Subtarget.hasSSE2())
            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
        }
      }
    }

    if (VA.needsCustom()) {
      assert(VA.getValVT() == MVT::v64i1 &&
             "Currently the only custom case is when we split v64i1 to 2 regs");

      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
                         Subtarget);

      // Add the second register to the CalleeSaveDisableRegs list.
      if (ShouldDisableCalleeSavedRegister)
        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
    } else {
      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
    }
  }

  SDValue Glue;
  SmallVector<SDValue, 6> RetOps;
  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  // Operand #1 = Bytes To Pop
  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
                   MVT::i32));

  // Copy the result values into the output registers.
  for (auto &RetVal : RetVals) {
    if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
      RetOps.push_back(RetVal.second);
      continue; // Don't emit a copytoreg.
    }

    Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
    Glue = Chain.getValue(1);
    RetOps.push_back(
        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
  }

  // Swift calling convention does not require we copy the sret argument
  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

  // All x86 ABIs require that for returning structs by value we copy
  // the sret argument into %rax/%eax (depending on ABI) for the return.
  // We saved the argument into a virtual register in the entry block,
  // so now we copy the value out and into %rax/%eax.
  //
  // Checking Function.hasStructRetAttr() here is insufficient because the IR
  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
  // false, then an sret argument may be implicitly inserted in the SelDAG. In
  // either case FuncInfo->setSRetReturnReg() will have been called.
  if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
    // When we have both sret and another return value, we should use the
    // original Chain stored in RetOps[0], instead of the current Chain updated
    // in the above loop. If we only have sret, RetOps[0] equals to Chain.

    // For the case of sret and another return value, we have
    //   Chain_0 at the function entry
    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
    // If we use Chain_1 in getCopyFromReg, we will have
    //   Val = getCopyFromReg(Chain_1)
    //   Chain_2 = getCopyToReg(Chain_1, Val) from below

    // getCopyToReg(Chain_0) will be glued together with
    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
    //   Data dependency from Unit B to Unit A due to usage of Val in
    //     getCopyToReg(Chain_1, Val)
    //   Chain dependency from Unit A to Unit B

    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
                                     getPointerTy(MF.getDataLayout()));

    Register RetValReg
        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
          X86::RAX : X86::EAX;
    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
    Glue = Chain.getValue(1);

    // RAX/EAX now acts like a return value.
    RetOps.push_back(
        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

    // Add the returned register to the CalleeSaveDisableRegs list. Don't do
    // this however for preserve_most/preserve_all to minimize the number of
    // callee-saved registers for these CCs.
    if (ShouldDisableCalleeSavedRegister &&
        CallConv != CallingConv::PreserveAll &&
        CallConv != CallingConv::PreserveMost)
      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
  }

  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
  const MCPhysReg *I =
      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
  if (I) {
    for (; *I; ++I) {
      if (X86::GR64RegClass.contains(*I))
        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
      else
        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    }
  }

  RetOps[0] = Chain;  // Update chain.

  // Add the glue if we have it.
  if (Glue.getNode())
    RetOps.push_back(Glue);

  X86ISD::NodeType opcode = X86ISD::RET_GLUE;
  if (CallConv == CallingConv::X86_INTR)
    opcode = X86ISD::IRET;
  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
}

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
    return false;

  SDValue TCChain = Chain;
  SDNode *Copy = *N->use_begin();
  if (Copy->getOpcode() == ISD::CopyToReg) {
    // If the copy has a glue operand, we conservatively assume it isn't safe to
    // perform a tail call.
    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
      return false;
    TCChain = Copy->getOperand(0);
  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
    return false;

  bool HasRet = false;
  for (const SDNode *U : Copy->uses()) {
    if (U->getOpcode() != X86ISD::RET_GLUE)
      return false;
    // If we are returning more than one value, we can definitely
    // not make a tail call see PR19530
    if (U->getNumOperands() > 4)
      return false;
    if (U->getNumOperands() == 4 &&
        U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
      return false;
    HasRet = true;
  }

  if (!HasRet)
    return false;

  Chain = TCChain;
  return true;
}

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
                                           ISD::NodeType ExtendKind) const {
  MVT ReturnMVT = MVT::i32;

  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
    // The ABI does not require i1, i8 or i16 to be extended.
    //
    // On Darwin, there is code in the wild relying on Clang's old behaviour of
    // always extending i8/i16 return values, so keep doing that for now.
    // (PR26665).
    ReturnMVT = MVT::i8;
  }

  EVT MinVT = getRegisterType(Context, ReturnMVT);
  return VT.bitsLT(MinVT) ? MinVT : VT;
}

/// Reads two 32 bit registers and creates a 64 bit mask value.
/// \param VA The current 32 bit value that need to be assigned.
/// \param NextVA The next 32 bit value that need to be assigned.
/// \param Root The parent DAG node.
/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
///                        glue purposes. In the case the DAG is already using
///                        physical register instead of virtual, we should glue
///                        our new SDValue to InGlue SDvalue.
/// \return a new SDvalue of size 64bit.
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
                                SDValue &Root, SelectionDAG &DAG,
                                const SDLoc &DL, const X86Subtarget &Subtarget,
                                SDValue *InGlue = nullptr) {
  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
  assert(VA.getValVT() == MVT::v64i1 &&
         "Expecting first location of 64 bit width type");
  assert(NextVA.getValVT() == VA.getValVT() &&
         "The locations should have the same type");
  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
         "The values should reside in two registers");

  SDValue Lo, Hi;
  SDValue ArgValueLo, ArgValueHi;

  MachineFunction &MF = DAG.getMachineFunction();
  const TargetRegisterClass *RC = &X86::GR32RegClass;

  // Read a 32 bit value from the registers.
  if (nullptr == InGlue) {
    // When no physical register is present,
    // create an intermediate virtual register.
    Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
    ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
    ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
  } else {
    // When a physical register is available read the value from it and glue
    // the reads together.
    ArgValueLo =
      DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
    *InGlue = ArgValueLo.getValue(2);
    ArgValueHi =
      DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
    *InGlue = ArgValueHi.getValue(2);
  }

  // Convert the i32 type into v32i1 type.
  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

  // Convert the i32 type into v32i1 type.
  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

  // Concatenate the two values together.
  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
}

/// The function will lower a register of various sizes (8/16/32/64)
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
/// \returns a DAG node contains the operand after lowering to mask type.
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                               const EVT &ValLoc, const SDLoc &DL,
                               SelectionDAG &DAG) {
  SDValue ValReturned = ValArg;

  if (ValVT == MVT::v1i1)
    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);

  if (ValVT == MVT::v64i1) {
    // In 32 bit machine, this case is handled by getv64i1Argument
    assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
    // In 64 bit machine, There is no need to truncate the value only bitcast
  } else {
    MVT MaskLenVT;
    switch (ValVT.getSimpleVT().SimpleTy) {
    case MVT::v8i1:
      MaskLenVT = MVT::i8;
      break;
    case MVT::v16i1:
      MaskLenVT = MVT::i16;
      break;
    case MVT::v32i1:
      MaskLenVT = MVT::i32;
      break;
    default:
      llvm_unreachable("Expecting a vector of i1 types");
    }

    ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
  }
  return DAG.getBitcast(ValVT, ValReturned);
}

/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue X86TargetLowering::LowerCallResult(
    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
    uint32_t *RegMask) const {

  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
  // Assign locations to each value returned by this call.
  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                 *DAG.getContext());
  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

  // Copy all of the result registers out of their specified physreg.
  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
       ++I, ++InsIndex) {
    CCValAssign &VA = RVLocs[I];
    EVT CopyVT = VA.getLocVT();

    // In some calling conventions we need to remove the used registers
    // from the register mask.
    if (RegMask) {
      for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
        RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
    }

    // Report an error if there was an attempt to return FP values via XMM
    // registers.
    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
      if (VA.getLocReg() == X86::XMM1)
        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
      else
        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    } else if (!Subtarget.hasSSE2() &&
               X86::FR64XRegClass.contains(VA.getLocReg()) &&
               CopyVT == MVT::f64) {
      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
      if (VA.getLocReg() == X86::XMM1)
        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
      else
        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
    }

    // If we prefer to use the value in xmm registers, copy it out as f80 and
    // use a truncate to move it from fp stack reg to xmm reg.
    bool RoundAfterCopy = false;
    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
        isScalarFPTypeInSSEReg(VA.getValVT())) {
      if (!Subtarget.hasX87())
        report_fatal_error("X87 register return with X87 disabled");
      CopyVT = MVT::f80;
      RoundAfterCopy = (CopyVT != VA.getLocVT());
    }

    SDValue Val;
    if (VA.needsCustom()) {
      assert(VA.getValVT() == MVT::v64i1 &&
             "Currently the only custom case is when we split v64i1 to 2 regs");
      Val =
          getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
    } else {
      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
                  .getValue(1);
      Val = Chain.getValue(0);
      InGlue = Chain.getValue(2);
    }

    if (RoundAfterCopy)
      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                        // This truncation won't change the value.
                        DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

    if (VA.isExtInLoc()) {
      if (VA.getValVT().isVector() &&
          VA.getValVT().getScalarType() == MVT::i1 &&
          ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
           (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
        Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
      } else
        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
    }

    if (VA.getLocInfo() == CCValAssign::BCvt)
      Val = DAG.getBitcast(VA.getValVT(), Val);

    InVals.push_back(Val);
  }

  return Chain;
}

//===----------------------------------------------------------------------===//
//                C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
//  StdCall calling convention seems to be standard for many Windows' API
//  routines and around. It differs from C calling convention just a little:
//  callee should clean up the stack, not caller. Symbols should be also
//  decorated in some fancy way :) It doesn't support any vector arguments.
//  For info on fast calling convention see Fast Calling Convention (tail call)
//  implementation LowerX86_32FastCCCallTo.

/// Determines whether Args, either a set of outgoing arguments to a call, or a
/// set of incoming args of a call, contains an sret pointer that the callee
/// pops
template <typename T>
static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
                             const X86Subtarget &Subtarget) {
  // Not C++20 (yet), so no concepts available.
  static_assert(std::is_same_v<T, ISD::OutputArg> ||
                    std::is_same_v<T, ISD::InputArg>,
                "requires ISD::OutputArg or ISD::InputArg");

  // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
  // for most compilations.
  if (!Subtarget.is32Bit())
    return false;

  if (Args.empty())
    return false;

  // Most calls do not have an sret argument, check the arg next.
  const ISD::ArgFlagsTy &Flags = Args[0].Flags;
  if (!Flags.isSRet() || Flags.isInReg())
    return false;

  // The MSVCabi does not pop the sret.
  if (Subtarget.getTargetTriple().isOSMSVCRT())
    return false;

  // MCUs don't pop the sret
  if (Subtarget.isTargetMCU())
    return false;

  // Callee pops argument
  return true;
}

/// Make a copy of an aggregate at address specified by "Src" to address
/// "Dst" with size and alignment information specified by the specific
/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                         SDValue Chain, ISD::ArgFlagsTy Flags,
                                         SelectionDAG &DAG, const SDLoc &dl) {
  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

  return DAG.getMemcpy(
      Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
      /*isVolatile*/ false, /*AlwaysInline=*/true,
      /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
}

/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
          CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
          CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
}

/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
  switch (CC) {
  // C calling conventions:
  case CallingConv::C:
  case CallingConv::Win64:
  case CallingConv::X86_64_SysV:
  // Callee pop conventions:
  case CallingConv::X86_ThisCall:
  case CallingConv::X86_StdCall:
  case CallingConv::X86_VectorCall:
  case CallingConv::X86_FastCall:
  // Swift:
  case CallingConv::Swift:
    return true;
  default:
    return canGuaranteeTCO(CC);
  }
}

/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
}

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
  if (!CI->isTailCall())
    return false;

  CallingConv::ID CalleeCC = CI->getCallingConv();
  if (!mayTailCallThisCC(CalleeCC))
    return false;

  return true;
}

SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
                                    const SDLoc &dl, SelectionDAG &DAG,
                                    const CCValAssign &VA,
                                    MachineFrameInfo &MFI, unsigned i) const {
  // Create the nodes corresponding to a load from this parameter slot.
  ISD::ArgFlagsTy Flags = Ins[i].Flags;
  bool AlwaysUseMutable = shouldGuaranteeTCO(
      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
  EVT ValVT;
  MVT PtrVT = getPointerTy(DAG.getDataLayout());

  // If value is passed by pointer we have address passed instead of the value
  // itself. No need to extend if the mask value and location share the same
  // absolute size.
  bool ExtendedInMem =
      VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
      VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
    ValVT = VA.getLocVT();
  else
    ValVT = VA.getValVT();

  // FIXME: For now, all byval parameter objects are marked mutable. This can be
  // changed with more analysis.
  // In case of tail call optimization mark all arguments mutable. Since they
  // could be overwritten by lowering of arguments in case of a tail call.
  if (Flags.isByVal()) {
    unsigned Bytes = Flags.getByValSize();
    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

    // FIXME: For now, all byval parameter objects are marked as aliasing. This
    // can be improved with deeper analysis.
    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
                                   /*isAliased=*/true);
    return DAG.getFrameIndex(FI, PtrVT);
  }

  EVT ArgVT = Ins[i].ArgVT;

  // If this is a vector that has been split into multiple parts, don't elide
  // the copy. The layout on the stack may not match the packed in-memory
  // layout.
  bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();

  // This is an argument in memory. We might be able to perform copy elision.
  // If the argument is passed directly in memory without any extension, then we
  // can perform copy elision. Large vector types, for example, may be passed
  // indirectly by pointer.
  if (Flags.isCopyElisionCandidate() &&
      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
      !ScalarizedVector) {
    SDValue PartAddr;
    if (Ins[i].PartOffset == 0) {
      // If this is a one-part value or the first part of a multi-part value,
      // create a stack object for the entire argument value type and return a
      // load from our portion of it. This assumes that if the first part of an
      // argument is in memory, the rest will also be in memory.
      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
                                     /*IsImmutable=*/false);
      PartAddr = DAG.getFrameIndex(FI, PtrVT);
      return DAG.getLoad(
          ValVT, dl, Chain, PartAddr,
          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
    }

    // This is not the first piece of an argument in memory. See if there is
    // already a fixed stack object including this offset. If so, assume it
    // was created by the PartOffset == 0 branch above and create a load from
    // the appropriate offset into it.
    int64_t PartBegin = VA.getLocMemOffset();
    int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
    int FI = MFI.getObjectIndexBegin();
    for (; MFI.isFixedObjectIndex(FI); ++FI) {
      int64_t ObjBegin = MFI.getObjectOffset(FI);
      int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
      if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
        break;
    }
    if (MFI.isFixedObjectIndex(FI)) {
      SDValue Addr =
          DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
                      DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
      return DAG.getLoad(ValVT, dl, Chain, Addr,
                         MachinePointerInfo::getFixedStack(
                             DAG.getMachineFunction(), FI, Ins[i].PartOffset));
    }
  }

  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
                                 VA.getLocMemOffset(), isImmutable);

  // Set SExt or ZExt flag.
  if (VA.getLocInfo() == CCValAssign::ZExt) {
    MFI.setObjectZExt(FI, true);
  } else if (VA.getLocInfo() == CCValAssign::SExt) {
    MFI.setObjectSExt(FI, true);
  }

  MaybeAlign Alignment;
  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
      ValVT != MVT::f80)
    Alignment = MaybeAlign(4);
  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
  SDValue Val = DAG.getLoad(
      ValVT, dl, Chain, FIN,
      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
      Alignment);
  return ExtendedInMem
             ? (VA.getValVT().isVector()
                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
             : Val;
}

// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
                                                const X86Subtarget &Subtarget) {
  assert(Subtarget.is64Bit());

  if (Subtarget.isCallingConvWin64(CallConv)) {
    static const MCPhysReg GPR64ArgRegsWin64[] = {
      X86::RCX, X86::RDX, X86::R8,  X86::R9
    };
    return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
  }

  static const MCPhysReg GPR64ArgRegs64Bit[] = {
    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
  };
  return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
}

// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
                                                CallingConv::ID CallConv,
                                                const X86Subtarget &Subtarget) {
  assert(Subtarget.is64Bit());
  if (Subtarget.isCallingConvWin64(CallConv)) {
    // The XMM registers which might contain var arg parameters are shadowed
    // in their paired GPR.  So we only need to save the GPR to their home
    // slots.
    // TODO: __vectorcall will change this.
    return std::nullopt;
  }

  bool isSoftFloat = Subtarget.useSoftFloat();
  if (isSoftFloat || !Subtarget.hasSSE1())
    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
    // registers.
    return std::nullopt;

  static const MCPhysReg XMMArgRegs64Bit[] = {
    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
  };
  return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}

#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
  return llvm::is_sorted(
      ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
        return A.getValNo() < B.getValNo();
      });
}
#endif

namespace {
/// This is a helper class for lowering variable arguments parameters.
class VarArgsLoweringHelper {
public:
  VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
                        SelectionDAG &DAG, const X86Subtarget &Subtarget,
                        CallingConv::ID CallConv, CCState &CCInfo)
      : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
        TheMachineFunction(DAG.getMachineFunction()),
        TheFunction(TheMachineFunction.getFunction()),
        FrameInfo(TheMachineFunction.getFrameInfo()),
        FrameLowering(*Subtarget.getFrameLowering()),
        TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
        CCInfo(CCInfo) {}

  // Lower variable arguments parameters.
  void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

private:
  void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

  void forwardMustTailParameters(SDValue &Chain);

  bool is64Bit() const { return Subtarget.is64Bit(); }
  bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

  X86MachineFunctionInfo *FuncInfo;
  const SDLoc &DL;
  SelectionDAG &DAG;
  const X86Subtarget &Subtarget;
  MachineFunction &TheMachineFunction;
  const Function &TheFunction;
  MachineFrameInfo &FrameInfo;
  const TargetFrameLowering &FrameLowering;
  const TargetLowering &TargLowering;
  CallingConv::ID CallConv;
  CCState &CCInfo;
};
} // namespace

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
    SDValue &Chain, unsigned StackSize) {
  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start. We
  // can skip this if there are no va_start calls.
  if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
                    CallConv != CallingConv::X86_ThisCall)) {
    FuncInfo->setVarArgsFrameIndex(
        FrameInfo.CreateFixedObject(1, StackSize, true));
  }

  // 64-bit calling conventions support varargs and register parameters, so we
  // have to do extra work to spill them in the prologue.
  if (is64Bit()) {
    // Find the first unallocated argument registers.
    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
    ArrayRef<MCPhysReg> ArgXMMs =
        get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
           "SSE register cannot be used when SSE is disabled!");

    if (isWin64()) {
      // Get to the caller-allocated home save location.  Add 8 to account
      // for the return address.
      int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
      FuncInfo->setRegSaveFrameIndex(
          FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
      // Fixup to set vararg frame on shadow area (4 x i64).
      if (NumIntRegs < 4)
        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
    } else {
      // For X86-64, if there are vararg parameters that are passed via
      // registers, then we must store them to their spots on the stack so
      // they may be loaded by dereferencing the result of va_next.
      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
      FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
    }

    SmallVector<SDValue, 6>
        LiveGPRs; // list of SDValue for GPR registers keeping live input value
    SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
                                         // keeping live input value
    SDValue ALVal; // if applicable keeps SDValue for %al register

    // Gather all the live in physical registers.
    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
      Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
      LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
    }
    const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
    if (!AvailableXmms.empty()) {
      Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
      ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
      for (MCPhysReg Reg : AvailableXmms) {
        // FastRegisterAllocator spills virtual registers at basic
        // block boundary. That leads to usages of xmm registers
        // outside of check for %al. Pass physical registers to
        // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
        TheMachineFunction.getRegInfo().addLiveIn(Reg);
        LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
      }
    }

    // Store the integer parameter registers.
    SmallVector<SDValue, 8> MemOps;
    SDValue RSFIN =
        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                          TargLowering.getPointerTy(DAG.getDataLayout()));
    unsigned Offset = FuncInfo->getVarArgsGPOffset();
    for (SDValue Val : LiveGPRs) {
      SDValue FIN = DAG.getNode(ISD::ADD, DL,
                                TargLowering.getPointerTy(DAG.getDataLayout()),
                                RSFIN, DAG.getIntPtrConstant(Offset, DL));
      SDValue Store =
          DAG.getStore(Val.getValue(1), DL, Val, FIN,
                       MachinePointerInfo::getFixedStack(
                           DAG.getMachineFunction(),
                           FuncInfo->getRegSaveFrameIndex(), Offset));
      MemOps.push_back(Store);
      Offset += 8;
    }

    // Now store the XMM (fp + vector) parameter registers.
    if (!LiveXMMRegs.empty()) {
      SmallVector<SDValue, 12> SaveXMMOps;
      SaveXMMOps.push_back(Chain);
      SaveXMMOps.push_back(ALVal);
      SaveXMMOps.push_back(RSFIN);
      SaveXMMOps.push_back(
          DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
      llvm::append_range(SaveXMMOps, LiveXMMRegs);
      MachineMemOperand *StoreMMO =
          DAG.getMachineFunction().getMachineMemOperand(
              MachinePointerInfo::getFixedStack(
                  DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
                  Offset),
              MachineMemOperand::MOStore, 128, Align(16));
      MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
                                               DL, DAG.getVTList(MVT::Other),
                                               SaveXMMOps, MVT::i8, StoreMMO));
    }

    if (!MemOps.empty())
      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
  }
}

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
  // Find the largest legal vector type.
  MVT VecVT = MVT::Other;
  // FIXME: Only some x86_32 calling conventions support AVX512.
  if (Subtarget.useAVX512Regs() &&
      (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
                     CallConv == CallingConv::Intel_OCL_BI)))
    VecVT = MVT::v16f32;
  else if (Subtarget.hasAVX())
    VecVT = MVT::v8f32;
  else if (Subtarget.hasSSE2())
    VecVT = MVT::v4f32;

  // We forward some GPRs and some vector types.
  SmallVector<MVT, 2> RegParmTypes;
  MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
  RegParmTypes.push_back(IntVT);
  if (VecVT != MVT::Other)
    RegParmTypes.push_back(VecVT);

  // Compute the set of forwarded registers. The rest are scratch.
  SmallVectorImpl<ForwardedRegister> &Forwards =
      FuncInfo->getForwardedMustTailRegParms();
  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

  // Forward AL for SysV x86_64 targets, since it is used for varargs.
  if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
    Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
    Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
  }

  // Copy all forwards from physical to virtual registers.
  for (ForwardedRegister &FR : Forwards) {
    // FIXME: Can we use a less constrained schedule?
    SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
    FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
        TargLowering.getRegClassFor(FR.VT));
    Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
  }
}

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
                                                   unsigned StackSize) {
  // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
  // If necessary, it would be set into the correct value later.
  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

  if (FrameInfo.hasVAStart())
    createVarArgAreaAndStoreRegisters(Chain, StackSize);

  if (FrameInfo.hasMustTailInVarArgFunc())
    forwardMustTailParameters(Chain);
}

SDValue X86TargetLowering::LowerFormalArguments(
    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

  const Function &F = MF.getFunction();
  if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
      F.getName() == "main")
    FuncInfo->setForceFramePointer(true);

  MachineFrameInfo &MFI = MF.getFrameInfo();
  bool Is64Bit = Subtarget.is64Bit();
  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

  assert(
      !(IsVarArg && canGuaranteeTCO(CallConv)) &&
      "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

  // Assign locations to all of the incoming arguments.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

  // Allocate shadow area for Win64.
  if (IsWin64)
    CCInfo.AllocateStack(32, Align(8));

  CCInfo.AnalyzeArguments(Ins, CC_X86);

  // In vectorcall calling convention a second pass is required for the HVA
  // types.
  if (CallingConv::X86_VectorCall == CallConv) {
    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
  }

  // The next loop assumes that the locations are in the same order of the
  // input arguments.
  assert(isSortedByValueNo(ArgLocs) &&
         "Argument Location list must be sorted before lowering");

  SDValue ArgValue;
  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
       ++I, ++InsIndex) {
    assert(InsIndex < Ins.size() && "Invalid Ins index");
    CCValAssign &VA = ArgLocs[I];

    if (VA.isRegLoc()) {
      EVT RegVT = VA.getLocVT();
      if (VA.needsCustom()) {
        assert(
            VA.getValVT() == MVT::v64i1 &&
            "Currently the only custom case is when we split v64i1 to 2 regs");

        // v64i1 values, in regcall calling convention, that are
        // compiled to 32 bit arch, are split up into two registers.
        ArgValue =
            getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
      } else {
        const TargetRegisterClass *RC;
        if (RegVT == MVT::i8)
          RC = &X86::GR8RegClass;
        else if (RegVT == MVT::i16)
          RC = &X86::GR16RegClass;
        else if (RegVT == MVT::i32)
          RC = &X86::GR32RegClass;
        else if (Is64Bit && RegVT == MVT::i64)
          RC = &X86::GR64RegClass;
        else if (RegVT == MVT::f16)
          RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
        else if (RegVT == MVT::f32)
          RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
        else if (RegVT == MVT::f64)
          RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
        else if (RegVT == MVT::f80)
          RC = &X86::RFP80RegClass;
        else if (RegVT == MVT::f128)
          RC = &X86::VR128RegClass;
        else if (RegVT.is512BitVector())
          RC = &X86::VR512RegClass;
        else if (RegVT.is256BitVector())
          RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
        else if (RegVT.is128BitVector())
          RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
        else if (RegVT == MVT::x86mmx)
          RC = &X86::VR64RegClass;
        else if (RegVT == MVT::v1i1)
          RC = &X86::VK1RegClass;
        else if (RegVT == MVT::v8i1)
          RC = &X86::VK8RegClass;
        else if (RegVT == MVT::v16i1)
          RC = &X86::VK16RegClass;
        else if (RegVT == MVT::v32i1)
          RC = &X86::VK32RegClass;
        else if (RegVT == MVT::v64i1)
          RC = &X86::VK64RegClass;
        else
          llvm_unreachable("Unknown argument type!");

        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
      }

      // If this is an 8 or 16-bit value, it is really passed promoted to 32
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
      // right size.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::ZExt)
        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::BCvt)
        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

      if (VA.isExtInLoc()) {
        // Handle MMX values passed in XMM regs.
        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
        else if (VA.getValVT().isVector() &&
                 VA.getValVT().getScalarType() == MVT::i1 &&
                 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
                  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
          ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
        } else
          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
      }
    } else {
      assert(VA.isMemLoc());
      ArgValue =
          LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
    }

    // If value is passed via pointer - do a load.
    if (VA.getLocInfo() == CCValAssign::Indirect &&
        !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
      ArgValue =
          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
    }

    InVals.push_back(ArgValue);
  }

  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
    if (Ins[I].Flags.isSwiftAsync()) {
      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
      if (Subtarget.is64Bit())
        X86FI->setHasSwiftAsyncContext(true);
      else {
        int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
        X86FI->setSwiftAsyncContextFrameIdx(FI);
        SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
                                  DAG.getFrameIndex(FI, MVT::i32),
                                  MachinePointerInfo::getFixedStack(MF, FI));
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
      }
    }

    // Swift calling convention does not require we copy the sret argument
    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
    if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
      continue;

    // All x86 ABIs require that for returning structs by value we copy the
    // sret argument into %rax/%eax (depending on ABI) for the return. Save
    // the argument into a virtual register so that we can access it from the
    // return points.
    if (Ins[I].Flags.isSRet()) {
      assert(!FuncInfo->getSRetReturnReg() &&
             "SRet return has already been set");
      MVT PtrTy = getPointerTy(DAG.getDataLayout());
      Register Reg =
          MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
      FuncInfo->setSRetReturnReg(Reg);
      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
      break;
    }
  }

  unsigned StackSize = CCInfo.getStackSize();
  // Align stack specially for tail calls.
  if (shouldGuaranteeTCO(CallConv,
                         MF.getTarget().Options.GuaranteedTailCallOpt))
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

  if (IsVarArg)
    VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
        .lowerVarArgsParameters(Chain, StackSize);

  // Some CCs need callee pop.
  if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
    // X86 interrupts must pop the error code (and the alignment padding) if
    // present.
    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
  } else {
    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
    // If this is an sret function, the return should pop the hidden pointer.
    if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
      FuncInfo->setBytesToPopOnReturn(4);
  }

  if (!Is64Bit) {
    // RegSaveFrameIndex is X86-64 only.
    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
  }

  FuncInfo->setArgumentStackSize(StackSize);

  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
    EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
    if (Personality == EHPersonality::CoreCLR) {
      assert(Is64Bit);
      // TODO: Add a mechanism to frame lowering that will allow us to indicate
      // that we'd prefer this slot be allocated towards the bottom of the frame
      // (i.e. near the stack pointer after allocating the frame).  Every
      // funclet needs a copy of this slot in its (mostly empty) frame, and the
      // offset from the bottom of this and each funclet's frame must be the
      // same, so the size of funclets' (mostly empty) frames is dictated by
      // how far this slot is from the bottom (since they allocate just enough
      // space to accommodate holding this slot at the correct offset).
      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
      EHInfo->PSPSymFrameIdx = PSPSymFI;
    }
  }

  if (shouldDisableArgRegFromCSR(CallConv) ||
      F.hasFnAttribute("no_caller_saved_registers")) {
    MachineRegisterInfo &MRI = MF.getRegInfo();
    for (std::pair<Register, Register> Pair : MRI.liveins())
      MRI.disableCalleeSavedRegister(Pair.first);
  }

  return Chain;
}

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
                                            SDValue Arg, const SDLoc &dl,
                                            SelectionDAG &DAG,
                                            const CCValAssign &VA,
                                            ISD::ArgFlagsTy Flags,
                                            bool isByVal) const {
  unsigned LocMemOffset = VA.getLocMemOffset();
  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                       StackPtr, PtrOff);
  if (isByVal)
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

  MaybeAlign Alignment;
  if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
      Arg.getSimpleValueType() != MVT::f80)
    Alignment = MaybeAlign(4);
  return DAG.getStore(
      Chain, dl, Arg, PtrOff,
      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
      Alignment);
}

/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
  // Adjust the Return address stack slot.
  EVT VT = getPointerTy(DAG.getDataLayout());
  OutRetAddr = getReturnAddressFrameIndex(DAG);

  // Load the "old" Return address.
  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
  return SDValue(OutRetAddr.getNode(), 1);
}

/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                        SDValue Chain, SDValue RetAddrFrIdx,
                                        EVT PtrVT, unsigned SlotSize,
                                        int FPDiff, const SDLoc &dl) {
  // Store the return address to the appropriate stack slot.
  if (!FPDiff) return Chain;
  // Calculate the new stack slot for the return address.
  int NewReturnAddrFI =
    MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
                                         false);
  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                       MachinePointerInfo::getFixedStack(
                           DAG.getMachineFunction(), NewReturnAddrFI));
  return Chain;
}

/// Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                                   SDValue V1, SDValue V2) const {
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> Mask;
  Mask.push_back(NumElems);
  for (unsigned i = 1; i != NumElems; ++i)
    Mask.push_back(i);
  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}

SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                             SmallVectorImpl<SDValue> &InVals) const {
  SelectionDAG &DAG                     = CLI.DAG;
  SDLoc &dl                             = CLI.DL;
  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
  SDValue Chain                         = CLI.Chain;
  SDValue Callee                        = CLI.Callee;
  CallingConv::ID CallConv              = CLI.CallConv;
  bool &isTailCall                      = CLI.IsTailCall;
  bool isVarArg                         = CLI.IsVarArg;
  const auto *CB                        = CLI.CB;

  MachineFunction &MF = DAG.getMachineFunction();
  bool Is64Bit        = Subtarget.is64Bit();
  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
  bool IsSibcall      = false;
  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
      CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
  bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
  bool HasNCSR = (CB && isa<CallInst>(CB) &&
                  CB->hasFnAttr("no_caller_saved_registers"));
  bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
  bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
  bool IsCFICall = IsIndirectCall && CLI.CFIType;
  const Module *M = MF.getMMI().getModule();
  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

  MachineFunction::CallSiteInfo CSInfo;
  if (CallConv == CallingConv::X86_INTR)
    report_fatal_error("X86 interrupts may not be called directly");

  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
    // If we are using a GOT, disable tail calls to external symbols with
    // default visibility. Tail calling such a symbol requires using a GOT
    // relocation, which forces early binding of the symbol. This breaks code
    // that require lazy function symbol resolution. Using musttail or
    // GuaranteedTailCallOpt will override this.
    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
               G->getGlobal()->hasDefaultVisibility()))
      isTailCall = false;
  }

  if (isTailCall && !IsMustTail) {
    // Check if it's really possible to do a tail call.
    isTailCall = IsEligibleForTailCallOptimization(
        Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
        Ins, DAG);

    // Sibcalls are automatically detected tailcalls which do not require
    // ABI changes.
    if (!IsGuaranteeTCO && isTailCall)
      IsSibcall = true;

    if (isTailCall)
      ++NumTailCalls;
  }

  if (IsMustTail && !isTailCall)
    report_fatal_error("failed to perform tail call elimination on a call "
                       "site marked musttail");

  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
         "Var args not supported with calling convention fastcc, ghc or hipe");

  // Analyze operands of the call, assigning locations to each operand.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

  // Allocate shadow area for Win64.
  if (IsWin64)
    CCInfo.AllocateStack(32, Align(8));

  CCInfo.AnalyzeArguments(Outs, CC_X86);

  // In vectorcall calling convention a second pass is required for the HVA
  // types.
  if (CallingConv::X86_VectorCall == CallConv) {
    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
  }

  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
  if (IsSibcall)
    // This is a sibcall. The memory operands are available in caller's
    // own caller's stack.
    NumBytes = 0;
  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

  int FPDiff = 0;
  if (isTailCall &&
      shouldGuaranteeTCO(CallConv,
                         MF.getTarget().Options.GuaranteedTailCallOpt)) {
    // Lower arguments at fp - stackoffset + fpdiff.
    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

    FPDiff = NumBytesCallerPushed - NumBytes;

    // Set the delta of movement of the returnaddr stackslot.
    // But only set if delta is greater than previous delta.
    if (FPDiff < X86Info->getTCReturnAddrDelta())
      X86Info->setTCReturnAddrDelta(FPDiff);
  }

  unsigned NumBytesToPush = NumBytes;
  unsigned NumBytesToPop = NumBytes;

  // If we have an inalloca argument, all stack space has already been allocated
  // for us and be right at the top of the stack.  We don't support multiple
  // arguments passed in memory when using inalloca.
  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
    NumBytesToPush = 0;
    if (!ArgLocs.back().isMemLoc())
      report_fatal_error("cannot use inalloca attribute on a register "
                         "parameter");
    if (ArgLocs.back().getLocMemOffset() != 0)
      report_fatal_error("any parameter with the inalloca attribute must be "
                         "the only memory argument");
  } else if (CLI.IsPreallocated) {
    assert(ArgLocs.back().isMemLoc() &&
           "cannot use preallocated attribute on a register "
           "parameter");
    SmallVector<size_t, 4> PreallocatedOffsets;
    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
      }
    }
    auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
    size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
    MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
    MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
    NumBytesToPush = 0;
  }

  if (!IsSibcall && !IsMustTail)
    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
                                 NumBytes - NumBytesToPush, dl);

  SDValue RetAddrFrIdx;
  // Load return address for tail calls.
  if (isTailCall && FPDiff)
    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                    Is64Bit, FPDiff, dl);

  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
  SmallVector<SDValue, 8> MemOpChains;
  SDValue StackPtr;

  // The next loop assumes that the locations are in the same order of the
  // input arguments.
  assert(isSortedByValueNo(ArgLocs) &&
         "Argument Location list must be sorted before lowering");

  // Walk the register/memloc assignments, inserting copies/loads.  In the case
  // of tail call optimization arguments are handle later.
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
       ++I, ++OutIndex) {
    assert(OutIndex < Outs.size() && "Invalid Out index");
    // Skip inalloca/preallocated arguments, they have already been written.
    ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
    if (Flags.isInAlloca() || Flags.isPreallocated())
      continue;

    CCValAssign &VA = ArgLocs[I];
    EVT RegVT = VA.getLocVT();
    SDValue Arg = OutVals[OutIndex];
    bool isByVal = Flags.isByVal();

    // Promote the value if needed.
    switch (VA.getLocInfo()) {
    default: llvm_unreachable("Unknown loc info!");
    case CCValAssign::Full: break;
    case CCValAssign::SExt:
      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::ZExt:
      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::AExt:
      if (Arg.getValueType().isVector() &&
          Arg.getValueType().getVectorElementType() == MVT::i1)
        Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
      else if (RegVT.is128BitVector()) {
        // Special case: passing MMX values in XMM registers.
        Arg = DAG.getBitcast(MVT::i64, Arg);
        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
      } else
        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
      break;
    case CCValAssign::BCvt:
      Arg = DAG.getBitcast(RegVT, Arg);
      break;
    case CCValAssign::Indirect: {
      if (isByVal) {
        // Memcpy the argument to a temporary stack slot to prevent
        // the caller from seeing any modifications the callee may make
        // as guaranteed by the `byval` attribute.
        int FrameIdx = MF.getFrameInfo().CreateStackObject(
            Flags.getByValSize(),
            std::max(Align(16), Flags.getNonZeroByValAlign()), false);
        SDValue StackSlot =
            DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
        Chain =
            CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
        // From now on treat this as a regular pointer
        Arg = StackSlot;
        isByVal = false;
      } else {
        // Store the argument.
        SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
        Chain = DAG.getStore(
            Chain, dl, Arg, SpillSlot,
            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
        Arg = SpillSlot;
      }
      break;
    }
    }

    if (VA.needsCustom()) {
      assert(VA.getValVT() == MVT::v64i1 &&
             "Currently the only custom case is when we split v64i1 to 2 regs");
      // Split v64i1 value into two registers
      Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
    } else if (VA.isRegLoc()) {
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
      const TargetOptions &Options = DAG.getTarget().Options;
      if (Options.EmitCallSiteInfo)
        CSInfo.emplace_back(VA.getLocReg(), I);
      if (isVarArg && IsWin64) {
        // Win64 ABI requires argument XMM reg to be copied to the corresponding
        // shadow reg if callee is a varargs function.
        Register ShadowReg;
        switch (VA.getLocReg()) {
        case X86::XMM0: ShadowReg = X86::RCX; break;
        case X86::XMM1: ShadowReg = X86::RDX; break;
        case X86::XMM2: ShadowReg = X86::R8; break;
        case X86::XMM3: ShadowReg = X86::R9; break;
        }
        if (ShadowReg)
          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
      }
    } else if (!IsSibcall && (!isTailCall || isByVal)) {
      assert(VA.isMemLoc());
      if (!StackPtr.getNode())
        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                      getPointerTy(DAG.getDataLayout()));
      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                             dl, DAG, VA, Flags, isByVal));
    }
  }

  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

  if (Subtarget.isPICStyleGOT()) {
    // ELF / PIC requires GOT in the EBX register before function calls via PLT
    // GOT pointer (except regcall).
    if (!isTailCall) {
      // Indirect call with RegCall calling convertion may use up all the
      // general registers, so it is not suitable to bind EBX reister for
      // GOT address, just let register allocator handle it.
      if (CallConv != CallingConv::X86_RegCall)
        RegsToPass.push_back(std::make_pair(
          Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                                          getPointerTy(DAG.getDataLayout()))));
    } else {
      // If we are tail calling and generating PIC/GOT style code load the
      // address of the callee into ECX. The value in ecx is used as target of
      // the tail jump. This is done to circumvent the ebx/callee-saved problem
      // for tail calls on PIC/GOT architectures. Normally we would just put the
      // address of GOT into ebx and then call target@PLT. But for tail calls
      // ebx would be restored (since ebx is callee saved) before jumping to the
      // target@PLT.

      // Note: The actual moving to ECX is done further down.
      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
      if (G && !G->getGlobal()->hasLocalLinkage() &&
          G->getGlobal()->hasDefaultVisibility())
        Callee = LowerGlobalAddress(Callee, DAG);
      else if (isa<ExternalSymbolSDNode>(Callee))
        Callee = LowerExternalSymbol(Callee, DAG);
    }
  }

  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
      (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
    // From AMD64 ABI document:
    // For calls that may call functions that use varargs or stdargs
    // (prototype-less calls or calls to functions containing ellipsis (...) in
    // the declaration) %al is used as hidden argument to specify the number
    // of SSE registers used. The contents of %al do not need to match exactly
    // the number of registers, but must be an ubound on the number of SSE
    // registers used and is in the range 0 - 8 inclusive.

    // Count the number of XMM registers allocated.
    static const MCPhysReg XMMArgRegs[] = {
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
    };
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
    assert((Subtarget.hasSSE1() || !NumXMMRegs)
           && "SSE registers cannot be used when SSE is disabled");
    RegsToPass.push_back(std::make_pair(Register(X86::AL),
                                        DAG.getConstant(NumXMMRegs, dl,
                                                        MVT::i8)));
  }

  if (isVarArg && IsMustTail) {
    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
    for (const auto &F : Forwards) {
      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
      RegsToPass.push_back(std::make_pair(F.PReg, Val));
    }
  }

  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
  // don't need this because the eligibility check rejects calls that require
  // shuffling arguments passed in memory.
  if (!IsSibcall && isTailCall) {
    // Force all the incoming stack arguments to be loaded from the stack
    // before any new outgoing arguments are stored to the stack, because the
    // outgoing stack slots may alias the incoming argument stack slots, and
    // the alias isn't otherwise explicit. This is slightly more conservative
    // than necessary, because it means that each store effectively depends
    // on every argument instead of just those arguments it would clobber.
    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

    SmallVector<SDValue, 8> MemOpChains2;
    SDValue FIN;
    int FI = 0;
    for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
         ++I, ++OutsIndex) {
      CCValAssign &VA = ArgLocs[I];

      if (VA.isRegLoc()) {
        if (VA.needsCustom()) {
          assert((CallConv == CallingConv::X86_RegCall) &&
                 "Expecting custom case only in regcall calling convention");
          // This means that we are in special case where one argument was
          // passed through two register locations - Skip the next location
          ++I;
        }

        continue;
      }

      assert(VA.isMemLoc());
      SDValue Arg = OutVals[OutsIndex];
      ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
      // Skip inalloca/preallocated arguments.  They don't require any work.
      if (Flags.isInAlloca() || Flags.isPreallocated())
        continue;
      // Create frame index.
      int32_t Offset = VA.getLocMemOffset()+FPDiff;
      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
      FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

      if (Flags.isByVal()) {
        // Copy relative to framepointer.
        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
        if (!StackPtr.getNode())
          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                        getPointerTy(DAG.getDataLayout()));
        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                             StackPtr, Source);

        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                         ArgChain,
                                                         Flags, DAG, dl));
      } else {
        // Store relative to framepointer.
        MemOpChains2.push_back(DAG.getStore(
            ArgChain, dl, Arg, FIN,
            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

    // Store the return address to the appropriate stack slot.
    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
                                     getPointerTy(DAG.getDataLayout()),
                                     RegInfo->getSlotSize(), FPDiff, dl);
  }

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and glue operands which copy the outgoing args into registers.
  SDValue InGlue;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                             RegsToPass[i].second, InGlue);
    InGlue = Chain.getValue(1);
  }

  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
    // In the 64-bit large code model, we have to make all calls
    // through a register, since the call instruction's 32-bit
    // pc-relative offset may not be large enough to hold the whole
    // address.
  } else if (Callee->getOpcode() == ISD::GlobalAddress ||
             Callee->getOpcode() == ISD::ExternalSymbol) {
    // Lower direct calls to global addresses and external symbols. Setting
    // ForCall to true here has the effect of removing WrapperRIP when possible
    // to allow direct calls to be selected without first materializing the
    // address into a register.
    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
  } else if (Subtarget.isTarget64BitILP32() &&
             Callee.getValueType() == MVT::i32) {
    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
  }

  // Returns a chain & a glue for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  SmallVector<SDValue, 8> Ops;

  if (!IsSibcall && isTailCall && !IsMustTail) {
    Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
    InGlue = Chain.getValue(1);
  }

  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (isTailCall)
    Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  // Add a register mask operand representing the call-preserved registers.
  const uint32_t *Mask = [&]() {
    auto AdaptedCC = CallConv;
    // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
    // use X86_INTR calling convention because it has the same CSR mask
    // (same preserved registers).
    if (HasNCSR)
      AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
    // If NoCalleeSavedRegisters is requested, than use GHC since it happens
    // to use the CSR_NoRegs_RegMask.
    if (CB && CB->hasFnAttr("no_callee_saved_registers"))
      AdaptedCC = (CallingConv::ID)CallingConv::GHC;
    return RegInfo->getCallPreservedMask(MF, AdaptedCC);
  }();
  assert(Mask && "Missing call preserved mask for calling convention");

  // If this is an invoke in a 32-bit function using a funclet-based
  // personality, assume the function clobbers all registers. If an exception
  // is thrown, the runtime will not restore CSRs.
  // FIXME: Model this more precisely so that we can register allocate across
  // the normal edge and spill and fill across the exceptional edge.
  if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
    const Function &CallerFn = MF.getFunction();
    EHPersonality Pers =
        CallerFn.hasPersonalityFn()
            ? classifyEHPersonality(CallerFn.getPersonalityFn())
            : EHPersonality::Unknown;
    if (isFuncletEHPersonality(Pers))
      Mask = RegInfo->getNoPreservedMask();
  }

  // Define a new register mask from the existing mask.
  uint32_t *RegMask = nullptr;

  // In some calling conventions we need to remove the used physical registers
  // from the reg mask. Create a new RegMask for such calling conventions.
  // RegMask for calling conventions that disable only return registers (e.g.
  // preserve_most) will be modified later in LowerCallResult.
  bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
  if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

    // Allocate a new Reg Mask and copy Mask.
    RegMask = MF.allocateRegMask();
    unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
    memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

    // Make sure all sub registers of the argument registers are reset
    // in the RegMask.
    if (ShouldDisableArgRegs) {
      for (auto const &RegPair : RegsToPass)
        for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
          RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
    }

    // Create the RegMask Operand according to our updated mask.
    Ops.push_back(DAG.getRegisterMask(RegMask));
  } else {
    // Create the RegMask Operand according to the static mask.
    Ops.push_back(DAG.getRegisterMask(Mask));
  }

  if (InGlue.getNode())
    Ops.push_back(InGlue);

  if (isTailCall) {
    // We used to do:
    //// If this is the first return lowered for this function, add the regs
    //// to the liveout set for the function.
    // This isn't right, although it's probably harmless on x86; liveouts
    // should be computed from returns not tail calls.  Consider a void
    // function making a tail call to a function returning int.
    MF.getFrameInfo().setHasTailCall();
    SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

    if (IsCFICall)
      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

    DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
    return Ret;
  }

  if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
    Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
  } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
    // Calls with a "clang.arc.attachedcall" bundle are special. They should be
    // expanded to the call, directly followed by a special marker sequence and
    // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
    assert(!isTailCall &&
           "tail calls cannot be marked with clang.arc.attachedcall");
    assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");

    // Add a target global address for the retainRV/claimRV runtime function
    // just before the call target.
    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
    auto PtrVT = getPointerTy(DAG.getDataLayout());
    auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
    Ops.insert(Ops.begin() + 1, GA);
    Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
  } else {
    Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
  }

  if (IsCFICall)
    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

  InGlue = Chain.getValue(1);
  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

  // Save heapallocsite metadata.
  if (CLI.CB)
    if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
      DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                       DAG.getTarget().Options.GuaranteedTailCallOpt))
    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
  else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
    // If this call passes a struct-return pointer, the callee
    // pops that struct pointer.
    NumBytesForCalleeToPop = 4;

  // Returns a glue for retval copy to use.
  if (!IsSibcall) {
    Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
                               InGlue, dl);
    InGlue = Chain.getValue(1);
  }

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
                         InVals, RegMask);
}

//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//  On X86_64 architecture with GOT-style position independent code only local
//  (within module) calls are supported at the moment.
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original RETADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
                                               SelectionDAG &DAG) const {
  const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
  const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
  assert(StackSize % SlotSize == 0 &&
         "StackSize must be a multiple of SlotSize");
  return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
}

/// Return true if the given stack call argument is already available in the
/// same position (relatively) of the caller's incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                         const X86InstrInfo *TII, const CCValAssign &VA) {
  unsigned Bytes = Arg.getValueSizeInBits() / 8;

  for (;;) {
    // Look through nodes that don't alter the bits of the incoming value.
    unsigned Op = Arg.getOpcode();
    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
        Op == ISD::AssertZext) {
      Arg = Arg.getOperand(0);
      continue;
    }
    if (Op == ISD::TRUNCATE) {
      const SDValue &TruncInput = Arg.getOperand(0);
      if (TruncInput.getOpcode() == ISD::AssertZext &&
          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
              Arg.getValueType()) {
        Arg = TruncInput.getOperand(0);
        continue;
      }
    }
    break;
  }

  int FI = INT_MAX;
  if (Arg.getOpcode() == ISD::CopyFromReg) {
    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
    if (!VR.isVirtual())
      return false;
    MachineInstr *Def = MRI->getVRegDef(VR);
    if (!Def)
      return false;
    if (!Flags.isByVal()) {
      if (!TII->isLoadFromStackSlot(*Def, FI))
        return false;
    } else {
      unsigned Opcode = Def->getOpcode();
      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
           Opcode == X86::LEA64_32r) &&
          Def->getOperand(1).isFI()) {
        FI = Def->getOperand(1).getIndex();
        Bytes = Flags.getByValSize();
      } else
        return false;
    }
  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
    if (Flags.isByVal())
      // ByVal argument is passed in as a pointer but it's now being
      // dereferenced. e.g.
      // define @foo(%struct.X* %A) {
      //   tail call @bar(%struct.X* byval %A)
      // }
      return false;
    SDValue Ptr = Ld->getBasePtr();
    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
    if (!FINode)
      return false;
    FI = FINode->getIndex();
  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
    FI = FINode->getIndex();
    Bytes = Flags.getByValSize();
  } else
    return false;

  assert(FI != INT_MAX);
  if (!MFI.isFixedObjectIndex(FI))
    return false;

  if (Offset != MFI.getObjectOffset(FI))
    return false;

  // If this is not byval, check that the argument stack object is immutable.
  // inalloca and argument copy elision can create mutable argument stack
  // objects. Byval objects can be mutated, but a byval call intends to pass the
  // mutated memory.
  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
    return false;

  if (VA.getLocVT().getFixedSizeInBits() >
      Arg.getValueSizeInBits().getFixedValue()) {
    // If the argument location is wider than the argument type, check that any
    // extension flags match.
    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
        Flags.isSExt() != MFI.isObjectSExt(FI)) {
      return false;
    }
  }

  return Bytes == MFI.getObjectSize(FI);
}

/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
bool X86TargetLowering::IsEligibleForTailCallOptimization(
    SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
    bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
    const SmallVectorImpl<SDValue> &OutVals,
    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
  if (!mayTailCallThisCC(CalleeCC))
    return false;

  // If -tailcallopt is specified, make fastcc functions tail-callable.
  MachineFunction &MF = DAG.getMachineFunction();
  const Function &CallerF = MF.getFunction();

  // If the function return type is x86_fp80 and the callee return type is not,
  // then the FP_EXTEND of the call result is not a nop. It's not safe to
  // perform a tailcall optimization here.
  if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
    return false;

  CallingConv::ID CallerCC = CallerF.getCallingConv();
  bool CCMatch = CallerCC == CalleeCC;
  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
      CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

  // Win64 functions have extra shadow space for argument homing. Don't do the
  // sibcall if the caller and callee have mismatched expectations for this
  // space.
  if (IsCalleeWin64 != IsCallerWin64)
    return false;

  if (IsGuaranteeTCO) {
    if (canGuaranteeTCO(CalleeCC) && CCMatch)
      return true;
    return false;
  }

  // Look for obvious safe cases to perform tail call optimization that do not
  // require ABI changes. This is what gcc calls sibcall.

  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
  // emit a special epilogue.
  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  if (RegInfo->hasStackRealignment(MF))
    return false;

  // Also avoid sibcall optimization if we're an sret return fn and the callee
  // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
  // insufficient.
  if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
    // For a compatible tail call the callee must return our sret pointer. So it
    // needs to be (a) an sret function itself and (b) we pass our sret as its
    // sret. Condition #b is harder to determine.
    return false;
  } else if (IsCalleePopSRet)
    // The callee pops an sret, so we cannot tail-call, as our caller doesn't
    // expect that.
    return false;

  // Do not sibcall optimize vararg calls unless all arguments are passed via
  // registers.
  LLVMContext &C = *DAG.getContext();
  if (isVarArg && !Outs.empty()) {
    // Optimizing for varargs on Win64 is unlikely to be safe without
    // additional testing.
    if (IsCalleeWin64 || IsCallerWin64)
      return false;

    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
    for (const auto &VA : ArgLocs)
      if (!VA.isRegLoc())
        return false;
  }

  // If the call result is in ST0 / ST1, it needs to be popped off the x87
  // stack.  Therefore, if it's not used by the call it is not safe to optimize
  // this into a sibcall.
  bool Unused = false;
  for (const auto &In : Ins) {
    if (!In.Used) {
      Unused = true;
      break;
    }
  }
  if (Unused) {
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
    for (const auto &VA : RVLocs) {
      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
        return false;
    }
  }

  // Check that the call results are passed in the same way.
  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
                                  RetCC_X86, RetCC_X86))
    return false;
  // The callee has to preserve all registers the caller needs to preserve.
  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
  if (!CCMatch) {
    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
      return false;
  }

  unsigned StackArgsSize = 0;

  // If the callee takes no arguments then go on to check the results of the
  // call.
  if (!Outs.empty()) {
    // Check if stack adjustment is needed. For now, do not do this if any
    // argument is passed on the stack.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

    // Allocate shadow area for Win64
    if (IsCalleeWin64)
      CCInfo.AllocateStack(32, Align(8));

    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
    StackArgsSize = CCInfo.getStackSize();

    if (CCInfo.getStackSize()) {
      // Check if the arguments are already laid out in the right way as
      // the caller's fixed stack objects.
      MachineFrameInfo &MFI = MF.getFrameInfo();
      const MachineRegisterInfo *MRI = &MF.getRegInfo();
      const X86InstrInfo *TII = Subtarget.getInstrInfo();
      for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
        const CCValAssign &VA = ArgLocs[I];
        SDValue Arg = OutVals[I];
        ISD::ArgFlagsTy Flags = Outs[I].Flags;
        if (VA.getLocInfo() == CCValAssign::Indirect)
          return false;
        if (!VA.isRegLoc()) {
          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
                                   TII, VA))
            return false;
        }
      }
    }

    bool PositionIndependent = isPositionIndependent();
    // If the tailcall address may be in a register, then make sure it's
    // possible to register allocate for it. In 32-bit, the call address can
    // only target EAX, EDX, or ECX since the tail call must be scheduled after
    // callee-saved registers are restored. These happen to be the same
    // registers used to pass 'inreg' arguments so watch out for those.
    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
                                  !isa<ExternalSymbolSDNode>(Callee)) ||
                                 PositionIndependent)) {
      unsigned NumInRegs = 0;
      // In PIC we need an extra register to formulate the address computation
      // for the callee.
      unsigned MaxInRegs = PositionIndependent ? 2 : 3;

      for (const auto &VA : ArgLocs) {
        if (!VA.isRegLoc())
          continue;
        Register Reg = VA.getLocReg();
        switch (Reg) {
        default: break;
        case X86::EAX: case X86::EDX: case X86::ECX:
          if (++NumInRegs == MaxInRegs)
            return false;
          break;
        }
      }
    }

    const MachineRegisterInfo &MRI = MF.getRegInfo();
    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
      return false;
  }

  bool CalleeWillPop =
      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
                       MF.getTarget().Options.GuaranteedTailCallOpt);

  if (unsigned BytesToPop =
          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
    // If we have bytes to pop, the callee must pop them.
    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
    if (!CalleePopMatches)
      return false;
  } else if (CalleeWillPop && StackArgsSize > 0) {
    // If we don't have bytes to pop, make sure the callee doesn't pop any.
    return false;
  }

  return true;
}

/// Determines whether the callee is required to pop its own arguments.
/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
  // can guarantee TCO.
  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
    return true;

  switch (CallingConv) {
  default:
    return false;
  case CallingConv::X86_StdCall:
  case CallingConv::X86_FastCall:
  case CallingConv::X86_ThisCall:
  case CallingConv::X86_VectorCall:
    return !is64Bit;
  }
}