1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
15#include "X86MachineFunctionInfo.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/SelectionDAGISel.h"
21#include "llvm/Config/llvm-config.h"
22#include "llvm/IR/ConstantRange.h"
23#include "llvm/IR/Function.h"
24#include "llvm/IR/Instructions.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsX86.h"
27#include "llvm/IR/Type.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/Support/ErrorHandling.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MathExtras.h"
32#include <stdint.h>
33using namespace llvm;
34
35#define DEBUG_TYPE "x86-isel"
36
37STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
38
39static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
40    cl::desc("Enable setting constant bits to reduce size of mask immediates"),
41    cl::Hidden);
42
43static cl::opt<bool> EnablePromoteAnyextLoad(
44    "x86-promote-anyext-load", cl::init(true),
45    cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
46
47//===----------------------------------------------------------------------===//
48//                      Pattern Matcher Implementation
49//===----------------------------------------------------------------------===//
50
51namespace {
52  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53  /// numbers for the leaves of the matched tree.
54  struct X86ISelAddressMode {
55    enum {
56      RegBase,
57      FrameIndexBase
58    } BaseType;
59
60    // This is really a union, discriminated by BaseType!
61    SDValue Base_Reg;
62    int Base_FrameIndex;
63
64    unsigned Scale;
65    SDValue IndexReg;
66    int32_t Disp;
67    SDValue Segment;
68    const GlobalValue *GV;
69    const Constant *CP;
70    const BlockAddress *BlockAddr;
71    const char *ES;
72    MCSymbol *MCSym;
73    int JT;
74    Align Alignment;            // CP alignment.
75    unsigned char SymbolFlags;  // X86II::MO_*
76    bool NegateIndex = false;
77
78    X86ISelAddressMode()
79        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81          MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {}
82
83    bool hasSymbolicDisplacement() const {
84      return GV != nullptr || CP != nullptr || ES != nullptr ||
85             MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
86    }
87
88    bool hasBaseOrIndexReg() const {
89      return BaseType == FrameIndexBase ||
90             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
91    }
92
93    /// Return true if this addressing mode is already RIP-relative.
94    bool isRIPRelative() const {
95      if (BaseType != RegBase) return false;
96      if (RegisterSDNode *RegNode =
97            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98        return RegNode->getReg() == X86::RIP;
99      return false;
100    }
101
102    void setBaseReg(SDValue Reg) {
103      BaseType = RegBase;
104      Base_Reg = Reg;
105    }
106
107#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108    void dump(SelectionDAG *DAG = nullptr) {
109      dbgs() << "X86ISelAddressMode " << this << '\n';
110      dbgs() << "Base_Reg ";
111      if (Base_Reg.getNode())
112        Base_Reg.getNode()->dump(DAG);
113      else
114        dbgs() << "nul\n";
115      if (BaseType == FrameIndexBase)
116        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117      dbgs() << " Scale " << Scale << '\n'
118             << "IndexReg ";
119      if (NegateIndex)
120        dbgs() << "negate ";
121      if (IndexReg.getNode())
122        IndexReg.getNode()->dump(DAG);
123      else
124        dbgs() << "nul\n";
125      dbgs() << " Disp " << Disp << '\n'
126             << "GV ";
127      if (GV)
128        GV->dump();
129      else
130        dbgs() << "nul";
131      dbgs() << " CP ";
132      if (CP)
133        CP->dump();
134      else
135        dbgs() << "nul";
136      dbgs() << '\n'
137             << "ES ";
138      if (ES)
139        dbgs() << ES;
140      else
141        dbgs() << "nul";
142      dbgs() << " MCSym ";
143      if (MCSym)
144        dbgs() << MCSym;
145      else
146        dbgs() << "nul";
147      dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
148    }
149#endif
150  };
151}
152
153namespace {
154  //===--------------------------------------------------------------------===//
155  /// ISel - X86-specific code to select X86 machine instructions for
156  /// SelectionDAG operations.
157  ///
158  class X86DAGToDAGISel final : public SelectionDAGISel {
159    /// Keep a pointer to the X86Subtarget around so that we can
160    /// make the right decision when generating code for different targets.
161    const X86Subtarget *Subtarget;
162
163    /// If true, selector should try to optimize for minimum code size.
164    bool OptForMinSize;
165
166    /// Disable direct TLS access through segment registers.
167    bool IndirectTlsSegRefs;
168
169  public:
170    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
171        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
172          OptForMinSize(false), IndirectTlsSegRefs(false) {}
173
174    StringRef getPassName() const override {
175      return "X86 DAG->DAG Instruction Selection";
176    }
177
178    bool runOnMachineFunction(MachineFunction &MF) override {
179      // Reset the subtarget each time through.
180      Subtarget = &MF.getSubtarget<X86Subtarget>();
181      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
182                             "indirect-tls-seg-refs");
183
184      // OptFor[Min]Size are used in pattern predicates that isel is matching.
185      OptForMinSize = MF.getFunction().hasMinSize();
186      assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
187             "OptForMinSize implies OptForSize");
188
189      SelectionDAGISel::runOnMachineFunction(MF);
190      return true;
191    }
192
193    void emitFunctionEntryCode() override;
194
195    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196
197    void PreprocessISelDAG() override;
198    void PostprocessISelDAG() override;
199
200// Include the pieces autogenerated from the target description.
201#include "X86GenDAGISel.inc"
202
203  private:
204    void Select(SDNode *N) override;
205
206    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
208    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
209    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
210    bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
211    bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
212    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213                                 unsigned Depth);
214    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
215    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
216                    SDValue &Scale, SDValue &Index, SDValue &Disp,
217                    SDValue &Segment);
218    bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
219                          SDValue ScaleOp, SDValue &Base, SDValue &Scale,
220                          SDValue &Index, SDValue &Disp, SDValue &Segment);
221    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
222    bool selectLEAAddr(SDValue N, SDValue &Base,
223                       SDValue &Scale, SDValue &Index, SDValue &Disp,
224                       SDValue &Segment);
225    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
226                            SDValue &Scale, SDValue &Index, SDValue &Disp,
227                            SDValue &Segment);
228    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
229                           SDValue &Scale, SDValue &Index, SDValue &Disp,
230                           SDValue &Segment);
231    bool selectRelocImm(SDValue N, SDValue &Op);
232
233    bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
234                     SDValue &Base, SDValue &Scale,
235                     SDValue &Index, SDValue &Disp,
236                     SDValue &Segment);
237
238    // Convenience method where P is also root.
239    bool tryFoldLoad(SDNode *P, SDValue N,
240                     SDValue &Base, SDValue &Scale,
241                     SDValue &Index, SDValue &Disp,
242                     SDValue &Segment) {
243      return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
244    }
245
246    bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
247                          SDValue &Base, SDValue &Scale,
248                          SDValue &Index, SDValue &Disp,
249                          SDValue &Segment);
250
251    bool isProfitableToFormMaskedOp(SDNode *N) const;
252
253    /// Implement addressing mode selection for inline asm expressions.
254    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
255                                      unsigned ConstraintID,
256                                      std::vector<SDValue> &OutOps) override;
257
258    void emitSpecialCodeForMain();
259
260    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
261                                   MVT VT, SDValue &Base, SDValue &Scale,
262                                   SDValue &Index, SDValue &Disp,
263                                   SDValue &Segment) {
264      if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
265        Base = CurDAG->getTargetFrameIndex(
266            AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
267      else if (AM.Base_Reg.getNode())
268        Base = AM.Base_Reg;
269      else
270        Base = CurDAG->getRegister(0, VT);
271
272      Scale = getI8Imm(AM.Scale, DL);
273
274      // Negate the index if needed.
275      if (AM.NegateIndex) {
276        unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
277        SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
278                                                     AM.IndexReg), 0);
279        AM.IndexReg = Neg;
280      }
281
282      if (AM.IndexReg.getNode())
283        Index = AM.IndexReg;
284      else
285        Index = CurDAG->getRegister(0, VT);
286
287      // These are 32-bit even in 64-bit mode since RIP-relative offset
288      // is 32-bit.
289      if (AM.GV)
290        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
291                                              MVT::i32, AM.Disp,
292                                              AM.SymbolFlags);
293      else if (AM.CP)
294        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
295                                             AM.Disp, AM.SymbolFlags);
296      else if (AM.ES) {
297        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
298        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
299      } else if (AM.MCSym) {
300        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
301        assert(AM.SymbolFlags == 0 && "oo");
302        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
303      } else if (AM.JT != -1) {
304        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
305        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
306      } else if (AM.BlockAddr)
307        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
308                                             AM.SymbolFlags);
309      else
310        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
311
312      if (AM.Segment.getNode())
313        Segment = AM.Segment;
314      else
315        Segment = CurDAG->getRegister(0, MVT::i16);
316    }
317
318    // Utility function to determine whether we should avoid selecting
319    // immediate forms of instructions for better code size or not.
320    // At a high level, we'd like to avoid such instructions when
321    // we have similar constants used within the same basic block
322    // that can be kept in a register.
323    //
324    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
325      uint32_t UseCount = 0;
326
327      // Do not want to hoist if we're not optimizing for size.
328      // TODO: We'd like to remove this restriction.
329      // See the comment in X86InstrInfo.td for more info.
330      if (!CurDAG->shouldOptForSize())
331        return false;
332
333      // Walk all the users of the immediate.
334      for (SDNode::use_iterator UI = N->use_begin(),
335           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
336
337        SDNode *User = *UI;
338
339        // This user is already selected. Count it as a legitimate use and
340        // move on.
341        if (User->isMachineOpcode()) {
342          UseCount++;
343          continue;
344        }
345
346        // We want to count stores of immediates as real uses.
347        if (User->getOpcode() == ISD::STORE &&
348            User->getOperand(1).getNode() == N) {
349          UseCount++;
350          continue;
351        }
352
353        // We don't currently match users that have > 2 operands (except
354        // for stores, which are handled above)
355        // Those instruction won't match in ISEL, for now, and would
356        // be counted incorrectly.
357        // This may change in the future as we add additional instruction
358        // types.
359        if (User->getNumOperands() != 2)
360          continue;
361
362        // If this is a sign-extended 8-bit integer immediate used in an ALU
363        // instruction, there is probably an opcode encoding to save space.
364        auto *C = dyn_cast<ConstantSDNode>(N);
365        if (C && isInt<8>(C->getSExtValue()))
366          continue;
367
368        // Immediates that are used for offsets as part of stack
369        // manipulation should be left alone. These are typically
370        // used to indicate SP offsets for argument passing and
371        // will get pulled into stores/pushes (implicitly).
372        if (User->getOpcode() == X86ISD::ADD ||
373            User->getOpcode() == ISD::ADD    ||
374            User->getOpcode() == X86ISD::SUB ||
375            User->getOpcode() == ISD::SUB) {
376
377          // Find the other operand of the add/sub.
378          SDValue OtherOp = User->getOperand(0);
379          if (OtherOp.getNode() == N)
380            OtherOp = User->getOperand(1);
381
382          // Don't count if the other operand is SP.
383          RegisterSDNode *RegNode;
384          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
385              (RegNode = dyn_cast_or_null<RegisterSDNode>(
386                 OtherOp->getOperand(1).getNode())))
387            if ((RegNode->getReg() == X86::ESP) ||
388                (RegNode->getReg() == X86::RSP))
389              continue;
390        }
391
392        // ... otherwise, count this and move on.
393        UseCount++;
394      }
395
396      // If we have more than 1 use, then recommend for hoisting.
397      return (UseCount > 1);
398    }
399
400    /// Return a target constant with the specified value of type i8.
401    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
402      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
403    }
404
405    /// Return a target constant with the specified value, of type i32.
406    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
407      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
408    }
409
410    /// Return a target constant with the specified value, of type i64.
411    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
412      return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
413    }
414
415    SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
416                                        const SDLoc &DL) {
417      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
418      uint64_t Index = N->getConstantOperandVal(1);
419      MVT VecVT = N->getOperand(0).getSimpleValueType();
420      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
421    }
422
423    SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
424                                      const SDLoc &DL) {
425      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426      uint64_t Index = N->getConstantOperandVal(2);
427      MVT VecVT = N->getSimpleValueType(0);
428      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429    }
430
431    // Helper to detect unneeded and instructions on shift amounts. Called
432    // from PatFrags in tablegen.
433    bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
434      assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
435      const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
436
437      if (Val.countTrailingOnes() >= Width)
438        return true;
439
440      APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
441      return Mask.countTrailingOnes() >= Width;
442    }
443
444    /// Return an SDNode that returns the value of the global base register.
445    /// Output instructions required to initialize the global base register,
446    /// if necessary.
447    SDNode *getGlobalBaseReg();
448
449    /// Return a reference to the TargetMachine, casted to the target-specific
450    /// type.
451    const X86TargetMachine &getTargetMachine() const {
452      return static_cast<const X86TargetMachine &>(TM);
453    }
454
455    /// Return a reference to the TargetInstrInfo, casted to the target-specific
456    /// type.
457    const X86InstrInfo *getInstrInfo() const {
458      return Subtarget->getInstrInfo();
459    }
460
461    /// Address-mode matching performs shift-of-and to and-of-shift
462    /// reassociation in order to expose more scaled addressing
463    /// opportunities.
464    bool ComplexPatternFuncMutatesDAG() const override {
465      return true;
466    }
467
468    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
469
470    // Indicates we should prefer to use a non-temporal load for this load.
471    bool useNonTemporalLoad(LoadSDNode *N) const {
472      if (!N->isNonTemporal())
473        return false;
474
475      unsigned StoreSize = N->getMemoryVT().getStoreSize();
476
477      if (N->getAlignment() < StoreSize)
478        return false;
479
480      switch (StoreSize) {
481      default: llvm_unreachable("Unsupported store size");
482      case 4:
483      case 8:
484        return false;
485      case 16:
486        return Subtarget->hasSSE41();
487      case 32:
488        return Subtarget->hasAVX2();
489      case 64:
490        return Subtarget->hasAVX512();
491      }
492    }
493
494    bool foldLoadStoreIntoMemOperand(SDNode *Node);
495    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
496    bool matchBitExtract(SDNode *Node);
497    bool shrinkAndImmediate(SDNode *N);
498    bool isMaskZeroExtended(SDNode *N) const;
499    bool tryShiftAmountMod(SDNode *N);
500    bool tryShrinkShlLogicImm(SDNode *N);
501    bool tryVPTERNLOG(SDNode *N);
502    bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
503    bool tryMatchBitSelect(SDNode *N);
504
505    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
506                                const SDLoc &dl, MVT VT, SDNode *Node);
507    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
508                                const SDLoc &dl, MVT VT, SDNode *Node,
509                                SDValue &InFlag);
510
511    bool tryOptimizeRem8Extend(SDNode *N);
512
513    bool onlyUsesZeroFlag(SDValue Flags) const;
514    bool hasNoSignFlagUses(SDValue Flags) const;
515    bool hasNoCarryFlagUses(SDValue Flags) const;
516  };
517}
518
519
520// Returns true if this masked compare can be implemented legally with this
521// type.
522static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
523  unsigned Opcode = N->getOpcode();
524  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
525      Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
526      Opcode == X86ISD::VFPCLASS) {
527    // We can get 256-bit 8 element types here without VLX being enabled. When
528    // this happens we will use 512-bit operations and the mask will not be
529    // zero extended.
530    EVT OpVT = N->getOperand(0).getValueType();
531    // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
532    // second operand.
533    if (Opcode == X86ISD::STRICT_CMPM)
534      OpVT = N->getOperand(1).getValueType();
535    if (OpVT.is256BitVector() || OpVT.is128BitVector())
536      return Subtarget->hasVLX();
537
538    return true;
539  }
540  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
541  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
542      Opcode == X86ISD::FSETCCM_SAE)
543    return true;
544
545  return false;
546}
547
548// Returns true if we can assume the writer of the mask has zero extended it
549// for us.
550bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
551  // If this is an AND, check if we have a compare on either side. As long as
552  // one side guarantees the mask is zero extended, the AND will preserve those
553  // zeros.
554  if (N->getOpcode() == ISD::AND)
555    return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
556           isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
557
558  return isLegalMaskCompare(N, Subtarget);
559}
560
561bool
562X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
563  if (OptLevel == CodeGenOpt::None) return false;
564
565  if (!N.hasOneUse())
566    return false;
567
568  if (N.getOpcode() != ISD::LOAD)
569    return true;
570
571  // Don't fold non-temporal loads if we have an instruction for them.
572  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
573    return false;
574
575  // If N is a load, do additional profitability checks.
576  if (U == Root) {
577    switch (U->getOpcode()) {
578    default: break;
579    case X86ISD::ADD:
580    case X86ISD::ADC:
581    case X86ISD::SUB:
582    case X86ISD::SBB:
583    case X86ISD::AND:
584    case X86ISD::XOR:
585    case X86ISD::OR:
586    case ISD::ADD:
587    case ISD::ADDCARRY:
588    case ISD::AND:
589    case ISD::OR:
590    case ISD::XOR: {
591      SDValue Op1 = U->getOperand(1);
592
593      // If the other operand is a 8-bit immediate we should fold the immediate
594      // instead. This reduces code size.
595      // e.g.
596      // movl 4(%esp), %eax
597      // addl $4, %eax
598      // vs.
599      // movl $4, %eax
600      // addl 4(%esp), %eax
601      // The former is 2 bytes shorter. In case where the increment is 1, then
602      // the saving can be 4 bytes (by using incl %eax).
603      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
604        if (Imm->getAPIntValue().isSignedIntN(8))
605          return false;
606
607        // If this is a 64-bit AND with an immediate that fits in 32-bits,
608        // prefer using the smaller and over folding the load. This is needed to
609        // make sure immediates created by shrinkAndImmediate are always folded.
610        // Ideally we would narrow the load during DAG combine and get the
611        // best of both worlds.
612        if (U->getOpcode() == ISD::AND &&
613            Imm->getAPIntValue().getBitWidth() == 64 &&
614            Imm->getAPIntValue().isIntN(32))
615          return false;
616
617        // If this really a zext_inreg that can be represented with a movzx
618        // instruction, prefer that.
619        // TODO: We could shrink the load and fold if it is non-volatile.
620        if (U->getOpcode() == ISD::AND &&
621            (Imm->getAPIntValue() == UINT8_MAX ||
622             Imm->getAPIntValue() == UINT16_MAX ||
623             Imm->getAPIntValue() == UINT32_MAX))
624          return false;
625
626        // ADD/SUB with can negate the immediate and use the opposite operation
627        // to fit 128 into a sign extended 8 bit immediate.
628        if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
629            (-Imm->getAPIntValue()).isSignedIntN(8))
630          return false;
631
632        if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
633            (-Imm->getAPIntValue()).isSignedIntN(8) &&
634            hasNoCarryFlagUses(SDValue(U, 1)))
635          return false;
636      }
637
638      // If the other operand is a TLS address, we should fold it instead.
639      // This produces
640      // movl    %gs:0, %eax
641      // leal    i@NTPOFF(%eax), %eax
642      // instead of
643      // movl    $i@NTPOFF, %eax
644      // addl    %gs:0, %eax
645      // if the block also has an access to a second TLS address this will save
646      // a load.
647      // FIXME: This is probably also true for non-TLS addresses.
648      if (Op1.getOpcode() == X86ISD::Wrapper) {
649        SDValue Val = Op1.getOperand(0);
650        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
651          return false;
652      }
653
654      // Don't fold load if this matches the BTS/BTR/BTC patterns.
655      // BTS: (or X, (shl 1, n))
656      // BTR: (and X, (rotl -2, n))
657      // BTC: (xor X, (shl 1, n))
658      if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
659        if (U->getOperand(0).getOpcode() == ISD::SHL &&
660            isOneConstant(U->getOperand(0).getOperand(0)))
661          return false;
662
663        if (U->getOperand(1).getOpcode() == ISD::SHL &&
664            isOneConstant(U->getOperand(1).getOperand(0)))
665          return false;
666      }
667      if (U->getOpcode() == ISD::AND) {
668        SDValue U0 = U->getOperand(0);
669        SDValue U1 = U->getOperand(1);
670        if (U0.getOpcode() == ISD::ROTL) {
671          auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
672          if (C && C->getSExtValue() == -2)
673            return false;
674        }
675
676        if (U1.getOpcode() == ISD::ROTL) {
677          auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
678          if (C && C->getSExtValue() == -2)
679            return false;
680        }
681      }
682
683      break;
684    }
685    case ISD::SHL:
686    case ISD::SRA:
687    case ISD::SRL:
688      // Don't fold a load into a shift by immediate. The BMI2 instructions
689      // support folding a load, but not an immediate. The legacy instructions
690      // support folding an immediate, but can't fold a load. Folding an
691      // immediate is preferable to folding a load.
692      if (isa<ConstantSDNode>(U->getOperand(1)))
693        return false;
694
695      break;
696    }
697  }
698
699  // Prevent folding a load if this can implemented with an insert_subreg or
700  // a move that implicitly zeroes.
701  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
702      isNullConstant(Root->getOperand(2)) &&
703      (Root->getOperand(0).isUndef() ||
704       ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
705    return false;
706
707  return true;
708}
709
710// Indicates it is profitable to form an AVX512 masked operation. Returning
711// false will favor a masked register-register masked move or vblendm and the
712// operation will be selected separately.
713bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
714  assert(
715      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
716      "Unexpected opcode!");
717
718  // If the operation has additional users, the operation will be duplicated.
719  // Check the use count to prevent that.
720  // FIXME: Are there cheap opcodes we might want to duplicate?
721  return N->getOperand(1).hasOneUse();
722}
723
724/// Replace the original chain operand of the call with
725/// load's chain operand and move load below the call's chain operand.
726static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
727                               SDValue Call, SDValue OrigChain) {
728  SmallVector<SDValue, 8> Ops;
729  SDValue Chain = OrigChain.getOperand(0);
730  if (Chain.getNode() == Load.getNode())
731    Ops.push_back(Load.getOperand(0));
732  else {
733    assert(Chain.getOpcode() == ISD::TokenFactor &&
734           "Unexpected chain operand");
735    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
736      if (Chain.getOperand(i).getNode() == Load.getNode())
737        Ops.push_back(Load.getOperand(0));
738      else
739        Ops.push_back(Chain.getOperand(i));
740    SDValue NewChain =
741      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
742    Ops.clear();
743    Ops.push_back(NewChain);
744  }
745  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
746  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
747  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
748                             Load.getOperand(1), Load.getOperand(2));
749
750  Ops.clear();
751  Ops.push_back(SDValue(Load.getNode(), 1));
752  Ops.append(Call->op_begin() + 1, Call->op_end());
753  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
754}
755
756/// Return true if call address is a load and it can be
757/// moved below CALLSEQ_START and the chains leading up to the call.
758/// Return the CALLSEQ_START by reference as a second output.
759/// In the case of a tail call, there isn't a callseq node between the call
760/// chain and the load.
761static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
762  // The transformation is somewhat dangerous if the call's chain was glued to
763  // the call. After MoveBelowOrigChain the load is moved between the call and
764  // the chain, this can create a cycle if the load is not folded. So it is
765  // *really* important that we are sure the load will be folded.
766  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
767    return false;
768  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
769  if (!LD ||
770      !LD->isSimple() ||
771      LD->getAddressingMode() != ISD::UNINDEXED ||
772      LD->getExtensionType() != ISD::NON_EXTLOAD)
773    return false;
774
775  // Now let's find the callseq_start.
776  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
777    if (!Chain.hasOneUse())
778      return false;
779    Chain = Chain.getOperand(0);
780  }
781
782  if (!Chain.getNumOperands())
783    return false;
784  // Since we are not checking for AA here, conservatively abort if the chain
785  // writes to memory. It's not safe to move the callee (a load) across a store.
786  if (isa<MemSDNode>(Chain.getNode()) &&
787      cast<MemSDNode>(Chain.getNode())->writeMem())
788    return false;
789  if (Chain.getOperand(0).getNode() == Callee.getNode())
790    return true;
791  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
792      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
793      Callee.getValue(1).hasOneUse())
794    return true;
795  return false;
796}
797
798void X86DAGToDAGISel::PreprocessISelDAG() {
799  bool MadeChange = false;
800  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
801       E = CurDAG->allnodes_end(); I != E; ) {
802    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
803
804    // If this is a target specific AND node with no flag usages, turn it back
805    // into ISD::AND to enable test instruction matching.
806    if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
807      SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
808                                    N->getOperand(0), N->getOperand(1));
809      --I;
810      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
811      ++I;
812      MadeChange = true;
813      continue;
814    }
815
816    /// Convert vector increment or decrement to sub/add with an all-ones
817    /// constant:
818    /// add X, <1, 1...> --> sub X, <-1, -1...>
819    /// sub X, <1, 1...> --> add X, <-1, -1...>
820    /// The all-ones vector constant can be materialized using a pcmpeq
821    /// instruction that is commonly recognized as an idiom (has no register
822    /// dependency), so that's better/smaller than loading a splat 1 constant.
823    if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
824        N->getSimpleValueType(0).isVector()) {
825
826      APInt SplatVal;
827      if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
828          SplatVal.isOneValue()) {
829        SDLoc DL(N);
830
831        MVT VT = N->getSimpleValueType(0);
832        unsigned NumElts = VT.getSizeInBits() / 32;
833        SDValue AllOnes =
834            CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
835        AllOnes = CurDAG->getBitcast(VT, AllOnes);
836
837        unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
838        SDValue Res =
839            CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
840        --I;
841        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
842        ++I;
843        MadeChange = true;
844        continue;
845      }
846    }
847
848    switch (N->getOpcode()) {
849    case X86ISD::VBROADCAST: {
850      MVT VT = N->getSimpleValueType(0);
851      // Emulate v32i16/v64i8 broadcast without BWI.
852      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
853        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
854        SDLoc dl(N);
855        SDValue NarrowBCast =
856            CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
857        SDValue Res =
858            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
859                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
860        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
861        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
862                              CurDAG->getIntPtrConstant(Index, dl));
863
864        --I;
865        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
866        ++I;
867        MadeChange = true;
868        continue;
869      }
870
871      break;
872    }
873    case X86ISD::VBROADCAST_LOAD: {
874      MVT VT = N->getSimpleValueType(0);
875      // Emulate v32i16/v64i8 broadcast without BWI.
876      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
877        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
878        auto *MemNode = cast<MemSDNode>(N);
879        SDLoc dl(N);
880        SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
881        SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
882        SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
883            X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
884            MemNode->getMemOperand());
885        SDValue Res =
886            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
887                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
888        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
889        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
890                              CurDAG->getIntPtrConstant(Index, dl));
891
892        --I;
893        SDValue To[] = {Res, NarrowBCast.getValue(1)};
894        CurDAG->ReplaceAllUsesWith(N, To);
895        ++I;
896        MadeChange = true;
897        continue;
898      }
899
900      break;
901    }
902    case ISD::VSELECT: {
903      // Replace VSELECT with non-mask conditions with with BLENDV.
904      if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
905        break;
906
907      assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
908      SDValue Blendv =
909          CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
910                          N->getOperand(0), N->getOperand(1), N->getOperand(2));
911      --I;
912      CurDAG->ReplaceAllUsesWith(N, Blendv.getNode());
913      ++I;
914      MadeChange = true;
915      continue;
916    }
917    case ISD::FP_ROUND:
918    case ISD::STRICT_FP_ROUND:
919    case ISD::FP_TO_SINT:
920    case ISD::FP_TO_UINT:
921    case ISD::STRICT_FP_TO_SINT:
922    case ISD::STRICT_FP_TO_UINT: {
923      // Replace vector fp_to_s/uint with their X86 specific equivalent so we
924      // don't need 2 sets of patterns.
925      if (!N->getSimpleValueType(0).isVector())
926        break;
927
928      unsigned NewOpc;
929      switch (N->getOpcode()) {
930      default: llvm_unreachable("Unexpected opcode!");
931      case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
932      case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
933      case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
934      case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
935      case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
936      case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
937      }
938      SDValue Res;
939      if (N->isStrictFPOpcode())
940        Res =
941            CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
942                            {N->getOperand(0), N->getOperand(1)});
943      else
944        Res =
945            CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
946                            N->getOperand(0));
947      --I;
948      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
949      ++I;
950      MadeChange = true;
951      continue;
952    }
953    case ISD::SHL:
954    case ISD::SRA:
955    case ISD::SRL: {
956      // Replace vector shifts with their X86 specific equivalent so we don't
957      // need 2 sets of patterns.
958      if (!N->getValueType(0).isVector())
959        break;
960
961      unsigned NewOpc;
962      switch (N->getOpcode()) {
963      default: llvm_unreachable("Unexpected opcode!");
964      case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
965      case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
966      case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
967      }
968      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
969                                    N->getOperand(0), N->getOperand(1));
970      --I;
971      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
972      ++I;
973      MadeChange = true;
974      continue;
975    }
976    case ISD::ANY_EXTEND:
977    case ISD::ANY_EXTEND_VECTOR_INREG: {
978      // Replace vector any extend with the zero extend equivalents so we don't
979      // need 2 sets of patterns. Ignore vXi1 extensions.
980      if (!N->getValueType(0).isVector())
981        break;
982
983      unsigned NewOpc;
984      if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
985        assert(N->getOpcode() == ISD::ANY_EXTEND &&
986               "Unexpected opcode for mask vector!");
987        NewOpc = ISD::SIGN_EXTEND;
988      } else {
989        NewOpc = N->getOpcode() == ISD::ANY_EXTEND
990                              ? ISD::ZERO_EXTEND
991                              : ISD::ZERO_EXTEND_VECTOR_INREG;
992      }
993
994      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
995                                    N->getOperand(0));
996      --I;
997      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
998      ++I;
999      MadeChange = true;
1000      continue;
1001    }
1002    case ISD::FCEIL:
1003    case ISD::STRICT_FCEIL:
1004    case ISD::FFLOOR:
1005    case ISD::STRICT_FFLOOR:
1006    case ISD::FTRUNC:
1007    case ISD::STRICT_FTRUNC:
1008    case ISD::FNEARBYINT:
1009    case ISD::STRICT_FNEARBYINT:
1010    case ISD::FRINT:
1011    case ISD::STRICT_FRINT: {
1012      // Replace fp rounding with their X86 specific equivalent so we don't
1013      // need 2 sets of patterns.
1014      unsigned Imm;
1015      switch (N->getOpcode()) {
1016      default: llvm_unreachable("Unexpected opcode!");
1017      case ISD::STRICT_FCEIL:
1018      case ISD::FCEIL:      Imm = 0xA; break;
1019      case ISD::STRICT_FFLOOR:
1020      case ISD::FFLOOR:     Imm = 0x9; break;
1021      case ISD::STRICT_FTRUNC:
1022      case ISD::FTRUNC:     Imm = 0xB; break;
1023      case ISD::STRICT_FNEARBYINT:
1024      case ISD::FNEARBYINT: Imm = 0xC; break;
1025      case ISD::STRICT_FRINT:
1026      case ISD::FRINT:      Imm = 0x4; break;
1027      }
1028      SDLoc dl(N);
1029      bool IsStrict = N->isStrictFPOpcode();
1030      SDValue Res;
1031      if (IsStrict)
1032        Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1033                              {N->getValueType(0), MVT::Other},
1034                              {N->getOperand(0), N->getOperand(1),
1035                               CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
1036      else
1037        Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1038                              N->getOperand(0),
1039                              CurDAG->getTargetConstant(Imm, dl, MVT::i8));
1040      --I;
1041      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1042      ++I;
1043      MadeChange = true;
1044      continue;
1045    }
1046    case X86ISD::FANDN:
1047    case X86ISD::FAND:
1048    case X86ISD::FOR:
1049    case X86ISD::FXOR: {
1050      // Widen scalar fp logic ops to vector to reduce isel patterns.
1051      // FIXME: Can we do this during lowering/combine.
1052      MVT VT = N->getSimpleValueType(0);
1053      if (VT.isVector() || VT == MVT::f128)
1054        break;
1055
1056      MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
1057      SDLoc dl(N);
1058      SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1059                                    N->getOperand(0));
1060      SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1061                                    N->getOperand(1));
1062
1063      SDValue Res;
1064      if (Subtarget->hasSSE2()) {
1065        EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1066        Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1067        Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1068        unsigned Opc;
1069        switch (N->getOpcode()) {
1070        default: llvm_unreachable("Unexpected opcode!");
1071        case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1072        case X86ISD::FAND:  Opc = ISD::AND;      break;
1073        case X86ISD::FOR:   Opc = ISD::OR;       break;
1074        case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1075        }
1076        Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1077        Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1078      } else {
1079        Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1080      }
1081      Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1082                            CurDAG->getIntPtrConstant(0, dl));
1083      --I;
1084      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1085      ++I;
1086      MadeChange = true;
1087      continue;
1088    }
1089    }
1090
1091    if (OptLevel != CodeGenOpt::None &&
1092        // Only do this when the target can fold the load into the call or
1093        // jmp.
1094        !Subtarget->useIndirectThunkCalls() &&
1095        ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1096         (N->getOpcode() == X86ISD::TC_RETURN &&
1097          (Subtarget->is64Bit() ||
1098           !getTargetMachine().isPositionIndependent())))) {
1099      /// Also try moving call address load from outside callseq_start to just
1100      /// before the call to allow it to be folded.
1101      ///
1102      ///     [Load chain]
1103      ///         ^
1104      ///         |
1105      ///       [Load]
1106      ///       ^    ^
1107      ///       |    |
1108      ///      /      \--
1109      ///     /          |
1110      ///[CALLSEQ_START] |
1111      ///     ^          |
1112      ///     |          |
1113      /// [LOAD/C2Reg]   |
1114      ///     |          |
1115      ///      \        /
1116      ///       \      /
1117      ///       [CALL]
1118      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1119      SDValue Chain = N->getOperand(0);
1120      SDValue Load  = N->getOperand(1);
1121      if (!isCalleeLoad(Load, Chain, HasCallSeq))
1122        continue;
1123      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1124      ++NumLoadMoved;
1125      MadeChange = true;
1126      continue;
1127    }
1128
1129    // Lower fpround and fpextend nodes that target the FP stack to be store and
1130    // load to the stack.  This is a gross hack.  We would like to simply mark
1131    // these as being illegal, but when we do that, legalize produces these when
1132    // it expands calls, then expands these in the same legalize pass.  We would
1133    // like dag combine to be able to hack on these between the call expansion
1134    // and the node legalization.  As such this pass basically does "really
1135    // late" legalization of these inline with the X86 isel pass.
1136    // FIXME: This should only happen when not compiled with -O0.
1137    switch (N->getOpcode()) {
1138    default: continue;
1139    case ISD::FP_ROUND:
1140    case ISD::FP_EXTEND:
1141    {
1142      MVT SrcVT = N->getOperand(0).getSimpleValueType();
1143      MVT DstVT = N->getSimpleValueType(0);
1144
1145      // If any of the sources are vectors, no fp stack involved.
1146      if (SrcVT.isVector() || DstVT.isVector())
1147        continue;
1148
1149      // If the source and destination are SSE registers, then this is a legal
1150      // conversion that should not be lowered.
1151      const X86TargetLowering *X86Lowering =
1152          static_cast<const X86TargetLowering *>(TLI);
1153      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1154      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1155      if (SrcIsSSE && DstIsSSE)
1156        continue;
1157
1158      if (!SrcIsSSE && !DstIsSSE) {
1159        // If this is an FPStack extension, it is a noop.
1160        if (N->getOpcode() == ISD::FP_EXTEND)
1161          continue;
1162        // If this is a value-preserving FPStack truncation, it is a noop.
1163        if (N->getConstantOperandVal(1))
1164          continue;
1165      }
1166
1167      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1168      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1169      // operations.  Based on this, decide what we want to do.
1170      MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1171      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1172      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1173      MachinePointerInfo MPI =
1174          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1175      SDLoc dl(N);
1176
1177      // FIXME: optimize the case where the src/dest is a load or store?
1178
1179      SDValue Store = CurDAG->getTruncStore(
1180          CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1181      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1182                                          MemTmp, MPI, MemVT);
1183
1184      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1185      // extload we created.  This will cause general havok on the dag because
1186      // anything below the conversion could be folded into other existing nodes.
1187      // To avoid invalidating 'I', back it up to the convert node.
1188      --I;
1189      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1190      break;
1191    }
1192
1193    //The sequence of events for lowering STRICT_FP versions of these nodes requires
1194    //dealing with the chain differently, as there is already a preexisting chain.
1195    case ISD::STRICT_FP_ROUND:
1196    case ISD::STRICT_FP_EXTEND:
1197    {
1198      MVT SrcVT = N->getOperand(1).getSimpleValueType();
1199      MVT DstVT = N->getSimpleValueType(0);
1200
1201      // If any of the sources are vectors, no fp stack involved.
1202      if (SrcVT.isVector() || DstVT.isVector())
1203        continue;
1204
1205      // If the source and destination are SSE registers, then this is a legal
1206      // conversion that should not be lowered.
1207      const X86TargetLowering *X86Lowering =
1208          static_cast<const X86TargetLowering *>(TLI);
1209      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1210      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1211      if (SrcIsSSE && DstIsSSE)
1212        continue;
1213
1214      if (!SrcIsSSE && !DstIsSSE) {
1215        // If this is an FPStack extension, it is a noop.
1216        if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1217          continue;
1218        // If this is a value-preserving FPStack truncation, it is a noop.
1219        if (N->getConstantOperandVal(2))
1220          continue;
1221      }
1222
1223      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1224      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1225      // operations.  Based on this, decide what we want to do.
1226      MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1227      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1228      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1229      MachinePointerInfo MPI =
1230          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1231      SDLoc dl(N);
1232
1233      // FIXME: optimize the case where the src/dest is a load or store?
1234
1235      //Since the operation is StrictFP, use the preexisting chain.
1236      SDValue Store, Result;
1237      if (!SrcIsSSE) {
1238        SDVTList VTs = CurDAG->getVTList(MVT::Other);
1239        SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1240        Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1241                                            MPI, /*Align*/ None,
1242                                            MachineMemOperand::MOStore);
1243        if (N->getFlags().hasNoFPExcept()) {
1244          SDNodeFlags Flags = Store->getFlags();
1245          Flags.setNoFPExcept(true);
1246          Store->setFlags(Flags);
1247        }
1248      } else {
1249        assert(SrcVT == MemVT && "Unexpected VT!");
1250        Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1251                                 MPI);
1252      }
1253
1254      if (!DstIsSSE) {
1255        SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1256        SDValue Ops[] = {Store, MemTmp};
1257        Result = CurDAG->getMemIntrinsicNode(
1258            X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1259            /*Align*/ None, MachineMemOperand::MOLoad);
1260        if (N->getFlags().hasNoFPExcept()) {
1261          SDNodeFlags Flags = Result->getFlags();
1262          Flags.setNoFPExcept(true);
1263          Result->setFlags(Flags);
1264        }
1265      } else {
1266        assert(DstVT == MemVT && "Unexpected VT!");
1267        Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1268      }
1269
1270      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1271      // extload we created.  This will cause general havok on the dag because
1272      // anything below the conversion could be folded into other existing nodes.
1273      // To avoid invalidating 'I', back it up to the convert node.
1274      --I;
1275      CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1276      break;
1277    }
1278    }
1279
1280
1281    // Now that we did that, the node is dead.  Increment the iterator to the
1282    // next node to process, then delete N.
1283    ++I;
1284    MadeChange = true;
1285  }
1286
1287  // Remove any dead nodes that may have been left behind.
1288  if (MadeChange)
1289    CurDAG->RemoveDeadNodes();
1290}
1291
1292// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1293bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1294  unsigned Opc = N->getMachineOpcode();
1295  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1296      Opc != X86::MOVSX64rr8)
1297    return false;
1298
1299  SDValue N0 = N->getOperand(0);
1300
1301  // We need to be extracting the lower bit of an extend.
1302  if (!N0.isMachineOpcode() ||
1303      N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1304      N0.getConstantOperandVal(1) != X86::sub_8bit)
1305    return false;
1306
1307  // We're looking for either a movsx or movzx to match the original opcode.
1308  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1309                                                : X86::MOVSX32rr8_NOREX;
1310  SDValue N00 = N0.getOperand(0);
1311  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1312    return false;
1313
1314  if (Opc == X86::MOVSX64rr8) {
1315    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1316    // to 64.
1317    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1318                                                   MVT::i64, N00);
1319    ReplaceUses(N, Extend);
1320  } else {
1321    // Ok we can drop this extend and just use the original extend.
1322    ReplaceUses(N, N00.getNode());
1323  }
1324
1325  return true;
1326}
1327
1328void X86DAGToDAGISel::PostprocessISelDAG() {
1329  // Skip peepholes at -O0.
1330  if (TM.getOptLevel() == CodeGenOpt::None)
1331    return;
1332
1333  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1334
1335  bool MadeChange = false;
1336  while (Position != CurDAG->allnodes_begin()) {
1337    SDNode *N = &*--Position;
1338    // Skip dead nodes and any non-machine opcodes.
1339    if (N->use_empty() || !N->isMachineOpcode())
1340      continue;
1341
1342    if (tryOptimizeRem8Extend(N)) {
1343      MadeChange = true;
1344      continue;
1345    }
1346
1347    // Look for a TESTrr+ANDrr pattern where both operands of the test are
1348    // the same. Rewrite to remove the AND.
1349    unsigned Opc = N->getMachineOpcode();
1350    if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1351         Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1352        N->getOperand(0) == N->getOperand(1) &&
1353        N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1354        N->getOperand(0).isMachineOpcode()) {
1355      SDValue And = N->getOperand(0);
1356      unsigned N0Opc = And.getMachineOpcode();
1357      if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1358          N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1359        MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1360                                                     MVT::i32,
1361                                                     And.getOperand(0),
1362                                                     And.getOperand(1));
1363        ReplaceUses(N, Test);
1364        MadeChange = true;
1365        continue;
1366      }
1367      if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1368          N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1369        unsigned NewOpc;
1370        switch (N0Opc) {
1371        case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
1372        case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1373        case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1374        case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1375        }
1376
1377        // Need to swap the memory and register operand.
1378        SDValue Ops[] = { And.getOperand(1),
1379                          And.getOperand(2),
1380                          And.getOperand(3),
1381                          And.getOperand(4),
1382                          And.getOperand(5),
1383                          And.getOperand(0),
1384                          And.getOperand(6)  /* Chain */ };
1385        MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1386                                                     MVT::i32, MVT::Other, Ops);
1387        CurDAG->setNodeMemRefs(
1388            Test, cast<MachineSDNode>(And.getNode())->memoperands());
1389        ReplaceUses(N, Test);
1390        MadeChange = true;
1391        continue;
1392      }
1393    }
1394
1395    // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1396    // used. We're doing this late so we can prefer to fold the AND into masked
1397    // comparisons. Doing that can be better for the live range of the mask
1398    // register.
1399    if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1400         Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1401        N->getOperand(0) == N->getOperand(1) &&
1402        N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1403        N->getOperand(0).isMachineOpcode() &&
1404        onlyUsesZeroFlag(SDValue(N, 0))) {
1405      SDValue And = N->getOperand(0);
1406      unsigned N0Opc = And.getMachineOpcode();
1407      // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1408      // KAND instructions and KTEST use the same ISA feature.
1409      if (N0Opc == X86::KANDBrr ||
1410          (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1411          N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1412        unsigned NewOpc;
1413        switch (Opc) {
1414        default: llvm_unreachable("Unexpected opcode!");
1415        case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1416        case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1417        case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1418        case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1419        }
1420        MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1421                                                      MVT::i32,
1422                                                      And.getOperand(0),
1423                                                      And.getOperand(1));
1424        ReplaceUses(N, KTest);
1425        MadeChange = true;
1426        continue;
1427      }
1428    }
1429
1430    // Attempt to remove vectors moves that were inserted to zero upper bits.
1431    if (Opc != TargetOpcode::SUBREG_TO_REG)
1432      continue;
1433
1434    unsigned SubRegIdx = N->getConstantOperandVal(2);
1435    if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1436      continue;
1437
1438    SDValue Move = N->getOperand(1);
1439    if (!Move.isMachineOpcode())
1440      continue;
1441
1442    // Make sure its one of the move opcodes we recognize.
1443    switch (Move.getMachineOpcode()) {
1444    default:
1445      continue;
1446    case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
1447    case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
1448    case X86::VMOVDQArr:       case X86::VMOVDQUrr:
1449    case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
1450    case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
1451    case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
1452    case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
1453    case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
1454    case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1455    case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1456    case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
1457    case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
1458    case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1459    case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1460      break;
1461    }
1462
1463    SDValue In = Move.getOperand(0);
1464    if (!In.isMachineOpcode() ||
1465        In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1466      continue;
1467
1468    // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1469    // the SHA instructions which use a legacy encoding.
1470    uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1471    if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1472        (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1473        (TSFlags & X86II::EncodingMask) != X86II::XOP)
1474      continue;
1475
1476    // Producing instruction is another vector instruction. We can drop the
1477    // move.
1478    CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1479    MadeChange = true;
1480  }
1481
1482  if (MadeChange)
1483    CurDAG->RemoveDeadNodes();
1484}
1485
1486
1487/// Emit any code that needs to be executed only in the main function.
1488void X86DAGToDAGISel::emitSpecialCodeForMain() {
1489  if (Subtarget->isTargetCygMing()) {
1490    TargetLowering::ArgListTy Args;
1491    auto &DL = CurDAG->getDataLayout();
1492
1493    TargetLowering::CallLoweringInfo CLI(*CurDAG);
1494    CLI.setChain(CurDAG->getRoot())
1495        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1496                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1497                   std::move(Args));
1498    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1499    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1500    CurDAG->setRoot(Result.second);
1501  }
1502}
1503
1504void X86DAGToDAGISel::emitFunctionEntryCode() {
1505  // If this is main, emit special code for main.
1506  const Function &F = MF->getFunction();
1507  if (F.hasExternalLinkage() && F.getName() == "main")
1508    emitSpecialCodeForMain();
1509}
1510
1511static bool isDispSafeForFrameIndex(int64_t Val) {
1512  // On 64-bit platforms, we can run into an issue where a frame index
1513  // includes a displacement that, when added to the explicit displacement,
1514  // will overflow the displacement field. Assuming that the frame index
1515  // displacement fits into a 31-bit integer  (which is only slightly more
1516  // aggressive than the current fundamental assumption that it fits into
1517  // a 32-bit integer), a 31-bit disp should always be safe.
1518  return isInt<31>(Val);
1519}
1520
1521bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1522                                            X86ISelAddressMode &AM) {
1523  // We may have already matched a displacement and the caller just added the
1524  // symbolic displacement. So we still need to do the checks even if Offset
1525  // is zero.
1526
1527  int64_t Val = AM.Disp + Offset;
1528
1529  // Cannot combine ExternalSymbol displacements with integer offsets.
1530  if (Val != 0 && (AM.ES || AM.MCSym))
1531    return true;
1532
1533  CodeModel::Model M = TM.getCodeModel();
1534  if (Subtarget->is64Bit()) {
1535    if (Val != 0 &&
1536        !X86::isOffsetSuitableForCodeModel(Val, M,
1537                                           AM.hasSymbolicDisplacement()))
1538      return true;
1539    // In addition to the checks required for a register base, check that
1540    // we do not try to use an unsafe Disp with a frame index.
1541    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1542        !isDispSafeForFrameIndex(Val))
1543      return true;
1544  }
1545  AM.Disp = Val;
1546  return false;
1547
1548}
1549
1550bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1551  SDValue Address = N->getOperand(1);
1552
1553  // load gs:0 -> GS segment register.
1554  // load fs:0 -> FS segment register.
1555  //
1556  // This optimization is valid because the GNU TLS model defines that
1557  // gs:0 (or fs:0 on X86-64) contains its own address.
1558  // For more information see http://people.redhat.com/drepper/tls.pdf
1559  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1560    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1561        !IndirectTlsSegRefs &&
1562        (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1563         Subtarget->isTargetFuchsia()))
1564      switch (N->getPointerInfo().getAddrSpace()) {
1565      case X86AS::GS:
1566        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1567        return false;
1568      case X86AS::FS:
1569        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1570        return false;
1571      // Address space X86AS::SS is not handled here, because it is not used to
1572      // address TLS areas.
1573      }
1574
1575  return true;
1576}
1577
1578/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1579/// mode. These wrap things that will resolve down into a symbol reference.
1580/// If no match is possible, this returns true, otherwise it returns false.
1581bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1582  // If the addressing mode already has a symbol as the displacement, we can
1583  // never match another symbol.
1584  if (AM.hasSymbolicDisplacement())
1585    return true;
1586
1587  bool IsRIPRelTLS = false;
1588  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1589  if (IsRIPRel) {
1590    SDValue Val = N.getOperand(0);
1591    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1592      IsRIPRelTLS = true;
1593  }
1594
1595  // We can't use an addressing mode in the 64-bit large code model.
1596  // Global TLS addressing is an exception. In the medium code model,
1597  // we use can use a mode when RIP wrappers are present.
1598  // That signifies access to globals that are known to be "near",
1599  // such as the GOT itself.
1600  CodeModel::Model M = TM.getCodeModel();
1601  if (Subtarget->is64Bit() &&
1602      ((M == CodeModel::Large && !IsRIPRelTLS) ||
1603       (M == CodeModel::Medium && !IsRIPRel)))
1604    return true;
1605
1606  // Base and index reg must be 0 in order to use %rip as base.
1607  if (IsRIPRel && AM.hasBaseOrIndexReg())
1608    return true;
1609
1610  // Make a local copy in case we can't do this fold.
1611  X86ISelAddressMode Backup = AM;
1612
1613  int64_t Offset = 0;
1614  SDValue N0 = N.getOperand(0);
1615  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1616    AM.GV = G->getGlobal();
1617    AM.SymbolFlags = G->getTargetFlags();
1618    Offset = G->getOffset();
1619  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1620    AM.CP = CP->getConstVal();
1621    AM.Alignment = CP->getAlign();
1622    AM.SymbolFlags = CP->getTargetFlags();
1623    Offset = CP->getOffset();
1624  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1625    AM.ES = S->getSymbol();
1626    AM.SymbolFlags = S->getTargetFlags();
1627  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1628    AM.MCSym = S->getMCSymbol();
1629  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1630    AM.JT = J->getIndex();
1631    AM.SymbolFlags = J->getTargetFlags();
1632  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1633    AM.BlockAddr = BA->getBlockAddress();
1634    AM.SymbolFlags = BA->getTargetFlags();
1635    Offset = BA->getOffset();
1636  } else
1637    llvm_unreachable("Unhandled symbol reference node.");
1638
1639  if (foldOffsetIntoAddress(Offset, AM)) {
1640    AM = Backup;
1641    return true;
1642  }
1643
1644  if (IsRIPRel)
1645    AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1646
1647  // Commit the changes now that we know this fold is safe.
1648  return false;
1649}
1650
1651/// Add the specified node to the specified addressing mode, returning true if
1652/// it cannot be done. This just pattern matches for the addressing mode.
1653bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1654  if (matchAddressRecursively(N, AM, 0))
1655    return true;
1656
1657  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1658  // a smaller encoding and avoids a scaled-index.
1659  if (AM.Scale == 2 &&
1660      AM.BaseType == X86ISelAddressMode::RegBase &&
1661      AM.Base_Reg.getNode() == nullptr) {
1662    AM.Base_Reg = AM.IndexReg;
1663    AM.Scale = 1;
1664  }
1665
1666  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1667  // because it has a smaller encoding.
1668  // TODO: Which other code models can use this?
1669  switch (TM.getCodeModel()) {
1670    default: break;
1671    case CodeModel::Small:
1672    case CodeModel::Kernel:
1673      if (Subtarget->is64Bit() &&
1674          AM.Scale == 1 &&
1675          AM.BaseType == X86ISelAddressMode::RegBase &&
1676          AM.Base_Reg.getNode() == nullptr &&
1677          AM.IndexReg.getNode() == nullptr &&
1678          AM.SymbolFlags == X86II::MO_NO_FLAG &&
1679          AM.hasSymbolicDisplacement())
1680        AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1681      break;
1682  }
1683
1684  return false;
1685}
1686
1687bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1688                               unsigned Depth) {
1689  // Add an artificial use to this node so that we can keep track of
1690  // it if it gets CSE'd with a different node.
1691  HandleSDNode Handle(N);
1692
1693  X86ISelAddressMode Backup = AM;
1694  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1695      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1696    return false;
1697  AM = Backup;
1698
1699  // Try again after commutating the operands.
1700  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1701                               Depth + 1) &&
1702      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1703    return false;
1704  AM = Backup;
1705
1706  // If we couldn't fold both operands into the address at the same time,
1707  // see if we can just put each operand into a register and fold at least
1708  // the add.
1709  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1710      !AM.Base_Reg.getNode() &&
1711      !AM.IndexReg.getNode()) {
1712    N = Handle.getValue();
1713    AM.Base_Reg = N.getOperand(0);
1714    AM.IndexReg = N.getOperand(1);
1715    AM.Scale = 1;
1716    return false;
1717  }
1718  N = Handle.getValue();
1719  return true;
1720}
1721
1722// Insert a node into the DAG at least before the Pos node's position. This
1723// will reposition the node as needed, and will assign it a node ID that is <=
1724// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1725// IDs! The selection DAG must no longer depend on their uniqueness when this
1726// is used.
1727static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1728  if (N->getNodeId() == -1 ||
1729      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1730       SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1731    DAG.RepositionNode(Pos->getIterator(), N.getNode());
1732    // Mark Node as invalid for pruning as after this it may be a successor to a
1733    // selected node but otherwise be in the same position of Pos.
1734    // Conservatively mark it with the same -abs(Id) to assure node id
1735    // invariant is preserved.
1736    N->setNodeId(Pos->getNodeId());
1737    SelectionDAGISel::InvalidateNodeId(N.getNode());
1738  }
1739}
1740
1741// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1742// safe. This allows us to convert the shift and and into an h-register
1743// extract and a scaled index. Returns false if the simplification is
1744// performed.
1745static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1746                                      uint64_t Mask,
1747                                      SDValue Shift, SDValue X,
1748                                      X86ISelAddressMode &AM) {
1749  if (Shift.getOpcode() != ISD::SRL ||
1750      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1751      !Shift.hasOneUse())
1752    return true;
1753
1754  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1755  if (ScaleLog <= 0 || ScaleLog >= 4 ||
1756      Mask != (0xffu << ScaleLog))
1757    return true;
1758
1759  MVT VT = N.getSimpleValueType();
1760  SDLoc DL(N);
1761  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1762  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1763  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1764  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1765  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1766  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1767
1768  // Insert the new nodes into the topological ordering. We must do this in
1769  // a valid topological ordering as nothing is going to go back and re-sort
1770  // these nodes. We continually insert before 'N' in sequence as this is
1771  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1772  // hierarchy left to express.
1773  insertDAGNode(DAG, N, Eight);
1774  insertDAGNode(DAG, N, Srl);
1775  insertDAGNode(DAG, N, NewMask);
1776  insertDAGNode(DAG, N, And);
1777  insertDAGNode(DAG, N, ShlCount);
1778  insertDAGNode(DAG, N, Shl);
1779  DAG.ReplaceAllUsesWith(N, Shl);
1780  DAG.RemoveDeadNode(N.getNode());
1781  AM.IndexReg = And;
1782  AM.Scale = (1 << ScaleLog);
1783  return false;
1784}
1785
1786// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1787// allows us to fold the shift into this addressing mode. Returns false if the
1788// transform succeeded.
1789static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1790                                        X86ISelAddressMode &AM) {
1791  SDValue Shift = N.getOperand(0);
1792
1793  // Use a signed mask so that shifting right will insert sign bits. These
1794  // bits will be removed when we shift the result left so it doesn't matter
1795  // what we use. This might allow a smaller immediate encoding.
1796  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1797
1798  // If we have an any_extend feeding the AND, look through it to see if there
1799  // is a shift behind it. But only if the AND doesn't use the extended bits.
1800  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1801  bool FoundAnyExtend = false;
1802  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1803      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1804      isUInt<32>(Mask)) {
1805    FoundAnyExtend = true;
1806    Shift = Shift.getOperand(0);
1807  }
1808
1809  if (Shift.getOpcode() != ISD::SHL ||
1810      !isa<ConstantSDNode>(Shift.getOperand(1)))
1811    return true;
1812
1813  SDValue X = Shift.getOperand(0);
1814
1815  // Not likely to be profitable if either the AND or SHIFT node has more
1816  // than one use (unless all uses are for address computation). Besides,
1817  // isel mechanism requires their node ids to be reused.
1818  if (!N.hasOneUse() || !Shift.hasOneUse())
1819    return true;
1820
1821  // Verify that the shift amount is something we can fold.
1822  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1823  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1824    return true;
1825
1826  MVT VT = N.getSimpleValueType();
1827  SDLoc DL(N);
1828  if (FoundAnyExtend) {
1829    SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1830    insertDAGNode(DAG, N, NewX);
1831    X = NewX;
1832  }
1833
1834  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1835  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1836  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1837
1838  // Insert the new nodes into the topological ordering. We must do this in
1839  // a valid topological ordering as nothing is going to go back and re-sort
1840  // these nodes. We continually insert before 'N' in sequence as this is
1841  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1842  // hierarchy left to express.
1843  insertDAGNode(DAG, N, NewMask);
1844  insertDAGNode(DAG, N, NewAnd);
1845  insertDAGNode(DAG, N, NewShift);
1846  DAG.ReplaceAllUsesWith(N, NewShift);
1847  DAG.RemoveDeadNode(N.getNode());
1848
1849  AM.Scale = 1 << ShiftAmt;
1850  AM.IndexReg = NewAnd;
1851  return false;
1852}
1853
1854// Implement some heroics to detect shifts of masked values where the mask can
1855// be replaced by extending the shift and undoing that in the addressing mode
1856// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1857// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1858// the addressing mode. This results in code such as:
1859//
1860//   int f(short *y, int *lookup_table) {
1861//     ...
1862//     return *y + lookup_table[*y >> 11];
1863//   }
1864//
1865// Turning into:
1866//   movzwl (%rdi), %eax
1867//   movl %eax, %ecx
1868//   shrl $11, %ecx
1869//   addl (%rsi,%rcx,4), %eax
1870//
1871// Instead of:
1872//   movzwl (%rdi), %eax
1873//   movl %eax, %ecx
1874//   shrl $9, %ecx
1875//   andl $124, %rcx
1876//   addl (%rsi,%rcx), %eax
1877//
1878// Note that this function assumes the mask is provided as a mask *after* the
1879// value is shifted. The input chain may or may not match that, but computing
1880// such a mask is trivial.
1881static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1882                                    uint64_t Mask,
1883                                    SDValue Shift, SDValue X,
1884                                    X86ISelAddressMode &AM) {
1885  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1886      !isa<ConstantSDNode>(Shift.getOperand(1)))
1887    return true;
1888
1889  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1890  unsigned MaskLZ = countLeadingZeros(Mask);
1891  unsigned MaskTZ = countTrailingZeros(Mask);
1892
1893  // The amount of shift we're trying to fit into the addressing mode is taken
1894  // from the trailing zeros of the mask.
1895  unsigned AMShiftAmt = MaskTZ;
1896
1897  // There is nothing we can do here unless the mask is removing some bits.
1898  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1899  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
1900
1901  // We also need to ensure that mask is a continuous run of bits.
1902  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1903
1904  // Scale the leading zero count down based on the actual size of the value.
1905  // Also scale it down based on the size of the shift.
1906  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1907  if (MaskLZ < ScaleDown)
1908    return true;
1909  MaskLZ -= ScaleDown;
1910
1911  // The final check is to ensure that any masked out high bits of X are
1912  // already known to be zero. Otherwise, the mask has a semantic impact
1913  // other than masking out a couple of low bits. Unfortunately, because of
1914  // the mask, zero extensions will be removed from operands in some cases.
1915  // This code works extra hard to look through extensions because we can
1916  // replace them with zero extensions cheaply if necessary.
1917  bool ReplacingAnyExtend = false;
1918  if (X.getOpcode() == ISD::ANY_EXTEND) {
1919    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1920                          X.getOperand(0).getSimpleValueType().getSizeInBits();
1921    // Assume that we'll replace the any-extend with a zero-extend, and
1922    // narrow the search to the extended value.
1923    X = X.getOperand(0);
1924    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1925    ReplacingAnyExtend = true;
1926  }
1927  APInt MaskedHighBits =
1928    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1929  KnownBits Known = DAG.computeKnownBits(X);
1930  if (MaskedHighBits != Known.Zero) return true;
1931
1932  // We've identified a pattern that can be transformed into a single shift
1933  // and an addressing mode. Make it so.
1934  MVT VT = N.getSimpleValueType();
1935  if (ReplacingAnyExtend) {
1936    assert(X.getValueType() != VT);
1937    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1938    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1939    insertDAGNode(DAG, N, NewX);
1940    X = NewX;
1941  }
1942  SDLoc DL(N);
1943  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1944  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1945  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1946  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1947
1948  // Insert the new nodes into the topological ordering. We must do this in
1949  // a valid topological ordering as nothing is going to go back and re-sort
1950  // these nodes. We continually insert before 'N' in sequence as this is
1951  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1952  // hierarchy left to express.
1953  insertDAGNode(DAG, N, NewSRLAmt);
1954  insertDAGNode(DAG, N, NewSRL);
1955  insertDAGNode(DAG, N, NewSHLAmt);
1956  insertDAGNode(DAG, N, NewSHL);
1957  DAG.ReplaceAllUsesWith(N, NewSHL);
1958  DAG.RemoveDeadNode(N.getNode());
1959
1960  AM.Scale = 1 << AMShiftAmt;
1961  AM.IndexReg = NewSRL;
1962  return false;
1963}
1964
1965// Transform "(X >> SHIFT) & (MASK << C1)" to
1966// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1967// matched to a BEXTR later. Returns false if the simplification is performed.
1968static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1969                                   uint64_t Mask,
1970                                   SDValue Shift, SDValue X,
1971                                   X86ISelAddressMode &AM,
1972                                   const X86Subtarget &Subtarget) {
1973  if (Shift.getOpcode() != ISD::SRL ||
1974      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1975      !Shift.hasOneUse() || !N.hasOneUse())
1976    return true;
1977
1978  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1979  if (!Subtarget.hasTBM() &&
1980      !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1981    return true;
1982
1983  // We need to ensure that mask is a continuous run of bits.
1984  if (!isShiftedMask_64(Mask)) return true;
1985
1986  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1987
1988  // The amount of shift we're trying to fit into the addressing mode is taken
1989  // from the trailing zeros of the mask.
1990  unsigned AMShiftAmt = countTrailingZeros(Mask);
1991
1992  // There is nothing we can do here unless the mask is removing some bits.
1993  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1994  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
1995
1996  MVT VT = N.getSimpleValueType();
1997  SDLoc DL(N);
1998  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1999  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
2000  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
2001  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
2002  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2003  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
2004
2005  // Insert the new nodes into the topological ordering. We must do this in
2006  // a valid topological ordering as nothing is going to go back and re-sort
2007  // these nodes. We continually insert before 'N' in sequence as this is
2008  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2009  // hierarchy left to express.
2010  insertDAGNode(DAG, N, NewSRLAmt);
2011  insertDAGNode(DAG, N, NewSRL);
2012  insertDAGNode(DAG, N, NewMask);
2013  insertDAGNode(DAG, N, NewAnd);
2014  insertDAGNode(DAG, N, NewSHLAmt);
2015  insertDAGNode(DAG, N, NewSHL);
2016  DAG.ReplaceAllUsesWith(N, NewSHL);
2017  DAG.RemoveDeadNode(N.getNode());
2018
2019  AM.Scale = 1 << AMShiftAmt;
2020  AM.IndexReg = NewAnd;
2021  return false;
2022}
2023
2024bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2025                                              unsigned Depth) {
2026  SDLoc dl(N);
2027  LLVM_DEBUG({
2028    dbgs() << "MatchAddress: ";
2029    AM.dump(CurDAG);
2030  });
2031  // Limit recursion.
2032  if (Depth > 5)
2033    return matchAddressBase(N, AM);
2034
2035  // If this is already a %rip relative address, we can only merge immediates
2036  // into it.  Instead of handling this in every case, we handle it here.
2037  // RIP relative addressing: %rip + 32-bit displacement!
2038  if (AM.isRIPRelative()) {
2039    // FIXME: JumpTable and ExternalSymbol address currently don't like
2040    // displacements.  It isn't very important, but this should be fixed for
2041    // consistency.
2042    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2043      return true;
2044
2045    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
2046      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2047        return false;
2048    return true;
2049  }
2050
2051  switch (N.getOpcode()) {
2052  default: break;
2053  case ISD::LOCAL_RECOVER: {
2054    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2055      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2056        // Use the symbol and don't prefix it.
2057        AM.MCSym = ESNode->getMCSymbol();
2058        return false;
2059      }
2060    break;
2061  }
2062  case ISD::Constant: {
2063    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2064    if (!foldOffsetIntoAddress(Val, AM))
2065      return false;
2066    break;
2067  }
2068
2069  case X86ISD::Wrapper:
2070  case X86ISD::WrapperRIP:
2071    if (!matchWrapper(N, AM))
2072      return false;
2073    break;
2074
2075  case ISD::LOAD:
2076    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2077      return false;
2078    break;
2079
2080  case ISD::FrameIndex:
2081    if (AM.BaseType == X86ISelAddressMode::RegBase &&
2082        AM.Base_Reg.getNode() == nullptr &&
2083        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2084      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2085      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2086      return false;
2087    }
2088    break;
2089
2090  case ISD::SHL:
2091    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2092      break;
2093
2094    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2095      unsigned Val = CN->getZExtValue();
2096      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2097      // that the base operand remains free for further matching. If
2098      // the base doesn't end up getting used, a post-processing step
2099      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2100      if (Val == 1 || Val == 2 || Val == 3) {
2101        AM.Scale = 1 << Val;
2102        SDValue ShVal = N.getOperand(0);
2103
2104        // Okay, we know that we have a scale by now.  However, if the scaled
2105        // value is an add of something and a constant, we can fold the
2106        // constant into the disp field here.
2107        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
2108          AM.IndexReg = ShVal.getOperand(0);
2109          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
2110          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
2111          if (!foldOffsetIntoAddress(Disp, AM))
2112            return false;
2113        }
2114
2115        AM.IndexReg = ShVal;
2116        return false;
2117      }
2118    }
2119    break;
2120
2121  case ISD::SRL: {
2122    // Scale must not be used already.
2123    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2124
2125    // We only handle up to 64-bit values here as those are what matter for
2126    // addressing mode optimizations.
2127    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2128           "Unexpected value size!");
2129
2130    SDValue And = N.getOperand(0);
2131    if (And.getOpcode() != ISD::AND) break;
2132    SDValue X = And.getOperand(0);
2133
2134    // The mask used for the transform is expected to be post-shift, but we
2135    // found the shift first so just apply the shift to the mask before passing
2136    // it down.
2137    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2138        !isa<ConstantSDNode>(And.getOperand(1)))
2139      break;
2140    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2141
2142    // Try to fold the mask and shift into the scale, and return false if we
2143    // succeed.
2144    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2145      return false;
2146    break;
2147  }
2148
2149  case ISD::SMUL_LOHI:
2150  case ISD::UMUL_LOHI:
2151    // A mul_lohi where we need the low part can be folded as a plain multiply.
2152    if (N.getResNo() != 0) break;
2153    LLVM_FALLTHROUGH;
2154  case ISD::MUL:
2155  case X86ISD::MUL_IMM:
2156    // X*[3,5,9] -> X+X*[2,4,8]
2157    if (AM.BaseType == X86ISelAddressMode::RegBase &&
2158        AM.Base_Reg.getNode() == nullptr &&
2159        AM.IndexReg.getNode() == nullptr) {
2160      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2161        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2162            CN->getZExtValue() == 9) {
2163          AM.Scale = unsigned(CN->getZExtValue())-1;
2164
2165          SDValue MulVal = N.getOperand(0);
2166          SDValue Reg;
2167
2168          // Okay, we know that we have a scale by now.  However, if the scaled
2169          // value is an add of something and a constant, we can fold the
2170          // constant into the disp field here.
2171          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2172              isa<ConstantSDNode>(MulVal.getOperand(1))) {
2173            Reg = MulVal.getOperand(0);
2174            ConstantSDNode *AddVal =
2175              cast<ConstantSDNode>(MulVal.getOperand(1));
2176            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2177            if (foldOffsetIntoAddress(Disp, AM))
2178              Reg = N.getOperand(0);
2179          } else {
2180            Reg = N.getOperand(0);
2181          }
2182
2183          AM.IndexReg = AM.Base_Reg = Reg;
2184          return false;
2185        }
2186    }
2187    break;
2188
2189  case ISD::SUB: {
2190    // Given A-B, if A can be completely folded into the address and
2191    // the index field with the index field unused, use -B as the index.
2192    // This is a win if a has multiple parts that can be folded into
2193    // the address. Also, this saves a mov if the base register has
2194    // other uses, since it avoids a two-address sub instruction, however
2195    // it costs an additional mov if the index register has other uses.
2196
2197    // Add an artificial use to this node so that we can keep track of
2198    // it if it gets CSE'd with a different node.
2199    HandleSDNode Handle(N);
2200
2201    // Test if the LHS of the sub can be folded.
2202    X86ISelAddressMode Backup = AM;
2203    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2204      N = Handle.getValue();
2205      AM = Backup;
2206      break;
2207    }
2208    N = Handle.getValue();
2209    // Test if the index field is free for use.
2210    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2211      AM = Backup;
2212      break;
2213    }
2214
2215    int Cost = 0;
2216    SDValue RHS = N.getOperand(1);
2217    // If the RHS involves a register with multiple uses, this
2218    // transformation incurs an extra mov, due to the neg instruction
2219    // clobbering its operand.
2220    if (!RHS.getNode()->hasOneUse() ||
2221        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2222        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2223        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2224        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2225         RHS.getOperand(0).getValueType() == MVT::i32))
2226      ++Cost;
2227    // If the base is a register with multiple uses, this
2228    // transformation may save a mov.
2229    if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2230         !AM.Base_Reg.getNode()->hasOneUse()) ||
2231        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2232      --Cost;
2233    // If the folded LHS was interesting, this transformation saves
2234    // address arithmetic.
2235    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2236        ((AM.Disp != 0) && (Backup.Disp == 0)) +
2237        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2238      --Cost;
2239    // If it doesn't look like it may be an overall win, don't do it.
2240    if (Cost >= 0) {
2241      AM = Backup;
2242      break;
2243    }
2244
2245    // Ok, the transformation is legal and appears profitable. Go for it.
2246    // Negation will be emitted later to avoid creating dangling nodes if this
2247    // was an unprofitable LEA.
2248    AM.IndexReg = RHS;
2249    AM.NegateIndex = true;
2250    AM.Scale = 1;
2251    return false;
2252  }
2253
2254  case ISD::ADD:
2255    if (!matchAdd(N, AM, Depth))
2256      return false;
2257    break;
2258
2259  case ISD::OR:
2260    // We want to look through a transform in InstCombine and DAGCombiner that
2261    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2262    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2263    // An 'lea' can then be used to match the shift (multiply) and add:
2264    // and $1, %esi
2265    // lea (%rsi, %rdi, 8), %rax
2266    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2267        !matchAdd(N, AM, Depth))
2268      return false;
2269    break;
2270
2271  case ISD::AND: {
2272    // Perform some heroic transforms on an and of a constant-count shift
2273    // with a constant to enable use of the scaled offset field.
2274
2275    // Scale must not be used already.
2276    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2277
2278    // We only handle up to 64-bit values here as those are what matter for
2279    // addressing mode optimizations.
2280    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2281           "Unexpected value size!");
2282
2283    if (!isa<ConstantSDNode>(N.getOperand(1)))
2284      break;
2285
2286    if (N.getOperand(0).getOpcode() == ISD::SRL) {
2287      SDValue Shift = N.getOperand(0);
2288      SDValue X = Shift.getOperand(0);
2289
2290      uint64_t Mask = N.getConstantOperandVal(1);
2291
2292      // Try to fold the mask and shift into an extract and scale.
2293      if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2294        return false;
2295
2296      // Try to fold the mask and shift directly into the scale.
2297      if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2298        return false;
2299
2300      // Try to fold the mask and shift into BEXTR and scale.
2301      if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2302        return false;
2303    }
2304
2305    // Try to swap the mask and shift to place shifts which can be done as
2306    // a scale on the outside of the mask.
2307    if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2308      return false;
2309
2310    break;
2311  }
2312  case ISD::ZERO_EXTEND: {
2313    // Try to widen a zexted shift left to the same size as its use, so we can
2314    // match the shift as a scale factor.
2315    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2316      break;
2317    if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2318      break;
2319
2320    // Give up if the shift is not a valid scale factor [1,2,3].
2321    SDValue Shl = N.getOperand(0);
2322    auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2323    if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2324      break;
2325
2326    // The narrow shift must only shift out zero bits (it must be 'nuw').
2327    // That makes it safe to widen to the destination type.
2328    APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2329                                            ShAmtC->getZExtValue());
2330    if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2331      break;
2332
2333    // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2334    MVT VT = N.getSimpleValueType();
2335    SDLoc DL(N);
2336    SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2337    SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2338
2339    // Convert the shift to scale factor.
2340    AM.Scale = 1 << ShAmtC->getZExtValue();
2341    AM.IndexReg = Zext;
2342
2343    insertDAGNode(*CurDAG, N, Zext);
2344    insertDAGNode(*CurDAG, N, NewShl);
2345    CurDAG->ReplaceAllUsesWith(N, NewShl);
2346    CurDAG->RemoveDeadNode(N.getNode());
2347    return false;
2348  }
2349  }
2350
2351  return matchAddressBase(N, AM);
2352}
2353
2354/// Helper for MatchAddress. Add the specified node to the
2355/// specified addressing mode without any further recursion.
2356bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2357  // Is the base register already occupied?
2358  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2359    // If so, check to see if the scale index register is set.
2360    if (!AM.IndexReg.getNode()) {
2361      AM.IndexReg = N;
2362      AM.Scale = 1;
2363      return false;
2364    }
2365
2366    // Otherwise, we cannot select it.
2367    return true;
2368  }
2369
2370  // Default, generate it as a register.
2371  AM.BaseType = X86ISelAddressMode::RegBase;
2372  AM.Base_Reg = N;
2373  return false;
2374}
2375
2376/// Helper for selectVectorAddr. Handles things that can be folded into a
2377/// gather scatter address. The index register and scale should have already
2378/// been handled.
2379bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2380  // TODO: Support other operations.
2381  switch (N.getOpcode()) {
2382  case ISD::Constant: {
2383    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2384    if (!foldOffsetIntoAddress(Val, AM))
2385      return false;
2386    break;
2387  }
2388  case X86ISD::Wrapper:
2389    if (!matchWrapper(N, AM))
2390      return false;
2391    break;
2392  }
2393
2394  return matchAddressBase(N, AM);
2395}
2396
2397bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2398                                       SDValue IndexOp, SDValue ScaleOp,
2399                                       SDValue &Base, SDValue &Scale,
2400                                       SDValue &Index, SDValue &Disp,
2401                                       SDValue &Segment) {
2402  X86ISelAddressMode AM;
2403  AM.IndexReg = IndexOp;
2404  AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
2405
2406  unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2407  if (AddrSpace == X86AS::GS)
2408    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2409  if (AddrSpace == X86AS::FS)
2410    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2411  if (AddrSpace == X86AS::SS)
2412    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2413
2414  SDLoc DL(BasePtr);
2415  MVT VT = BasePtr.getSimpleValueType();
2416
2417  // Try to match into the base and displacement fields.
2418  if (matchVectorAddress(BasePtr, AM))
2419    return false;
2420
2421  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2422  return true;
2423}
2424
2425/// Returns true if it is able to pattern match an addressing mode.
2426/// It returns the operands which make up the maximal addressing mode it can
2427/// match by reference.
2428///
2429/// Parent is the parent node of the addr operand that is being matched.  It
2430/// is always a load, store, atomic node, or null.  It is only null when
2431/// checking memory operands for inline asm nodes.
2432bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2433                                 SDValue &Scale, SDValue &Index,
2434                                 SDValue &Disp, SDValue &Segment) {
2435  X86ISelAddressMode AM;
2436
2437  if (Parent &&
2438      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2439      // that are not a MemSDNode, and thus don't have proper addrspace info.
2440      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2441      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2442      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2443      Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2444      Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2445      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2446      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2447    unsigned AddrSpace =
2448      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2449    if (AddrSpace == X86AS::GS)
2450      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2451    if (AddrSpace == X86AS::FS)
2452      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2453    if (AddrSpace == X86AS::SS)
2454      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2455  }
2456
2457  // Save the DL and VT before calling matchAddress, it can invalidate N.
2458  SDLoc DL(N);
2459  MVT VT = N.getSimpleValueType();
2460
2461  if (matchAddress(N, AM))
2462    return false;
2463
2464  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2465  return true;
2466}
2467
2468bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2469  // In static codegen with small code model, we can get the address of a label
2470  // into a register with 'movl'
2471  if (N->getOpcode() != X86ISD::Wrapper)
2472    return false;
2473
2474  N = N.getOperand(0);
2475
2476  // At least GNU as does not accept 'movl' for TPOFF relocations.
2477  // FIXME: We could use 'movl' when we know we are targeting MC.
2478  if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2479    return false;
2480
2481  Imm = N;
2482  if (N->getOpcode() != ISD::TargetGlobalAddress)
2483    return TM.getCodeModel() == CodeModel::Small;
2484
2485  Optional<ConstantRange> CR =
2486      cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2487  if (!CR)
2488    return TM.getCodeModel() == CodeModel::Small;
2489
2490  return CR->getUnsignedMax().ult(1ull << 32);
2491}
2492
2493bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2494                                         SDValue &Scale, SDValue &Index,
2495                                         SDValue &Disp, SDValue &Segment) {
2496  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2497  SDLoc DL(N);
2498
2499  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2500    return false;
2501
2502  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2503  if (RN && RN->getReg() == 0)
2504    Base = CurDAG->getRegister(0, MVT::i64);
2505  else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2506    // Base could already be %rip, particularly in the x32 ABI.
2507    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2508                                                     MVT::i64), 0);
2509    Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2510                                         Base);
2511  }
2512
2513  RN = dyn_cast<RegisterSDNode>(Index);
2514  if (RN && RN->getReg() == 0)
2515    Index = CurDAG->getRegister(0, MVT::i64);
2516  else {
2517    assert(Index.getValueType() == MVT::i32 &&
2518           "Expect to be extending 32-bit registers for use in LEA");
2519    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2520                                                     MVT::i64), 0);
2521    Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2522                                          Index);
2523  }
2524
2525  return true;
2526}
2527
2528/// Calls SelectAddr and determines if the maximal addressing
2529/// mode it matches can be cost effectively emitted as an LEA instruction.
2530bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2531                                    SDValue &Base, SDValue &Scale,
2532                                    SDValue &Index, SDValue &Disp,
2533                                    SDValue &Segment) {
2534  X86ISelAddressMode AM;
2535
2536  // Save the DL and VT before calling matchAddress, it can invalidate N.
2537  SDLoc DL(N);
2538  MVT VT = N.getSimpleValueType();
2539
2540  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2541  // segments.
2542  SDValue Copy = AM.Segment;
2543  SDValue T = CurDAG->getRegister(0, MVT::i32);
2544  AM.Segment = T;
2545  if (matchAddress(N, AM))
2546    return false;
2547  assert (T == AM.Segment);
2548  AM.Segment = Copy;
2549
2550  unsigned Complexity = 0;
2551  if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2552    Complexity = 1;
2553  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2554    Complexity = 4;
2555
2556  if (AM.IndexReg.getNode())
2557    Complexity++;
2558
2559  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2560  // a simple shift.
2561  if (AM.Scale > 1)
2562    Complexity++;
2563
2564  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2565  // to a LEA. This is determined with some experimentation but is by no means
2566  // optimal (especially for code size consideration). LEA is nice because of
2567  // its three-address nature. Tweak the cost function again when we can run
2568  // convertToThreeAddress() at register allocation time.
2569  if (AM.hasSymbolicDisplacement()) {
2570    // For X86-64, always use LEA to materialize RIP-relative addresses.
2571    if (Subtarget->is64Bit())
2572      Complexity = 4;
2573    else
2574      Complexity += 2;
2575  }
2576
2577  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2578  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2579  // duplicating flag-producing instructions later in the pipeline.
2580  if (N.getOpcode() == ISD::ADD) {
2581    auto isMathWithFlags = [](SDValue V) {
2582      switch (V.getOpcode()) {
2583      case X86ISD::ADD:
2584      case X86ISD::SUB:
2585      case X86ISD::ADC:
2586      case X86ISD::SBB:
2587      /* TODO: These opcodes can be added safely, but we may want to justify
2588               their inclusion for different reasons (better for reg-alloc).
2589      case X86ISD::SMUL:
2590      case X86ISD::UMUL:
2591      case X86ISD::OR:
2592      case X86ISD::XOR:
2593      case X86ISD::AND:
2594      */
2595        // Value 1 is the flag output of the node - verify it's not dead.
2596        return !SDValue(V.getNode(), 1).use_empty();
2597      default:
2598        return false;
2599      }
2600    };
2601    // TODO: This could be an 'or' rather than 'and' to make the transform more
2602    //       likely to happen. We might want to factor in whether there's a
2603    //       load folding opportunity for the math op that disappears with LEA.
2604    if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2605      Complexity++;
2606  }
2607
2608  if (AM.Disp)
2609    Complexity++;
2610
2611  // If it isn't worth using an LEA, reject it.
2612  if (Complexity <= 2)
2613    return false;
2614
2615  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2616  return true;
2617}
2618
2619/// This is only run on TargetGlobalTLSAddress nodes.
2620bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2621                                        SDValue &Scale, SDValue &Index,
2622                                        SDValue &Disp, SDValue &Segment) {
2623  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2624  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2625
2626  X86ISelAddressMode AM;
2627  AM.GV = GA->getGlobal();
2628  AM.Disp += GA->getOffset();
2629  AM.SymbolFlags = GA->getTargetFlags();
2630
2631  MVT VT = N.getSimpleValueType();
2632  if (VT == MVT::i32) {
2633    AM.Scale = 1;
2634    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2635  }
2636
2637  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2638  return true;
2639}
2640
2641bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2642  // Keep track of the original value type and whether this value was
2643  // truncated. If we see a truncation from pointer type to VT that truncates
2644  // bits that are known to be zero, we can use a narrow reference.
2645  EVT VT = N.getValueType();
2646  bool WasTruncated = false;
2647  if (N.getOpcode() == ISD::TRUNCATE) {
2648    WasTruncated = true;
2649    N = N.getOperand(0);
2650  }
2651
2652  if (N.getOpcode() != X86ISD::Wrapper)
2653    return false;
2654
2655  // We can only use non-GlobalValues as immediates if they were not truncated,
2656  // as we do not have any range information. If we have a GlobalValue and the
2657  // address was not truncated, we can select it as an operand directly.
2658  unsigned Opc = N.getOperand(0)->getOpcode();
2659  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2660    Op = N.getOperand(0);
2661    // We can only select the operand directly if we didn't have to look past a
2662    // truncate.
2663    return !WasTruncated;
2664  }
2665
2666  // Check that the global's range fits into VT.
2667  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2668  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2669  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2670    return false;
2671
2672  // Okay, we can use a narrow reference.
2673  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2674                                      GA->getOffset(), GA->getTargetFlags());
2675  return true;
2676}
2677
2678bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2679                                  SDValue &Base, SDValue &Scale,
2680                                  SDValue &Index, SDValue &Disp,
2681                                  SDValue &Segment) {
2682  assert(Root && P && "Unknown root/parent nodes");
2683  if (!ISD::isNON_EXTLoad(N.getNode()) ||
2684      !IsProfitableToFold(N, P, Root) ||
2685      !IsLegalToFold(N, P, Root, OptLevel))
2686    return false;
2687
2688  return selectAddr(N.getNode(),
2689                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
2690}
2691
2692bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
2693                                       SDValue &Base, SDValue &Scale,
2694                                       SDValue &Index, SDValue &Disp,
2695                                       SDValue &Segment) {
2696  assert(Root && P && "Unknown root/parent nodes");
2697  if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
2698      !IsProfitableToFold(N, P, Root) ||
2699      !IsLegalToFold(N, P, Root, OptLevel))
2700    return false;
2701
2702  return selectAddr(N.getNode(),
2703                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
2704}
2705
2706/// Return an SDNode that returns the value of the global base register.
2707/// Output instructions required to initialize the global base register,
2708/// if necessary.
2709SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2710  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2711  auto &DL = MF->getDataLayout();
2712  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2713}
2714
2715bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2716  if (N->getOpcode() == ISD::TRUNCATE)
2717    N = N->getOperand(0).getNode();
2718  if (N->getOpcode() != X86ISD::Wrapper)
2719    return false;
2720
2721  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2722  if (!GA)
2723    return false;
2724
2725  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2726  return CR && CR->getSignedMin().sge(-1ull << Width) &&
2727         CR->getSignedMax().slt(1ull << Width);
2728}
2729
2730static X86::CondCode getCondFromNode(SDNode *N) {
2731  assert(N->isMachineOpcode() && "Unexpected node");
2732  X86::CondCode CC = X86::COND_INVALID;
2733  unsigned Opc = N->getMachineOpcode();
2734  if (Opc == X86::JCC_1)
2735    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2736  else if (Opc == X86::SETCCr)
2737    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2738  else if (Opc == X86::SETCCm)
2739    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2740  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2741           Opc == X86::CMOV64rr)
2742    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2743  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2744           Opc == X86::CMOV64rm)
2745    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2746
2747  return CC;
2748}
2749
2750/// Test whether the given X86ISD::CMP node has any users that use a flag
2751/// other than ZF.
2752bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2753  // Examine each user of the node.
2754  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2755         UI != UE; ++UI) {
2756    // Only check things that use the flags.
2757    if (UI.getUse().getResNo() != Flags.getResNo())
2758      continue;
2759    // Only examine CopyToReg uses that copy to EFLAGS.
2760    if (UI->getOpcode() != ISD::CopyToReg ||
2761        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2762      return false;
2763    // Examine each user of the CopyToReg use.
2764    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2765           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2766      // Only examine the Flag result.
2767      if (FlagUI.getUse().getResNo() != 1) continue;
2768      // Anything unusual: assume conservatively.
2769      if (!FlagUI->isMachineOpcode()) return false;
2770      // Examine the condition code of the user.
2771      X86::CondCode CC = getCondFromNode(*FlagUI);
2772
2773      switch (CC) {
2774      // Comparisons which only use the zero flag.
2775      case X86::COND_E: case X86::COND_NE:
2776        continue;
2777      // Anything else: assume conservatively.
2778      default:
2779        return false;
2780      }
2781    }
2782  }
2783  return true;
2784}
2785
2786/// Test whether the given X86ISD::CMP node has any uses which require the SF
2787/// flag to be accurate.
2788bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2789  // Examine each user of the node.
2790  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2791         UI != UE; ++UI) {
2792    // Only check things that use the flags.
2793    if (UI.getUse().getResNo() != Flags.getResNo())
2794      continue;
2795    // Only examine CopyToReg uses that copy to EFLAGS.
2796    if (UI->getOpcode() != ISD::CopyToReg ||
2797        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2798      return false;
2799    // Examine each user of the CopyToReg use.
2800    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2801           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2802      // Only examine the Flag result.
2803      if (FlagUI.getUse().getResNo() != 1) continue;
2804      // Anything unusual: assume conservatively.
2805      if (!FlagUI->isMachineOpcode()) return false;
2806      // Examine the condition code of the user.
2807      X86::CondCode CC = getCondFromNode(*FlagUI);
2808
2809      switch (CC) {
2810      // Comparisons which don't examine the SF flag.
2811      case X86::COND_A: case X86::COND_AE:
2812      case X86::COND_B: case X86::COND_BE:
2813      case X86::COND_E: case X86::COND_NE:
2814      case X86::COND_O: case X86::COND_NO:
2815      case X86::COND_P: case X86::COND_NP:
2816        continue;
2817      // Anything else: assume conservatively.
2818      default:
2819        return false;
2820      }
2821    }
2822  }
2823  return true;
2824}
2825
2826static bool mayUseCarryFlag(X86::CondCode CC) {
2827  switch (CC) {
2828  // Comparisons which don't examine the CF flag.
2829  case X86::COND_O: case X86::COND_NO:
2830  case X86::COND_E: case X86::COND_NE:
2831  case X86::COND_S: case X86::COND_NS:
2832  case X86::COND_P: case X86::COND_NP:
2833  case X86::COND_L: case X86::COND_GE:
2834  case X86::COND_G: case X86::COND_LE:
2835    return false;
2836  // Anything else: assume conservatively.
2837  default:
2838    return true;
2839  }
2840}
2841
2842/// Test whether the given node which sets flags has any uses which require the
2843/// CF flag to be accurate.
2844 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2845  // Examine each user of the node.
2846  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2847         UI != UE; ++UI) {
2848    // Only check things that use the flags.
2849    if (UI.getUse().getResNo() != Flags.getResNo())
2850      continue;
2851
2852    unsigned UIOpc = UI->getOpcode();
2853
2854    if (UIOpc == ISD::CopyToReg) {
2855      // Only examine CopyToReg uses that copy to EFLAGS.
2856      if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2857        return false;
2858      // Examine each user of the CopyToReg use.
2859      for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2860           FlagUI != FlagUE; ++FlagUI) {
2861        // Only examine the Flag result.
2862        if (FlagUI.getUse().getResNo() != 1)
2863          continue;
2864        // Anything unusual: assume conservatively.
2865        if (!FlagUI->isMachineOpcode())
2866          return false;
2867        // Examine the condition code of the user.
2868        X86::CondCode CC = getCondFromNode(*FlagUI);
2869
2870        if (mayUseCarryFlag(CC))
2871          return false;
2872      }
2873
2874      // This CopyToReg is ok. Move on to the next user.
2875      continue;
2876    }
2877
2878    // This might be an unselected node. So look for the pre-isel opcodes that
2879    // use flags.
2880    unsigned CCOpNo;
2881    switch (UIOpc) {
2882    default:
2883      // Something unusual. Be conservative.
2884      return false;
2885    case X86ISD::SETCC:       CCOpNo = 0; break;
2886    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2887    case X86ISD::CMOV:        CCOpNo = 2; break;
2888    case X86ISD::BRCOND:      CCOpNo = 2; break;
2889    }
2890
2891    X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2892    if (mayUseCarryFlag(CC))
2893      return false;
2894  }
2895  return true;
2896}
2897
2898/// Check whether or not the chain ending in StoreNode is suitable for doing
2899/// the {load; op; store} to modify transformation.
2900static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2901                                        SDValue StoredVal, SelectionDAG *CurDAG,
2902                                        unsigned LoadOpNo,
2903                                        LoadSDNode *&LoadNode,
2904                                        SDValue &InputChain) {
2905  // Is the stored value result 0 of the operation?
2906  if (StoredVal.getResNo() != 0) return false;
2907
2908  // Are there other uses of the operation other than the store?
2909  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2910
2911  // Is the store non-extending and non-indexed?
2912  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2913    return false;
2914
2915  SDValue Load = StoredVal->getOperand(LoadOpNo);
2916  // Is the stored value a non-extending and non-indexed load?
2917  if (!ISD::isNormalLoad(Load.getNode())) return false;
2918
2919  // Return LoadNode by reference.
2920  LoadNode = cast<LoadSDNode>(Load);
2921
2922  // Is store the only read of the loaded value?
2923  if (!Load.hasOneUse())
2924    return false;
2925
2926  // Is the address of the store the same as the load?
2927  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2928      LoadNode->getOffset() != StoreNode->getOffset())
2929    return false;
2930
2931  bool FoundLoad = false;
2932  SmallVector<SDValue, 4> ChainOps;
2933  SmallVector<const SDNode *, 4> LoopWorklist;
2934  SmallPtrSet<const SDNode *, 16> Visited;
2935  const unsigned int Max = 1024;
2936
2937  //  Visualization of Load-Op-Store fusion:
2938  // -------------------------
2939  // Legend:
2940  //    *-lines = Chain operand dependencies.
2941  //    |-lines = Normal operand dependencies.
2942  //    Dependencies flow down and right. n-suffix references multiple nodes.
2943  //
2944  //        C                        Xn  C
2945  //        *                         *  *
2946  //        *                          * *
2947  //  Xn  A-LD    Yn                    TF         Yn
2948  //   *    * \   |                       *        |
2949  //    *   *  \  |                        *       |
2950  //     *  *   \ |             =>       A--LD_OP_ST
2951  //      * *    \|                                 \
2952  //       TF    OP                                  \
2953  //         *   | \                                  Zn
2954  //          *  |  \
2955  //         A-ST    Zn
2956  //
2957
2958  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2959  //                                      #2: Yn -> LD
2960  //                                      #3: ST -> Zn
2961
2962  // Ensure the transform is safe by checking for the dual
2963  // dependencies to make sure we do not induce a loop.
2964
2965  // As LD is a predecessor to both OP and ST we can do this by checking:
2966  //  a). if LD is a predecessor to a member of Xn or Yn.
2967  //  b). if a Zn is a predecessor to ST.
2968
2969  // However, (b) can only occur through being a chain predecessor to
2970  // ST, which is the same as Zn being a member or predecessor of Xn,
2971  // which is a subset of LD being a predecessor of Xn. So it's
2972  // subsumed by check (a).
2973
2974  SDValue Chain = StoreNode->getChain();
2975
2976  // Gather X elements in ChainOps.
2977  if (Chain == Load.getValue(1)) {
2978    FoundLoad = true;
2979    ChainOps.push_back(Load.getOperand(0));
2980  } else if (Chain.getOpcode() == ISD::TokenFactor) {
2981    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2982      SDValue Op = Chain.getOperand(i);
2983      if (Op == Load.getValue(1)) {
2984        FoundLoad = true;
2985        // Drop Load, but keep its chain. No cycle check necessary.
2986        ChainOps.push_back(Load.getOperand(0));
2987        continue;
2988      }
2989      LoopWorklist.push_back(Op.getNode());
2990      ChainOps.push_back(Op);
2991    }
2992  }
2993
2994  if (!FoundLoad)
2995    return false;
2996
2997  // Worklist is currently Xn. Add Yn to worklist.
2998  for (SDValue Op : StoredVal->ops())
2999    if (Op.getNode() != LoadNode)
3000      LoopWorklist.push_back(Op.getNode());
3001
3002  // Check (a) if Load is a predecessor to Xn + Yn
3003  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3004                                   true))
3005    return false;
3006
3007  InputChain =
3008      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3009  return true;
3010}
3011
3012// Change a chain of {load; op; store} of the same value into a simple op
3013// through memory of that value, if the uses of the modified value and its
3014// address are suitable.
3015//
3016// The tablegen pattern memory operand pattern is currently not able to match
3017// the case where the EFLAGS on the original operation are used.
3018//
3019// To move this to tablegen, we'll need to improve tablegen to allow flags to
3020// be transferred from a node in the pattern to the result node, probably with
3021// a new keyword. For example, we have this
3022// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3023//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3024//   (implicit EFLAGS)]>;
3025// but maybe need something like this
3026// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3027//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3028//   (transferrable EFLAGS)]>;
3029//
3030// Until then, we manually fold these and instruction select the operation
3031// here.
3032bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3033  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
3034  SDValue StoredVal = StoreNode->getOperand(1);
3035  unsigned Opc = StoredVal->getOpcode();
3036
3037  // Before we try to select anything, make sure this is memory operand size
3038  // and opcode we can handle. Note that this must match the code below that
3039  // actually lowers the opcodes.
3040  EVT MemVT = StoreNode->getMemoryVT();
3041  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3042      MemVT != MVT::i8)
3043    return false;
3044
3045  bool IsCommutable = false;
3046  bool IsNegate = false;
3047  switch (Opc) {
3048  default:
3049    return false;
3050  case X86ISD::SUB:
3051    IsNegate = isNullConstant(StoredVal.getOperand(0));
3052    break;
3053  case X86ISD::SBB:
3054    break;
3055  case X86ISD::ADD:
3056  case X86ISD::ADC:
3057  case X86ISD::AND:
3058  case X86ISD::OR:
3059  case X86ISD::XOR:
3060    IsCommutable = true;
3061    break;
3062  }
3063
3064  unsigned LoadOpNo = IsNegate ? 1 : 0;
3065  LoadSDNode *LoadNode = nullptr;
3066  SDValue InputChain;
3067  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3068                                   LoadNode, InputChain)) {
3069    if (!IsCommutable)
3070      return false;
3071
3072    // This operation is commutable, try the other operand.
3073    LoadOpNo = 1;
3074    if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3075                                     LoadNode, InputChain))
3076      return false;
3077  }
3078
3079  SDValue Base, Scale, Index, Disp, Segment;
3080  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3081                  Segment))
3082    return false;
3083
3084  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3085                          unsigned Opc8) {
3086    switch (MemVT.getSimpleVT().SimpleTy) {
3087    case MVT::i64:
3088      return Opc64;
3089    case MVT::i32:
3090      return Opc32;
3091    case MVT::i16:
3092      return Opc16;
3093    case MVT::i8:
3094      return Opc8;
3095    default:
3096      llvm_unreachable("Invalid size!");
3097    }
3098  };
3099
3100  MachineSDNode *Result;
3101  switch (Opc) {
3102  case X86ISD::SUB:
3103    // Handle negate.
3104    if (IsNegate) {
3105      unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3106                                     X86::NEG8m);
3107      const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3108      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3109                                      MVT::Other, Ops);
3110      break;
3111    }
3112   LLVM_FALLTHROUGH;
3113  case X86ISD::ADD:
3114    // Try to match inc/dec.
3115    if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3116      bool IsOne = isOneConstant(StoredVal.getOperand(1));
3117      bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3118      // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3119      if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3120        unsigned NewOpc =
3121          ((Opc == X86ISD::ADD) == IsOne)
3122              ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3123              : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3124        const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3125        Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3126                                        MVT::Other, Ops);
3127        break;
3128      }
3129    }
3130    LLVM_FALLTHROUGH;
3131  case X86ISD::ADC:
3132  case X86ISD::SBB:
3133  case X86ISD::AND:
3134  case X86ISD::OR:
3135  case X86ISD::XOR: {
3136    auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3137      switch (Opc) {
3138      case X86ISD::ADD:
3139        return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3140                            X86::ADD8mr);
3141      case X86ISD::ADC:
3142        return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3143                            X86::ADC8mr);
3144      case X86ISD::SUB:
3145        return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3146                            X86::SUB8mr);
3147      case X86ISD::SBB:
3148        return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3149                            X86::SBB8mr);
3150      case X86ISD::AND:
3151        return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3152                            X86::AND8mr);
3153      case X86ISD::OR:
3154        return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3155      case X86ISD::XOR:
3156        return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3157                            X86::XOR8mr);
3158      default:
3159        llvm_unreachable("Invalid opcode!");
3160      }
3161    };
3162    auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3163      switch (Opc) {
3164      case X86ISD::ADD:
3165        return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3166      case X86ISD::ADC:
3167        return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3168      case X86ISD::SUB:
3169        return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3170      case X86ISD::SBB:
3171        return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3172      case X86ISD::AND:
3173        return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3174      case X86ISD::OR:
3175        return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3176      case X86ISD::XOR:
3177        return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3178      default:
3179        llvm_unreachable("Invalid opcode!");
3180      }
3181    };
3182    auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3183      switch (Opc) {
3184      case X86ISD::ADD:
3185        return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3186                            X86::ADD8mi);
3187      case X86ISD::ADC:
3188        return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3189                            X86::ADC8mi);
3190      case X86ISD::SUB:
3191        return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3192                            X86::SUB8mi);
3193      case X86ISD::SBB:
3194        return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3195                            X86::SBB8mi);
3196      case X86ISD::AND:
3197        return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3198                            X86::AND8mi);
3199      case X86ISD::OR:
3200        return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3201                            X86::OR8mi);
3202      case X86ISD::XOR:
3203        return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3204                            X86::XOR8mi);
3205      default:
3206        llvm_unreachable("Invalid opcode!");
3207      }
3208    };
3209
3210    unsigned NewOpc = SelectRegOpcode(Opc);
3211    SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3212
3213    // See if the operand is a constant that we can fold into an immediate
3214    // operand.
3215    if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3216      int64_t OperandV = OperandC->getSExtValue();
3217
3218      // Check if we can shrink the operand enough to fit in an immediate (or
3219      // fit into a smaller immediate) by negating it and switching the
3220      // operation.
3221      if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3222          ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3223           (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3224            isInt<32>(-OperandV))) &&
3225          hasNoCarryFlagUses(StoredVal.getValue(1))) {
3226        OperandV = -OperandV;
3227        Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3228      }
3229
3230      // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3231      // the larger immediate operand.
3232      if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3233        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3234        NewOpc = SelectImm8Opcode(Opc);
3235      } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3236        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3237        NewOpc = SelectImmOpcode(Opc);
3238      }
3239    }
3240
3241    if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3242      SDValue CopyTo =
3243          CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3244                               StoredVal.getOperand(2), SDValue());
3245
3246      const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3247                             Segment, Operand, CopyTo, CopyTo.getValue(1)};
3248      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3249                                      Ops);
3250    } else {
3251      const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3252                             Segment, Operand, InputChain};
3253      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3254                                      Ops);
3255    }
3256    break;
3257  }
3258  default:
3259    llvm_unreachable("Invalid opcode!");
3260  }
3261
3262  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3263                                 LoadNode->getMemOperand()};
3264  CurDAG->setNodeMemRefs(Result, MemOps);
3265
3266  // Update Load Chain uses as well.
3267  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3268  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3269  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3270  CurDAG->RemoveDeadNode(Node);
3271  return true;
3272}
3273
3274// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3275// Where Mask is one of the following patterns:
3276//   a) x &  (1 << nbits) - 1
3277//   b) x & ~(-1 << nbits)
3278//   c) x &  (-1 >> (32 - y))
3279//   d) x << (32 - y) >> (32 - y)
3280bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3281  assert(
3282      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3283      "Should be either an and-mask, or right-shift after clearing high bits.");
3284
3285  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3286  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3287    return false;
3288
3289  MVT NVT = Node->getSimpleValueType(0);
3290
3291  // Only supported for 32 and 64 bits.
3292  if (NVT != MVT::i32 && NVT != MVT::i64)
3293    return false;
3294
3295  SDValue NBits;
3296
3297  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3298  // Else, if we only have BMI1's BEXTR, we require one-use.
3299  const bool CanHaveExtraUses = Subtarget->hasBMI2();
3300  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3301    return CanHaveExtraUses ||
3302           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3303  };
3304  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3305  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3306
3307  auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3308    if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3309      assert(V.getSimpleValueType() == MVT::i32 &&
3310             V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3311             "Expected i64 -> i32 truncation");
3312      V = V.getOperand(0);
3313    }
3314    return V;
3315  };
3316
3317  // a) x & ((1 << nbits) + (-1))
3318  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3319                        &NBits](SDValue Mask) -> bool {
3320    // Match `add`. Must only have one use!
3321    if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3322      return false;
3323    // We should be adding all-ones constant (i.e. subtracting one.)
3324    if (!isAllOnesConstant(Mask->getOperand(1)))
3325      return false;
3326    // Match `1 << nbits`. Might be truncated. Must only have one use!
3327    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3328    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3329      return false;
3330    if (!isOneConstant(M0->getOperand(0)))
3331      return false;
3332    NBits = M0->getOperand(1);
3333    return true;
3334  };
3335
3336  auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3337    V = peekThroughOneUseTruncation(V);
3338    return CurDAG->MaskedValueIsAllOnes(
3339        V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3340                                NVT.getSizeInBits()));
3341  };
3342
3343  // b) x & ~(-1 << nbits)
3344  auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3345                        &NBits](SDValue Mask) -> bool {
3346    // Match `~()`. Must only have one use!
3347    if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3348      return false;
3349    // The -1 only has to be all-ones for the final Node's NVT.
3350    if (!isAllOnes(Mask->getOperand(1)))
3351      return false;
3352    // Match `-1 << nbits`. Might be truncated. Must only have one use!
3353    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3354    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3355      return false;
3356    // The -1 only has to be all-ones for the final Node's NVT.
3357    if (!isAllOnes(M0->getOperand(0)))
3358      return false;
3359    NBits = M0->getOperand(1);
3360    return true;
3361  };
3362
3363  // Match potentially-truncated (bitwidth - y)
3364  auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3365                                             unsigned Bitwidth) {
3366    // Skip over a truncate of the shift amount.
3367    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3368      ShiftAmt = ShiftAmt.getOperand(0);
3369      // The trunc should have been the only user of the real shift amount.
3370      if (!checkOneUse(ShiftAmt))
3371        return false;
3372    }
3373    // Match the shift amount as: (bitwidth - y). It should go away, too.
3374    if (ShiftAmt.getOpcode() != ISD::SUB)
3375      return false;
3376    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3377    if (!V0 || V0->getZExtValue() != Bitwidth)
3378      return false;
3379    NBits = ShiftAmt.getOperand(1);
3380    return true;
3381  };
3382
3383  // c) x &  (-1 >> (32 - y))
3384  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3385                        matchShiftAmt](SDValue Mask) -> bool {
3386    // The mask itself may be truncated.
3387    Mask = peekThroughOneUseTruncation(Mask);
3388    unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3389    // Match `l>>`. Must only have one use!
3390    if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3391      return false;
3392    // We should be shifting truly all-ones constant.
3393    if (!isAllOnesConstant(Mask.getOperand(0)))
3394      return false;
3395    SDValue M1 = Mask.getOperand(1);
3396    // The shift amount should not be used externally.
3397    if (!checkOneUse(M1))
3398      return false;
3399    return matchShiftAmt(M1, Bitwidth);
3400  };
3401
3402  SDValue X;
3403
3404  // d) x << (32 - y) >> (32 - y)
3405  auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3406                        &X](SDNode *Node) -> bool {
3407    if (Node->getOpcode() != ISD::SRL)
3408      return false;
3409    SDValue N0 = Node->getOperand(0);
3410    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3411      return false;
3412    unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3413    SDValue N1 = Node->getOperand(1);
3414    SDValue N01 = N0->getOperand(1);
3415    // Both of the shifts must be by the exact same value.
3416    // There should not be any uses of the shift amount outside of the pattern.
3417    if (N1 != N01 || !checkTwoUse(N1))
3418      return false;
3419    if (!matchShiftAmt(N1, Bitwidth))
3420      return false;
3421    X = N0->getOperand(0);
3422    return true;
3423  };
3424
3425  auto matchLowBitMask = [matchPatternA, matchPatternB,
3426                          matchPatternC](SDValue Mask) -> bool {
3427    return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3428  };
3429
3430  if (Node->getOpcode() == ISD::AND) {
3431    X = Node->getOperand(0);
3432    SDValue Mask = Node->getOperand(1);
3433
3434    if (matchLowBitMask(Mask)) {
3435      // Great.
3436    } else {
3437      std::swap(X, Mask);
3438      if (!matchLowBitMask(Mask))
3439        return false;
3440    }
3441  } else if (!matchPatternD(Node))
3442    return false;
3443
3444  SDLoc DL(Node);
3445
3446  // Truncate the shift amount.
3447  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3448  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3449
3450  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3451  // All the other bits are undefined, we do not care about them.
3452  SDValue ImplDef = SDValue(
3453      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3454  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3455
3456  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3457  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3458  NBits = SDValue(
3459      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3460                             NBits, SRIdxVal), 0);
3461  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3462
3463  if (Subtarget->hasBMI2()) {
3464    // Great, just emit the the BZHI..
3465    if (NVT != MVT::i32) {
3466      // But have to place the bit count into the wide-enough register first.
3467      NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3468      insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3469    }
3470
3471    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3472    ReplaceNode(Node, Extract.getNode());
3473    SelectCode(Extract.getNode());
3474    return true;
3475  }
3476
3477  // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3478  // *logically* shifted (potentially with one-use trunc inbetween),
3479  // and the truncation was the only use of the shift,
3480  // and if so look past one-use truncation.
3481  {
3482    SDValue RealX = peekThroughOneUseTruncation(X);
3483    // FIXME: only if the shift is one-use?
3484    if (RealX != X && RealX.getOpcode() == ISD::SRL)
3485      X = RealX;
3486  }
3487
3488  MVT XVT = X.getSimpleValueType();
3489
3490  // Else, emitting BEXTR requires one more step.
3491  // The 'control' of BEXTR has the pattern of:
3492  // [15...8 bit][ 7...0 bit] location
3493  // [ bit count][     shift] name
3494  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3495
3496  // Shift NBits left by 8 bits, thus producing 'control'.
3497  // This makes the low 8 bits to be zero.
3498  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3499  insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
3500  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3501  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3502
3503  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3504  // FIXME: only if the shift is one-use?
3505  if (X.getOpcode() == ISD::SRL) {
3506    SDValue ShiftAmt = X.getOperand(1);
3507    X = X.getOperand(0);
3508
3509    assert(ShiftAmt.getValueType() == MVT::i8 &&
3510           "Expected shift amount to be i8");
3511
3512    // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3513    // We could zext to i16 in some form, but we intentionally don't do that.
3514    SDValue OrigShiftAmt = ShiftAmt;
3515    ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3516    insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3517
3518    // And now 'or' these low 8 bits of shift amount into the 'control'.
3519    Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3520    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3521  }
3522
3523  // But have to place the 'control' into the wide-enough register first.
3524  if (XVT != MVT::i32) {
3525    Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3526    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3527  }
3528
3529  // And finally, form the BEXTR itself.
3530  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3531
3532  // The 'X' was originally truncated. Do that now.
3533  if (XVT != NVT) {
3534    insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3535    Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3536  }
3537
3538  ReplaceNode(Node, Extract.getNode());
3539  SelectCode(Extract.getNode());
3540
3541  return true;
3542}
3543
3544// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3545MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3546  MVT NVT = Node->getSimpleValueType(0);
3547  SDLoc dl(Node);
3548
3549  SDValue N0 = Node->getOperand(0);
3550  SDValue N1 = Node->getOperand(1);
3551
3552  // If we have TBM we can use an immediate for the control. If we have BMI
3553  // we should only do this if the BEXTR instruction is implemented well.
3554  // Otherwise moving the control into a register makes this more costly.
3555  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3556  // hoisting the move immediate would make it worthwhile with a less optimal
3557  // BEXTR?
3558  bool PreferBEXTR =
3559      Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
3560  if (!PreferBEXTR && !Subtarget->hasBMI2())
3561    return nullptr;
3562
3563  // Must have a shift right.
3564  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3565    return nullptr;
3566
3567  // Shift can't have additional users.
3568  if (!N0->hasOneUse())
3569    return nullptr;
3570
3571  // Only supported for 32 and 64 bits.
3572  if (NVT != MVT::i32 && NVT != MVT::i64)
3573    return nullptr;
3574
3575  // Shift amount and RHS of and must be constant.
3576  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3577  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3578  if (!MaskCst || !ShiftCst)
3579    return nullptr;
3580
3581  // And RHS must be a mask.
3582  uint64_t Mask = MaskCst->getZExtValue();
3583  if (!isMask_64(Mask))
3584    return nullptr;
3585
3586  uint64_t Shift = ShiftCst->getZExtValue();
3587  uint64_t MaskSize = countPopulation(Mask);
3588
3589  // Don't interfere with something that can be handled by extracting AH.
3590  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3591  if (Shift == 8 && MaskSize == 8)
3592    return nullptr;
3593
3594  // Make sure we are only using bits that were in the original value, not
3595  // shifted in.
3596  if (Shift + MaskSize > NVT.getSizeInBits())
3597    return nullptr;
3598
3599  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3600  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3601  // does not fit into 32 bits. Load folding is not a sufficient reason.
3602  if (!PreferBEXTR && MaskSize <= 32)
3603    return nullptr;
3604
3605  SDValue Control;
3606  unsigned ROpc, MOpc;
3607
3608  if (!PreferBEXTR) {
3609    assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
3610    // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3611    // Let's perform the mask first, and apply shift later. Note that we need to
3612    // widen the mask to account for the fact that we'll apply shift afterwards!
3613    Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
3614    ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
3615    MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
3616    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3617    Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3618  } else {
3619    // The 'control' of BEXTR has the pattern of:
3620    // [15...8 bit][ 7...0 bit] location
3621    // [ bit count][     shift] name
3622    // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3623    Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3624    if (Subtarget->hasTBM()) {
3625      ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3626      MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3627    } else {
3628      assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
3629      // BMI requires the immediate to placed in a register.
3630      ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3631      MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3632      unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3633      Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3634    }
3635  }
3636
3637  MachineSDNode *NewNode;
3638  SDValue Input = N0->getOperand(0);
3639  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3640  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3641    SDValue Ops[] = {
3642        Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
3643    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3644    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3645    // Update the chain.
3646    ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3647    // Record the mem-refs
3648    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3649  } else {
3650    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
3651  }
3652
3653  if (!PreferBEXTR) {
3654    // We still need to apply the shift.
3655    SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
3656    unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
3657    NewNode =
3658        CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
3659  }
3660
3661  return NewNode;
3662}
3663
3664// Emit a PCMISTR(I/M) instruction.
3665MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3666                                             bool MayFoldLoad, const SDLoc &dl,
3667                                             MVT VT, SDNode *Node) {
3668  SDValue N0 = Node->getOperand(0);
3669  SDValue N1 = Node->getOperand(1);
3670  SDValue Imm = Node->getOperand(2);
3671  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3672  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3673
3674  // Try to fold a load. No need to check alignment.
3675  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3676  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3677    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3678                      N1.getOperand(0) };
3679    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3680    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3681    // Update the chain.
3682    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3683    // Record the mem-refs
3684    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3685    return CNode;
3686  }
3687
3688  SDValue Ops[] = { N0, N1, Imm };
3689  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3690  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3691  return CNode;
3692}
3693
3694// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3695// to emit a second instruction after this one. This is needed since we have two
3696// copyToReg nodes glued before this and we need to continue that glue through.
3697MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3698                                             bool MayFoldLoad, const SDLoc &dl,
3699                                             MVT VT, SDNode *Node,
3700                                             SDValue &InFlag) {
3701  SDValue N0 = Node->getOperand(0);
3702  SDValue N2 = Node->getOperand(2);
3703  SDValue Imm = Node->getOperand(4);
3704  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3705  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3706
3707  // Try to fold a load. No need to check alignment.
3708  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3709  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3710    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3711                      N2.getOperand(0), InFlag };
3712    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3713    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3714    InFlag = SDValue(CNode, 3);
3715    // Update the chain.
3716    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3717    // Record the mem-refs
3718    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3719    return CNode;
3720  }
3721
3722  SDValue Ops[] = { N0, N2, Imm, InFlag };
3723  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3724  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3725  InFlag = SDValue(CNode, 2);
3726  return CNode;
3727}
3728
3729bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3730  EVT VT = N->getValueType(0);
3731
3732  // Only handle scalar shifts.
3733  if (VT.isVector())
3734    return false;
3735
3736  // Narrower shifts only mask to 5 bits in hardware.
3737  unsigned Size = VT == MVT::i64 ? 64 : 32;
3738
3739  SDValue OrigShiftAmt = N->getOperand(1);
3740  SDValue ShiftAmt = OrigShiftAmt;
3741  SDLoc DL(N);
3742
3743  // Skip over a truncate of the shift amount.
3744  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3745    ShiftAmt = ShiftAmt->getOperand(0);
3746
3747  // This function is called after X86DAGToDAGISel::matchBitExtract(),
3748  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3749
3750  SDValue NewShiftAmt;
3751  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3752    SDValue Add0 = ShiftAmt->getOperand(0);
3753    SDValue Add1 = ShiftAmt->getOperand(1);
3754    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3755    // to avoid the ADD/SUB.
3756    if (isa<ConstantSDNode>(Add1) &&
3757        cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3758      NewShiftAmt = Add0;
3759    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3760    // generate a NEG instead of a SUB of a constant.
3761    } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3762               isa<ConstantSDNode>(Add0) &&
3763               cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3764               cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3765      // Insert a negate op.
3766      // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3767      // that uses it that's not a shift.
3768      EVT SubVT = ShiftAmt.getValueType();
3769      SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3770      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3771      NewShiftAmt = Neg;
3772
3773      // Insert these operands into a valid topological order so they can
3774      // get selected independently.
3775      insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3776      insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3777    } else
3778      return false;
3779  } else
3780    return false;
3781
3782  if (NewShiftAmt.getValueType() != MVT::i8) {
3783    // Need to truncate the shift amount.
3784    NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3785    // Add to a correct topological ordering.
3786    insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3787  }
3788
3789  // Insert a new mask to keep the shift amount legal. This should be removed
3790  // by isel patterns.
3791  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3792                                CurDAG->getConstant(Size - 1, DL, MVT::i8));
3793  // Place in a correct topological ordering.
3794  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3795
3796  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3797                                                   NewShiftAmt);
3798  if (UpdatedNode != N) {
3799    // If we found an existing node, we should replace ourselves with that node
3800    // and wait for it to be selected after its other users.
3801    ReplaceNode(N, UpdatedNode);
3802    return true;
3803  }
3804
3805  // If the original shift amount is now dead, delete it so that we don't run
3806  // it through isel.
3807  if (OrigShiftAmt.getNode()->use_empty())
3808    CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3809
3810  // Now that we've optimized the shift amount, defer to normal isel to get
3811  // load folding and legacy vs BMI2 selection without repeating it here.
3812  SelectCode(N);
3813  return true;
3814}
3815
3816bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3817  MVT NVT = N->getSimpleValueType(0);
3818  unsigned Opcode = N->getOpcode();
3819  SDLoc dl(N);
3820
3821  // For operations of the form (x << C1) op C2, check if we can use a smaller
3822  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3823  SDValue Shift = N->getOperand(0);
3824  SDValue N1 = N->getOperand(1);
3825
3826  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3827  if (!Cst)
3828    return false;
3829
3830  int64_t Val = Cst->getSExtValue();
3831
3832  // If we have an any_extend feeding the AND, look through it to see if there
3833  // is a shift behind it. But only if the AND doesn't use the extended bits.
3834  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3835  bool FoundAnyExtend = false;
3836  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3837      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3838      isUInt<32>(Val)) {
3839    FoundAnyExtend = true;
3840    Shift = Shift.getOperand(0);
3841  }
3842
3843  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3844    return false;
3845
3846  // i8 is unshrinkable, i16 should be promoted to i32.
3847  if (NVT != MVT::i32 && NVT != MVT::i64)
3848    return false;
3849
3850  ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3851  if (!ShlCst)
3852    return false;
3853
3854  uint64_t ShAmt = ShlCst->getZExtValue();
3855
3856  // Make sure that we don't change the operation by removing bits.
3857  // This only matters for OR and XOR, AND is unaffected.
3858  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3859  if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3860    return false;
3861
3862  // Check the minimum bitwidth for the new constant.
3863  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3864  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3865    if (Opcode == ISD::AND) {
3866      // AND32ri is the same as AND64ri32 with zext imm.
3867      // Try this before sign extended immediates below.
3868      ShiftedVal = (uint64_t)Val >> ShAmt;
3869      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3870        return true;
3871      // Also swap order when the AND can become MOVZX.
3872      if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3873        return true;
3874    }
3875    ShiftedVal = Val >> ShAmt;
3876    if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3877        (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3878      return true;
3879    if (Opcode != ISD::AND) {
3880      // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3881      ShiftedVal = (uint64_t)Val >> ShAmt;
3882      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3883        return true;
3884    }
3885    return false;
3886  };
3887
3888  int64_t ShiftedVal;
3889  if (!CanShrinkImmediate(ShiftedVal))
3890    return false;
3891
3892  // Ok, we can reorder to get a smaller immediate.
3893
3894  // But, its possible the original immediate allowed an AND to become MOVZX.
3895  // Doing this late due to avoid the MakedValueIsZero call as late as
3896  // possible.
3897  if (Opcode == ISD::AND) {
3898    // Find the smallest zext this could possibly be.
3899    unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3900    ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3901
3902    // Figure out which bits need to be zero to achieve that mask.
3903    APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3904                                            ZExtWidth);
3905    NeededMask &= ~Cst->getAPIntValue();
3906
3907    if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3908      return false;
3909  }
3910
3911  SDValue X = Shift.getOperand(0);
3912  if (FoundAnyExtend) {
3913    SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3914    insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3915    X = NewX;
3916  }
3917
3918  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3919  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3920  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3921  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3922  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3923                                   Shift.getOperand(1));
3924  ReplaceNode(N, NewSHL.getNode());
3925  SelectCode(NewSHL.getNode());
3926  return true;
3927}
3928
3929// Try to match two logic ops to a VPTERNLOG.
3930// FIXME: Handle inverted inputs?
3931// FIXME: Handle more complex patterns that use an operand more than once?
3932bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
3933  MVT NVT = N->getSimpleValueType(0);
3934
3935  // Make sure we support VPTERNLOG.
3936  if (!NVT.isVector() || !Subtarget->hasAVX512() ||
3937      NVT.getVectorElementType() == MVT::i1)
3938    return false;
3939
3940  // We need VLX for 128/256-bit.
3941  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
3942    return false;
3943
3944  unsigned Opc1 = N->getOpcode();
3945  SDValue N0 = N->getOperand(0);
3946  SDValue N1 = N->getOperand(1);
3947
3948  auto isLogicOp = [](unsigned Opc) {
3949    return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
3950           Opc == X86ISD::ANDNP;
3951  };
3952
3953  SDValue A, B, C;
3954  unsigned Opc2;
3955  if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
3956    Opc2 = N1.getOpcode();
3957    A = N0;
3958    B = N1.getOperand(0);
3959    C = N1.getOperand(1);
3960  } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
3961    Opc2 = N0.getOpcode();
3962    A = N1;
3963    B = N0.getOperand(0);
3964    C = N0.getOperand(1);
3965  } else
3966    return false;
3967
3968  uint64_t Imm;
3969  switch (Opc1) {
3970  default: llvm_unreachable("Unexpected opcode!");
3971  case ISD::AND:
3972    switch (Opc2) {
3973    default: llvm_unreachable("Unexpected opcode!");
3974    case ISD::AND:      Imm = 0x80; break;
3975    case ISD::OR:       Imm = 0xe0; break;
3976    case ISD::XOR:      Imm = 0x60; break;
3977    case X86ISD::ANDNP: Imm = 0x20; break;
3978    }
3979    break;
3980  case ISD::OR:
3981    switch (Opc2) {
3982    default: llvm_unreachable("Unexpected opcode!");
3983    case ISD::AND:      Imm = 0xf8; break;
3984    case ISD::OR:       Imm = 0xfe; break;
3985    case ISD::XOR:      Imm = 0xf6; break;
3986    case X86ISD::ANDNP: Imm = 0xf2; break;
3987    }
3988    break;
3989  case ISD::XOR:
3990    switch (Opc2) {
3991    default: llvm_unreachable("Unexpected opcode!");
3992    case ISD::AND:      Imm = 0x78; break;
3993    case ISD::OR:       Imm = 0x1e; break;
3994    case ISD::XOR:      Imm = 0x96; break;
3995    case X86ISD::ANDNP: Imm = 0xd2; break;
3996    }
3997    break;
3998  }
3999
4000  SDLoc DL(N);
4001  SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
4002                                CurDAG->getTargetConstant(Imm, DL, MVT::i8));
4003  ReplaceNode(N, New.getNode());
4004  SelectCode(New.getNode());
4005  return true;
4006}
4007
4008/// If the high bits of an 'and' operand are known zero, try setting the
4009/// high bits of an 'and' constant operand to produce a smaller encoding by
4010/// creating a small, sign-extended negative immediate rather than a large
4011/// positive one. This reverses a transform in SimplifyDemandedBits that
4012/// shrinks mask constants by clearing bits. There is also a possibility that
4013/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4014/// case, just replace the 'and'. Return 'true' if the node is replaced.
4015bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4016  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4017  // have immediate operands.
4018  MVT VT = And->getSimpleValueType(0);
4019  if (VT != MVT::i32 && VT != MVT::i64)
4020    return false;
4021
4022  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4023  if (!And1C)
4024    return false;
4025
4026  // Bail out if the mask constant is already negative. It's can't shrink more.
4027  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4028  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4029  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4030  // are negative too.
4031  APInt MaskVal = And1C->getAPIntValue();
4032  unsigned MaskLZ = MaskVal.countLeadingZeros();
4033  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4034    return false;
4035
4036  // Don't extend into the upper 32 bits of a 64 bit mask.
4037  if (VT == MVT::i64 && MaskLZ >= 32) {
4038    MaskLZ -= 32;
4039    MaskVal = MaskVal.trunc(32);
4040  }
4041
4042  SDValue And0 = And->getOperand(0);
4043  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4044  APInt NegMaskVal = MaskVal | HighZeros;
4045
4046  // If a negative constant would not allow a smaller encoding, there's no need
4047  // to continue. Only change the constant when we know it's a win.
4048  unsigned MinWidth = NegMaskVal.getMinSignedBits();
4049  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
4050    return false;
4051
4052  // Extend masks if we truncated above.
4053  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4054    NegMaskVal = NegMaskVal.zext(64);
4055    HighZeros = HighZeros.zext(64);
4056  }
4057
4058  // The variable operand must be all zeros in the top bits to allow using the
4059  // new, negative constant as the mask.
4060  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4061    return false;
4062
4063  // Check if the mask is -1. In that case, this is an unnecessary instruction
4064  // that escaped earlier analysis.
4065  if (NegMaskVal.isAllOnesValue()) {
4066    ReplaceNode(And, And0.getNode());
4067    return true;
4068  }
4069
4070  // A negative mask allows a smaller encoding. Create a new 'and' node.
4071  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4072  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4073  ReplaceNode(And, NewAnd.getNode());
4074  SelectCode(NewAnd.getNode());
4075  return true;
4076}
4077
4078static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4079                              bool FoldedBCast, bool Masked) {
4080#define VPTESTM_CASE(VT, SUFFIX) \
4081case MVT::VT: \
4082  if (Masked) \
4083    return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4084  return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4085
4086
4087#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4088default: llvm_unreachable("Unexpected VT!"); \
4089VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4090VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4091VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4092VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4093VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4094VPTESTM_CASE(v8i64, QZ##SUFFIX)
4095
4096#define VPTESTM_FULL_CASES(SUFFIX) \
4097VPTESTM_BROADCAST_CASES(SUFFIX) \
4098VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4099VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4100VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4101VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4102VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4103VPTESTM_CASE(v32i16, WZ##SUFFIX)
4104
4105  if (FoldedLoad) {
4106    switch (TestVT.SimpleTy) {
4107    VPTESTM_FULL_CASES(rm)
4108    }
4109  }
4110
4111  if (FoldedBCast) {
4112    switch (TestVT.SimpleTy) {
4113    VPTESTM_BROADCAST_CASES(rmb)
4114    }
4115  }
4116
4117  switch (TestVT.SimpleTy) {
4118  VPTESTM_FULL_CASES(rr)
4119  }
4120
4121#undef VPTESTM_FULL_CASES
4122#undef VPTESTM_BROADCAST_CASES
4123#undef VPTESTM_CASE
4124}
4125
4126// Try to create VPTESTM instruction. If InMask is not null, it will be used
4127// to form a masked operation.
4128bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4129                                 SDValue InMask) {
4130  assert(Subtarget->hasAVX512() && "Expected AVX512!");
4131  assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4132         "Unexpected VT!");
4133
4134  // Look for equal and not equal compares.
4135  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4136  if (CC != ISD::SETEQ && CC != ISD::SETNE)
4137    return false;
4138
4139  SDValue SetccOp0 = Setcc.getOperand(0);
4140  SDValue SetccOp1 = Setcc.getOperand(1);
4141
4142  // Canonicalize the all zero vector to the RHS.
4143  if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4144    std::swap(SetccOp0, SetccOp1);
4145
4146  // See if we're comparing against zero.
4147  if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4148    return false;
4149
4150  SDValue N0 = SetccOp0;
4151
4152  MVT CmpVT = N0.getSimpleValueType();
4153  MVT CmpSVT = CmpVT.getVectorElementType();
4154
4155  // Start with both operands the same. We'll try to refine this.
4156  SDValue Src0 = N0;
4157  SDValue Src1 = N0;
4158
4159  {
4160    // Look through single use bitcasts.
4161    SDValue N0Temp = N0;
4162    if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4163      N0Temp = N0.getOperand(0);
4164
4165     // Look for single use AND.
4166    if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4167      Src0 = N0Temp.getOperand(0);
4168      Src1 = N0Temp.getOperand(1);
4169    }
4170  }
4171
4172  // Without VLX we need to widen the load.
4173  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4174
4175  // We can only fold loads if the sources are unique.
4176  bool CanFoldLoads = Src0 != Src1;
4177
4178  // Try to fold loads unless we need to widen.
4179  bool FoldedLoad = false;
4180  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4181  if (!Widen && CanFoldLoads) {
4182    Load = Src1;
4183    FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4184                             Tmp4);
4185    if (!FoldedLoad) {
4186      // And is computative.
4187      Load = Src0;
4188      FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4189                               Tmp3, Tmp4);
4190      if (FoldedLoad)
4191        std::swap(Src0, Src1);
4192    }
4193  }
4194
4195  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4196    // Look through single use bitcasts.
4197    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
4198      Parent = Src.getNode();
4199      Src = Src.getOperand(0);
4200    }
4201
4202    if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
4203      auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
4204      if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
4205        return Src;
4206    }
4207
4208    return SDValue();
4209  };
4210
4211  // If we didn't fold a load, try to match broadcast. No widening limitation
4212  // for this. But only 32 and 64 bit types are supported.
4213  bool FoldedBCast = false;
4214  if (!FoldedLoad && CanFoldLoads &&
4215      (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4216    SDNode *ParentNode = N0.getNode();
4217    if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4218      FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4219                                     Tmp1, Tmp2, Tmp3, Tmp4);
4220    }
4221
4222    // Try the other operand.
4223    if (!FoldedBCast) {
4224      SDNode *ParentNode = N0.getNode();
4225      if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4226        FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4227                                       Tmp1, Tmp2, Tmp3, Tmp4);
4228        if (FoldedBCast)
4229          std::swap(Src0, Src1);
4230      }
4231    }
4232  }
4233
4234  auto getMaskRC = [](MVT MaskVT) {
4235    switch (MaskVT.SimpleTy) {
4236    default: llvm_unreachable("Unexpected VT!");
4237    case MVT::v2i1:  return X86::VK2RegClassID;
4238    case MVT::v4i1:  return X86::VK4RegClassID;
4239    case MVT::v8i1:  return X86::VK8RegClassID;
4240    case MVT::v16i1: return X86::VK16RegClassID;
4241    case MVT::v32i1: return X86::VK32RegClassID;
4242    case MVT::v64i1: return X86::VK64RegClassID;
4243    }
4244  };
4245
4246  bool IsMasked = InMask.getNode() != nullptr;
4247
4248  SDLoc dl(Root);
4249
4250  MVT ResVT = Setcc.getSimpleValueType();
4251  MVT MaskVT = ResVT;
4252  if (Widen) {
4253    // Widen the inputs using insert_subreg or copy_to_regclass.
4254    unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4255    unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4256    unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4257    CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4258    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4259    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4260                                                     CmpVT), 0);
4261    Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4262
4263    assert(!FoldedLoad && "Shouldn't have folded the load");
4264    if (!FoldedBCast)
4265      Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4266
4267    if (IsMasked) {
4268      // Widen the mask.
4269      unsigned RegClass = getMaskRC(MaskVT);
4270      SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4271      InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4272                                              dl, MaskVT, InMask, RC), 0);
4273    }
4274  }
4275
4276  bool IsTestN = CC == ISD::SETEQ;
4277  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4278                               IsMasked);
4279
4280  MachineSDNode *CNode;
4281  if (FoldedLoad || FoldedBCast) {
4282    SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4283
4284    if (IsMasked) {
4285      SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4286                        Load.getOperand(0) };
4287      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4288    } else {
4289      SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4290                        Load.getOperand(0) };
4291      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4292    }
4293
4294    // Update the chain.
4295    ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4296    // Record the mem-refs
4297    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
4298  } else {
4299    if (IsMasked)
4300      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4301    else
4302      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4303  }
4304
4305  // If we widened, we need to shrink the mask VT.
4306  if (Widen) {
4307    unsigned RegClass = getMaskRC(ResVT);
4308    SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4309    CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4310                                   dl, ResVT, SDValue(CNode, 0), RC);
4311  }
4312
4313  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4314  CurDAG->RemoveDeadNode(Root);
4315  return true;
4316}
4317
4318// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4319// into vpternlog.
4320bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4321  assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4322
4323  MVT NVT = N->getSimpleValueType(0);
4324
4325  // Make sure we support VPTERNLOG.
4326  if (!NVT.isVector() || !Subtarget->hasAVX512())
4327    return false;
4328
4329  // We need VLX for 128/256-bit.
4330  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4331    return false;
4332
4333  SDValue N0 = N->getOperand(0);
4334  SDValue N1 = N->getOperand(1);
4335
4336  // Canonicalize AND to LHS.
4337  if (N1.getOpcode() == ISD::AND)
4338    std::swap(N0, N1);
4339
4340  if (N0.getOpcode() != ISD::AND ||
4341      N1.getOpcode() != X86ISD::ANDNP ||
4342      !N0.hasOneUse() || !N1.hasOneUse())
4343    return false;
4344
4345  // ANDN is not commutable, use it to pick down A and C.
4346  SDValue A = N1.getOperand(0);
4347  SDValue C = N1.getOperand(1);
4348
4349  // AND is commutable, if one operand matches A, the other operand is B.
4350  // Otherwise this isn't a match.
4351  SDValue B;
4352  if (N0.getOperand(0) == A)
4353    B = N0.getOperand(1);
4354  else if (N0.getOperand(1) == A)
4355    B = N0.getOperand(0);
4356  else
4357    return false;
4358
4359  SDLoc dl(N);
4360  SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4361  SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4362  ReplaceNode(N, Ternlog.getNode());
4363  SelectCode(Ternlog.getNode());
4364  return true;
4365}
4366
4367void X86DAGToDAGISel::Select(SDNode *Node) {
4368  MVT NVT = Node->getSimpleValueType(0);
4369  unsigned Opcode = Node->getOpcode();
4370  SDLoc dl(Node);
4371
4372  if (Node->isMachineOpcode()) {
4373    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4374    Node->setNodeId(-1);
4375    return;   // Already selected.
4376  }
4377
4378  switch (Opcode) {
4379  default: break;
4380  case ISD::INTRINSIC_VOID: {
4381    unsigned IntNo = Node->getConstantOperandVal(1);
4382    switch (IntNo) {
4383    default: break;
4384    case Intrinsic::x86_sse3_monitor:
4385    case Intrinsic::x86_monitorx:
4386    case Intrinsic::x86_clzero: {
4387      bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4388
4389      unsigned Opc = 0;
4390      switch (IntNo) {
4391      default: llvm_unreachable("Unexpected intrinsic!");
4392      case Intrinsic::x86_sse3_monitor:
4393        if (!Subtarget->hasSSE3())
4394          break;
4395        Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4396        break;
4397      case Intrinsic::x86_monitorx:
4398        if (!Subtarget->hasMWAITX())
4399          break;
4400        Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4401        break;
4402      case Intrinsic::x86_clzero:
4403        if (!Subtarget->hasCLZERO())
4404          break;
4405        Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4406        break;
4407      }
4408
4409      if (Opc) {
4410        unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4411        SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4412                                             Node->getOperand(2), SDValue());
4413        SDValue InFlag = Chain.getValue(1);
4414
4415        if (IntNo == Intrinsic::x86_sse3_monitor ||
4416            IntNo == Intrinsic::x86_monitorx) {
4417          // Copy the other two operands to ECX and EDX.
4418          Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4419                                       InFlag);
4420          InFlag = Chain.getValue(1);
4421          Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4422                                       InFlag);
4423          InFlag = Chain.getValue(1);
4424        }
4425
4426        MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4427                                                      { Chain, InFlag});
4428        ReplaceNode(Node, CNode);
4429        return;
4430      }
4431
4432      break;
4433    }
4434    case Intrinsic::x86_tileloadd64:
4435    case Intrinsic::x86_tileloaddt164:
4436    case Intrinsic::x86_tilestored64: {
4437      if (!Subtarget->hasAMXTILE())
4438        break;
4439      unsigned Opc;
4440      switch (IntNo) {
4441      default: llvm_unreachable("Unexpected intrinsic!");
4442      case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
4443      case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
4444      case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
4445      }
4446      // FIXME: Match displacement and scale.
4447      unsigned TIndex = Node->getConstantOperandVal(2);
4448      SDValue TReg = getI8Imm(TIndex, dl);
4449      SDValue Base = Node->getOperand(3);
4450      SDValue Scale = getI8Imm(1, dl);
4451      SDValue Index = Node->getOperand(4);
4452      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
4453      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
4454      SDValue Chain = Node->getOperand(0);
4455      MachineSDNode *CNode;
4456      if (Opc == X86::PTILESTORED) {
4457        SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
4458        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
4459      } else {
4460        SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
4461        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
4462      }
4463      ReplaceNode(Node, CNode);
4464      return;
4465    }
4466    }
4467    break;
4468  }
4469  case ISD::BRIND: {
4470    if (Subtarget->isTargetNaCl())
4471      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4472      // leave the instruction alone.
4473      break;
4474    if (Subtarget->isTarget64BitILP32()) {
4475      // Converts a 32-bit register to a 64-bit, zero-extended version of
4476      // it. This is needed because x86-64 can do many things, but jmp %r32
4477      // ain't one of them.
4478      SDValue Target = Node->getOperand(1);
4479      assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
4480      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
4481      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4482                                      Node->getOperand(0), ZextTarget);
4483      ReplaceNode(Node, Brind.getNode());
4484      SelectCode(ZextTarget.getNode());
4485      SelectCode(Brind.getNode());
4486      return;
4487    }
4488    break;
4489  }
4490  case X86ISD::GlobalBaseReg:
4491    ReplaceNode(Node, getGlobalBaseReg());
4492    return;
4493
4494  case ISD::BITCAST:
4495    // Just drop all 128/256/512-bit bitcasts.
4496    if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4497        NVT == MVT::f128) {
4498      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4499      CurDAG->RemoveDeadNode(Node);
4500      return;
4501    }
4502    break;
4503
4504  case ISD::SRL:
4505    if (matchBitExtract(Node))
4506      return;
4507    LLVM_FALLTHROUGH;
4508  case ISD::SRA:
4509  case ISD::SHL:
4510    if (tryShiftAmountMod(Node))
4511      return;
4512    break;
4513
4514  case ISD::AND:
4515    if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4516      // Try to form a masked VPTESTM. Operands can be in either order.
4517      SDValue N0 = Node->getOperand(0);
4518      SDValue N1 = Node->getOperand(1);
4519      if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4520          tryVPTESTM(Node, N0, N1))
4521        return;
4522      if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4523          tryVPTESTM(Node, N1, N0))
4524        return;
4525    }
4526
4527    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4528      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4529      CurDAG->RemoveDeadNode(Node);
4530      return;
4531    }
4532    if (matchBitExtract(Node))
4533      return;
4534    if (AndImmShrink && shrinkAndImmediate(Node))
4535      return;
4536
4537    LLVM_FALLTHROUGH;
4538  case ISD::OR:
4539  case ISD::XOR:
4540    if (tryShrinkShlLogicImm(Node))
4541      return;
4542    if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4543      return;
4544    if (tryVPTERNLOG(Node))
4545      return;
4546
4547    LLVM_FALLTHROUGH;
4548  case ISD::ADD:
4549  case ISD::SUB: {
4550    // Try to avoid folding immediates with multiple uses for optsize.
4551    // This code tries to select to register form directly to avoid going
4552    // through the isel table which might fold the immediate. We can't change
4553    // the patterns on the add/sub/and/or/xor with immediate paterns in the
4554    // tablegen files to check immediate use count without making the patterns
4555    // unavailable to the fast-isel table.
4556    if (!CurDAG->shouldOptForSize())
4557      break;
4558
4559    // Only handle i8/i16/i32/i64.
4560    if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4561      break;
4562
4563    SDValue N0 = Node->getOperand(0);
4564    SDValue N1 = Node->getOperand(1);
4565
4566    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4567    if (!Cst)
4568      break;
4569
4570    int64_t Val = Cst->getSExtValue();
4571
4572    // Make sure its an immediate that is considered foldable.
4573    // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4574    if (!isInt<8>(Val) && !isInt<32>(Val))
4575      break;
4576
4577    // If this can match to INC/DEC, let it go.
4578    if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
4579      break;
4580
4581    // Check if we should avoid folding this immediate.
4582    if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4583      break;
4584
4585    // We should not fold the immediate. So we need a register form instead.
4586    unsigned ROpc, MOpc;
4587    switch (NVT.SimpleTy) {
4588    default: llvm_unreachable("Unexpected VT!");
4589    case MVT::i8:
4590      switch (Opcode) {
4591      default: llvm_unreachable("Unexpected opcode!");
4592      case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4593      case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4594      case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4595      case ISD::OR:  ROpc = X86::OR8rr;  MOpc = X86::OR8rm;  break;
4596      case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4597      }
4598      break;
4599    case MVT::i16:
4600      switch (Opcode) {
4601      default: llvm_unreachable("Unexpected opcode!");
4602      case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4603      case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4604      case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4605      case ISD::OR:  ROpc = X86::OR16rr;  MOpc = X86::OR16rm;  break;
4606      case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4607      }
4608      break;
4609    case MVT::i32:
4610      switch (Opcode) {
4611      default: llvm_unreachable("Unexpected opcode!");
4612      case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4613      case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4614      case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4615      case ISD::OR:  ROpc = X86::OR32rr;  MOpc = X86::OR32rm;  break;
4616      case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4617      }
4618      break;
4619    case MVT::i64:
4620      switch (Opcode) {
4621      default: llvm_unreachable("Unexpected opcode!");
4622      case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4623      case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4624      case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4625      case ISD::OR:  ROpc = X86::OR64rr;  MOpc = X86::OR64rm;  break;
4626      case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4627      }
4628      break;
4629    }
4630
4631    // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4632
4633    // If this is a not a subtract, we can still try to fold a load.
4634    if (Opcode != ISD::SUB) {
4635      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4636      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4637        SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4638        SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4639        MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4640        // Update the chain.
4641        ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4642        // Record the mem-refs
4643        CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4644        ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4645        CurDAG->RemoveDeadNode(Node);
4646        return;
4647      }
4648    }
4649
4650    CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4651    return;
4652  }
4653
4654  case X86ISD::SMUL:
4655    // i16/i32/i64 are handled with isel patterns.
4656    if (NVT != MVT::i8)
4657      break;
4658    LLVM_FALLTHROUGH;
4659  case X86ISD::UMUL: {
4660    SDValue N0 = Node->getOperand(0);
4661    SDValue N1 = Node->getOperand(1);
4662
4663    unsigned LoReg, ROpc, MOpc;
4664    switch (NVT.SimpleTy) {
4665    default: llvm_unreachable("Unsupported VT!");
4666    case MVT::i8:
4667      LoReg = X86::AL;
4668      ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4669      MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4670      break;
4671    case MVT::i16:
4672      LoReg = X86::AX;
4673      ROpc = X86::MUL16r;
4674      MOpc = X86::MUL16m;
4675      break;
4676    case MVT::i32:
4677      LoReg = X86::EAX;
4678      ROpc = X86::MUL32r;
4679      MOpc = X86::MUL32m;
4680      break;
4681    case MVT::i64:
4682      LoReg = X86::RAX;
4683      ROpc = X86::MUL64r;
4684      MOpc = X86::MUL64m;
4685      break;
4686    }
4687
4688    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4689    bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4690    // Multiply is commutative.
4691    if (!FoldedLoad) {
4692      FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4693      if (FoldedLoad)
4694        std::swap(N0, N1);
4695    }
4696
4697    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4698                                          N0, SDValue()).getValue(1);
4699
4700    MachineSDNode *CNode;
4701    if (FoldedLoad) {
4702      // i16/i32/i64 use an instruction that produces a low and high result even
4703      // though only the low result is used.
4704      SDVTList VTs;
4705      if (NVT == MVT::i8)
4706        VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4707      else
4708        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4709
4710      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4711                        InFlag };
4712      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4713
4714      // Update the chain.
4715      ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4716      // Record the mem-refs
4717      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4718    } else {
4719      // i16/i32/i64 use an instruction that produces a low and high result even
4720      // though only the low result is used.
4721      SDVTList VTs;
4722      if (NVT == MVT::i8)
4723        VTs = CurDAG->getVTList(NVT, MVT::i32);
4724      else
4725        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4726
4727      CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4728    }
4729
4730    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4731    ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4732    CurDAG->RemoveDeadNode(Node);
4733    return;
4734  }
4735
4736  case ISD::SMUL_LOHI:
4737  case ISD::UMUL_LOHI: {
4738    SDValue N0 = Node->getOperand(0);
4739    SDValue N1 = Node->getOperand(1);
4740
4741    unsigned Opc, MOpc;
4742    unsigned LoReg, HiReg;
4743    bool IsSigned = Opcode == ISD::SMUL_LOHI;
4744    bool UseMULX = !IsSigned && Subtarget->hasBMI2();
4745    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
4746    switch (NVT.SimpleTy) {
4747    default: llvm_unreachable("Unsupported VT!");
4748    case MVT::i32:
4749      Opc  = UseMULXHi ? X86::MULX32Hrr :
4750             UseMULX ? X86::MULX32rr :
4751             IsSigned ? X86::IMUL32r : X86::MUL32r;
4752      MOpc = UseMULXHi ? X86::MULX32Hrm :
4753             UseMULX ? X86::MULX32rm :
4754             IsSigned ? X86::IMUL32m : X86::MUL32m;
4755      LoReg = UseMULX ? X86::EDX : X86::EAX;
4756      HiReg = X86::EDX;
4757      break;
4758    case MVT::i64:
4759      Opc  = UseMULXHi ? X86::MULX64Hrr :
4760             UseMULX ? X86::MULX64rr :
4761             IsSigned ? X86::IMUL64r : X86::MUL64r;
4762      MOpc = UseMULXHi ? X86::MULX64Hrm :
4763             UseMULX ? X86::MULX64rm :
4764             IsSigned ? X86::IMUL64m : X86::MUL64m;
4765      LoReg = UseMULX ? X86::RDX : X86::RAX;
4766      HiReg = X86::RDX;
4767      break;
4768    }
4769
4770    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4771    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4772    // Multiply is commmutative.
4773    if (!foldedLoad) {
4774      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4775      if (foldedLoad)
4776        std::swap(N0, N1);
4777    }
4778
4779    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4780                                          N0, SDValue()).getValue(1);
4781    SDValue ResHi, ResLo;
4782    if (foldedLoad) {
4783      SDValue Chain;
4784      MachineSDNode *CNode = nullptr;
4785      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4786                        InFlag };
4787      if (UseMULXHi) {
4788        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4789        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4790        ResHi = SDValue(CNode, 0);
4791        Chain = SDValue(CNode, 1);
4792      } else if (UseMULX) {
4793        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
4794        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4795        ResHi = SDValue(CNode, 0);
4796        ResLo = SDValue(CNode, 1);
4797        Chain = SDValue(CNode, 2);
4798      } else {
4799        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4800        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4801        Chain = SDValue(CNode, 0);
4802        InFlag = SDValue(CNode, 1);
4803      }
4804
4805      // Update the chain.
4806      ReplaceUses(N1.getValue(1), Chain);
4807      // Record the mem-refs
4808      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4809    } else {
4810      SDValue Ops[] = { N1, InFlag };
4811      if (UseMULXHi) {
4812        SDVTList VTs = CurDAG->getVTList(NVT);
4813        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4814        ResHi = SDValue(CNode, 0);
4815      } else if (UseMULX) {
4816        SDVTList VTs = CurDAG->getVTList(NVT, NVT);
4817        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4818        ResHi = SDValue(CNode, 0);
4819        ResLo = SDValue(CNode, 1);
4820      } else {
4821        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4822        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4823        InFlag = SDValue(CNode, 0);
4824      }
4825    }
4826
4827    // Copy the low half of the result, if it is needed.
4828    if (!SDValue(Node, 0).use_empty()) {
4829      if (!ResLo) {
4830        assert(LoReg && "Register for low half is not defined!");
4831        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4832                                       NVT, InFlag);
4833        InFlag = ResLo.getValue(2);
4834      }
4835      ReplaceUses(SDValue(Node, 0), ResLo);
4836      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4837                 dbgs() << '\n');
4838    }
4839    // Copy the high half of the result, if it is needed.
4840    if (!SDValue(Node, 1).use_empty()) {
4841      if (!ResHi) {
4842        assert(HiReg && "Register for high half is not defined!");
4843        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4844                                       NVT, InFlag);
4845        InFlag = ResHi.getValue(2);
4846      }
4847      ReplaceUses(SDValue(Node, 1), ResHi);
4848      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4849                 dbgs() << '\n');
4850    }
4851
4852    CurDAG->RemoveDeadNode(Node);
4853    return;
4854  }
4855
4856  case ISD::SDIVREM:
4857  case ISD::UDIVREM: {
4858    SDValue N0 = Node->getOperand(0);
4859    SDValue N1 = Node->getOperand(1);
4860
4861    unsigned ROpc, MOpc;
4862    bool isSigned = Opcode == ISD::SDIVREM;
4863    if (!isSigned) {
4864      switch (NVT.SimpleTy) {
4865      default: llvm_unreachable("Unsupported VT!");
4866      case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
4867      case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
4868      case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
4869      case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
4870      }
4871    } else {
4872      switch (NVT.SimpleTy) {
4873      default: llvm_unreachable("Unsupported VT!");
4874      case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
4875      case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4876      case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4877      case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4878      }
4879    }
4880
4881    unsigned LoReg, HiReg, ClrReg;
4882    unsigned SExtOpcode;
4883    switch (NVT.SimpleTy) {
4884    default: llvm_unreachable("Unsupported VT!");
4885    case MVT::i8:
4886      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
4887      SExtOpcode = 0; // Not used.
4888      break;
4889    case MVT::i16:
4890      LoReg = X86::AX;  HiReg = X86::DX;
4891      ClrReg = X86::DX;
4892      SExtOpcode = X86::CWD;
4893      break;
4894    case MVT::i32:
4895      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4896      SExtOpcode = X86::CDQ;
4897      break;
4898    case MVT::i64:
4899      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4900      SExtOpcode = X86::CQO;
4901      break;
4902    }
4903
4904    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4905    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4906    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4907
4908    SDValue InFlag;
4909    if (NVT == MVT::i8) {
4910      // Special case for div8, just use a move with zero extension to AX to
4911      // clear the upper 8 bits (AH).
4912      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4913      MachineSDNode *Move;
4914      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4915        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4916        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
4917                                                    : X86::MOVZX16rm8;
4918        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
4919        Chain = SDValue(Move, 1);
4920        ReplaceUses(N0.getValue(1), Chain);
4921        // Record the mem-refs
4922        CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4923      } else {
4924        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
4925                                                    : X86::MOVZX16rr8;
4926        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
4927        Chain = CurDAG->getEntryNode();
4928      }
4929      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
4930                                    SDValue());
4931      InFlag = Chain.getValue(1);
4932    } else {
4933      InFlag =
4934        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4935                             LoReg, N0, SDValue()).getValue(1);
4936      if (isSigned && !signBitIsZero) {
4937        // Sign extend the low part into the high part.
4938        InFlag =
4939          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4940      } else {
4941        // Zero out the high part, effectively zero extending the input.
4942        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
4943        SDValue ClrNode =
4944            SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
4945        switch (NVT.SimpleTy) {
4946        case MVT::i16:
4947          ClrNode =
4948              SDValue(CurDAG->getMachineNode(
4949                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4950                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
4951                                                    MVT::i32)),
4952                      0);
4953          break;
4954        case MVT::i32:
4955          break;
4956        case MVT::i64:
4957          ClrNode =
4958              SDValue(CurDAG->getMachineNode(
4959                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4960                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4961                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
4962                                                    MVT::i32)),
4963                      0);
4964          break;
4965        default:
4966          llvm_unreachable("Unexpected division source");
4967        }
4968
4969        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4970                                      ClrNode, InFlag).getValue(1);
4971      }
4972    }
4973
4974    if (foldedLoad) {
4975      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4976                        InFlag };
4977      MachineSDNode *CNode =
4978        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4979      InFlag = SDValue(CNode, 1);
4980      // Update the chain.
4981      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4982      // Record the mem-refs
4983      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4984    } else {
4985      InFlag =
4986        SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
4987    }
4988
4989    // Prevent use of AH in a REX instruction by explicitly copying it to
4990    // an ABCD_L register.
4991    //
4992    // The current assumption of the register allocator is that isel
4993    // won't generate explicit references to the GR8_ABCD_H registers. If
4994    // the allocator and/or the backend get enhanced to be more robust in
4995    // that regard, this can be, and should be, removed.
4996    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4997      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4998      unsigned AHExtOpcode =
4999          isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5000
5001      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5002                                             MVT::Glue, AHCopy, InFlag);
5003      SDValue Result(RNode, 0);
5004      InFlag = SDValue(RNode, 1);
5005
5006      Result =
5007          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5008
5009      ReplaceUses(SDValue(Node, 1), Result);
5010      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5011                 dbgs() << '\n');
5012    }
5013    // Copy the division (low) result, if it is needed.
5014    if (!SDValue(Node, 0).use_empty()) {
5015      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5016                                                LoReg, NVT, InFlag);
5017      InFlag = Result.getValue(2);
5018      ReplaceUses(SDValue(Node, 0), Result);
5019      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5020                 dbgs() << '\n');
5021    }
5022    // Copy the remainder (high) result, if it is needed.
5023    if (!SDValue(Node, 1).use_empty()) {
5024      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5025                                              HiReg, NVT, InFlag);
5026      InFlag = Result.getValue(2);
5027      ReplaceUses(SDValue(Node, 1), Result);
5028      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5029                 dbgs() << '\n');
5030    }
5031    CurDAG->RemoveDeadNode(Node);
5032    return;
5033  }
5034
5035  case X86ISD::FCMP:
5036  case X86ISD::STRICT_FCMP:
5037  case X86ISD::STRICT_FCMPS: {
5038    bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5039                       Node->getOpcode() == X86ISD::STRICT_FCMPS;
5040    SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5041    SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5042
5043    // Save the original VT of the compare.
5044    MVT CmpVT = N0.getSimpleValueType();
5045
5046    // Floating point needs special handling if we don't have FCOMI.
5047    if (Subtarget->hasCMov())
5048      break;
5049
5050    bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5051
5052    unsigned Opc;
5053    switch (CmpVT.SimpleTy) {
5054    default: llvm_unreachable("Unexpected type!");
5055    case MVT::f32:
5056      Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5057      break;
5058    case MVT::f64:
5059      Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5060      break;
5061    case MVT::f80:
5062      Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5063      break;
5064    }
5065
5066    SDValue Cmp;
5067    SDValue Chain =
5068        IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5069    if (IsStrictCmp) {
5070      SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other);
5071      Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5072      Chain = Cmp.getValue(1);
5073    } else {
5074      Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0);
5075    }
5076
5077    // Move FPSW to AX.
5078    SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue());
5079    Chain = FPSW;
5080    SDValue FNSTSW =
5081        SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW,
5082                                       FPSW.getValue(1)),
5083                0);
5084
5085    // Extract upper 8-bits of AX.
5086    SDValue Extract =
5087        CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5088
5089    // Move AH into flags.
5090    // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5091    assert(Subtarget->hasLAHFSAHF() &&
5092           "Target doesn't support SAHF or FCOMI?");
5093    SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5094    Chain = AH;
5095    SDValue SAHF = SDValue(
5096        CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5097
5098    if (IsStrictCmp)
5099      ReplaceUses(SDValue(Node, 1), Chain);
5100
5101    ReplaceUses(SDValue(Node, 0), SAHF);
5102    CurDAG->RemoveDeadNode(Node);
5103    return;
5104  }
5105
5106  case X86ISD::CMP: {
5107    SDValue N0 = Node->getOperand(0);
5108    SDValue N1 = Node->getOperand(1);
5109
5110    // Optimizations for TEST compares.
5111    if (!isNullConstant(N1))
5112      break;
5113
5114    // Save the original VT of the compare.
5115    MVT CmpVT = N0.getSimpleValueType();
5116
5117    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5118    // by a test instruction. The test should be removed later by
5119    // analyzeCompare if we are using only the zero flag.
5120    // TODO: Should we check the users and use the BEXTR flags directly?
5121    if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5122      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5123        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5124                                             : X86::TEST32rr;
5125        SDValue BEXTR = SDValue(NewNode, 0);
5126        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5127        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5128        CurDAG->RemoveDeadNode(Node);
5129        return;
5130      }
5131    }
5132
5133    // We can peek through truncates, but we need to be careful below.
5134    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5135      N0 = N0.getOperand(0);
5136
5137    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5138    // use a smaller encoding.
5139    // Look past the truncate if CMP is the only use of it.
5140    if (N0.getOpcode() == ISD::AND &&
5141        N0.getNode()->hasOneUse() &&
5142        N0.getValueType() != MVT::i8) {
5143      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5144      if (!C) break;
5145      uint64_t Mask = C->getZExtValue();
5146
5147      // Check if we can replace AND+IMM64 with a shift. This is possible for
5148      // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5149      // flag.
5150      if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
5151          onlyUsesZeroFlag(SDValue(Node, 0))) {
5152        if (isMask_64(~Mask)) {
5153          unsigned TrailingZeros = countTrailingZeros(Mask);
5154          SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
5155          SDValue Shift =
5156            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
5157                                           N0.getOperand(0), Imm), 0);
5158          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5159                                                       MVT::i32, Shift, Shift);
5160          ReplaceNode(Node, Test);
5161          return;
5162        }
5163        if (isMask_64(Mask)) {
5164          unsigned LeadingZeros = countLeadingZeros(Mask);
5165          SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
5166          SDValue Shift =
5167            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
5168                                           N0.getOperand(0), Imm), 0);
5169          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5170                                                       MVT::i32, Shift, Shift);
5171          ReplaceNode(Node, Test);
5172          return;
5173        }
5174      }
5175
5176      MVT VT;
5177      int SubRegOp;
5178      unsigned ROpc, MOpc;
5179
5180      // For each of these checks we need to be careful if the sign flag is
5181      // being used. It is only safe to use the sign flag in two conditions,
5182      // either the sign bit in the shrunken mask is zero or the final test
5183      // size is equal to the original compare size.
5184
5185      if (isUInt<8>(Mask) &&
5186          (!(Mask & 0x80) || CmpVT == MVT::i8 ||
5187           hasNoSignFlagUses(SDValue(Node, 0)))) {
5188        // For example, convert "testl %eax, $8" to "testb %al, $8"
5189        VT = MVT::i8;
5190        SubRegOp = X86::sub_8bit;
5191        ROpc = X86::TEST8ri;
5192        MOpc = X86::TEST8mi;
5193      } else if (OptForMinSize && isUInt<16>(Mask) &&
5194                 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
5195                  hasNoSignFlagUses(SDValue(Node, 0)))) {
5196        // For example, "testl %eax, $32776" to "testw %ax, $32776".
5197        // NOTE: We only want to form TESTW instructions if optimizing for
5198        // min size. Otherwise we only save one byte and possibly get a length
5199        // changing prefix penalty in the decoders.
5200        VT = MVT::i16;
5201        SubRegOp = X86::sub_16bit;
5202        ROpc = X86::TEST16ri;
5203        MOpc = X86::TEST16mi;
5204      } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
5205                 ((!(Mask & 0x80000000) &&
5206                   // Without minsize 16-bit Cmps can get here so we need to
5207                   // be sure we calculate the correct sign flag if needed.
5208                   (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
5209                  CmpVT == MVT::i32 ||
5210                  hasNoSignFlagUses(SDValue(Node, 0)))) {
5211        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5212        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5213        // Otherwize, we find ourselves in a position where we have to do
5214        // promotion. If previous passes did not promote the and, we assume
5215        // they had a good reason not to and do not promote here.
5216        VT = MVT::i32;
5217        SubRegOp = X86::sub_32bit;
5218        ROpc = X86::TEST32ri;
5219        MOpc = X86::TEST32mi;
5220      } else {
5221        // No eligible transformation was found.
5222        break;
5223      }
5224
5225      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
5226      SDValue Reg = N0.getOperand(0);
5227
5228      // Emit a testl or testw.
5229      MachineSDNode *NewNode;
5230      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5231      if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5232        if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
5233          if (!LoadN->isSimple()) {
5234            unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
5235            if (MOpc == X86::TEST8mi && NumVolBits != 8)
5236              break;
5237            else if (MOpc == X86::TEST16mi && NumVolBits != 16)
5238              break;
5239            else if (MOpc == X86::TEST32mi && NumVolBits != 32)
5240              break;
5241          }
5242        }
5243        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
5244                          Reg.getOperand(0) };
5245        NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
5246        // Update the chain.
5247        ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
5248        // Record the mem-refs
5249        CurDAG->setNodeMemRefs(NewNode,
5250                               {cast<LoadSDNode>(Reg)->getMemOperand()});
5251      } else {
5252        // Extract the subregister if necessary.
5253        if (N0.getValueType() != VT)
5254          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
5255
5256        NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
5257      }
5258      // Replace CMP with TEST.
5259      ReplaceNode(Node, NewNode);
5260      return;
5261    }
5262    break;
5263  }
5264  case X86ISD::PCMPISTR: {
5265    if (!Subtarget->hasSSE42())
5266      break;
5267
5268    bool NeedIndex = !SDValue(Node, 0).use_empty();
5269    bool NeedMask = !SDValue(Node, 1).use_empty();
5270    // We can't fold a load if we are going to make two instructions.
5271    bool MayFoldLoad = !NeedIndex || !NeedMask;
5272
5273    MachineSDNode *CNode;
5274    if (NeedMask) {
5275      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
5276      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
5277      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
5278      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5279    }
5280    if (NeedIndex || !NeedMask) {
5281      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
5282      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
5283      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
5284      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5285    }
5286
5287    // Connect the flag usage to the last instruction created.
5288    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5289    CurDAG->RemoveDeadNode(Node);
5290    return;
5291  }
5292  case X86ISD::PCMPESTR: {
5293    if (!Subtarget->hasSSE42())
5294      break;
5295
5296    // Copy the two implicit register inputs.
5297    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
5298                                          Node->getOperand(1),
5299                                          SDValue()).getValue(1);
5300    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
5301                                  Node->getOperand(3), InFlag).getValue(1);
5302
5303    bool NeedIndex = !SDValue(Node, 0).use_empty();
5304    bool NeedMask = !SDValue(Node, 1).use_empty();
5305    // We can't fold a load if we are going to make two instructions.
5306    bool MayFoldLoad = !NeedIndex || !NeedMask;
5307
5308    MachineSDNode *CNode;
5309    if (NeedMask) {
5310      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
5311      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
5312      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
5313                           InFlag);
5314      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5315    }
5316    if (NeedIndex || !NeedMask) {
5317      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
5318      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
5319      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
5320      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5321    }
5322    // Connect the flag usage to the last instruction created.
5323    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5324    CurDAG->RemoveDeadNode(Node);
5325    return;
5326  }
5327
5328  case ISD::SETCC: {
5329    if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5330      return;
5331
5332    break;
5333  }
5334
5335  case ISD::STORE:
5336    if (foldLoadStoreIntoMemOperand(Node))
5337      return;
5338    break;
5339
5340  case X86ISD::SETCC_CARRY: {
5341    // We have to do this manually because tblgen will put the eflags copy in
5342    // the wrong place if we use an extract_subreg in the pattern.
5343    MVT VT = Node->getSimpleValueType(0);
5344
5345    // Copy flags to the EFLAGS register and glue it to next node.
5346    SDValue EFLAGS =
5347        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
5348                             Node->getOperand(1), SDValue());
5349
5350    // Create a 64-bit instruction if the result is 64-bits otherwise use the
5351    // 32-bit version.
5352    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
5353    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
5354    SDValue Result = SDValue(
5355        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
5356
5357    // For less than 32-bits we need to extract from the 32-bit node.
5358    if (VT == MVT::i8 || VT == MVT::i16) {
5359      int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
5360      Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
5361    }
5362
5363    ReplaceUses(SDValue(Node, 0), Result);
5364    CurDAG->RemoveDeadNode(Node);
5365    return;
5366  }
5367  case X86ISD::SBB: {
5368    if (isNullConstant(Node->getOperand(0)) &&
5369        isNullConstant(Node->getOperand(1))) {
5370      MVT VT = Node->getSimpleValueType(0);
5371
5372      // Create zero.
5373      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5374      SDValue Zero =
5375          SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
5376      if (VT == MVT::i64) {
5377        Zero = SDValue(
5378            CurDAG->getMachineNode(
5379                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5380                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
5381                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
5382            0);
5383      }
5384
5385      // Copy flags to the EFLAGS register and glue it to next node.
5386      SDValue EFLAGS =
5387          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
5388                               Node->getOperand(2), SDValue());
5389
5390      // Create a 64-bit instruction if the result is 64-bits otherwise use the
5391      // 32-bit version.
5392      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
5393      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
5394      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
5395      SDValue Result =
5396          SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
5397                                         EFLAGS.getValue(1)}),
5398                  0);
5399
5400      // Replace the flag use.
5401      ReplaceUses(SDValue(Node, 1), Result.getValue(1));
5402
5403      // Replace the result use.
5404      if (!SDValue(Node, 0).use_empty()) {
5405        // For less than 32-bits we need to extract from the 32-bit node.
5406        if (VT == MVT::i8 || VT == MVT::i16) {
5407          int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
5408          Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
5409        }
5410        ReplaceUses(SDValue(Node, 0), Result);
5411      }
5412
5413      CurDAG->RemoveDeadNode(Node);
5414      return;
5415    }
5416    break;
5417  }
5418  case X86ISD::MGATHER: {
5419    auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
5420    SDValue IndexOp = Mgt->getIndex();
5421    SDValue Mask = Mgt->getMask();
5422    MVT IndexVT = IndexOp.getSimpleValueType();
5423    MVT ValueVT = Node->getSimpleValueType(0);
5424    MVT MaskVT = Mask.getSimpleValueType();
5425
5426    // This is just to prevent crashes if the nodes are malformed somehow. We're
5427    // otherwise only doing loose type checking in here based on type what
5428    // a type constraint would say just like table based isel.
5429    if (!ValueVT.isVector() || !MaskVT.isVector())
5430      break;
5431
5432    unsigned NumElts = ValueVT.getVectorNumElements();
5433    MVT ValueSVT = ValueVT.getVectorElementType();
5434
5435    bool IsFP = ValueSVT.isFloatingPoint();
5436    unsigned EltSize = ValueSVT.getSizeInBits();
5437
5438    unsigned Opc = 0;
5439    bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
5440    if (AVX512Gather) {
5441      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
5442        Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
5443      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
5444        Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
5445      else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
5446        Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
5447      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
5448        Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
5449      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
5450        Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
5451      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
5452        Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
5453      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
5454        Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
5455      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
5456        Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
5457      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
5458        Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
5459      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
5460        Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
5461      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
5462        Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
5463      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
5464        Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
5465    } else {
5466      assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
5467             "Unexpected mask VT!");
5468      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
5469        Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
5470      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
5471        Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
5472      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
5473        Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
5474      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
5475        Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
5476      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
5477        Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
5478      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
5479        Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
5480      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
5481        Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
5482      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
5483        Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
5484    }
5485
5486    if (!Opc)
5487      break;
5488
5489    SDValue Base, Scale, Index, Disp, Segment;
5490    if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
5491                          Base, Scale, Index, Disp, Segment))
5492      break;
5493
5494    SDValue PassThru = Mgt->getPassThru();
5495    SDValue Chain = Mgt->getChain();
5496    // Gather instructions have a mask output not in the ISD node.
5497    SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
5498
5499    MachineSDNode *NewNode;
5500    if (AVX512Gather) {
5501      SDValue Ops[] = {PassThru, Mask, Base,    Scale,
5502                       Index,    Disp, Segment, Chain};
5503      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
5504    } else {
5505      SDValue Ops[] = {PassThru, Base,    Scale, Index,
5506                       Disp,     Segment, Mask,  Chain};
5507      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
5508    }
5509    CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
5510    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5511    ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
5512    CurDAG->RemoveDeadNode(Node);
5513    return;
5514  }
5515  case X86ISD::MSCATTER: {
5516    auto *Sc = cast<X86MaskedScatterSDNode>(Node);
5517    SDValue Value = Sc->getValue();
5518    SDValue IndexOp = Sc->getIndex();
5519    MVT IndexVT = IndexOp.getSimpleValueType();
5520    MVT ValueVT = Value.getSimpleValueType();
5521
5522    // This is just to prevent crashes if the nodes are malformed somehow. We're
5523    // otherwise only doing loose type checking in here based on type what
5524    // a type constraint would say just like table based isel.
5525    if (!ValueVT.isVector())
5526      break;
5527
5528    unsigned NumElts = ValueVT.getVectorNumElements();
5529    MVT ValueSVT = ValueVT.getVectorElementType();
5530
5531    bool IsFP = ValueSVT.isFloatingPoint();
5532    unsigned EltSize = ValueSVT.getSizeInBits();
5533
5534    unsigned Opc;
5535    if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
5536      Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
5537    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
5538      Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
5539    else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
5540      Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
5541    else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
5542      Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
5543    else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
5544      Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
5545    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
5546      Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
5547    else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
5548      Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
5549    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
5550      Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
5551    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
5552      Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
5553    else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
5554      Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
5555    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
5556      Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
5557    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
5558      Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
5559    else
5560      break;
5561
5562    SDValue Base, Scale, Index, Disp, Segment;
5563    if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
5564                          Base, Scale, Index, Disp, Segment))
5565      break;
5566
5567    SDValue Mask = Sc->getMask();
5568    SDValue Chain = Sc->getChain();
5569    // Scatter instructions have a mask output not in the ISD node.
5570    SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
5571    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
5572
5573    MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
5574    CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
5575    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
5576    CurDAG->RemoveDeadNode(Node);
5577    return;
5578  }
5579  case ISD::PREALLOCATED_SETUP: {
5580    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5581    auto CallId = MFI->getPreallocatedIdForCallSite(
5582        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
5583    SDValue Chain = Node->getOperand(0);
5584    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
5585    MachineSDNode *New = CurDAG->getMachineNode(
5586        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
5587    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
5588    CurDAG->RemoveDeadNode(Node);
5589    return;
5590  }
5591  case ISD::PREALLOCATED_ARG: {
5592    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5593    auto CallId = MFI->getPreallocatedIdForCallSite(
5594        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
5595    SDValue Chain = Node->getOperand(0);
5596    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
5597    SDValue ArgIndex = Node->getOperand(2);
5598    SDValue Ops[3];
5599    Ops[0] = CallIdValue;
5600    Ops[1] = ArgIndex;
5601    Ops[2] = Chain;
5602    MachineSDNode *New = CurDAG->getMachineNode(
5603        TargetOpcode::PREALLOCATED_ARG, dl,
5604        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
5605                          MVT::Other),
5606        Ops);
5607    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
5608    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
5609    CurDAG->RemoveDeadNode(Node);
5610    return;
5611  }
5612  }
5613
5614  SelectCode(Node);
5615}
5616
5617bool X86DAGToDAGISel::
5618SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5619                             std::vector<SDValue> &OutOps) {
5620  SDValue Op0, Op1, Op2, Op3, Op4;
5621  switch (ConstraintID) {
5622  default:
5623    llvm_unreachable("Unexpected asm memory constraint");
5624  case InlineAsm::Constraint_o: // offsetable        ??
5625  case InlineAsm::Constraint_v: // not offsetable    ??
5626  case InlineAsm::Constraint_m: // memory
5627  case InlineAsm::Constraint_X:
5628    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5629      return true;
5630    break;
5631  }
5632
5633  OutOps.push_back(Op0);
5634  OutOps.push_back(Op1);
5635  OutOps.push_back(Op2);
5636  OutOps.push_back(Op3);
5637  OutOps.push_back(Op4);
5638  return false;
5639}
5640
5641/// This pass converts a legalized DAG into a X86-specific DAG,
5642/// ready for instruction scheduling.
5643FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5644                                     CodeGenOpt::Level OptLevel) {
5645  return new X86DAGToDAGISel(TM, OptLevel);
5646}
5647