X86ISelDAGToDAG.cpp revision 363496
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
15#include "X86MachineFunctionInfo.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineFunction.h"
22#include "llvm/CodeGen/SelectionDAGISel.h"
23#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/ConstantRange.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/Instructions.h"
27#include "llvm/IR/Intrinsics.h"
28#include "llvm/IR/IntrinsicsX86.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Support/MathExtras.h"
34#include "llvm/Support/raw_ostream.h"
35#include "llvm/Target/TargetMachine.h"
36#include "llvm/Target/TargetOptions.h"
37#include <stdint.h>
38using namespace llvm;
39
40#define DEBUG_TYPE "x86-isel"
41
42STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43
44static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
45    cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46    cl::Hidden);
47
48//===----------------------------------------------------------------------===//
49//                      Pattern Matcher Implementation
50//===----------------------------------------------------------------------===//
51
52namespace {
53  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
54  /// numbers for the leaves of the matched tree.
55  struct X86ISelAddressMode {
56    enum {
57      RegBase,
58      FrameIndexBase
59    } BaseType;
60
61    // This is really a union, discriminated by BaseType!
62    SDValue Base_Reg;
63    int Base_FrameIndex;
64
65    unsigned Scale;
66    SDValue IndexReg;
67    int32_t Disp;
68    SDValue Segment;
69    const GlobalValue *GV;
70    const Constant *CP;
71    const BlockAddress *BlockAddr;
72    const char *ES;
73    MCSymbol *MCSym;
74    int JT;
75    unsigned Align;    // CP alignment.
76    unsigned char SymbolFlags;  // X86II::MO_*
77    bool NegateIndex = false;
78
79    X86ISelAddressMode()
80        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
81          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
82          MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
83
84    bool hasSymbolicDisplacement() const {
85      return GV != nullptr || CP != nullptr || ES != nullptr ||
86             MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
87    }
88
89    bool hasBaseOrIndexReg() const {
90      return BaseType == FrameIndexBase ||
91             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
92    }
93
94    /// Return true if this addressing mode is already RIP-relative.
95    bool isRIPRelative() const {
96      if (BaseType != RegBase) return false;
97      if (RegisterSDNode *RegNode =
98            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
99        return RegNode->getReg() == X86::RIP;
100      return false;
101    }
102
103    void setBaseReg(SDValue Reg) {
104      BaseType = RegBase;
105      Base_Reg = Reg;
106    }
107
108#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
109    void dump(SelectionDAG *DAG = nullptr) {
110      dbgs() << "X86ISelAddressMode " << this << '\n';
111      dbgs() << "Base_Reg ";
112      if (Base_Reg.getNode())
113        Base_Reg.getNode()->dump(DAG);
114      else
115        dbgs() << "nul\n";
116      if (BaseType == FrameIndexBase)
117        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
118      dbgs() << " Scale " << Scale << '\n'
119             << "IndexReg ";
120      if (NegateIndex)
121        dbgs() << "negate ";
122      if (IndexReg.getNode())
123        IndexReg.getNode()->dump(DAG);
124      else
125        dbgs() << "nul\n";
126      dbgs() << " Disp " << Disp << '\n'
127             << "GV ";
128      if (GV)
129        GV->dump();
130      else
131        dbgs() << "nul";
132      dbgs() << " CP ";
133      if (CP)
134        CP->dump();
135      else
136        dbgs() << "nul";
137      dbgs() << '\n'
138             << "ES ";
139      if (ES)
140        dbgs() << ES;
141      else
142        dbgs() << "nul";
143      dbgs() << " MCSym ";
144      if (MCSym)
145        dbgs() << MCSym;
146      else
147        dbgs() << "nul";
148      dbgs() << " JT" << JT << " Align" << Align << '\n';
149    }
150#endif
151  };
152}
153
154namespace {
155  //===--------------------------------------------------------------------===//
156  /// ISel - X86-specific code to select X86 machine instructions for
157  /// SelectionDAG operations.
158  ///
159  class X86DAGToDAGISel final : public SelectionDAGISel {
160    /// Keep a pointer to the X86Subtarget around so that we can
161    /// make the right decision when generating code for different targets.
162    const X86Subtarget *Subtarget;
163
164    /// If true, selector should try to optimize for code size instead of
165    /// performance.
166    bool OptForSize;
167
168    /// If true, selector should try to optimize for minimum code size.
169    bool OptForMinSize;
170
171    /// Disable direct TLS access through segment registers.
172    bool IndirectTlsSegRefs;
173
174  public:
175    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
176        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
177          OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179    StringRef getPassName() const override {
180      return "X86 DAG->DAG Instruction Selection";
181    }
182
183    bool runOnMachineFunction(MachineFunction &MF) override {
184      // Reset the subtarget each time through.
185      Subtarget = &MF.getSubtarget<X86Subtarget>();
186      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
187                             "indirect-tls-seg-refs");
188
189      // OptFor[Min]Size are used in pattern predicates that isel is matching.
190      OptForSize = MF.getFunction().hasOptSize();
191      OptForMinSize = MF.getFunction().hasMinSize();
192      assert((!OptForMinSize || OptForSize) &&
193             "OptForMinSize implies OptForSize");
194
195      SelectionDAGISel::runOnMachineFunction(MF);
196      return true;
197    }
198
199    void EmitFunctionEntryCode() override;
200
201    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
202
203    void PreprocessISelDAG() override;
204    void PostprocessISelDAG() override;
205
206// Include the pieces autogenerated from the target description.
207#include "X86GenDAGISel.inc"
208
209  private:
210    void Select(SDNode *N) override;
211
212    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
213    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
214    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
215    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
216    bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
217    bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
218    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
219                                 unsigned Depth);
220    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
221    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
222                    SDValue &Scale, SDValue &Index, SDValue &Disp,
223                    SDValue &Segment);
224    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
225                          SDValue &Scale, SDValue &Index, SDValue &Disp,
226                          SDValue &Segment);
227    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
228    bool selectLEAAddr(SDValue N, SDValue &Base,
229                       SDValue &Scale, SDValue &Index, SDValue &Disp,
230                       SDValue &Segment);
231    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
232                            SDValue &Scale, SDValue &Index, SDValue &Disp,
233                            SDValue &Segment);
234    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
235                           SDValue &Scale, SDValue &Index, SDValue &Disp,
236                           SDValue &Segment);
237    bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
238                             SDValue &Base, SDValue &Scale,
239                             SDValue &Index, SDValue &Disp,
240                             SDValue &Segment,
241                             SDValue &NodeWithChain);
242    bool selectRelocImm(SDValue N, SDValue &Op);
243
244    bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
245                     SDValue &Base, SDValue &Scale,
246                     SDValue &Index, SDValue &Disp,
247                     SDValue &Segment);
248
249    // Convenience method where P is also root.
250    bool tryFoldLoad(SDNode *P, SDValue N,
251                     SDValue &Base, SDValue &Scale,
252                     SDValue &Index, SDValue &Disp,
253                     SDValue &Segment) {
254      return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
255    }
256
257    bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
258                          SDValue &Base, SDValue &Scale,
259                          SDValue &Index, SDValue &Disp,
260                          SDValue &Segment);
261
262    /// Implement addressing mode selection for inline asm expressions.
263    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
264                                      unsigned ConstraintID,
265                                      std::vector<SDValue> &OutOps) override;
266
267    void emitSpecialCodeForMain();
268
269    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
270                                   MVT VT, SDValue &Base, SDValue &Scale,
271                                   SDValue &Index, SDValue &Disp,
272                                   SDValue &Segment) {
273      if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
274        Base = CurDAG->getTargetFrameIndex(
275            AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
276      else if (AM.Base_Reg.getNode())
277        Base = AM.Base_Reg;
278      else
279        Base = CurDAG->getRegister(0, VT);
280
281      Scale = getI8Imm(AM.Scale, DL);
282
283      // Negate the index if needed.
284      if (AM.NegateIndex) {
285        unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
286        SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287                                                     AM.IndexReg), 0);
288        AM.IndexReg = Neg;
289      }
290
291      if (AM.IndexReg.getNode())
292        Index = AM.IndexReg;
293      else
294        Index = CurDAG->getRegister(0, VT);
295
296      // These are 32-bit even in 64-bit mode since RIP-relative offset
297      // is 32-bit.
298      if (AM.GV)
299        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300                                              MVT::i32, AM.Disp,
301                                              AM.SymbolFlags);
302      else if (AM.CP)
303        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
304                                             AM.Align, AM.Disp, AM.SymbolFlags);
305      else if (AM.ES) {
306        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308      } else if (AM.MCSym) {
309        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310        assert(AM.SymbolFlags == 0 && "oo");
311        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312      } else if (AM.JT != -1) {
313        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315      } else if (AM.BlockAddr)
316        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317                                             AM.SymbolFlags);
318      else
319        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321      if (AM.Segment.getNode())
322        Segment = AM.Segment;
323      else
324        Segment = CurDAG->getRegister(0, MVT::i16);
325    }
326
327    // Utility function to determine whether we should avoid selecting
328    // immediate forms of instructions for better code size or not.
329    // At a high level, we'd like to avoid such instructions when
330    // we have similar constants used within the same basic block
331    // that can be kept in a register.
332    //
333    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334      uint32_t UseCount = 0;
335
336      // Do not want to hoist if we're not optimizing for size.
337      // TODO: We'd like to remove this restriction.
338      // See the comment in X86InstrInfo.td for more info.
339      if (!CurDAG->shouldOptForSize())
340        return false;
341
342      // Walk all the users of the immediate.
343      for (SDNode::use_iterator UI = N->use_begin(),
344           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
345
346        SDNode *User = *UI;
347
348        // This user is already selected. Count it as a legitimate use and
349        // move on.
350        if (User->isMachineOpcode()) {
351          UseCount++;
352          continue;
353        }
354
355        // We want to count stores of immediates as real uses.
356        if (User->getOpcode() == ISD::STORE &&
357            User->getOperand(1).getNode() == N) {
358          UseCount++;
359          continue;
360        }
361
362        // We don't currently match users that have > 2 operands (except
363        // for stores, which are handled above)
364        // Those instruction won't match in ISEL, for now, and would
365        // be counted incorrectly.
366        // This may change in the future as we add additional instruction
367        // types.
368        if (User->getNumOperands() != 2)
369          continue;
370
371        // If this can match to INC/DEC, don't count it as a use.
372        if (User->getOpcode() == ISD::ADD &&
373            (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
374          continue;
375
376        // Immediates that are used for offsets as part of stack
377        // manipulation should be left alone. These are typically
378        // used to indicate SP offsets for argument passing and
379        // will get pulled into stores/pushes (implicitly).
380        if (User->getOpcode() == X86ISD::ADD ||
381            User->getOpcode() == ISD::ADD    ||
382            User->getOpcode() == X86ISD::SUB ||
383            User->getOpcode() == ISD::SUB) {
384
385          // Find the other operand of the add/sub.
386          SDValue OtherOp = User->getOperand(0);
387          if (OtherOp.getNode() == N)
388            OtherOp = User->getOperand(1);
389
390          // Don't count if the other operand is SP.
391          RegisterSDNode *RegNode;
392          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393              (RegNode = dyn_cast_or_null<RegisterSDNode>(
394                 OtherOp->getOperand(1).getNode())))
395            if ((RegNode->getReg() == X86::ESP) ||
396                (RegNode->getReg() == X86::RSP))
397              continue;
398        }
399
400        // ... otherwise, count this and move on.
401        UseCount++;
402      }
403
404      // If we have more than 1 use, then recommend for hoisting.
405      return (UseCount > 1);
406    }
407
408    /// Return a target constant with the specified value of type i8.
409    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411    }
412
413    /// Return a target constant with the specified value, of type i32.
414    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416    }
417
418    /// Return a target constant with the specified value, of type i64.
419    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420      return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421    }
422
423    SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424                                        const SDLoc &DL) {
425      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426      uint64_t Index = N->getConstantOperandVal(1);
427      MVT VecVT = N->getOperand(0).getSimpleValueType();
428      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429    }
430
431    SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432                                      const SDLoc &DL) {
433      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434      uint64_t Index = N->getConstantOperandVal(2);
435      MVT VecVT = N->getSimpleValueType(0);
436      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437    }
438
439    // Helper to detect unneeded and instructions on shift amounts. Called
440    // from PatFrags in tablegen.
441    bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
442      assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
443      const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
444
445      if (Val.countTrailingOnes() >= Width)
446        return true;
447
448      APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
449      return Mask.countTrailingOnes() >= Width;
450    }
451
452    /// Return an SDNode that returns the value of the global base register.
453    /// Output instructions required to initialize the global base register,
454    /// if necessary.
455    SDNode *getGlobalBaseReg();
456
457    /// Return a reference to the TargetMachine, casted to the target-specific
458    /// type.
459    const X86TargetMachine &getTargetMachine() const {
460      return static_cast<const X86TargetMachine &>(TM);
461    }
462
463    /// Return a reference to the TargetInstrInfo, casted to the target-specific
464    /// type.
465    const X86InstrInfo *getInstrInfo() const {
466      return Subtarget->getInstrInfo();
467    }
468
469    /// Address-mode matching performs shift-of-and to and-of-shift
470    /// reassociation in order to expose more scaled addressing
471    /// opportunities.
472    bool ComplexPatternFuncMutatesDAG() const override {
473      return true;
474    }
475
476    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
477
478    /// Returns whether this is a relocatable immediate in the range
479    /// [-2^Width .. 2^Width-1].
480    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
481      if (auto *CN = dyn_cast<ConstantSDNode>(N))
482        return isInt<Width>(CN->getSExtValue());
483      return isSExtAbsoluteSymbolRef(Width, N);
484    }
485
486    // Indicates we should prefer to use a non-temporal load for this load.
487    bool useNonTemporalLoad(LoadSDNode *N) const {
488      if (!N->isNonTemporal())
489        return false;
490
491      unsigned StoreSize = N->getMemoryVT().getStoreSize();
492
493      if (N->getAlignment() < StoreSize)
494        return false;
495
496      switch (StoreSize) {
497      default: llvm_unreachable("Unsupported store size");
498      case 4:
499      case 8:
500        return false;
501      case 16:
502        return Subtarget->hasSSE41();
503      case 32:
504        return Subtarget->hasAVX2();
505      case 64:
506        return Subtarget->hasAVX512();
507      }
508    }
509
510    bool foldLoadStoreIntoMemOperand(SDNode *Node);
511    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
512    bool matchBitExtract(SDNode *Node);
513    bool shrinkAndImmediate(SDNode *N);
514    bool isMaskZeroExtended(SDNode *N) const;
515    bool tryShiftAmountMod(SDNode *N);
516    bool combineIncDecVector(SDNode *Node);
517    bool tryShrinkShlLogicImm(SDNode *N);
518    bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
519    bool tryMatchBitSelect(SDNode *N);
520
521    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
522                                const SDLoc &dl, MVT VT, SDNode *Node);
523    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
524                                const SDLoc &dl, MVT VT, SDNode *Node,
525                                SDValue &InFlag);
526
527    bool tryOptimizeRem8Extend(SDNode *N);
528
529    bool onlyUsesZeroFlag(SDValue Flags) const;
530    bool hasNoSignFlagUses(SDValue Flags) const;
531    bool hasNoCarryFlagUses(SDValue Flags) const;
532  };
533}
534
535
536// Returns true if this masked compare can be implemented legally with this
537// type.
538static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
539  unsigned Opcode = N->getOpcode();
540  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
541      Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
542      Opcode == X86ISD::VFPCLASS) {
543    // We can get 256-bit 8 element types here without VLX being enabled. When
544    // this happens we will use 512-bit operations and the mask will not be
545    // zero extended.
546    EVT OpVT = N->getOperand(0).getValueType();
547    // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
548    // second operand.
549    if (Opcode == X86ISD::STRICT_CMPM)
550      OpVT = N->getOperand(1).getValueType();
551    if (OpVT.is256BitVector() || OpVT.is128BitVector())
552      return Subtarget->hasVLX();
553
554    return true;
555  }
556  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
557  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
558      Opcode == X86ISD::FSETCCM_SAE)
559    return true;
560
561  return false;
562}
563
564// Returns true if we can assume the writer of the mask has zero extended it
565// for us.
566bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
567  // If this is an AND, check if we have a compare on either side. As long as
568  // one side guarantees the mask is zero extended, the AND will preserve those
569  // zeros.
570  if (N->getOpcode() == ISD::AND)
571    return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
572           isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
573
574  return isLegalMaskCompare(N, Subtarget);
575}
576
577bool
578X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
579  if (OptLevel == CodeGenOpt::None) return false;
580
581  if (!N.hasOneUse())
582    return false;
583
584  // FIXME: Temporary hack to prevent strict floating point nodes from
585  // folding into masked operations illegally.
586  if (U == Root && Root->getOpcode() == ISD::VSELECT &&
587      N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD)
588    return false;
589
590  if (N.getOpcode() != ISD::LOAD)
591    return true;
592
593  // Don't fold non-temporal loads if we have an instruction for them.
594  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
595    return false;
596
597  // If N is a load, do additional profitability checks.
598  if (U == Root) {
599    switch (U->getOpcode()) {
600    default: break;
601    case X86ISD::ADD:
602    case X86ISD::ADC:
603    case X86ISD::SUB:
604    case X86ISD::SBB:
605    case X86ISD::AND:
606    case X86ISD::XOR:
607    case X86ISD::OR:
608    case ISD::ADD:
609    case ISD::ADDCARRY:
610    case ISD::AND:
611    case ISD::OR:
612    case ISD::XOR: {
613      SDValue Op1 = U->getOperand(1);
614
615      // If the other operand is a 8-bit immediate we should fold the immediate
616      // instead. This reduces code size.
617      // e.g.
618      // movl 4(%esp), %eax
619      // addl $4, %eax
620      // vs.
621      // movl $4, %eax
622      // addl 4(%esp), %eax
623      // The former is 2 bytes shorter. In case where the increment is 1, then
624      // the saving can be 4 bytes (by using incl %eax).
625      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
626        if (Imm->getAPIntValue().isSignedIntN(8))
627          return false;
628
629        // If this is a 64-bit AND with an immediate that fits in 32-bits,
630        // prefer using the smaller and over folding the load. This is needed to
631        // make sure immediates created by shrinkAndImmediate are always folded.
632        // Ideally we would narrow the load during DAG combine and get the
633        // best of both worlds.
634        if (U->getOpcode() == ISD::AND &&
635            Imm->getAPIntValue().getBitWidth() == 64 &&
636            Imm->getAPIntValue().isIntN(32))
637          return false;
638
639        // If this really a zext_inreg that can be represented with a movzx
640        // instruction, prefer that.
641        // TODO: We could shrink the load and fold if it is non-volatile.
642        if (U->getOpcode() == ISD::AND &&
643            (Imm->getAPIntValue() == UINT8_MAX ||
644             Imm->getAPIntValue() == UINT16_MAX ||
645             Imm->getAPIntValue() == UINT32_MAX))
646          return false;
647
648        // ADD/SUB with can negate the immediate and use the opposite operation
649        // to fit 128 into a sign extended 8 bit immediate.
650        if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
651            (-Imm->getAPIntValue()).isSignedIntN(8))
652          return false;
653      }
654
655      // If the other operand is a TLS address, we should fold it instead.
656      // This produces
657      // movl    %gs:0, %eax
658      // leal    i@NTPOFF(%eax), %eax
659      // instead of
660      // movl    $i@NTPOFF, %eax
661      // addl    %gs:0, %eax
662      // if the block also has an access to a second TLS address this will save
663      // a load.
664      // FIXME: This is probably also true for non-TLS addresses.
665      if (Op1.getOpcode() == X86ISD::Wrapper) {
666        SDValue Val = Op1.getOperand(0);
667        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
668          return false;
669      }
670
671      // Don't fold load if this matches the BTS/BTR/BTC patterns.
672      // BTS: (or X, (shl 1, n))
673      // BTR: (and X, (rotl -2, n))
674      // BTC: (xor X, (shl 1, n))
675      if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
676        if (U->getOperand(0).getOpcode() == ISD::SHL &&
677            isOneConstant(U->getOperand(0).getOperand(0)))
678          return false;
679
680        if (U->getOperand(1).getOpcode() == ISD::SHL &&
681            isOneConstant(U->getOperand(1).getOperand(0)))
682          return false;
683      }
684      if (U->getOpcode() == ISD::AND) {
685        SDValue U0 = U->getOperand(0);
686        SDValue U1 = U->getOperand(1);
687        if (U0.getOpcode() == ISD::ROTL) {
688          auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
689          if (C && C->getSExtValue() == -2)
690            return false;
691        }
692
693        if (U1.getOpcode() == ISD::ROTL) {
694          auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
695          if (C && C->getSExtValue() == -2)
696            return false;
697        }
698      }
699
700      break;
701    }
702    case ISD::SHL:
703    case ISD::SRA:
704    case ISD::SRL:
705      // Don't fold a load into a shift by immediate. The BMI2 instructions
706      // support folding a load, but not an immediate. The legacy instructions
707      // support folding an immediate, but can't fold a load. Folding an
708      // immediate is preferable to folding a load.
709      if (isa<ConstantSDNode>(U->getOperand(1)))
710        return false;
711
712      break;
713    }
714  }
715
716  // Prevent folding a load if this can implemented with an insert_subreg or
717  // a move that implicitly zeroes.
718  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
719      isNullConstant(Root->getOperand(2)) &&
720      (Root->getOperand(0).isUndef() ||
721       ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
722    return false;
723
724  return true;
725}
726
727/// Replace the original chain operand of the call with
728/// load's chain operand and move load below the call's chain operand.
729static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
730                               SDValue Call, SDValue OrigChain) {
731  SmallVector<SDValue, 8> Ops;
732  SDValue Chain = OrigChain.getOperand(0);
733  if (Chain.getNode() == Load.getNode())
734    Ops.push_back(Load.getOperand(0));
735  else {
736    assert(Chain.getOpcode() == ISD::TokenFactor &&
737           "Unexpected chain operand");
738    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
739      if (Chain.getOperand(i).getNode() == Load.getNode())
740        Ops.push_back(Load.getOperand(0));
741      else
742        Ops.push_back(Chain.getOperand(i));
743    SDValue NewChain =
744      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
745    Ops.clear();
746    Ops.push_back(NewChain);
747  }
748  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
749  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
750  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
751                             Load.getOperand(1), Load.getOperand(2));
752
753  Ops.clear();
754  Ops.push_back(SDValue(Load.getNode(), 1));
755  Ops.append(Call->op_begin() + 1, Call->op_end());
756  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
757}
758
759/// Return true if call address is a load and it can be
760/// moved below CALLSEQ_START and the chains leading up to the call.
761/// Return the CALLSEQ_START by reference as a second output.
762/// In the case of a tail call, there isn't a callseq node between the call
763/// chain and the load.
764static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
765  // The transformation is somewhat dangerous if the call's chain was glued to
766  // the call. After MoveBelowOrigChain the load is moved between the call and
767  // the chain, this can create a cycle if the load is not folded. So it is
768  // *really* important that we are sure the load will be folded.
769  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
770    return false;
771  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
772  if (!LD ||
773      !LD->isSimple() ||
774      LD->getAddressingMode() != ISD::UNINDEXED ||
775      LD->getExtensionType() != ISD::NON_EXTLOAD)
776    return false;
777
778  // Now let's find the callseq_start.
779  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
780    if (!Chain.hasOneUse())
781      return false;
782    Chain = Chain.getOperand(0);
783  }
784
785  if (!Chain.getNumOperands())
786    return false;
787  // Since we are not checking for AA here, conservatively abort if the chain
788  // writes to memory. It's not safe to move the callee (a load) across a store.
789  if (isa<MemSDNode>(Chain.getNode()) &&
790      cast<MemSDNode>(Chain.getNode())->writeMem())
791    return false;
792  if (Chain.getOperand(0).getNode() == Callee.getNode())
793    return true;
794  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
795      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
796      Callee.getValue(1).hasOneUse())
797    return true;
798  return false;
799}
800
801void X86DAGToDAGISel::PreprocessISelDAG() {
802  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
803       E = CurDAG->allnodes_end(); I != E; ) {
804    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
805
806    // If this is a target specific AND node with no flag usages, turn it back
807    // into ISD::AND to enable test instruction matching.
808    if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
809      SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
810                                    N->getOperand(0), N->getOperand(1));
811      --I;
812      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
813      ++I;
814      CurDAG->DeleteNode(N);
815      continue;
816    }
817
818    switch (N->getOpcode()) {
819    case ISD::FP_ROUND:
820    case ISD::STRICT_FP_ROUND:
821    case ISD::FP_TO_SINT:
822    case ISD::FP_TO_UINT:
823    case ISD::STRICT_FP_TO_SINT:
824    case ISD::STRICT_FP_TO_UINT: {
825      // Replace vector fp_to_s/uint with their X86 specific equivalent so we
826      // don't need 2 sets of patterns.
827      if (!N->getSimpleValueType(0).isVector())
828        break;
829
830      unsigned NewOpc;
831      switch (N->getOpcode()) {
832      default: llvm_unreachable("Unexpected opcode!");
833      case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
834      case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
835      case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
836      case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
837      case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
838      case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
839      }
840      SDValue Res;
841      if (N->isStrictFPOpcode())
842        Res =
843            CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
844                            {N->getOperand(0), N->getOperand(1)});
845      else
846        Res =
847            CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
848                            N->getOperand(0));
849      --I;
850      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
851      ++I;
852      CurDAG->DeleteNode(N);
853      continue;
854    }
855    case ISD::SHL:
856    case ISD::SRA:
857    case ISD::SRL: {
858      // Replace vector shifts with their X86 specific equivalent so we don't
859      // need 2 sets of patterns.
860      if (!N->getValueType(0).isVector())
861        break;
862
863      unsigned NewOpc;
864      switch (N->getOpcode()) {
865      default: llvm_unreachable("Unexpected opcode!");
866      case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
867      case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
868      case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
869      }
870      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
871                                    N->getOperand(0), N->getOperand(1));
872      --I;
873      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
874      ++I;
875      CurDAG->DeleteNode(N);
876      continue;
877    }
878    case ISD::ANY_EXTEND:
879    case ISD::ANY_EXTEND_VECTOR_INREG: {
880      // Replace vector any extend with the zero extend equivalents so we don't
881      // need 2 sets of patterns. Ignore vXi1 extensions.
882      if (!N->getValueType(0).isVector() ||
883          N->getOperand(0).getScalarValueSizeInBits() == 1)
884        break;
885
886      unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
887                            ? ISD::ZERO_EXTEND
888                            : ISD::ZERO_EXTEND_VECTOR_INREG;
889
890      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
891                                    N->getOperand(0));
892      --I;
893      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
894      ++I;
895      CurDAG->DeleteNode(N);
896      continue;
897    }
898    case ISD::FCEIL:
899    case ISD::STRICT_FCEIL:
900    case ISD::FFLOOR:
901    case ISD::STRICT_FFLOOR:
902    case ISD::FTRUNC:
903    case ISD::STRICT_FTRUNC:
904    case ISD::FNEARBYINT:
905    case ISD::STRICT_FNEARBYINT:
906    case ISD::FRINT:
907    case ISD::STRICT_FRINT: {
908      // Replace fp rounding with their X86 specific equivalent so we don't
909      // need 2 sets of patterns.
910      unsigned Imm;
911      switch (N->getOpcode()) {
912      default: llvm_unreachable("Unexpected opcode!");
913      case ISD::STRICT_FCEIL:
914      case ISD::FCEIL:      Imm = 0xA; break;
915      case ISD::STRICT_FFLOOR:
916      case ISD::FFLOOR:     Imm = 0x9; break;
917      case ISD::STRICT_FTRUNC:
918      case ISD::FTRUNC:     Imm = 0xB; break;
919      case ISD::STRICT_FNEARBYINT:
920      case ISD::FNEARBYINT: Imm = 0xC; break;
921      case ISD::STRICT_FRINT:
922      case ISD::FRINT:      Imm = 0x4; break;
923      }
924      SDLoc dl(N);
925      bool IsStrict = N->isStrictFPOpcode();
926      SDValue Res;
927      if (IsStrict)
928        Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
929                              {N->getValueType(0), MVT::Other},
930                              {N->getOperand(0), N->getOperand(1),
931                               CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
932      else
933        Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
934                              N->getOperand(0),
935                              CurDAG->getTargetConstant(Imm, dl, MVT::i8));
936      --I;
937      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
938      ++I;
939      CurDAG->DeleteNode(N);
940      continue;
941    }
942    case X86ISD::FANDN:
943    case X86ISD::FAND:
944    case X86ISD::FOR:
945    case X86ISD::FXOR: {
946      // Widen scalar fp logic ops to vector to reduce isel patterns.
947      // FIXME: Can we do this during lowering/combine.
948      MVT VT = N->getSimpleValueType(0);
949      if (VT.isVector() || VT == MVT::f128)
950        break;
951
952      MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
953      SDLoc dl(N);
954      SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
955                                    N->getOperand(0));
956      SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
957                                    N->getOperand(1));
958
959      SDValue Res;
960      if (Subtarget->hasSSE2()) {
961        EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
962        Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
963        Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
964        unsigned Opc;
965        switch (N->getOpcode()) {
966        default: llvm_unreachable("Unexpected opcode!");
967        case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
968        case X86ISD::FAND:  Opc = ISD::AND;      break;
969        case X86ISD::FOR:   Opc = ISD::OR;       break;
970        case X86ISD::FXOR:  Opc = ISD::XOR;      break;
971        }
972        Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
973        Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
974      } else {
975        Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
976      }
977      Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
978                            CurDAG->getIntPtrConstant(0, dl));
979      --I;
980      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
981      ++I;
982      CurDAG->DeleteNode(N);
983      continue;
984    }
985    }
986
987    if (OptLevel != CodeGenOpt::None &&
988        // Only do this when the target can fold the load into the call or
989        // jmp.
990        !Subtarget->useIndirectThunkCalls() &&
991        ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
992         (N->getOpcode() == X86ISD::TC_RETURN &&
993          (Subtarget->is64Bit() ||
994           !getTargetMachine().isPositionIndependent())))) {
995      /// Also try moving call address load from outside callseq_start to just
996      /// before the call to allow it to be folded.
997      ///
998      ///     [Load chain]
999      ///         ^
1000      ///         |
1001      ///       [Load]
1002      ///       ^    ^
1003      ///       |    |
1004      ///      /      \--
1005      ///     /          |
1006      ///[CALLSEQ_START] |
1007      ///     ^          |
1008      ///     |          |
1009      /// [LOAD/C2Reg]   |
1010      ///     |          |
1011      ///      \        /
1012      ///       \      /
1013      ///       [CALL]
1014      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1015      SDValue Chain = N->getOperand(0);
1016      SDValue Load  = N->getOperand(1);
1017      if (!isCalleeLoad(Load, Chain, HasCallSeq))
1018        continue;
1019      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1020      ++NumLoadMoved;
1021      continue;
1022    }
1023
1024    // Lower fpround and fpextend nodes that target the FP stack to be store and
1025    // load to the stack.  This is a gross hack.  We would like to simply mark
1026    // these as being illegal, but when we do that, legalize produces these when
1027    // it expands calls, then expands these in the same legalize pass.  We would
1028    // like dag combine to be able to hack on these between the call expansion
1029    // and the node legalization.  As such this pass basically does "really
1030    // late" legalization of these inline with the X86 isel pass.
1031    // FIXME: This should only happen when not compiled with -O0.
1032    switch (N->getOpcode()) {
1033    default: continue;
1034    case ISD::FP_ROUND:
1035    case ISD::FP_EXTEND:
1036    {
1037      MVT SrcVT = N->getOperand(0).getSimpleValueType();
1038      MVT DstVT = N->getSimpleValueType(0);
1039
1040      // If any of the sources are vectors, no fp stack involved.
1041      if (SrcVT.isVector() || DstVT.isVector())
1042        continue;
1043
1044      // If the source and destination are SSE registers, then this is a legal
1045      // conversion that should not be lowered.
1046      const X86TargetLowering *X86Lowering =
1047          static_cast<const X86TargetLowering *>(TLI);
1048      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1049      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1050      if (SrcIsSSE && DstIsSSE)
1051        continue;
1052
1053      if (!SrcIsSSE && !DstIsSSE) {
1054        // If this is an FPStack extension, it is a noop.
1055        if (N->getOpcode() == ISD::FP_EXTEND)
1056          continue;
1057        // If this is a value-preserving FPStack truncation, it is a noop.
1058        if (N->getConstantOperandVal(1))
1059          continue;
1060      }
1061
1062      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1063      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1064      // operations.  Based on this, decide what we want to do.
1065      MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1066      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1067      SDLoc dl(N);
1068
1069      // FIXME: optimize the case where the src/dest is a load or store?
1070
1071      SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1072                                          MemTmp, MachinePointerInfo(), MemVT);
1073      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1074                                          MachinePointerInfo(), MemVT);
1075
1076      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1077      // extload we created.  This will cause general havok on the dag because
1078      // anything below the conversion could be folded into other existing nodes.
1079      // To avoid invalidating 'I', back it up to the convert node.
1080      --I;
1081      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1082      break;
1083    }
1084
1085    //The sequence of events for lowering STRICT_FP versions of these nodes requires
1086    //dealing with the chain differently, as there is already a preexisting chain.
1087    case ISD::STRICT_FP_ROUND:
1088    case ISD::STRICT_FP_EXTEND:
1089    {
1090      MVT SrcVT = N->getOperand(1).getSimpleValueType();
1091      MVT DstVT = N->getSimpleValueType(0);
1092
1093      // If any of the sources are vectors, no fp stack involved.
1094      if (SrcVT.isVector() || DstVT.isVector())
1095        continue;
1096
1097      // If the source and destination are SSE registers, then this is a legal
1098      // conversion that should not be lowered.
1099      const X86TargetLowering *X86Lowering =
1100          static_cast<const X86TargetLowering *>(TLI);
1101      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1102      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1103      if (SrcIsSSE && DstIsSSE)
1104        continue;
1105
1106      if (!SrcIsSSE && !DstIsSSE) {
1107        // If this is an FPStack extension, it is a noop.
1108        if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1109          continue;
1110        // If this is a value-preserving FPStack truncation, it is a noop.
1111        if (N->getConstantOperandVal(2))
1112          continue;
1113      }
1114
1115      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1116      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1117      // operations.  Based on this, decide what we want to do.
1118      MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1119      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1120      SDLoc dl(N);
1121
1122      // FIXME: optimize the case where the src/dest is a load or store?
1123
1124      //Since the operation is StrictFP, use the preexisting chain.
1125      SDValue Store, Result;
1126      if (!SrcIsSSE) {
1127        SDVTList VTs = CurDAG->getVTList(MVT::Other);
1128        SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1129        Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1130                                            MachinePointerInfo(), 0,
1131                                            MachineMemOperand::MOStore);
1132        if (N->getFlags().hasNoFPExcept()) {
1133          SDNodeFlags Flags = Store->getFlags();
1134          Flags.setNoFPExcept(true);
1135          Store->setFlags(Flags);
1136        }
1137      } else {
1138        assert(SrcVT == MemVT && "Unexpected VT!");
1139        Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1140                                 MachinePointerInfo());
1141      }
1142
1143      if (!DstIsSSE) {
1144        SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1145        SDValue Ops[] = {Store, MemTmp};
1146        Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT,
1147                                             MachinePointerInfo(), 0,
1148                                             MachineMemOperand::MOLoad);
1149        if (N->getFlags().hasNoFPExcept()) {
1150          SDNodeFlags Flags = Result->getFlags();
1151          Flags.setNoFPExcept(true);
1152          Result->setFlags(Flags);
1153        }
1154      } else {
1155        assert(DstVT == MemVT && "Unexpected VT!");
1156        Result =
1157            CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo());
1158      }
1159
1160      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1161      // extload we created.  This will cause general havok on the dag because
1162      // anything below the conversion could be folded into other existing nodes.
1163      // To avoid invalidating 'I', back it up to the convert node.
1164      --I;
1165      CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1166      break;
1167    }
1168    }
1169
1170
1171    // Now that we did that, the node is dead.  Increment the iterator to the
1172    // next node to process, then delete N.
1173    ++I;
1174    CurDAG->DeleteNode(N);
1175  }
1176
1177  // The load+call transform above can leave some dead nodes in the graph. Make
1178  // sure we remove them. Its possible some of the other transforms do to so
1179  // just remove dead nodes unconditionally.
1180  CurDAG->RemoveDeadNodes();
1181}
1182
1183// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1184bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1185  unsigned Opc = N->getMachineOpcode();
1186  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1187      Opc != X86::MOVSX64rr8)
1188    return false;
1189
1190  SDValue N0 = N->getOperand(0);
1191
1192  // We need to be extracting the lower bit of an extend.
1193  if (!N0.isMachineOpcode() ||
1194      N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1195      N0.getConstantOperandVal(1) != X86::sub_8bit)
1196    return false;
1197
1198  // We're looking for either a movsx or movzx to match the original opcode.
1199  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1200                                                : X86::MOVSX32rr8_NOREX;
1201  SDValue N00 = N0.getOperand(0);
1202  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1203    return false;
1204
1205  if (Opc == X86::MOVSX64rr8) {
1206    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1207    // to 64.
1208    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1209                                                   MVT::i64, N00);
1210    ReplaceUses(N, Extend);
1211  } else {
1212    // Ok we can drop this extend and just use the original extend.
1213    ReplaceUses(N, N00.getNode());
1214  }
1215
1216  return true;
1217}
1218
1219void X86DAGToDAGISel::PostprocessISelDAG() {
1220  // Skip peepholes at -O0.
1221  if (TM.getOptLevel() == CodeGenOpt::None)
1222    return;
1223
1224  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1225
1226  bool MadeChange = false;
1227  while (Position != CurDAG->allnodes_begin()) {
1228    SDNode *N = &*--Position;
1229    // Skip dead nodes and any non-machine opcodes.
1230    if (N->use_empty() || !N->isMachineOpcode())
1231      continue;
1232
1233    if (tryOptimizeRem8Extend(N)) {
1234      MadeChange = true;
1235      continue;
1236    }
1237
1238    // Look for a TESTrr+ANDrr pattern where both operands of the test are
1239    // the same. Rewrite to remove the AND.
1240    unsigned Opc = N->getMachineOpcode();
1241    if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1242         Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1243        N->getOperand(0) == N->getOperand(1) &&
1244        N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1245        N->getOperand(0).isMachineOpcode()) {
1246      SDValue And = N->getOperand(0);
1247      unsigned N0Opc = And.getMachineOpcode();
1248      if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1249          N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1250        MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1251                                                     MVT::i32,
1252                                                     And.getOperand(0),
1253                                                     And.getOperand(1));
1254        ReplaceUses(N, Test);
1255        MadeChange = true;
1256        continue;
1257      }
1258      if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1259          N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1260        unsigned NewOpc;
1261        switch (N0Opc) {
1262        case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
1263        case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1264        case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1265        case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1266        }
1267
1268        // Need to swap the memory and register operand.
1269        SDValue Ops[] = { And.getOperand(1),
1270                          And.getOperand(2),
1271                          And.getOperand(3),
1272                          And.getOperand(4),
1273                          And.getOperand(5),
1274                          And.getOperand(0),
1275                          And.getOperand(6)  /* Chain */ };
1276        MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1277                                                     MVT::i32, MVT::Other, Ops);
1278        ReplaceUses(N, Test);
1279        MadeChange = true;
1280        continue;
1281      }
1282    }
1283
1284    // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1285    // used. We're doing this late so we can prefer to fold the AND into masked
1286    // comparisons. Doing that can be better for the live range of the mask
1287    // register.
1288    if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1289         Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1290        N->getOperand(0) == N->getOperand(1) &&
1291        N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1292        N->getOperand(0).isMachineOpcode() &&
1293        onlyUsesZeroFlag(SDValue(N, 0))) {
1294      SDValue And = N->getOperand(0);
1295      unsigned N0Opc = And.getMachineOpcode();
1296      // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1297      // KAND instructions and KTEST use the same ISA feature.
1298      if (N0Opc == X86::KANDBrr ||
1299          (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1300          N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1301        unsigned NewOpc;
1302        switch (Opc) {
1303        default: llvm_unreachable("Unexpected opcode!");
1304        case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1305        case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1306        case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1307        case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1308        }
1309        MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1310                                                      MVT::i32,
1311                                                      And.getOperand(0),
1312                                                      And.getOperand(1));
1313        ReplaceUses(N, KTest);
1314        MadeChange = true;
1315        continue;
1316      }
1317    }
1318
1319    // Attempt to remove vectors moves that were inserted to zero upper bits.
1320    if (Opc != TargetOpcode::SUBREG_TO_REG)
1321      continue;
1322
1323    unsigned SubRegIdx = N->getConstantOperandVal(2);
1324    if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1325      continue;
1326
1327    SDValue Move = N->getOperand(1);
1328    if (!Move.isMachineOpcode())
1329      continue;
1330
1331    // Make sure its one of the move opcodes we recognize.
1332    switch (Move.getMachineOpcode()) {
1333    default:
1334      continue;
1335    case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
1336    case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
1337    case X86::VMOVDQArr:       case X86::VMOVDQUrr:
1338    case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
1339    case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
1340    case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
1341    case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
1342    case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
1343    case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1344    case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1345    case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
1346    case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
1347    case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1348    case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1349      break;
1350    }
1351
1352    SDValue In = Move.getOperand(0);
1353    if (!In.isMachineOpcode() ||
1354        In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1355      continue;
1356
1357    // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1358    // the SHA instructions which use a legacy encoding.
1359    uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1360    if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1361        (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1362        (TSFlags & X86II::EncodingMask) != X86II::XOP)
1363      continue;
1364
1365    // Producing instruction is another vector instruction. We can drop the
1366    // move.
1367    CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1368    MadeChange = true;
1369  }
1370
1371  if (MadeChange)
1372    CurDAG->RemoveDeadNodes();
1373}
1374
1375
1376/// Emit any code that needs to be executed only in the main function.
1377void X86DAGToDAGISel::emitSpecialCodeForMain() {
1378  if (Subtarget->isTargetCygMing()) {
1379    TargetLowering::ArgListTy Args;
1380    auto &DL = CurDAG->getDataLayout();
1381
1382    TargetLowering::CallLoweringInfo CLI(*CurDAG);
1383    CLI.setChain(CurDAG->getRoot())
1384        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1385                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1386                   std::move(Args));
1387    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1388    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1389    CurDAG->setRoot(Result.second);
1390  }
1391}
1392
1393void X86DAGToDAGISel::EmitFunctionEntryCode() {
1394  // If this is main, emit special code for main.
1395  const Function &F = MF->getFunction();
1396  if (F.hasExternalLinkage() && F.getName() == "main")
1397    emitSpecialCodeForMain();
1398}
1399
1400static bool isDispSafeForFrameIndex(int64_t Val) {
1401  // On 64-bit platforms, we can run into an issue where a frame index
1402  // includes a displacement that, when added to the explicit displacement,
1403  // will overflow the displacement field. Assuming that the frame index
1404  // displacement fits into a 31-bit integer  (which is only slightly more
1405  // aggressive than the current fundamental assumption that it fits into
1406  // a 32-bit integer), a 31-bit disp should always be safe.
1407  return isInt<31>(Val);
1408}
1409
1410bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1411                                            X86ISelAddressMode &AM) {
1412  // If there's no offset to fold, we don't need to do any work.
1413  if (Offset == 0)
1414    return false;
1415
1416  // Cannot combine ExternalSymbol displacements with integer offsets.
1417  if (AM.ES || AM.MCSym)
1418    return true;
1419
1420  int64_t Val = AM.Disp + Offset;
1421  CodeModel::Model M = TM.getCodeModel();
1422  if (Subtarget->is64Bit()) {
1423    if (!X86::isOffsetSuitableForCodeModel(Val, M,
1424                                           AM.hasSymbolicDisplacement()))
1425      return true;
1426    // In addition to the checks required for a register base, check that
1427    // we do not try to use an unsafe Disp with a frame index.
1428    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1429        !isDispSafeForFrameIndex(Val))
1430      return true;
1431  }
1432  AM.Disp = Val;
1433  return false;
1434
1435}
1436
1437bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1438  SDValue Address = N->getOperand(1);
1439
1440  // load gs:0 -> GS segment register.
1441  // load fs:0 -> FS segment register.
1442  //
1443  // This optimization is valid because the GNU TLS model defines that
1444  // gs:0 (or fs:0 on X86-64) contains its own address.
1445  // For more information see http://people.redhat.com/drepper/tls.pdf
1446  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1447    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1448        !IndirectTlsSegRefs &&
1449        (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1450         Subtarget->isTargetFuchsia()))
1451      switch (N->getPointerInfo().getAddrSpace()) {
1452      case 256:
1453        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1454        return false;
1455      case 257:
1456        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1457        return false;
1458      // Address space 258 is not handled here, because it is not used to
1459      // address TLS areas.
1460      }
1461
1462  return true;
1463}
1464
1465/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1466/// mode. These wrap things that will resolve down into a symbol reference.
1467/// If no match is possible, this returns true, otherwise it returns false.
1468bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1469  // If the addressing mode already has a symbol as the displacement, we can
1470  // never match another symbol.
1471  if (AM.hasSymbolicDisplacement())
1472    return true;
1473
1474  bool IsRIPRelTLS = false;
1475  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1476  if (IsRIPRel) {
1477    SDValue Val = N.getOperand(0);
1478    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1479      IsRIPRelTLS = true;
1480  }
1481
1482  // We can't use an addressing mode in the 64-bit large code model.
1483  // Global TLS addressing is an exception. In the medium code model,
1484  // we use can use a mode when RIP wrappers are present.
1485  // That signifies access to globals that are known to be "near",
1486  // such as the GOT itself.
1487  CodeModel::Model M = TM.getCodeModel();
1488  if (Subtarget->is64Bit() &&
1489      ((M == CodeModel::Large && !IsRIPRelTLS) ||
1490       (M == CodeModel::Medium && !IsRIPRel)))
1491    return true;
1492
1493  // Base and index reg must be 0 in order to use %rip as base.
1494  if (IsRIPRel && AM.hasBaseOrIndexReg())
1495    return true;
1496
1497  // Make a local copy in case we can't do this fold.
1498  X86ISelAddressMode Backup = AM;
1499
1500  int64_t Offset = 0;
1501  SDValue N0 = N.getOperand(0);
1502  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1503    AM.GV = G->getGlobal();
1504    AM.SymbolFlags = G->getTargetFlags();
1505    Offset = G->getOffset();
1506  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1507    AM.CP = CP->getConstVal();
1508    AM.Align = CP->getAlignment();
1509    AM.SymbolFlags = CP->getTargetFlags();
1510    Offset = CP->getOffset();
1511  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1512    AM.ES = S->getSymbol();
1513    AM.SymbolFlags = S->getTargetFlags();
1514  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1515    AM.MCSym = S->getMCSymbol();
1516  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1517    AM.JT = J->getIndex();
1518    AM.SymbolFlags = J->getTargetFlags();
1519  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1520    AM.BlockAddr = BA->getBlockAddress();
1521    AM.SymbolFlags = BA->getTargetFlags();
1522    Offset = BA->getOffset();
1523  } else
1524    llvm_unreachable("Unhandled symbol reference node.");
1525
1526  if (foldOffsetIntoAddress(Offset, AM)) {
1527    AM = Backup;
1528    return true;
1529  }
1530
1531  if (IsRIPRel)
1532    AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1533
1534  // Commit the changes now that we know this fold is safe.
1535  return false;
1536}
1537
1538/// Add the specified node to the specified addressing mode, returning true if
1539/// it cannot be done. This just pattern matches for the addressing mode.
1540bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1541  if (matchAddressRecursively(N, AM, 0))
1542    return true;
1543
1544  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1545  // a smaller encoding and avoids a scaled-index.
1546  if (AM.Scale == 2 &&
1547      AM.BaseType == X86ISelAddressMode::RegBase &&
1548      AM.Base_Reg.getNode() == nullptr) {
1549    AM.Base_Reg = AM.IndexReg;
1550    AM.Scale = 1;
1551  }
1552
1553  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1554  // because it has a smaller encoding.
1555  // TODO: Which other code models can use this?
1556  switch (TM.getCodeModel()) {
1557    default: break;
1558    case CodeModel::Small:
1559    case CodeModel::Kernel:
1560      if (Subtarget->is64Bit() &&
1561          AM.Scale == 1 &&
1562          AM.BaseType == X86ISelAddressMode::RegBase &&
1563          AM.Base_Reg.getNode() == nullptr &&
1564          AM.IndexReg.getNode() == nullptr &&
1565          AM.SymbolFlags == X86II::MO_NO_FLAG &&
1566          AM.hasSymbolicDisplacement())
1567        AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1568      break;
1569  }
1570
1571  return false;
1572}
1573
1574bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1575                               unsigned Depth) {
1576  // Add an artificial use to this node so that we can keep track of
1577  // it if it gets CSE'd with a different node.
1578  HandleSDNode Handle(N);
1579
1580  X86ISelAddressMode Backup = AM;
1581  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1582      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1583    return false;
1584  AM = Backup;
1585
1586  // Try again after commuting the operands.
1587  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1588      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1589    return false;
1590  AM = Backup;
1591
1592  // If we couldn't fold both operands into the address at the same time,
1593  // see if we can just put each operand into a register and fold at least
1594  // the add.
1595  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1596      !AM.Base_Reg.getNode() &&
1597      !AM.IndexReg.getNode()) {
1598    N = Handle.getValue();
1599    AM.Base_Reg = N.getOperand(0);
1600    AM.IndexReg = N.getOperand(1);
1601    AM.Scale = 1;
1602    return false;
1603  }
1604  N = Handle.getValue();
1605  return true;
1606}
1607
1608// Insert a node into the DAG at least before the Pos node's position. This
1609// will reposition the node as needed, and will assign it a node ID that is <=
1610// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1611// IDs! The selection DAG must no longer depend on their uniqueness when this
1612// is used.
1613static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1614  if (N->getNodeId() == -1 ||
1615      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1616       SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1617    DAG.RepositionNode(Pos->getIterator(), N.getNode());
1618    // Mark Node as invalid for pruning as after this it may be a successor to a
1619    // selected node but otherwise be in the same position of Pos.
1620    // Conservatively mark it with the same -abs(Id) to assure node id
1621    // invariant is preserved.
1622    N->setNodeId(Pos->getNodeId());
1623    SelectionDAGISel::InvalidateNodeId(N.getNode());
1624  }
1625}
1626
1627// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1628// safe. This allows us to convert the shift and and into an h-register
1629// extract and a scaled index. Returns false if the simplification is
1630// performed.
1631static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1632                                      uint64_t Mask,
1633                                      SDValue Shift, SDValue X,
1634                                      X86ISelAddressMode &AM) {
1635  if (Shift.getOpcode() != ISD::SRL ||
1636      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1637      !Shift.hasOneUse())
1638    return true;
1639
1640  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1641  if (ScaleLog <= 0 || ScaleLog >= 4 ||
1642      Mask != (0xffu << ScaleLog))
1643    return true;
1644
1645  MVT VT = N.getSimpleValueType();
1646  SDLoc DL(N);
1647  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1648  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1649  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1650  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1651  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1652  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1653
1654  // Insert the new nodes into the topological ordering. We must do this in
1655  // a valid topological ordering as nothing is going to go back and re-sort
1656  // these nodes. We continually insert before 'N' in sequence as this is
1657  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1658  // hierarchy left to express.
1659  insertDAGNode(DAG, N, Eight);
1660  insertDAGNode(DAG, N, Srl);
1661  insertDAGNode(DAG, N, NewMask);
1662  insertDAGNode(DAG, N, And);
1663  insertDAGNode(DAG, N, ShlCount);
1664  insertDAGNode(DAG, N, Shl);
1665  DAG.ReplaceAllUsesWith(N, Shl);
1666  DAG.RemoveDeadNode(N.getNode());
1667  AM.IndexReg = And;
1668  AM.Scale = (1 << ScaleLog);
1669  return false;
1670}
1671
1672// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1673// allows us to fold the shift into this addressing mode. Returns false if the
1674// transform succeeded.
1675static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1676                                        X86ISelAddressMode &AM) {
1677  SDValue Shift = N.getOperand(0);
1678
1679  // Use a signed mask so that shifting right will insert sign bits. These
1680  // bits will be removed when we shift the result left so it doesn't matter
1681  // what we use. This might allow a smaller immediate encoding.
1682  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1683
1684  // If we have an any_extend feeding the AND, look through it to see if there
1685  // is a shift behind it. But only if the AND doesn't use the extended bits.
1686  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1687  bool FoundAnyExtend = false;
1688  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1689      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1690      isUInt<32>(Mask)) {
1691    FoundAnyExtend = true;
1692    Shift = Shift.getOperand(0);
1693  }
1694
1695  if (Shift.getOpcode() != ISD::SHL ||
1696      !isa<ConstantSDNode>(Shift.getOperand(1)))
1697    return true;
1698
1699  SDValue X = Shift.getOperand(0);
1700
1701  // Not likely to be profitable if either the AND or SHIFT node has more
1702  // than one use (unless all uses are for address computation). Besides,
1703  // isel mechanism requires their node ids to be reused.
1704  if (!N.hasOneUse() || !Shift.hasOneUse())
1705    return true;
1706
1707  // Verify that the shift amount is something we can fold.
1708  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1709  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1710    return true;
1711
1712  MVT VT = N.getSimpleValueType();
1713  SDLoc DL(N);
1714  if (FoundAnyExtend) {
1715    SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1716    insertDAGNode(DAG, N, NewX);
1717    X = NewX;
1718  }
1719
1720  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1721  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1722  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1723
1724  // Insert the new nodes into the topological ordering. We must do this in
1725  // a valid topological ordering as nothing is going to go back and re-sort
1726  // these nodes. We continually insert before 'N' in sequence as this is
1727  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1728  // hierarchy left to express.
1729  insertDAGNode(DAG, N, NewMask);
1730  insertDAGNode(DAG, N, NewAnd);
1731  insertDAGNode(DAG, N, NewShift);
1732  DAG.ReplaceAllUsesWith(N, NewShift);
1733  DAG.RemoveDeadNode(N.getNode());
1734
1735  AM.Scale = 1 << ShiftAmt;
1736  AM.IndexReg = NewAnd;
1737  return false;
1738}
1739
1740// Implement some heroics to detect shifts of masked values where the mask can
1741// be replaced by extending the shift and undoing that in the addressing mode
1742// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1743// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1744// the addressing mode. This results in code such as:
1745//
1746//   int f(short *y, int *lookup_table) {
1747//     ...
1748//     return *y + lookup_table[*y >> 11];
1749//   }
1750//
1751// Turning into:
1752//   movzwl (%rdi), %eax
1753//   movl %eax, %ecx
1754//   shrl $11, %ecx
1755//   addl (%rsi,%rcx,4), %eax
1756//
1757// Instead of:
1758//   movzwl (%rdi), %eax
1759//   movl %eax, %ecx
1760//   shrl $9, %ecx
1761//   andl $124, %rcx
1762//   addl (%rsi,%rcx), %eax
1763//
1764// Note that this function assumes the mask is provided as a mask *after* the
1765// value is shifted. The input chain may or may not match that, but computing
1766// such a mask is trivial.
1767static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1768                                    uint64_t Mask,
1769                                    SDValue Shift, SDValue X,
1770                                    X86ISelAddressMode &AM) {
1771  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1772      !isa<ConstantSDNode>(Shift.getOperand(1)))
1773    return true;
1774
1775  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1776  unsigned MaskLZ = countLeadingZeros(Mask);
1777  unsigned MaskTZ = countTrailingZeros(Mask);
1778
1779  // The amount of shift we're trying to fit into the addressing mode is taken
1780  // from the trailing zeros of the mask.
1781  unsigned AMShiftAmt = MaskTZ;
1782
1783  // There is nothing we can do here unless the mask is removing some bits.
1784  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1785  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1786
1787  // We also need to ensure that mask is a continuous run of bits.
1788  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1789
1790  // Scale the leading zero count down based on the actual size of the value.
1791  // Also scale it down based on the size of the shift.
1792  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1793  if (MaskLZ < ScaleDown)
1794    return true;
1795  MaskLZ -= ScaleDown;
1796
1797  // The final check is to ensure that any masked out high bits of X are
1798  // already known to be zero. Otherwise, the mask has a semantic impact
1799  // other than masking out a couple of low bits. Unfortunately, because of
1800  // the mask, zero extensions will be removed from operands in some cases.
1801  // This code works extra hard to look through extensions because we can
1802  // replace them with zero extensions cheaply if necessary.
1803  bool ReplacingAnyExtend = false;
1804  if (X.getOpcode() == ISD::ANY_EXTEND) {
1805    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1806                          X.getOperand(0).getSimpleValueType().getSizeInBits();
1807    // Assume that we'll replace the any-extend with a zero-extend, and
1808    // narrow the search to the extended value.
1809    X = X.getOperand(0);
1810    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1811    ReplacingAnyExtend = true;
1812  }
1813  APInt MaskedHighBits =
1814    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1815  KnownBits Known = DAG.computeKnownBits(X);
1816  if (MaskedHighBits != Known.Zero) return true;
1817
1818  // We've identified a pattern that can be transformed into a single shift
1819  // and an addressing mode. Make it so.
1820  MVT VT = N.getSimpleValueType();
1821  if (ReplacingAnyExtend) {
1822    assert(X.getValueType() != VT);
1823    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1824    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1825    insertDAGNode(DAG, N, NewX);
1826    X = NewX;
1827  }
1828  SDLoc DL(N);
1829  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1830  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1831  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1832  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1833
1834  // Insert the new nodes into the topological ordering. We must do this in
1835  // a valid topological ordering as nothing is going to go back and re-sort
1836  // these nodes. We continually insert before 'N' in sequence as this is
1837  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1838  // hierarchy left to express.
1839  insertDAGNode(DAG, N, NewSRLAmt);
1840  insertDAGNode(DAG, N, NewSRL);
1841  insertDAGNode(DAG, N, NewSHLAmt);
1842  insertDAGNode(DAG, N, NewSHL);
1843  DAG.ReplaceAllUsesWith(N, NewSHL);
1844  DAG.RemoveDeadNode(N.getNode());
1845
1846  AM.Scale = 1 << AMShiftAmt;
1847  AM.IndexReg = NewSRL;
1848  return false;
1849}
1850
1851// Transform "(X >> SHIFT) & (MASK << C1)" to
1852// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1853// matched to a BEXTR later. Returns false if the simplification is performed.
1854static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1855                                   uint64_t Mask,
1856                                   SDValue Shift, SDValue X,
1857                                   X86ISelAddressMode &AM,
1858                                   const X86Subtarget &Subtarget) {
1859  if (Shift.getOpcode() != ISD::SRL ||
1860      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1861      !Shift.hasOneUse() || !N.hasOneUse())
1862    return true;
1863
1864  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1865  if (!Subtarget.hasTBM() &&
1866      !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1867    return true;
1868
1869  // We need to ensure that mask is a continuous run of bits.
1870  if (!isShiftedMask_64(Mask)) return true;
1871
1872  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1873
1874  // The amount of shift we're trying to fit into the addressing mode is taken
1875  // from the trailing zeros of the mask.
1876  unsigned AMShiftAmt = countTrailingZeros(Mask);
1877
1878  // There is nothing we can do here unless the mask is removing some bits.
1879  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1880  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1881
1882  MVT VT = N.getSimpleValueType();
1883  SDLoc DL(N);
1884  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1885  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1886  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1887  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1888  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1889  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1890
1891  // Insert the new nodes into the topological ordering. We must do this in
1892  // a valid topological ordering as nothing is going to go back and re-sort
1893  // these nodes. We continually insert before 'N' in sequence as this is
1894  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1895  // hierarchy left to express.
1896  insertDAGNode(DAG, N, NewSRLAmt);
1897  insertDAGNode(DAG, N, NewSRL);
1898  insertDAGNode(DAG, N, NewMask);
1899  insertDAGNode(DAG, N, NewAnd);
1900  insertDAGNode(DAG, N, NewSHLAmt);
1901  insertDAGNode(DAG, N, NewSHL);
1902  DAG.ReplaceAllUsesWith(N, NewSHL);
1903  DAG.RemoveDeadNode(N.getNode());
1904
1905  AM.Scale = 1 << AMShiftAmt;
1906  AM.IndexReg = NewAnd;
1907  return false;
1908}
1909
1910bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1911                                              unsigned Depth) {
1912  SDLoc dl(N);
1913  LLVM_DEBUG({
1914    dbgs() << "MatchAddress: ";
1915    AM.dump(CurDAG);
1916  });
1917  // Limit recursion.
1918  if (Depth > 5)
1919    return matchAddressBase(N, AM);
1920
1921  // If this is already a %rip relative address, we can only merge immediates
1922  // into it.  Instead of handling this in every case, we handle it here.
1923  // RIP relative addressing: %rip + 32-bit displacement!
1924  if (AM.isRIPRelative()) {
1925    // FIXME: JumpTable and ExternalSymbol address currently don't like
1926    // displacements.  It isn't very important, but this should be fixed for
1927    // consistency.
1928    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1929      return true;
1930
1931    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1932      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1933        return false;
1934    return true;
1935  }
1936
1937  switch (N.getOpcode()) {
1938  default: break;
1939  case ISD::LOCAL_RECOVER: {
1940    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1941      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1942        // Use the symbol and don't prefix it.
1943        AM.MCSym = ESNode->getMCSymbol();
1944        return false;
1945      }
1946    break;
1947  }
1948  case ISD::Constant: {
1949    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1950    if (!foldOffsetIntoAddress(Val, AM))
1951      return false;
1952    break;
1953  }
1954
1955  case X86ISD::Wrapper:
1956  case X86ISD::WrapperRIP:
1957    if (!matchWrapper(N, AM))
1958      return false;
1959    break;
1960
1961  case ISD::LOAD:
1962    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1963      return false;
1964    break;
1965
1966  case ISD::FrameIndex:
1967    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1968        AM.Base_Reg.getNode() == nullptr &&
1969        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1970      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1971      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1972      return false;
1973    }
1974    break;
1975
1976  case ISD::SHL:
1977    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1978      break;
1979
1980    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1981      unsigned Val = CN->getZExtValue();
1982      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1983      // that the base operand remains free for further matching. If
1984      // the base doesn't end up getting used, a post-processing step
1985      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1986      if (Val == 1 || Val == 2 || Val == 3) {
1987        AM.Scale = 1 << Val;
1988        SDValue ShVal = N.getOperand(0);
1989
1990        // Okay, we know that we have a scale by now.  However, if the scaled
1991        // value is an add of something and a constant, we can fold the
1992        // constant into the disp field here.
1993        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1994          AM.IndexReg = ShVal.getOperand(0);
1995          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1996          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1997          if (!foldOffsetIntoAddress(Disp, AM))
1998            return false;
1999        }
2000
2001        AM.IndexReg = ShVal;
2002        return false;
2003      }
2004    }
2005    break;
2006
2007  case ISD::SRL: {
2008    // Scale must not be used already.
2009    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2010
2011    // We only handle up to 64-bit values here as those are what matter for
2012    // addressing mode optimizations.
2013    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2014           "Unexpected value size!");
2015
2016    SDValue And = N.getOperand(0);
2017    if (And.getOpcode() != ISD::AND) break;
2018    SDValue X = And.getOperand(0);
2019
2020    // The mask used for the transform is expected to be post-shift, but we
2021    // found the shift first so just apply the shift to the mask before passing
2022    // it down.
2023    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2024        !isa<ConstantSDNode>(And.getOperand(1)))
2025      break;
2026    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2027
2028    // Try to fold the mask and shift into the scale, and return false if we
2029    // succeed.
2030    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2031      return false;
2032    break;
2033  }
2034
2035  case ISD::SMUL_LOHI:
2036  case ISD::UMUL_LOHI:
2037    // A mul_lohi where we need the low part can be folded as a plain multiply.
2038    if (N.getResNo() != 0) break;
2039    LLVM_FALLTHROUGH;
2040  case ISD::MUL:
2041  case X86ISD::MUL_IMM:
2042    // X*[3,5,9] -> X+X*[2,4,8]
2043    if (AM.BaseType == X86ISelAddressMode::RegBase &&
2044        AM.Base_Reg.getNode() == nullptr &&
2045        AM.IndexReg.getNode() == nullptr) {
2046      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2047        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2048            CN->getZExtValue() == 9) {
2049          AM.Scale = unsigned(CN->getZExtValue())-1;
2050
2051          SDValue MulVal = N.getOperand(0);
2052          SDValue Reg;
2053
2054          // Okay, we know that we have a scale by now.  However, if the scaled
2055          // value is an add of something and a constant, we can fold the
2056          // constant into the disp field here.
2057          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2058              isa<ConstantSDNode>(MulVal.getOperand(1))) {
2059            Reg = MulVal.getOperand(0);
2060            ConstantSDNode *AddVal =
2061              cast<ConstantSDNode>(MulVal.getOperand(1));
2062            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2063            if (foldOffsetIntoAddress(Disp, AM))
2064              Reg = N.getOperand(0);
2065          } else {
2066            Reg = N.getOperand(0);
2067          }
2068
2069          AM.IndexReg = AM.Base_Reg = Reg;
2070          return false;
2071        }
2072    }
2073    break;
2074
2075  case ISD::SUB: {
2076    // Given A-B, if A can be completely folded into the address and
2077    // the index field with the index field unused, use -B as the index.
2078    // This is a win if a has multiple parts that can be folded into
2079    // the address. Also, this saves a mov if the base register has
2080    // other uses, since it avoids a two-address sub instruction, however
2081    // it costs an additional mov if the index register has other uses.
2082
2083    // Add an artificial use to this node so that we can keep track of
2084    // it if it gets CSE'd with a different node.
2085    HandleSDNode Handle(N);
2086
2087    // Test if the LHS of the sub can be folded.
2088    X86ISelAddressMode Backup = AM;
2089    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2090      N = Handle.getValue();
2091      AM = Backup;
2092      break;
2093    }
2094    N = Handle.getValue();
2095    // Test if the index field is free for use.
2096    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2097      AM = Backup;
2098      break;
2099    }
2100
2101    int Cost = 0;
2102    SDValue RHS = N.getOperand(1);
2103    // If the RHS involves a register with multiple uses, this
2104    // transformation incurs an extra mov, due to the neg instruction
2105    // clobbering its operand.
2106    if (!RHS.getNode()->hasOneUse() ||
2107        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2108        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2109        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2110        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2111         RHS.getOperand(0).getValueType() == MVT::i32))
2112      ++Cost;
2113    // If the base is a register with multiple uses, this
2114    // transformation may save a mov.
2115    if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2116         !AM.Base_Reg.getNode()->hasOneUse()) ||
2117        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2118      --Cost;
2119    // If the folded LHS was interesting, this transformation saves
2120    // address arithmetic.
2121    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2122        ((AM.Disp != 0) && (Backup.Disp == 0)) +
2123        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2124      --Cost;
2125    // If it doesn't look like it may be an overall win, don't do it.
2126    if (Cost >= 0) {
2127      AM = Backup;
2128      break;
2129    }
2130
2131    // Ok, the transformation is legal and appears profitable. Go for it.
2132    // Negation will be emitted later to avoid creating dangling nodes if this
2133    // was an unprofitable LEA.
2134    AM.IndexReg = RHS;
2135    AM.NegateIndex = true;
2136    AM.Scale = 1;
2137    return false;
2138  }
2139
2140  case ISD::ADD:
2141    if (!matchAdd(N, AM, Depth))
2142      return false;
2143    break;
2144
2145  case ISD::OR:
2146    // We want to look through a transform in InstCombine and DAGCombiner that
2147    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2148    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2149    // An 'lea' can then be used to match the shift (multiply) and add:
2150    // and $1, %esi
2151    // lea (%rsi, %rdi, 8), %rax
2152    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2153        !matchAdd(N, AM, Depth))
2154      return false;
2155    break;
2156
2157  case ISD::AND: {
2158    // Perform some heroic transforms on an and of a constant-count shift
2159    // with a constant to enable use of the scaled offset field.
2160
2161    // Scale must not be used already.
2162    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2163
2164    // We only handle up to 64-bit values here as those are what matter for
2165    // addressing mode optimizations.
2166    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2167           "Unexpected value size!");
2168
2169    if (!isa<ConstantSDNode>(N.getOperand(1)))
2170      break;
2171
2172    if (N.getOperand(0).getOpcode() == ISD::SRL) {
2173      SDValue Shift = N.getOperand(0);
2174      SDValue X = Shift.getOperand(0);
2175
2176      uint64_t Mask = N.getConstantOperandVal(1);
2177
2178      // Try to fold the mask and shift into an extract and scale.
2179      if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2180        return false;
2181
2182      // Try to fold the mask and shift directly into the scale.
2183      if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2184        return false;
2185
2186      // Try to fold the mask and shift into BEXTR and scale.
2187      if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2188        return false;
2189    }
2190
2191    // Try to swap the mask and shift to place shifts which can be done as
2192    // a scale on the outside of the mask.
2193    if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2194      return false;
2195
2196    break;
2197  }
2198  case ISD::ZERO_EXTEND: {
2199    // Try to widen a zexted shift left to the same size as its use, so we can
2200    // match the shift as a scale factor.
2201    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2202      break;
2203    if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2204      break;
2205
2206    // Give up if the shift is not a valid scale factor [1,2,3].
2207    SDValue Shl = N.getOperand(0);
2208    auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2209    if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2210      break;
2211
2212    // The narrow shift must only shift out zero bits (it must be 'nuw').
2213    // That makes it safe to widen to the destination type.
2214    APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2215                                            ShAmtC->getZExtValue());
2216    if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2217      break;
2218
2219    // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2220    MVT VT = N.getSimpleValueType();
2221    SDLoc DL(N);
2222    SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2223    SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2224
2225    // Convert the shift to scale factor.
2226    AM.Scale = 1 << ShAmtC->getZExtValue();
2227    AM.IndexReg = Zext;
2228
2229    insertDAGNode(*CurDAG, N, Zext);
2230    insertDAGNode(*CurDAG, N, NewShl);
2231    CurDAG->ReplaceAllUsesWith(N, NewShl);
2232    CurDAG->RemoveDeadNode(N.getNode());
2233    return false;
2234  }
2235  }
2236
2237  return matchAddressBase(N, AM);
2238}
2239
2240/// Helper for MatchAddress. Add the specified node to the
2241/// specified addressing mode without any further recursion.
2242bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2243  // Is the base register already occupied?
2244  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2245    // If so, check to see if the scale index register is set.
2246    if (!AM.IndexReg.getNode()) {
2247      AM.IndexReg = N;
2248      AM.Scale = 1;
2249      return false;
2250    }
2251
2252    // Otherwise, we cannot select it.
2253    return true;
2254  }
2255
2256  // Default, generate it as a register.
2257  AM.BaseType = X86ISelAddressMode::RegBase;
2258  AM.Base_Reg = N;
2259  return false;
2260}
2261
2262/// Helper for selectVectorAddr. Handles things that can be folded into a
2263/// gather scatter address. The index register and scale should have already
2264/// been handled.
2265bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2266  // TODO: Support other operations.
2267  switch (N.getOpcode()) {
2268  case ISD::Constant: {
2269    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2270    if (!foldOffsetIntoAddress(Val, AM))
2271      return false;
2272    break;
2273  }
2274  case X86ISD::Wrapper:
2275    if (!matchWrapper(N, AM))
2276      return false;
2277    break;
2278  }
2279
2280  return matchAddressBase(N, AM);
2281}
2282
2283bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2284                                       SDValue &Scale, SDValue &Index,
2285                                       SDValue &Disp, SDValue &Segment) {
2286  X86ISelAddressMode AM;
2287  auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2288  AM.IndexReg = Mgs->getIndex();
2289  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2290
2291  unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2292  if (AddrSpace == X86AS::GS)
2293    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2294  if (AddrSpace == X86AS::FS)
2295    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2296  if (AddrSpace == X86AS::SS)
2297    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2298
2299  SDLoc DL(N);
2300  MVT VT = N.getSimpleValueType();
2301
2302  // Try to match into the base and displacement fields.
2303  if (matchVectorAddress(N, AM))
2304    return false;
2305
2306  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2307  return true;
2308}
2309
2310/// Returns true if it is able to pattern match an addressing mode.
2311/// It returns the operands which make up the maximal addressing mode it can
2312/// match by reference.
2313///
2314/// Parent is the parent node of the addr operand that is being matched.  It
2315/// is always a load, store, atomic node, or null.  It is only null when
2316/// checking memory operands for inline asm nodes.
2317bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2318                                 SDValue &Scale, SDValue &Index,
2319                                 SDValue &Disp, SDValue &Segment) {
2320  X86ISelAddressMode AM;
2321
2322  if (Parent &&
2323      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2324      // that are not a MemSDNode, and thus don't have proper addrspace info.
2325      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2326      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2327      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2328      Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2329      Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2330      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2331      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2332    unsigned AddrSpace =
2333      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2334    // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2335    if (AddrSpace == 256)
2336      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2337    if (AddrSpace == 257)
2338      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2339    if (AddrSpace == 258)
2340      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2341  }
2342
2343  // Save the DL and VT before calling matchAddress, it can invalidate N.
2344  SDLoc DL(N);
2345  MVT VT = N.getSimpleValueType();
2346
2347  if (matchAddress(N, AM))
2348    return false;
2349
2350  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2351  return true;
2352}
2353
2354// We can only fold a load if all nodes between it and the root node have a
2355// single use. If there are additional uses, we could end up duplicating the
2356// load.
2357static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2358  while (User != Root) {
2359    if (!User->hasOneUse())
2360      return false;
2361    User = *User->use_begin();
2362  }
2363
2364  return true;
2365}
2366
2367/// Match a scalar SSE load. In particular, we want to match a load whose top
2368/// elements are either undef or zeros. The load flavor is derived from the
2369/// type of N, which is either v4f32 or v2f64.
2370///
2371/// We also return:
2372///   PatternChainNode: this is the matched node that has a chain input and
2373///   output.
2374bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2375                                          SDValue N, SDValue &Base,
2376                                          SDValue &Scale, SDValue &Index,
2377                                          SDValue &Disp, SDValue &Segment,
2378                                          SDValue &PatternNodeWithChain) {
2379  if (!hasSingleUsesFromRoot(Root, Parent))
2380    return false;
2381
2382  // We can allow a full vector load here since narrowing a load is ok unless
2383  // it's volatile or atomic.
2384  if (ISD::isNON_EXTLoad(N.getNode())) {
2385    LoadSDNode *LD = cast<LoadSDNode>(N);
2386    if (LD->isSimple() &&
2387        IsProfitableToFold(N, LD, Root) &&
2388        IsLegalToFold(N, Parent, Root, OptLevel)) {
2389      PatternNodeWithChain = N;
2390      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2391                        Segment);
2392    }
2393  }
2394
2395  // We can also match the special zero extended load opcode.
2396  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2397    PatternNodeWithChain = N;
2398    if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2399        IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2400      auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2401      return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2402                        Segment);
2403    }
2404  }
2405
2406  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2407  // once. Otherwise the load might get duplicated and the chain output of the
2408  // duplicate load will not be observed by all dependencies.
2409  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2410    PatternNodeWithChain = N.getOperand(0);
2411    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2412        IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2413        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2414      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2415      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2416                        Segment);
2417    }
2418  }
2419
2420  return false;
2421}
2422
2423
2424bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2425  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2426    uint64_t ImmVal = CN->getZExtValue();
2427    if (!isUInt<32>(ImmVal))
2428      return false;
2429
2430    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2431    return true;
2432  }
2433
2434  // In static codegen with small code model, we can get the address of a label
2435  // into a register with 'movl'
2436  if (N->getOpcode() != X86ISD::Wrapper)
2437    return false;
2438
2439  N = N.getOperand(0);
2440
2441  // At least GNU as does not accept 'movl' for TPOFF relocations.
2442  // FIXME: We could use 'movl' when we know we are targeting MC.
2443  if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2444    return false;
2445
2446  Imm = N;
2447  if (N->getOpcode() != ISD::TargetGlobalAddress)
2448    return TM.getCodeModel() == CodeModel::Small;
2449
2450  Optional<ConstantRange> CR =
2451      cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2452  if (!CR)
2453    return TM.getCodeModel() == CodeModel::Small;
2454
2455  return CR->getUnsignedMax().ult(1ull << 32);
2456}
2457
2458bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2459                                         SDValue &Scale, SDValue &Index,
2460                                         SDValue &Disp, SDValue &Segment) {
2461  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2462  SDLoc DL(N);
2463
2464  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2465    return false;
2466
2467  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2468  if (RN && RN->getReg() == 0)
2469    Base = CurDAG->getRegister(0, MVT::i64);
2470  else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2471    // Base could already be %rip, particularly in the x32 ABI.
2472    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2473                                                     MVT::i64), 0);
2474    Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2475                                         Base);
2476  }
2477
2478  RN = dyn_cast<RegisterSDNode>(Index);
2479  if (RN && RN->getReg() == 0)
2480    Index = CurDAG->getRegister(0, MVT::i64);
2481  else {
2482    assert(Index.getValueType() == MVT::i32 &&
2483           "Expect to be extending 32-bit registers for use in LEA");
2484    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2485                                                     MVT::i64), 0);
2486    Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2487                                          Index);
2488  }
2489
2490  return true;
2491}
2492
2493/// Calls SelectAddr and determines if the maximal addressing
2494/// mode it matches can be cost effectively emitted as an LEA instruction.
2495bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2496                                    SDValue &Base, SDValue &Scale,
2497                                    SDValue &Index, SDValue &Disp,
2498                                    SDValue &Segment) {
2499  X86ISelAddressMode AM;
2500
2501  // Save the DL and VT before calling matchAddress, it can invalidate N.
2502  SDLoc DL(N);
2503  MVT VT = N.getSimpleValueType();
2504
2505  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2506  // segments.
2507  SDValue Copy = AM.Segment;
2508  SDValue T = CurDAG->getRegister(0, MVT::i32);
2509  AM.Segment = T;
2510  if (matchAddress(N, AM))
2511    return false;
2512  assert (T == AM.Segment);
2513  AM.Segment = Copy;
2514
2515  unsigned Complexity = 0;
2516  if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2517    Complexity = 1;
2518  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2519    Complexity = 4;
2520
2521  if (AM.IndexReg.getNode())
2522    Complexity++;
2523
2524  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2525  // a simple shift.
2526  if (AM.Scale > 1)
2527    Complexity++;
2528
2529  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2530  // to a LEA. This is determined with some experimentation but is by no means
2531  // optimal (especially for code size consideration). LEA is nice because of
2532  // its three-address nature. Tweak the cost function again when we can run
2533  // convertToThreeAddress() at register allocation time.
2534  if (AM.hasSymbolicDisplacement()) {
2535    // For X86-64, always use LEA to materialize RIP-relative addresses.
2536    if (Subtarget->is64Bit())
2537      Complexity = 4;
2538    else
2539      Complexity += 2;
2540  }
2541
2542  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2543  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2544  // duplicating flag-producing instructions later in the pipeline.
2545  if (N.getOpcode() == ISD::ADD) {
2546    auto isMathWithFlags = [](SDValue V) {
2547      switch (V.getOpcode()) {
2548      case X86ISD::ADD:
2549      case X86ISD::SUB:
2550      case X86ISD::ADC:
2551      case X86ISD::SBB:
2552      /* TODO: These opcodes can be added safely, but we may want to justify
2553               their inclusion for different reasons (better for reg-alloc).
2554      case X86ISD::SMUL:
2555      case X86ISD::UMUL:
2556      case X86ISD::OR:
2557      case X86ISD::XOR:
2558      case X86ISD::AND:
2559      */
2560        // Value 1 is the flag output of the node - verify it's not dead.
2561        return !SDValue(V.getNode(), 1).use_empty();
2562      default:
2563        return false;
2564      }
2565    };
2566    // TODO: This could be an 'or' rather than 'and' to make the transform more
2567    //       likely to happen. We might want to factor in whether there's a
2568    //       load folding opportunity for the math op that disappears with LEA.
2569    if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2570      Complexity++;
2571  }
2572
2573  if (AM.Disp)
2574    Complexity++;
2575
2576  // If it isn't worth using an LEA, reject it.
2577  if (Complexity <= 2)
2578    return false;
2579
2580  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2581  return true;
2582}
2583
2584/// This is only run on TargetGlobalTLSAddress nodes.
2585bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2586                                        SDValue &Scale, SDValue &Index,
2587                                        SDValue &Disp, SDValue &Segment) {
2588  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2589  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2590
2591  X86ISelAddressMode AM;
2592  AM.GV = GA->getGlobal();
2593  AM.Disp += GA->getOffset();
2594  AM.SymbolFlags = GA->getTargetFlags();
2595
2596  MVT VT = N.getSimpleValueType();
2597  if (VT == MVT::i32) {
2598    AM.Scale = 1;
2599    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2600  }
2601
2602  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2603  return true;
2604}
2605
2606bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2607  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2608    Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2609                                   N.getValueType());
2610    return true;
2611  }
2612
2613  // Keep track of the original value type and whether this value was
2614  // truncated. If we see a truncation from pointer type to VT that truncates
2615  // bits that are known to be zero, we can use a narrow reference.
2616  EVT VT = N.getValueType();
2617  bool WasTruncated = false;
2618  if (N.getOpcode() == ISD::TRUNCATE) {
2619    WasTruncated = true;
2620    N = N.getOperand(0);
2621  }
2622
2623  if (N.getOpcode() != X86ISD::Wrapper)
2624    return false;
2625
2626  // We can only use non-GlobalValues as immediates if they were not truncated,
2627  // as we do not have any range information. If we have a GlobalValue and the
2628  // address was not truncated, we can select it as an operand directly.
2629  unsigned Opc = N.getOperand(0)->getOpcode();
2630  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2631    Op = N.getOperand(0);
2632    // We can only select the operand directly if we didn't have to look past a
2633    // truncate.
2634    return !WasTruncated;
2635  }
2636
2637  // Check that the global's range fits into VT.
2638  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2639  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2640  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2641    return false;
2642
2643  // Okay, we can use a narrow reference.
2644  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2645                                      GA->getOffset(), GA->getTargetFlags());
2646  return true;
2647}
2648
2649bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2650                                  SDValue &Base, SDValue &Scale,
2651                                  SDValue &Index, SDValue &Disp,
2652                                  SDValue &Segment) {
2653  assert(Root && P && "Unknown root/parent nodes");
2654  if (!ISD::isNON_EXTLoad(N.getNode()) ||
2655      !IsProfitableToFold(N, P, Root) ||
2656      !IsLegalToFold(N, P, Root, OptLevel))
2657    return false;
2658
2659  return selectAddr(N.getNode(),
2660                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
2661}
2662
2663bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
2664                                       SDValue &Base, SDValue &Scale,
2665                                       SDValue &Index, SDValue &Disp,
2666                                       SDValue &Segment) {
2667  assert(Root && P && "Unknown root/parent nodes");
2668  if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
2669      !IsProfitableToFold(N, P, Root) ||
2670      !IsLegalToFold(N, P, Root, OptLevel))
2671    return false;
2672
2673  return selectAddr(N.getNode(),
2674                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
2675}
2676
2677/// Return an SDNode that returns the value of the global base register.
2678/// Output instructions required to initialize the global base register,
2679/// if necessary.
2680SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2681  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2682  auto &DL = MF->getDataLayout();
2683  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2684}
2685
2686bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2687  if (N->getOpcode() == ISD::TRUNCATE)
2688    N = N->getOperand(0).getNode();
2689  if (N->getOpcode() != X86ISD::Wrapper)
2690    return false;
2691
2692  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2693  if (!GA)
2694    return false;
2695
2696  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2697  return CR && CR->getSignedMin().sge(-1ull << Width) &&
2698         CR->getSignedMax().slt(1ull << Width);
2699}
2700
2701static X86::CondCode getCondFromNode(SDNode *N) {
2702  assert(N->isMachineOpcode() && "Unexpected node");
2703  X86::CondCode CC = X86::COND_INVALID;
2704  unsigned Opc = N->getMachineOpcode();
2705  if (Opc == X86::JCC_1)
2706    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2707  else if (Opc == X86::SETCCr)
2708    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2709  else if (Opc == X86::SETCCm)
2710    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2711  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2712           Opc == X86::CMOV64rr)
2713    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2714  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2715           Opc == X86::CMOV64rm)
2716    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2717
2718  return CC;
2719}
2720
2721/// Test whether the given X86ISD::CMP node has any users that use a flag
2722/// other than ZF.
2723bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2724  // Examine each user of the node.
2725  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2726         UI != UE; ++UI) {
2727    // Only check things that use the flags.
2728    if (UI.getUse().getResNo() != Flags.getResNo())
2729      continue;
2730    // Only examine CopyToReg uses that copy to EFLAGS.
2731    if (UI->getOpcode() != ISD::CopyToReg ||
2732        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2733      return false;
2734    // Examine each user of the CopyToReg use.
2735    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2736           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2737      // Only examine the Flag result.
2738      if (FlagUI.getUse().getResNo() != 1) continue;
2739      // Anything unusual: assume conservatively.
2740      if (!FlagUI->isMachineOpcode()) return false;
2741      // Examine the condition code of the user.
2742      X86::CondCode CC = getCondFromNode(*FlagUI);
2743
2744      switch (CC) {
2745      // Comparisons which only use the zero flag.
2746      case X86::COND_E: case X86::COND_NE:
2747        continue;
2748      // Anything else: assume conservatively.
2749      default:
2750        return false;
2751      }
2752    }
2753  }
2754  return true;
2755}
2756
2757/// Test whether the given X86ISD::CMP node has any uses which require the SF
2758/// flag to be accurate.
2759bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2760  // Examine each user of the node.
2761  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2762         UI != UE; ++UI) {
2763    // Only check things that use the flags.
2764    if (UI.getUse().getResNo() != Flags.getResNo())
2765      continue;
2766    // Only examine CopyToReg uses that copy to EFLAGS.
2767    if (UI->getOpcode() != ISD::CopyToReg ||
2768        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2769      return false;
2770    // Examine each user of the CopyToReg use.
2771    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2772           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2773      // Only examine the Flag result.
2774      if (FlagUI.getUse().getResNo() != 1) continue;
2775      // Anything unusual: assume conservatively.
2776      if (!FlagUI->isMachineOpcode()) return false;
2777      // Examine the condition code of the user.
2778      X86::CondCode CC = getCondFromNode(*FlagUI);
2779
2780      switch (CC) {
2781      // Comparisons which don't examine the SF flag.
2782      case X86::COND_A: case X86::COND_AE:
2783      case X86::COND_B: case X86::COND_BE:
2784      case X86::COND_E: case X86::COND_NE:
2785      case X86::COND_O: case X86::COND_NO:
2786      case X86::COND_P: case X86::COND_NP:
2787        continue;
2788      // Anything else: assume conservatively.
2789      default:
2790        return false;
2791      }
2792    }
2793  }
2794  return true;
2795}
2796
2797static bool mayUseCarryFlag(X86::CondCode CC) {
2798  switch (CC) {
2799  // Comparisons which don't examine the CF flag.
2800  case X86::COND_O: case X86::COND_NO:
2801  case X86::COND_E: case X86::COND_NE:
2802  case X86::COND_S: case X86::COND_NS:
2803  case X86::COND_P: case X86::COND_NP:
2804  case X86::COND_L: case X86::COND_GE:
2805  case X86::COND_G: case X86::COND_LE:
2806    return false;
2807  // Anything else: assume conservatively.
2808  default:
2809    return true;
2810  }
2811}
2812
2813/// Test whether the given node which sets flags has any uses which require the
2814/// CF flag to be accurate.
2815 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2816  // Examine each user of the node.
2817  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2818         UI != UE; ++UI) {
2819    // Only check things that use the flags.
2820    if (UI.getUse().getResNo() != Flags.getResNo())
2821      continue;
2822
2823    unsigned UIOpc = UI->getOpcode();
2824
2825    if (UIOpc == ISD::CopyToReg) {
2826      // Only examine CopyToReg uses that copy to EFLAGS.
2827      if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2828        return false;
2829      // Examine each user of the CopyToReg use.
2830      for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2831           FlagUI != FlagUE; ++FlagUI) {
2832        // Only examine the Flag result.
2833        if (FlagUI.getUse().getResNo() != 1)
2834          continue;
2835        // Anything unusual: assume conservatively.
2836        if (!FlagUI->isMachineOpcode())
2837          return false;
2838        // Examine the condition code of the user.
2839        X86::CondCode CC = getCondFromNode(*FlagUI);
2840
2841        if (mayUseCarryFlag(CC))
2842          return false;
2843      }
2844
2845      // This CopyToReg is ok. Move on to the next user.
2846      continue;
2847    }
2848
2849    // This might be an unselected node. So look for the pre-isel opcodes that
2850    // use flags.
2851    unsigned CCOpNo;
2852    switch (UIOpc) {
2853    default:
2854      // Something unusual. Be conservative.
2855      return false;
2856    case X86ISD::SETCC:       CCOpNo = 0; break;
2857    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2858    case X86ISD::CMOV:        CCOpNo = 2; break;
2859    case X86ISD::BRCOND:      CCOpNo = 2; break;
2860    }
2861
2862    X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2863    if (mayUseCarryFlag(CC))
2864      return false;
2865  }
2866  return true;
2867}
2868
2869/// Check whether or not the chain ending in StoreNode is suitable for doing
2870/// the {load; op; store} to modify transformation.
2871static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2872                                        SDValue StoredVal, SelectionDAG *CurDAG,
2873                                        unsigned LoadOpNo,
2874                                        LoadSDNode *&LoadNode,
2875                                        SDValue &InputChain) {
2876  // Is the stored value result 0 of the operation?
2877  if (StoredVal.getResNo() != 0) return false;
2878
2879  // Are there other uses of the operation other than the store?
2880  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2881
2882  // Is the store non-extending and non-indexed?
2883  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2884    return false;
2885
2886  SDValue Load = StoredVal->getOperand(LoadOpNo);
2887  // Is the stored value a non-extending and non-indexed load?
2888  if (!ISD::isNormalLoad(Load.getNode())) return false;
2889
2890  // Return LoadNode by reference.
2891  LoadNode = cast<LoadSDNode>(Load);
2892
2893  // Is store the only read of the loaded value?
2894  if (!Load.hasOneUse())
2895    return false;
2896
2897  // Is the address of the store the same as the load?
2898  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2899      LoadNode->getOffset() != StoreNode->getOffset())
2900    return false;
2901
2902  bool FoundLoad = false;
2903  SmallVector<SDValue, 4> ChainOps;
2904  SmallVector<const SDNode *, 4> LoopWorklist;
2905  SmallPtrSet<const SDNode *, 16> Visited;
2906  const unsigned int Max = 1024;
2907
2908  //  Visualization of Load-Op-Store fusion:
2909  // -------------------------
2910  // Legend:
2911  //    *-lines = Chain operand dependencies.
2912  //    |-lines = Normal operand dependencies.
2913  //    Dependencies flow down and right. n-suffix references multiple nodes.
2914  //
2915  //        C                        Xn  C
2916  //        *                         *  *
2917  //        *                          * *
2918  //  Xn  A-LD    Yn                    TF         Yn
2919  //   *    * \   |                       *        |
2920  //    *   *  \  |                        *       |
2921  //     *  *   \ |             =>       A--LD_OP_ST
2922  //      * *    \|                                 \
2923  //       TF    OP                                  \
2924  //         *   | \                                  Zn
2925  //          *  |  \
2926  //         A-ST    Zn
2927  //
2928
2929  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2930  //                                      #2: Yn -> LD
2931  //                                      #3: ST -> Zn
2932
2933  // Ensure the transform is safe by checking for the dual
2934  // dependencies to make sure we do not induce a loop.
2935
2936  // As LD is a predecessor to both OP and ST we can do this by checking:
2937  //  a). if LD is a predecessor to a member of Xn or Yn.
2938  //  b). if a Zn is a predecessor to ST.
2939
2940  // However, (b) can only occur through being a chain predecessor to
2941  // ST, which is the same as Zn being a member or predecessor of Xn,
2942  // which is a subset of LD being a predecessor of Xn. So it's
2943  // subsumed by check (a).
2944
2945  SDValue Chain = StoreNode->getChain();
2946
2947  // Gather X elements in ChainOps.
2948  if (Chain == Load.getValue(1)) {
2949    FoundLoad = true;
2950    ChainOps.push_back(Load.getOperand(0));
2951  } else if (Chain.getOpcode() == ISD::TokenFactor) {
2952    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2953      SDValue Op = Chain.getOperand(i);
2954      if (Op == Load.getValue(1)) {
2955        FoundLoad = true;
2956        // Drop Load, but keep its chain. No cycle check necessary.
2957        ChainOps.push_back(Load.getOperand(0));
2958        continue;
2959      }
2960      LoopWorklist.push_back(Op.getNode());
2961      ChainOps.push_back(Op);
2962    }
2963  }
2964
2965  if (!FoundLoad)
2966    return false;
2967
2968  // Worklist is currently Xn. Add Yn to worklist.
2969  for (SDValue Op : StoredVal->ops())
2970    if (Op.getNode() != LoadNode)
2971      LoopWorklist.push_back(Op.getNode());
2972
2973  // Check (a) if Load is a predecessor to Xn + Yn
2974  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2975                                   true))
2976    return false;
2977
2978  InputChain =
2979      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2980  return true;
2981}
2982
2983// Change a chain of {load; op; store} of the same value into a simple op
2984// through memory of that value, if the uses of the modified value and its
2985// address are suitable.
2986//
2987// The tablegen pattern memory operand pattern is currently not able to match
2988// the case where the EFLAGS on the original operation are used.
2989//
2990// To move this to tablegen, we'll need to improve tablegen to allow flags to
2991// be transferred from a node in the pattern to the result node, probably with
2992// a new keyword. For example, we have this
2993// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2994//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2995//   (implicit EFLAGS)]>;
2996// but maybe need something like this
2997// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2998//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2999//   (transferrable EFLAGS)]>;
3000//
3001// Until then, we manually fold these and instruction select the operation
3002// here.
3003bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3004  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
3005  SDValue StoredVal = StoreNode->getOperand(1);
3006  unsigned Opc = StoredVal->getOpcode();
3007
3008  // Before we try to select anything, make sure this is memory operand size
3009  // and opcode we can handle. Note that this must match the code below that
3010  // actually lowers the opcodes.
3011  EVT MemVT = StoreNode->getMemoryVT();
3012  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3013      MemVT != MVT::i8)
3014    return false;
3015
3016  bool IsCommutable = false;
3017  bool IsNegate = false;
3018  switch (Opc) {
3019  default:
3020    return false;
3021  case X86ISD::SUB:
3022    IsNegate = isNullConstant(StoredVal.getOperand(0));
3023    break;
3024  case X86ISD::SBB:
3025    break;
3026  case X86ISD::ADD:
3027  case X86ISD::ADC:
3028  case X86ISD::AND:
3029  case X86ISD::OR:
3030  case X86ISD::XOR:
3031    IsCommutable = true;
3032    break;
3033  }
3034
3035  unsigned LoadOpNo = IsNegate ? 1 : 0;
3036  LoadSDNode *LoadNode = nullptr;
3037  SDValue InputChain;
3038  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3039                                   LoadNode, InputChain)) {
3040    if (!IsCommutable)
3041      return false;
3042
3043    // This operation is commutable, try the other operand.
3044    LoadOpNo = 1;
3045    if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3046                                     LoadNode, InputChain))
3047      return false;
3048  }
3049
3050  SDValue Base, Scale, Index, Disp, Segment;
3051  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3052                  Segment))
3053    return false;
3054
3055  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3056                          unsigned Opc8) {
3057    switch (MemVT.getSimpleVT().SimpleTy) {
3058    case MVT::i64:
3059      return Opc64;
3060    case MVT::i32:
3061      return Opc32;
3062    case MVT::i16:
3063      return Opc16;
3064    case MVT::i8:
3065      return Opc8;
3066    default:
3067      llvm_unreachable("Invalid size!");
3068    }
3069  };
3070
3071  MachineSDNode *Result;
3072  switch (Opc) {
3073  case X86ISD::SUB:
3074    // Handle negate.
3075    if (IsNegate) {
3076      unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3077                                     X86::NEG8m);
3078      const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3079      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3080                                      MVT::Other, Ops);
3081      break;
3082    }
3083   LLVM_FALLTHROUGH;
3084  case X86ISD::ADD:
3085    // Try to match inc/dec.
3086    if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3087      bool IsOne = isOneConstant(StoredVal.getOperand(1));
3088      bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3089      // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3090      if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3091        unsigned NewOpc =
3092          ((Opc == X86ISD::ADD) == IsOne)
3093              ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3094              : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3095        const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3096        Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3097                                        MVT::Other, Ops);
3098        break;
3099      }
3100    }
3101    LLVM_FALLTHROUGH;
3102  case X86ISD::ADC:
3103  case X86ISD::SBB:
3104  case X86ISD::AND:
3105  case X86ISD::OR:
3106  case X86ISD::XOR: {
3107    auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3108      switch (Opc) {
3109      case X86ISD::ADD:
3110        return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3111                            X86::ADD8mr);
3112      case X86ISD::ADC:
3113        return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3114                            X86::ADC8mr);
3115      case X86ISD::SUB:
3116        return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3117                            X86::SUB8mr);
3118      case X86ISD::SBB:
3119        return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3120                            X86::SBB8mr);
3121      case X86ISD::AND:
3122        return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3123                            X86::AND8mr);
3124      case X86ISD::OR:
3125        return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3126      case X86ISD::XOR:
3127        return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3128                            X86::XOR8mr);
3129      default:
3130        llvm_unreachable("Invalid opcode!");
3131      }
3132    };
3133    auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3134      switch (Opc) {
3135      case X86ISD::ADD:
3136        return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3137      case X86ISD::ADC:
3138        return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3139      case X86ISD::SUB:
3140        return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3141      case X86ISD::SBB:
3142        return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3143      case X86ISD::AND:
3144        return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3145      case X86ISD::OR:
3146        return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3147      case X86ISD::XOR:
3148        return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3149      default:
3150        llvm_unreachable("Invalid opcode!");
3151      }
3152    };
3153    auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3154      switch (Opc) {
3155      case X86ISD::ADD:
3156        return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3157                            X86::ADD8mi);
3158      case X86ISD::ADC:
3159        return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3160                            X86::ADC8mi);
3161      case X86ISD::SUB:
3162        return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3163                            X86::SUB8mi);
3164      case X86ISD::SBB:
3165        return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3166                            X86::SBB8mi);
3167      case X86ISD::AND:
3168        return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3169                            X86::AND8mi);
3170      case X86ISD::OR:
3171        return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3172                            X86::OR8mi);
3173      case X86ISD::XOR:
3174        return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3175                            X86::XOR8mi);
3176      default:
3177        llvm_unreachable("Invalid opcode!");
3178      }
3179    };
3180
3181    unsigned NewOpc = SelectRegOpcode(Opc);
3182    SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3183
3184    // See if the operand is a constant that we can fold into an immediate
3185    // operand.
3186    if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3187      int64_t OperandV = OperandC->getSExtValue();
3188
3189      // Check if we can shrink the operand enough to fit in an immediate (or
3190      // fit into a smaller immediate) by negating it and switching the
3191      // operation.
3192      if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3193          ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3194           (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3195            isInt<32>(-OperandV))) &&
3196          hasNoCarryFlagUses(StoredVal.getValue(1))) {
3197        OperandV = -OperandV;
3198        Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3199      }
3200
3201      // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3202      // the larger immediate operand.
3203      if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3204        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3205        NewOpc = SelectImm8Opcode(Opc);
3206      } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3207        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3208        NewOpc = SelectImmOpcode(Opc);
3209      }
3210    }
3211
3212    if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3213      SDValue CopyTo =
3214          CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3215                               StoredVal.getOperand(2), SDValue());
3216
3217      const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3218                             Segment, Operand, CopyTo, CopyTo.getValue(1)};
3219      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3220                                      Ops);
3221    } else {
3222      const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3223                             Segment, Operand, InputChain};
3224      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3225                                      Ops);
3226    }
3227    break;
3228  }
3229  default:
3230    llvm_unreachable("Invalid opcode!");
3231  }
3232
3233  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3234                                 LoadNode->getMemOperand()};
3235  CurDAG->setNodeMemRefs(Result, MemOps);
3236
3237  // Update Load Chain uses as well.
3238  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3239  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3240  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3241  CurDAG->RemoveDeadNode(Node);
3242  return true;
3243}
3244
3245// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3246// Where Mask is one of the following patterns:
3247//   a) x &  (1 << nbits) - 1
3248//   b) x & ~(-1 << nbits)
3249//   c) x &  (-1 >> (32 - y))
3250//   d) x << (32 - y) >> (32 - y)
3251bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3252  assert(
3253      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3254      "Should be either an and-mask, or right-shift after clearing high bits.");
3255
3256  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3257  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3258    return false;
3259
3260  MVT NVT = Node->getSimpleValueType(0);
3261
3262  // Only supported for 32 and 64 bits.
3263  if (NVT != MVT::i32 && NVT != MVT::i64)
3264    return false;
3265
3266  SDValue NBits;
3267
3268  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3269  // Else, if we only have BMI1's BEXTR, we require one-use.
3270  const bool CanHaveExtraUses = Subtarget->hasBMI2();
3271  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3272    return CanHaveExtraUses ||
3273           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3274  };
3275  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3276  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3277
3278  auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3279    if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3280      assert(V.getSimpleValueType() == MVT::i32 &&
3281             V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3282             "Expected i64 -> i32 truncation");
3283      V = V.getOperand(0);
3284    }
3285    return V;
3286  };
3287
3288  // a) x & ((1 << nbits) + (-1))
3289  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3290                        &NBits](SDValue Mask) -> bool {
3291    // Match `add`. Must only have one use!
3292    if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3293      return false;
3294    // We should be adding all-ones constant (i.e. subtracting one.)
3295    if (!isAllOnesConstant(Mask->getOperand(1)))
3296      return false;
3297    // Match `1 << nbits`. Might be truncated. Must only have one use!
3298    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3299    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3300      return false;
3301    if (!isOneConstant(M0->getOperand(0)))
3302      return false;
3303    NBits = M0->getOperand(1);
3304    return true;
3305  };
3306
3307  auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3308    V = peekThroughOneUseTruncation(V);
3309    return CurDAG->MaskedValueIsAllOnes(
3310        V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3311                                NVT.getSizeInBits()));
3312  };
3313
3314  // b) x & ~(-1 << nbits)
3315  auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3316                        &NBits](SDValue Mask) -> bool {
3317    // Match `~()`. Must only have one use!
3318    if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3319      return false;
3320    // The -1 only has to be all-ones for the final Node's NVT.
3321    if (!isAllOnes(Mask->getOperand(1)))
3322      return false;
3323    // Match `-1 << nbits`. Might be truncated. Must only have one use!
3324    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3325    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3326      return false;
3327    // The -1 only has to be all-ones for the final Node's NVT.
3328    if (!isAllOnes(M0->getOperand(0)))
3329      return false;
3330    NBits = M0->getOperand(1);
3331    return true;
3332  };
3333
3334  // Match potentially-truncated (bitwidth - y)
3335  auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3336                                             unsigned Bitwidth) {
3337    // Skip over a truncate of the shift amount.
3338    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3339      ShiftAmt = ShiftAmt.getOperand(0);
3340      // The trunc should have been the only user of the real shift amount.
3341      if (!checkOneUse(ShiftAmt))
3342        return false;
3343    }
3344    // Match the shift amount as: (bitwidth - y). It should go away, too.
3345    if (ShiftAmt.getOpcode() != ISD::SUB)
3346      return false;
3347    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3348    if (!V0 || V0->getZExtValue() != Bitwidth)
3349      return false;
3350    NBits = ShiftAmt.getOperand(1);
3351    return true;
3352  };
3353
3354  // c) x &  (-1 >> (32 - y))
3355  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3356                        matchShiftAmt](SDValue Mask) -> bool {
3357    // The mask itself may be truncated.
3358    Mask = peekThroughOneUseTruncation(Mask);
3359    unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3360    // Match `l>>`. Must only have one use!
3361    if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3362      return false;
3363    // We should be shifting truly all-ones constant.
3364    if (!isAllOnesConstant(Mask.getOperand(0)))
3365      return false;
3366    SDValue M1 = Mask.getOperand(1);
3367    // The shift amount should not be used externally.
3368    if (!checkOneUse(M1))
3369      return false;
3370    return matchShiftAmt(M1, Bitwidth);
3371  };
3372
3373  SDValue X;
3374
3375  // d) x << (32 - y) >> (32 - y)
3376  auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3377                        &X](SDNode *Node) -> bool {
3378    if (Node->getOpcode() != ISD::SRL)
3379      return false;
3380    SDValue N0 = Node->getOperand(0);
3381    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3382      return false;
3383    unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3384    SDValue N1 = Node->getOperand(1);
3385    SDValue N01 = N0->getOperand(1);
3386    // Both of the shifts must be by the exact same value.
3387    // There should not be any uses of the shift amount outside of the pattern.
3388    if (N1 != N01 || !checkTwoUse(N1))
3389      return false;
3390    if (!matchShiftAmt(N1, Bitwidth))
3391      return false;
3392    X = N0->getOperand(0);
3393    return true;
3394  };
3395
3396  auto matchLowBitMask = [matchPatternA, matchPatternB,
3397                          matchPatternC](SDValue Mask) -> bool {
3398    return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3399  };
3400
3401  if (Node->getOpcode() == ISD::AND) {
3402    X = Node->getOperand(0);
3403    SDValue Mask = Node->getOperand(1);
3404
3405    if (matchLowBitMask(Mask)) {
3406      // Great.
3407    } else {
3408      std::swap(X, Mask);
3409      if (!matchLowBitMask(Mask))
3410        return false;
3411    }
3412  } else if (!matchPatternD(Node))
3413    return false;
3414
3415  SDLoc DL(Node);
3416
3417  // Truncate the shift amount.
3418  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3419  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3420
3421  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3422  // All the other bits are undefined, we do not care about them.
3423  SDValue ImplDef = SDValue(
3424      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3425  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3426
3427  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3428  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3429  NBits = SDValue(
3430      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3431                             NBits, SRIdxVal), 0);
3432  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3433
3434  if (Subtarget->hasBMI2()) {
3435    // Great, just emit the the BZHI..
3436    if (NVT != MVT::i32) {
3437      // But have to place the bit count into the wide-enough register first.
3438      NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3439      insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3440    }
3441
3442    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3443    ReplaceNode(Node, Extract.getNode());
3444    SelectCode(Extract.getNode());
3445    return true;
3446  }
3447
3448  // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3449  // *logically* shifted (potentially with one-use trunc inbetween),
3450  // and the truncation was the only use of the shift,
3451  // and if so look past one-use truncation.
3452  {
3453    SDValue RealX = peekThroughOneUseTruncation(X);
3454    // FIXME: only if the shift is one-use?
3455    if (RealX != X && RealX.getOpcode() == ISD::SRL)
3456      X = RealX;
3457  }
3458
3459  MVT XVT = X.getSimpleValueType();
3460
3461  // Else, emitting BEXTR requires one more step.
3462  // The 'control' of BEXTR has the pattern of:
3463  // [15...8 bit][ 7...0 bit] location
3464  // [ bit count][     shift] name
3465  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3466
3467  // Shift NBits left by 8 bits, thus producing 'control'.
3468  // This makes the low 8 bits to be zero.
3469  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3470  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3471  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3472
3473  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3474  // FIXME: only if the shift is one-use?
3475  if (X.getOpcode() == ISD::SRL) {
3476    SDValue ShiftAmt = X.getOperand(1);
3477    X = X.getOperand(0);
3478
3479    assert(ShiftAmt.getValueType() == MVT::i8 &&
3480           "Expected shift amount to be i8");
3481
3482    // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3483    // We could zext to i16 in some form, but we intentionally don't do that.
3484    SDValue OrigShiftAmt = ShiftAmt;
3485    ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3486    insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3487
3488    // And now 'or' these low 8 bits of shift amount into the 'control'.
3489    Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3490    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3491  }
3492
3493  // But have to place the 'control' into the wide-enough register first.
3494  if (XVT != MVT::i32) {
3495    Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3496    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3497  }
3498
3499  // And finally, form the BEXTR itself.
3500  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3501
3502  // The 'X' was originally truncated. Do that now.
3503  if (XVT != NVT) {
3504    insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3505    Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3506  }
3507
3508  ReplaceNode(Node, Extract.getNode());
3509  SelectCode(Extract.getNode());
3510
3511  return true;
3512}
3513
3514// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3515MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3516  MVT NVT = Node->getSimpleValueType(0);
3517  SDLoc dl(Node);
3518
3519  SDValue N0 = Node->getOperand(0);
3520  SDValue N1 = Node->getOperand(1);
3521
3522  // If we have TBM we can use an immediate for the control. If we have BMI
3523  // we should only do this if the BEXTR instruction is implemented well.
3524  // Otherwise moving the control into a register makes this more costly.
3525  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3526  // hoisting the move immediate would make it worthwhile with a less optimal
3527  // BEXTR?
3528  bool PreferBEXTR =
3529      Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
3530  if (!PreferBEXTR && !Subtarget->hasBMI2())
3531    return nullptr;
3532
3533  // Must have a shift right.
3534  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3535    return nullptr;
3536
3537  // Shift can't have additional users.
3538  if (!N0->hasOneUse())
3539    return nullptr;
3540
3541  // Only supported for 32 and 64 bits.
3542  if (NVT != MVT::i32 && NVT != MVT::i64)
3543    return nullptr;
3544
3545  // Shift amount and RHS of and must be constant.
3546  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3547  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3548  if (!MaskCst || !ShiftCst)
3549    return nullptr;
3550
3551  // And RHS must be a mask.
3552  uint64_t Mask = MaskCst->getZExtValue();
3553  if (!isMask_64(Mask))
3554    return nullptr;
3555
3556  uint64_t Shift = ShiftCst->getZExtValue();
3557  uint64_t MaskSize = countPopulation(Mask);
3558
3559  // Don't interfere with something that can be handled by extracting AH.
3560  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3561  if (Shift == 8 && MaskSize == 8)
3562    return nullptr;
3563
3564  // Make sure we are only using bits that were in the original value, not
3565  // shifted in.
3566  if (Shift + MaskSize > NVT.getSizeInBits())
3567    return nullptr;
3568
3569  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3570  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3571  // does not fit into 32 bits. Load folding is not a sufficient reason.
3572  if (!PreferBEXTR && MaskSize <= 32)
3573    return nullptr;
3574
3575  SDValue Control;
3576  unsigned ROpc, MOpc;
3577
3578  if (!PreferBEXTR) {
3579    assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
3580    // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3581    // Let's perform the mask first, and apply shift later. Note that we need to
3582    // widen the mask to account for the fact that we'll apply shift afterwards!
3583    Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
3584    ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
3585    MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
3586    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3587    Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3588  } else {
3589    // The 'control' of BEXTR has the pattern of:
3590    // [15...8 bit][ 7...0 bit] location
3591    // [ bit count][     shift] name
3592    // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3593    Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3594    if (Subtarget->hasTBM()) {
3595      ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3596      MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3597    } else {
3598      assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
3599      // BMI requires the immediate to placed in a register.
3600      ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3601      MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3602      unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3603      Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3604    }
3605  }
3606
3607  MachineSDNode *NewNode;
3608  SDValue Input = N0->getOperand(0);
3609  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3610  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3611    SDValue Ops[] = {
3612        Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
3613    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3614    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3615    // Update the chain.
3616    ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3617    // Record the mem-refs
3618    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3619  } else {
3620    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
3621  }
3622
3623  if (!PreferBEXTR) {
3624    // We still need to apply the shift.
3625    SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
3626    unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
3627    NewNode =
3628        CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
3629  }
3630
3631  return NewNode;
3632}
3633
3634// Emit a PCMISTR(I/M) instruction.
3635MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3636                                             bool MayFoldLoad, const SDLoc &dl,
3637                                             MVT VT, SDNode *Node) {
3638  SDValue N0 = Node->getOperand(0);
3639  SDValue N1 = Node->getOperand(1);
3640  SDValue Imm = Node->getOperand(2);
3641  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3642  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3643
3644  // Try to fold a load. No need to check alignment.
3645  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3646  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3647    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3648                      N1.getOperand(0) };
3649    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3650    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3651    // Update the chain.
3652    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3653    // Record the mem-refs
3654    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3655    return CNode;
3656  }
3657
3658  SDValue Ops[] = { N0, N1, Imm };
3659  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3660  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3661  return CNode;
3662}
3663
3664// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3665// to emit a second instruction after this one. This is needed since we have two
3666// copyToReg nodes glued before this and we need to continue that glue through.
3667MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3668                                             bool MayFoldLoad, const SDLoc &dl,
3669                                             MVT VT, SDNode *Node,
3670                                             SDValue &InFlag) {
3671  SDValue N0 = Node->getOperand(0);
3672  SDValue N2 = Node->getOperand(2);
3673  SDValue Imm = Node->getOperand(4);
3674  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3675  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3676
3677  // Try to fold a load. No need to check alignment.
3678  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3679  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3680    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3681                      N2.getOperand(0), InFlag };
3682    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3683    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3684    InFlag = SDValue(CNode, 3);
3685    // Update the chain.
3686    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3687    // Record the mem-refs
3688    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3689    return CNode;
3690  }
3691
3692  SDValue Ops[] = { N0, N2, Imm, InFlag };
3693  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3694  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3695  InFlag = SDValue(CNode, 2);
3696  return CNode;
3697}
3698
3699bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3700  EVT VT = N->getValueType(0);
3701
3702  // Only handle scalar shifts.
3703  if (VT.isVector())
3704    return false;
3705
3706  // Narrower shifts only mask to 5 bits in hardware.
3707  unsigned Size = VT == MVT::i64 ? 64 : 32;
3708
3709  SDValue OrigShiftAmt = N->getOperand(1);
3710  SDValue ShiftAmt = OrigShiftAmt;
3711  SDLoc DL(N);
3712
3713  // Skip over a truncate of the shift amount.
3714  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3715    ShiftAmt = ShiftAmt->getOperand(0);
3716
3717  // This function is called after X86DAGToDAGISel::matchBitExtract(),
3718  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3719
3720  SDValue NewShiftAmt;
3721  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3722    SDValue Add0 = ShiftAmt->getOperand(0);
3723    SDValue Add1 = ShiftAmt->getOperand(1);
3724    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3725    // to avoid the ADD/SUB.
3726    if (isa<ConstantSDNode>(Add1) &&
3727        cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3728      NewShiftAmt = Add0;
3729    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3730    // generate a NEG instead of a SUB of a constant.
3731    } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3732               isa<ConstantSDNode>(Add0) &&
3733               cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3734               cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3735      // Insert a negate op.
3736      // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3737      // that uses it that's not a shift.
3738      EVT SubVT = ShiftAmt.getValueType();
3739      SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3740      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3741      NewShiftAmt = Neg;
3742
3743      // Insert these operands into a valid topological order so they can
3744      // get selected independently.
3745      insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3746      insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3747    } else
3748      return false;
3749  } else
3750    return false;
3751
3752  if (NewShiftAmt.getValueType() != MVT::i8) {
3753    // Need to truncate the shift amount.
3754    NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3755    // Add to a correct topological ordering.
3756    insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3757  }
3758
3759  // Insert a new mask to keep the shift amount legal. This should be removed
3760  // by isel patterns.
3761  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3762                                CurDAG->getConstant(Size - 1, DL, MVT::i8));
3763  // Place in a correct topological ordering.
3764  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3765
3766  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3767                                                   NewShiftAmt);
3768  if (UpdatedNode != N) {
3769    // If we found an existing node, we should replace ourselves with that node
3770    // and wait for it to be selected after its other users.
3771    ReplaceNode(N, UpdatedNode);
3772    return true;
3773  }
3774
3775  // If the original shift amount is now dead, delete it so that we don't run
3776  // it through isel.
3777  if (OrigShiftAmt.getNode()->use_empty())
3778    CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3779
3780  // Now that we've optimized the shift amount, defer to normal isel to get
3781  // load folding and legacy vs BMI2 selection without repeating it here.
3782  SelectCode(N);
3783  return true;
3784}
3785
3786bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3787  MVT NVT = N->getSimpleValueType(0);
3788  unsigned Opcode = N->getOpcode();
3789  SDLoc dl(N);
3790
3791  // For operations of the form (x << C1) op C2, check if we can use a smaller
3792  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3793  SDValue Shift = N->getOperand(0);
3794  SDValue N1 = N->getOperand(1);
3795
3796  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3797  if (!Cst)
3798    return false;
3799
3800  int64_t Val = Cst->getSExtValue();
3801
3802  // If we have an any_extend feeding the AND, look through it to see if there
3803  // is a shift behind it. But only if the AND doesn't use the extended bits.
3804  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3805  bool FoundAnyExtend = false;
3806  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3807      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3808      isUInt<32>(Val)) {
3809    FoundAnyExtend = true;
3810    Shift = Shift.getOperand(0);
3811  }
3812
3813  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3814    return false;
3815
3816  // i8 is unshrinkable, i16 should be promoted to i32.
3817  if (NVT != MVT::i32 && NVT != MVT::i64)
3818    return false;
3819
3820  ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3821  if (!ShlCst)
3822    return false;
3823
3824  uint64_t ShAmt = ShlCst->getZExtValue();
3825
3826  // Make sure that we don't change the operation by removing bits.
3827  // This only matters for OR and XOR, AND is unaffected.
3828  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3829  if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3830    return false;
3831
3832  // Check the minimum bitwidth for the new constant.
3833  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3834  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3835    if (Opcode == ISD::AND) {
3836      // AND32ri is the same as AND64ri32 with zext imm.
3837      // Try this before sign extended immediates below.
3838      ShiftedVal = (uint64_t)Val >> ShAmt;
3839      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3840        return true;
3841      // Also swap order when the AND can become MOVZX.
3842      if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3843        return true;
3844    }
3845    ShiftedVal = Val >> ShAmt;
3846    if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3847        (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3848      return true;
3849    if (Opcode != ISD::AND) {
3850      // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3851      ShiftedVal = (uint64_t)Val >> ShAmt;
3852      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3853        return true;
3854    }
3855    return false;
3856  };
3857
3858  int64_t ShiftedVal;
3859  if (!CanShrinkImmediate(ShiftedVal))
3860    return false;
3861
3862  // Ok, we can reorder to get a smaller immediate.
3863
3864  // But, its possible the original immediate allowed an AND to become MOVZX.
3865  // Doing this late due to avoid the MakedValueIsZero call as late as
3866  // possible.
3867  if (Opcode == ISD::AND) {
3868    // Find the smallest zext this could possibly be.
3869    unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3870    ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3871
3872    // Figure out which bits need to be zero to achieve that mask.
3873    APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3874                                            ZExtWidth);
3875    NeededMask &= ~Cst->getAPIntValue();
3876
3877    if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3878      return false;
3879  }
3880
3881  SDValue X = Shift.getOperand(0);
3882  if (FoundAnyExtend) {
3883    SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3884    insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3885    X = NewX;
3886  }
3887
3888  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3889  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3890  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3891  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3892  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3893                                   Shift.getOperand(1));
3894  ReplaceNode(N, NewSHL.getNode());
3895  SelectCode(NewSHL.getNode());
3896  return true;
3897}
3898
3899/// Convert vector increment or decrement to sub/add with an all-ones constant:
3900/// add X, <1, 1...> --> sub X, <-1, -1...>
3901/// sub X, <1, 1...> --> add X, <-1, -1...>
3902/// The all-ones vector constant can be materialized using a pcmpeq instruction
3903/// that is commonly recognized as an idiom (has no register dependency), so
3904/// that's better/smaller than loading a splat 1 constant.
3905bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
3906  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
3907         "Unexpected opcode for increment/decrement transform");
3908
3909  EVT VT = Node->getValueType(0);
3910  assert(VT.isVector() && "Should only be called for vectors.");
3911
3912  SDValue X = Node->getOperand(0);
3913  SDValue OneVec = Node->getOperand(1);
3914
3915  APInt SplatVal;
3916  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
3917    return false;
3918
3919  SDLoc DL(Node);
3920  SDValue OneConstant, AllOnesVec;
3921
3922  APInt Ones = APInt::getAllOnesValue(32);
3923  assert(VT.getSizeInBits() % 32 == 0 &&
3924         "Expected bit count to be a multiple of 32");
3925  OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
3926  insertDAGNode(*CurDAG, X, OneConstant);
3927
3928  unsigned NumElts = VT.getSizeInBits() / 32;
3929  assert(NumElts > 0 && "Expected to get non-empty vector.");
3930  AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
3931                                           DL, OneConstant);
3932  insertDAGNode(*CurDAG, X, AllOnesVec);
3933
3934  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
3935  insertDAGNode(*CurDAG, X, AllOnesVec);
3936
3937  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
3938  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
3939
3940  ReplaceNode(Node, NewNode.getNode());
3941  SelectCode(NewNode.getNode());
3942  return true;
3943}
3944
3945/// If the high bits of an 'and' operand are known zero, try setting the
3946/// high bits of an 'and' constant operand to produce a smaller encoding by
3947/// creating a small, sign-extended negative immediate rather than a large
3948/// positive one. This reverses a transform in SimplifyDemandedBits that
3949/// shrinks mask constants by clearing bits. There is also a possibility that
3950/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3951/// case, just replace the 'and'. Return 'true' if the node is replaced.
3952bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3953  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3954  // have immediate operands.
3955  MVT VT = And->getSimpleValueType(0);
3956  if (VT != MVT::i32 && VT != MVT::i64)
3957    return false;
3958
3959  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3960  if (!And1C)
3961    return false;
3962
3963  // Bail out if the mask constant is already negative. It's can't shrink more.
3964  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3965  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3966  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3967  // are negative too.
3968  APInt MaskVal = And1C->getAPIntValue();
3969  unsigned MaskLZ = MaskVal.countLeadingZeros();
3970  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3971    return false;
3972
3973  // Don't extend into the upper 32 bits of a 64 bit mask.
3974  if (VT == MVT::i64 && MaskLZ >= 32) {
3975    MaskLZ -= 32;
3976    MaskVal = MaskVal.trunc(32);
3977  }
3978
3979  SDValue And0 = And->getOperand(0);
3980  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3981  APInt NegMaskVal = MaskVal | HighZeros;
3982
3983  // If a negative constant would not allow a smaller encoding, there's no need
3984  // to continue. Only change the constant when we know it's a win.
3985  unsigned MinWidth = NegMaskVal.getMinSignedBits();
3986  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3987    return false;
3988
3989  // Extend masks if we truncated above.
3990  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3991    NegMaskVal = NegMaskVal.zext(64);
3992    HighZeros = HighZeros.zext(64);
3993  }
3994
3995  // The variable operand must be all zeros in the top bits to allow using the
3996  // new, negative constant as the mask.
3997  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3998    return false;
3999
4000  // Check if the mask is -1. In that case, this is an unnecessary instruction
4001  // that escaped earlier analysis.
4002  if (NegMaskVal.isAllOnesValue()) {
4003    ReplaceNode(And, And0.getNode());
4004    return true;
4005  }
4006
4007  // A negative mask allows a smaller encoding. Create a new 'and' node.
4008  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4009  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4010  ReplaceNode(And, NewAnd.getNode());
4011  SelectCode(NewAnd.getNode());
4012  return true;
4013}
4014
4015static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4016                              bool FoldedBCast, bool Masked) {
4017  if (Masked) {
4018    if (FoldedLoad) {
4019      switch (TestVT.SimpleTy) {
4020      default: llvm_unreachable("Unexpected VT!");
4021      case MVT::v16i8:
4022        return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
4023      case MVT::v8i16:
4024        return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
4025      case MVT::v4i32:
4026        return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
4027      case MVT::v2i64:
4028        return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
4029      case MVT::v32i8:
4030        return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
4031      case MVT::v16i16:
4032        return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
4033      case MVT::v8i32:
4034        return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
4035      case MVT::v4i64:
4036        return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
4037      case MVT::v64i8:
4038        return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
4039      case MVT::v32i16:
4040        return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
4041      case MVT::v16i32:
4042        return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
4043      case MVT::v8i64:
4044        return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
4045      }
4046    }
4047
4048    if (FoldedBCast) {
4049      switch (TestVT.SimpleTy) {
4050      default: llvm_unreachable("Unexpected VT!");
4051      case MVT::v4i32:
4052        return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
4053      case MVT::v2i64:
4054        return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
4055      case MVT::v8i32:
4056        return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
4057      case MVT::v4i64:
4058        return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
4059      case MVT::v16i32:
4060        return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
4061      case MVT::v8i64:
4062        return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
4063      }
4064    }
4065
4066    switch (TestVT.SimpleTy) {
4067    default: llvm_unreachable("Unexpected VT!");
4068    case MVT::v16i8:
4069      return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
4070    case MVT::v8i16:
4071      return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
4072    case MVT::v4i32:
4073      return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
4074    case MVT::v2i64:
4075      return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
4076    case MVT::v32i8:
4077      return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
4078    case MVT::v16i16:
4079      return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
4080    case MVT::v8i32:
4081      return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
4082    case MVT::v4i64:
4083      return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
4084    case MVT::v64i8:
4085      return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
4086    case MVT::v32i16:
4087      return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
4088    case MVT::v16i32:
4089      return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
4090    case MVT::v8i64:
4091      return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
4092    }
4093  }
4094
4095  if (FoldedLoad) {
4096    switch (TestVT.SimpleTy) {
4097    default: llvm_unreachable("Unexpected VT!");
4098    case MVT::v16i8:
4099      return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
4100    case MVT::v8i16:
4101      return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
4102    case MVT::v4i32:
4103      return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
4104    case MVT::v2i64:
4105      return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
4106    case MVT::v32i8:
4107      return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
4108    case MVT::v16i16:
4109      return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
4110    case MVT::v8i32:
4111      return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
4112    case MVT::v4i64:
4113      return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
4114    case MVT::v64i8:
4115      return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
4116    case MVT::v32i16:
4117      return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
4118    case MVT::v16i32:
4119      return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
4120    case MVT::v8i64:
4121      return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
4122    }
4123  }
4124
4125  if (FoldedBCast) {
4126    switch (TestVT.SimpleTy) {
4127    default: llvm_unreachable("Unexpected VT!");
4128    case MVT::v4i32:
4129      return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
4130    case MVT::v2i64:
4131      return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
4132    case MVT::v8i32:
4133      return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
4134    case MVT::v4i64:
4135      return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
4136    case MVT::v16i32:
4137      return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
4138    case MVT::v8i64:
4139      return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
4140    }
4141  }
4142
4143  switch (TestVT.SimpleTy) {
4144  default: llvm_unreachable("Unexpected VT!");
4145  case MVT::v16i8:
4146    return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
4147  case MVT::v8i16:
4148    return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
4149  case MVT::v4i32:
4150    return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
4151  case MVT::v2i64:
4152    return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
4153  case MVT::v32i8:
4154    return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
4155  case MVT::v16i16:
4156    return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
4157  case MVT::v8i32:
4158    return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
4159  case MVT::v4i64:
4160    return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
4161  case MVT::v64i8:
4162    return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
4163  case MVT::v32i16:
4164    return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
4165  case MVT::v16i32:
4166    return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
4167  case MVT::v8i64:
4168    return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
4169  }
4170}
4171
4172// Try to create VPTESTM instruction. If InMask is not null, it will be used
4173// to form a masked operation.
4174bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4175                                 SDValue InMask) {
4176  assert(Subtarget->hasAVX512() && "Expected AVX512!");
4177  assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4178         "Unexpected VT!");
4179
4180  // Look for equal and not equal compares.
4181  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4182  if (CC != ISD::SETEQ && CC != ISD::SETNE)
4183    return false;
4184
4185  SDValue SetccOp0 = Setcc.getOperand(0);
4186  SDValue SetccOp1 = Setcc.getOperand(1);
4187
4188  // Canonicalize the all zero vector to the RHS.
4189  if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4190    std::swap(SetccOp0, SetccOp1);
4191
4192  // See if we're comparing against zero.
4193  if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4194    return false;
4195
4196  SDValue N0 = SetccOp0;
4197
4198  MVT CmpVT = N0.getSimpleValueType();
4199  MVT CmpSVT = CmpVT.getVectorElementType();
4200
4201  // Start with both operands the same. We'll try to refine this.
4202  SDValue Src0 = N0;
4203  SDValue Src1 = N0;
4204
4205  {
4206    // Look through single use bitcasts.
4207    SDValue N0Temp = N0;
4208    if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4209      N0Temp = N0.getOperand(0);
4210
4211     // Look for single use AND.
4212    if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4213      Src0 = N0Temp.getOperand(0);
4214      Src1 = N0Temp.getOperand(1);
4215    }
4216  }
4217
4218  // Without VLX we need to widen the load.
4219  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4220
4221  // We can only fold loads if the sources are unique.
4222  bool CanFoldLoads = Src0 != Src1;
4223
4224  // Try to fold loads unless we need to widen.
4225  bool FoldedLoad = false;
4226  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4227  if (!Widen && CanFoldLoads) {
4228    Load = Src1;
4229    FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4230                             Tmp4);
4231    if (!FoldedLoad) {
4232      // And is computative.
4233      Load = Src0;
4234      FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4235                               Tmp3, Tmp4);
4236      if (FoldedLoad)
4237        std::swap(Src0, Src1);
4238    }
4239  }
4240
4241  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4242    // Look through single use bitcasts.
4243    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
4244      Parent = Src.getNode();
4245      Src = Src.getOperand(0);
4246    }
4247
4248    if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
4249      auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
4250      if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
4251        return Src;
4252    }
4253
4254    return SDValue();
4255  };
4256
4257  // If we didn't fold a load, try to match broadcast. No widening limitation
4258  // for this. But only 32 and 64 bit types are supported.
4259  bool FoldedBCast = false;
4260  if (!FoldedLoad && CanFoldLoads &&
4261      (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4262    SDNode *ParentNode = N0.getNode();
4263    if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4264      FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4265                                     Tmp1, Tmp2, Tmp3, Tmp4);
4266    }
4267
4268    // Try the other operand.
4269    if (!FoldedBCast) {
4270      SDNode *ParentNode = N0.getNode();
4271      if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4272        FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4273                                       Tmp1, Tmp2, Tmp3, Tmp4);
4274        if (FoldedBCast)
4275          std::swap(Src0, Src1);
4276      }
4277    }
4278  }
4279
4280  auto getMaskRC = [](MVT MaskVT) {
4281    switch (MaskVT.SimpleTy) {
4282    default: llvm_unreachable("Unexpected VT!");
4283    case MVT::v2i1:  return X86::VK2RegClassID;
4284    case MVT::v4i1:  return X86::VK4RegClassID;
4285    case MVT::v8i1:  return X86::VK8RegClassID;
4286    case MVT::v16i1: return X86::VK16RegClassID;
4287    case MVT::v32i1: return X86::VK32RegClassID;
4288    case MVT::v64i1: return X86::VK64RegClassID;
4289    }
4290  };
4291
4292  bool IsMasked = InMask.getNode() != nullptr;
4293
4294  SDLoc dl(Root);
4295
4296  MVT ResVT = Setcc.getSimpleValueType();
4297  MVT MaskVT = ResVT;
4298  if (Widen) {
4299    // Widen the inputs using insert_subreg or copy_to_regclass.
4300    unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4301    unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4302    unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4303    CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4304    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4305    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4306                                                     CmpVT), 0);
4307    Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4308
4309    assert(!FoldedLoad && "Shouldn't have folded the load");
4310    if (!FoldedBCast)
4311      Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4312
4313    if (IsMasked) {
4314      // Widen the mask.
4315      unsigned RegClass = getMaskRC(MaskVT);
4316      SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4317      InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4318                                              dl, MaskVT, InMask, RC), 0);
4319    }
4320  }
4321
4322  bool IsTestN = CC == ISD::SETEQ;
4323  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4324                               IsMasked);
4325
4326  MachineSDNode *CNode;
4327  if (FoldedLoad || FoldedBCast) {
4328    SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4329
4330    if (IsMasked) {
4331      SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4332                        Load.getOperand(0) };
4333      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4334    } else {
4335      SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4336                        Load.getOperand(0) };
4337      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4338    }
4339
4340    // Update the chain.
4341    ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4342    // Record the mem-refs
4343    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
4344  } else {
4345    if (IsMasked)
4346      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4347    else
4348      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4349  }
4350
4351  // If we widened, we need to shrink the mask VT.
4352  if (Widen) {
4353    unsigned RegClass = getMaskRC(ResVT);
4354    SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4355    CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4356                                   dl, ResVT, SDValue(CNode, 0), RC);
4357  }
4358
4359  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4360  CurDAG->RemoveDeadNode(Root);
4361  return true;
4362}
4363
4364// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4365// into vpternlog.
4366bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4367  assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4368
4369  MVT NVT = N->getSimpleValueType(0);
4370
4371  // Make sure we support VPTERNLOG.
4372  if (!NVT.isVector() || !Subtarget->hasAVX512())
4373    return false;
4374
4375  // We need VLX for 128/256-bit.
4376  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4377    return false;
4378
4379  SDValue N0 = N->getOperand(0);
4380  SDValue N1 = N->getOperand(1);
4381
4382  // Canonicalize AND to LHS.
4383  if (N1.getOpcode() == ISD::AND)
4384    std::swap(N0, N1);
4385
4386  if (N0.getOpcode() != ISD::AND ||
4387      N1.getOpcode() != X86ISD::ANDNP ||
4388      !N0.hasOneUse() || !N1.hasOneUse())
4389    return false;
4390
4391  // ANDN is not commutable, use it to pick down A and C.
4392  SDValue A = N1.getOperand(0);
4393  SDValue C = N1.getOperand(1);
4394
4395  // AND is commutable, if one operand matches A, the other operand is B.
4396  // Otherwise this isn't a match.
4397  SDValue B;
4398  if (N0.getOperand(0) == A)
4399    B = N0.getOperand(1);
4400  else if (N0.getOperand(1) == A)
4401    B = N0.getOperand(0);
4402  else
4403    return false;
4404
4405  SDLoc dl(N);
4406  SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4407  SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4408  ReplaceNode(N, Ternlog.getNode());
4409  SelectCode(Ternlog.getNode());
4410  return true;
4411}
4412
4413void X86DAGToDAGISel::Select(SDNode *Node) {
4414  MVT NVT = Node->getSimpleValueType(0);
4415  unsigned Opcode = Node->getOpcode();
4416  SDLoc dl(Node);
4417
4418  if (Node->isMachineOpcode()) {
4419    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4420    Node->setNodeId(-1);
4421    return;   // Already selected.
4422  }
4423
4424  switch (Opcode) {
4425  default: break;
4426  case ISD::INTRINSIC_VOID: {
4427    unsigned IntNo = Node->getConstantOperandVal(1);
4428    switch (IntNo) {
4429    default: break;
4430    case Intrinsic::x86_sse3_monitor:
4431    case Intrinsic::x86_monitorx:
4432    case Intrinsic::x86_clzero: {
4433      bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4434
4435      unsigned Opc = 0;
4436      switch (IntNo) {
4437      default: llvm_unreachable("Unexpected intrinsic!");
4438      case Intrinsic::x86_sse3_monitor:
4439        if (!Subtarget->hasSSE3())
4440          break;
4441        Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4442        break;
4443      case Intrinsic::x86_monitorx:
4444        if (!Subtarget->hasMWAITX())
4445          break;
4446        Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4447        break;
4448      case Intrinsic::x86_clzero:
4449        if (!Subtarget->hasCLZERO())
4450          break;
4451        Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4452        break;
4453      }
4454
4455      if (Opc) {
4456        unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4457        SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4458                                             Node->getOperand(2), SDValue());
4459        SDValue InFlag = Chain.getValue(1);
4460
4461        if (IntNo == Intrinsic::x86_sse3_monitor ||
4462            IntNo == Intrinsic::x86_monitorx) {
4463          // Copy the other two operands to ECX and EDX.
4464          Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4465                                       InFlag);
4466          InFlag = Chain.getValue(1);
4467          Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4468                                       InFlag);
4469          InFlag = Chain.getValue(1);
4470        }
4471
4472        MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4473                                                      { Chain, InFlag});
4474        ReplaceNode(Node, CNode);
4475        return;
4476      }
4477
4478      break;
4479    }
4480    }
4481
4482    break;
4483  }
4484  case ISD::BRIND: {
4485    if (Subtarget->isTargetNaCl())
4486      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4487      // leave the instruction alone.
4488      break;
4489    if (Subtarget->isTarget64BitILP32()) {
4490      // Converts a 32-bit register to a 64-bit, zero-extended version of
4491      // it. This is needed because x86-64 can do many things, but jmp %r32
4492      // ain't one of them.
4493      const SDValue &Target = Node->getOperand(1);
4494      assert(Target.getSimpleValueType() == llvm::MVT::i32);
4495      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4496      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4497                                      Node->getOperand(0), ZextTarget);
4498      ReplaceNode(Node, Brind.getNode());
4499      SelectCode(ZextTarget.getNode());
4500      SelectCode(Brind.getNode());
4501      return;
4502    }
4503    break;
4504  }
4505  case X86ISD::GlobalBaseReg:
4506    ReplaceNode(Node, getGlobalBaseReg());
4507    return;
4508
4509  case ISD::BITCAST:
4510    // Just drop all 128/256/512-bit bitcasts.
4511    if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4512        NVT == MVT::f128) {
4513      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4514      CurDAG->RemoveDeadNode(Node);
4515      return;
4516    }
4517    break;
4518
4519  case ISD::VSELECT: {
4520    // Replace VSELECT with non-mask conditions with with BLENDV.
4521    if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4522      break;
4523
4524    assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4525    SDValue Blendv = CurDAG->getNode(
4526        X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4527        Node->getOperand(1), Node->getOperand(2));
4528    ReplaceNode(Node, Blendv.getNode());
4529    SelectCode(Blendv.getNode());
4530    // We already called ReplaceUses.
4531    return;
4532  }
4533
4534  case ISD::SRL:
4535    if (matchBitExtract(Node))
4536      return;
4537    LLVM_FALLTHROUGH;
4538  case ISD::SRA:
4539  case ISD::SHL:
4540    if (tryShiftAmountMod(Node))
4541      return;
4542    break;
4543
4544  case ISD::AND:
4545    if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4546      // Try to form a masked VPTESTM. Operands can be in either order.
4547      SDValue N0 = Node->getOperand(0);
4548      SDValue N1 = Node->getOperand(1);
4549      if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4550          tryVPTESTM(Node, N0, N1))
4551        return;
4552      if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4553          tryVPTESTM(Node, N1, N0))
4554        return;
4555    }
4556
4557    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4558      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4559      CurDAG->RemoveDeadNode(Node);
4560      return;
4561    }
4562    if (matchBitExtract(Node))
4563      return;
4564    if (AndImmShrink && shrinkAndImmediate(Node))
4565      return;
4566
4567    LLVM_FALLTHROUGH;
4568  case ISD::OR:
4569  case ISD::XOR:
4570    if (tryShrinkShlLogicImm(Node))
4571      return;
4572
4573    if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4574      return;
4575
4576    LLVM_FALLTHROUGH;
4577  case ISD::ADD:
4578  case ISD::SUB: {
4579    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
4580        combineIncDecVector(Node))
4581      return;
4582
4583    // Try to avoid folding immediates with multiple uses for optsize.
4584    // This code tries to select to register form directly to avoid going
4585    // through the isel table which might fold the immediate. We can't change
4586    // the patterns on the add/sub/and/or/xor with immediate paterns in the
4587    // tablegen files to check immediate use count without making the patterns
4588    // unavailable to the fast-isel table.
4589    if (!OptForSize)
4590      break;
4591
4592    // Only handle i8/i16/i32/i64.
4593    if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4594      break;
4595
4596    SDValue N0 = Node->getOperand(0);
4597    SDValue N1 = Node->getOperand(1);
4598
4599    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4600    if (!Cst)
4601      break;
4602
4603    int64_t Val = Cst->getSExtValue();
4604
4605    // Make sure its an immediate that is considered foldable.
4606    // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4607    if (!isInt<8>(Val) && !isInt<32>(Val))
4608      break;
4609
4610    // If this can match to INC/DEC, let it go.
4611    if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
4612      break;
4613
4614    // Check if we should avoid folding this immediate.
4615    if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4616      break;
4617
4618    // We should not fold the immediate. So we need a register form instead.
4619    unsigned ROpc, MOpc;
4620    switch (NVT.SimpleTy) {
4621    default: llvm_unreachable("Unexpected VT!");
4622    case MVT::i8:
4623      switch (Opcode) {
4624      default: llvm_unreachable("Unexpected opcode!");
4625      case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4626      case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4627      case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4628      case ISD::OR:  ROpc = X86::OR8rr;  MOpc = X86::OR8rm;  break;
4629      case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4630      }
4631      break;
4632    case MVT::i16:
4633      switch (Opcode) {
4634      default: llvm_unreachable("Unexpected opcode!");
4635      case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4636      case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4637      case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4638      case ISD::OR:  ROpc = X86::OR16rr;  MOpc = X86::OR16rm;  break;
4639      case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4640      }
4641      break;
4642    case MVT::i32:
4643      switch (Opcode) {
4644      default: llvm_unreachable("Unexpected opcode!");
4645      case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4646      case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4647      case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4648      case ISD::OR:  ROpc = X86::OR32rr;  MOpc = X86::OR32rm;  break;
4649      case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4650      }
4651      break;
4652    case MVT::i64:
4653      switch (Opcode) {
4654      default: llvm_unreachable("Unexpected opcode!");
4655      case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4656      case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4657      case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4658      case ISD::OR:  ROpc = X86::OR64rr;  MOpc = X86::OR64rm;  break;
4659      case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4660      }
4661      break;
4662    }
4663
4664    // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4665
4666    // If this is a not a subtract, we can still try to fold a load.
4667    if (Opcode != ISD::SUB) {
4668      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4669      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4670        SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4671        SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4672        MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4673        // Update the chain.
4674        ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4675        // Record the mem-refs
4676        CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4677        ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4678        CurDAG->RemoveDeadNode(Node);
4679        return;
4680      }
4681    }
4682
4683    CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4684    return;
4685  }
4686
4687  case X86ISD::SMUL:
4688    // i16/i32/i64 are handled with isel patterns.
4689    if (NVT != MVT::i8)
4690      break;
4691    LLVM_FALLTHROUGH;
4692  case X86ISD::UMUL: {
4693    SDValue N0 = Node->getOperand(0);
4694    SDValue N1 = Node->getOperand(1);
4695
4696    unsigned LoReg, ROpc, MOpc;
4697    switch (NVT.SimpleTy) {
4698    default: llvm_unreachable("Unsupported VT!");
4699    case MVT::i8:
4700      LoReg = X86::AL;
4701      ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4702      MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4703      break;
4704    case MVT::i16:
4705      LoReg = X86::AX;
4706      ROpc = X86::MUL16r;
4707      MOpc = X86::MUL16m;
4708      break;
4709    case MVT::i32:
4710      LoReg = X86::EAX;
4711      ROpc = X86::MUL32r;
4712      MOpc = X86::MUL32m;
4713      break;
4714    case MVT::i64:
4715      LoReg = X86::RAX;
4716      ROpc = X86::MUL64r;
4717      MOpc = X86::MUL64m;
4718      break;
4719    }
4720
4721    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4722    bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4723    // Multiply is commmutative.
4724    if (!FoldedLoad) {
4725      FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4726      if (FoldedLoad)
4727        std::swap(N0, N1);
4728    }
4729
4730    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4731                                          N0, SDValue()).getValue(1);
4732
4733    MachineSDNode *CNode;
4734    if (FoldedLoad) {
4735      // i16/i32/i64 use an instruction that produces a low and high result even
4736      // though only the low result is used.
4737      SDVTList VTs;
4738      if (NVT == MVT::i8)
4739        VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4740      else
4741        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4742
4743      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4744                        InFlag };
4745      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4746
4747      // Update the chain.
4748      ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4749      // Record the mem-refs
4750      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4751    } else {
4752      // i16/i32/i64 use an instruction that produces a low and high result even
4753      // though only the low result is used.
4754      SDVTList VTs;
4755      if (NVT == MVT::i8)
4756        VTs = CurDAG->getVTList(NVT, MVT::i32);
4757      else
4758        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4759
4760      CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4761    }
4762
4763    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4764    ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4765    CurDAG->RemoveDeadNode(Node);
4766    return;
4767  }
4768
4769  case ISD::SMUL_LOHI:
4770  case ISD::UMUL_LOHI: {
4771    SDValue N0 = Node->getOperand(0);
4772    SDValue N1 = Node->getOperand(1);
4773
4774    unsigned Opc, MOpc;
4775    bool isSigned = Opcode == ISD::SMUL_LOHI;
4776    if (!isSigned) {
4777      switch (NVT.SimpleTy) {
4778      default: llvm_unreachable("Unsupported VT!");
4779      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4780      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4781      }
4782    } else {
4783      switch (NVT.SimpleTy) {
4784      default: llvm_unreachable("Unsupported VT!");
4785      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4786      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4787      }
4788    }
4789
4790    unsigned SrcReg, LoReg, HiReg;
4791    switch (Opc) {
4792    default: llvm_unreachable("Unknown MUL opcode!");
4793    case X86::IMUL32r:
4794    case X86::MUL32r:
4795      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4796      break;
4797    case X86::IMUL64r:
4798    case X86::MUL64r:
4799      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4800      break;
4801    }
4802
4803    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4804    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4805    // Multiply is commmutative.
4806    if (!foldedLoad) {
4807      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4808      if (foldedLoad)
4809        std::swap(N0, N1);
4810    }
4811
4812    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4813                                          N0, SDValue()).getValue(1);
4814    if (foldedLoad) {
4815      SDValue Chain;
4816      MachineSDNode *CNode = nullptr;
4817      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4818                        InFlag };
4819      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4820      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4821      Chain = SDValue(CNode, 0);
4822      InFlag = SDValue(CNode, 1);
4823
4824      // Update the chain.
4825      ReplaceUses(N1.getValue(1), Chain);
4826      // Record the mem-refs
4827      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4828    } else {
4829      SDValue Ops[] = { N1, InFlag };
4830      SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4831      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4832      InFlag = SDValue(CNode, 0);
4833    }
4834
4835    // Copy the low half of the result, if it is needed.
4836    if (!SDValue(Node, 0).use_empty()) {
4837      assert(LoReg && "Register for low half is not defined!");
4838      SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4839                                             NVT, InFlag);
4840      InFlag = ResLo.getValue(2);
4841      ReplaceUses(SDValue(Node, 0), ResLo);
4842      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4843                 dbgs() << '\n');
4844    }
4845    // Copy the high half of the result, if it is needed.
4846    if (!SDValue(Node, 1).use_empty()) {
4847      assert(HiReg && "Register for high half is not defined!");
4848      SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4849                                             NVT, InFlag);
4850      InFlag = ResHi.getValue(2);
4851      ReplaceUses(SDValue(Node, 1), ResHi);
4852      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4853                 dbgs() << '\n');
4854    }
4855
4856    CurDAG->RemoveDeadNode(Node);
4857    return;
4858  }
4859
4860  case ISD::SDIVREM:
4861  case ISD::UDIVREM: {
4862    SDValue N0 = Node->getOperand(0);
4863    SDValue N1 = Node->getOperand(1);
4864
4865    unsigned Opc, MOpc;
4866    bool isSigned = Opcode == ISD::SDIVREM;
4867    if (!isSigned) {
4868      switch (NVT.SimpleTy) {
4869      default: llvm_unreachable("Unsupported VT!");
4870      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
4871      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4872      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4873      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4874      }
4875    } else {
4876      switch (NVT.SimpleTy) {
4877      default: llvm_unreachable("Unsupported VT!");
4878      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
4879      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4880      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4881      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4882      }
4883    }
4884
4885    unsigned LoReg, HiReg, ClrReg;
4886    unsigned SExtOpcode;
4887    switch (NVT.SimpleTy) {
4888    default: llvm_unreachable("Unsupported VT!");
4889    case MVT::i8:
4890      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
4891      SExtOpcode = 0; // Not used.
4892      break;
4893    case MVT::i16:
4894      LoReg = X86::AX;  HiReg = X86::DX;
4895      ClrReg = X86::DX;
4896      SExtOpcode = X86::CWD;
4897      break;
4898    case MVT::i32:
4899      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4900      SExtOpcode = X86::CDQ;
4901      break;
4902    case MVT::i64:
4903      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4904      SExtOpcode = X86::CQO;
4905      break;
4906    }
4907
4908    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4909    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4910    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4911
4912    SDValue InFlag;
4913    if (NVT == MVT::i8) {
4914      // Special case for div8, just use a move with zero extension to AX to
4915      // clear the upper 8 bits (AH).
4916      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4917      MachineSDNode *Move;
4918      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4919        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4920        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
4921                                                    : X86::MOVZX16rm8;
4922        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
4923        Chain = SDValue(Move, 1);
4924        ReplaceUses(N0.getValue(1), Chain);
4925        // Record the mem-refs
4926        CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4927      } else {
4928        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
4929                                                    : X86::MOVZX16rr8;
4930        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
4931        Chain = CurDAG->getEntryNode();
4932      }
4933      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
4934                                    SDValue());
4935      InFlag = Chain.getValue(1);
4936    } else {
4937      InFlag =
4938        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4939                             LoReg, N0, SDValue()).getValue(1);
4940      if (isSigned && !signBitIsZero) {
4941        // Sign extend the low part into the high part.
4942        InFlag =
4943          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4944      } else {
4945        // Zero out the high part, effectively zero extending the input.
4946        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4947        switch (NVT.SimpleTy) {
4948        case MVT::i16:
4949          ClrNode =
4950              SDValue(CurDAG->getMachineNode(
4951                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4952                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
4953                                                    MVT::i32)),
4954                      0);
4955          break;
4956        case MVT::i32:
4957          break;
4958        case MVT::i64:
4959          ClrNode =
4960              SDValue(CurDAG->getMachineNode(
4961                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4962                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4963                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
4964                                                    MVT::i32)),
4965                      0);
4966          break;
4967        default:
4968          llvm_unreachable("Unexpected division source");
4969        }
4970
4971        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4972                                      ClrNode, InFlag).getValue(1);
4973      }
4974    }
4975
4976    if (foldedLoad) {
4977      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4978                        InFlag };
4979      MachineSDNode *CNode =
4980        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4981      InFlag = SDValue(CNode, 1);
4982      // Update the chain.
4983      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4984      // Record the mem-refs
4985      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4986    } else {
4987      InFlag =
4988        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4989    }
4990
4991    // Prevent use of AH in a REX instruction by explicitly copying it to
4992    // an ABCD_L register.
4993    //
4994    // The current assumption of the register allocator is that isel
4995    // won't generate explicit references to the GR8_ABCD_H registers. If
4996    // the allocator and/or the backend get enhanced to be more robust in
4997    // that regard, this can be, and should be, removed.
4998    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4999      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5000      unsigned AHExtOpcode =
5001          isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5002
5003      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5004                                             MVT::Glue, AHCopy, InFlag);
5005      SDValue Result(RNode, 0);
5006      InFlag = SDValue(RNode, 1);
5007
5008      Result =
5009          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5010
5011      ReplaceUses(SDValue(Node, 1), Result);
5012      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5013                 dbgs() << '\n');
5014    }
5015    // Copy the division (low) result, if it is needed.
5016    if (!SDValue(Node, 0).use_empty()) {
5017      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5018                                                LoReg, NVT, InFlag);
5019      InFlag = Result.getValue(2);
5020      ReplaceUses(SDValue(Node, 0), Result);
5021      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5022                 dbgs() << '\n');
5023    }
5024    // Copy the remainder (high) result, if it is needed.
5025    if (!SDValue(Node, 1).use_empty()) {
5026      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5027                                              HiReg, NVT, InFlag);
5028      InFlag = Result.getValue(2);
5029      ReplaceUses(SDValue(Node, 1), Result);
5030      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5031                 dbgs() << '\n');
5032    }
5033    CurDAG->RemoveDeadNode(Node);
5034    return;
5035  }
5036
5037  case X86ISD::CMP: {
5038    SDValue N0 = Node->getOperand(0);
5039    SDValue N1 = Node->getOperand(1);
5040
5041    // Optimizations for TEST compares.
5042    if (!isNullConstant(N1))
5043      break;
5044
5045    // Save the original VT of the compare.
5046    MVT CmpVT = N0.getSimpleValueType();
5047
5048    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5049    // by a test instruction. The test should be removed later by
5050    // analyzeCompare if we are using only the zero flag.
5051    // TODO: Should we check the users and use the BEXTR flags directly?
5052    if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5053      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5054        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5055                                             : X86::TEST32rr;
5056        SDValue BEXTR = SDValue(NewNode, 0);
5057        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5058        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5059        CurDAG->RemoveDeadNode(Node);
5060        return;
5061      }
5062    }
5063
5064    // We can peek through truncates, but we need to be careful below.
5065    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5066      N0 = N0.getOperand(0);
5067
5068    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5069    // use a smaller encoding.
5070    // Look past the truncate if CMP is the only use of it.
5071    if (N0.getOpcode() == ISD::AND &&
5072        N0.getNode()->hasOneUse() &&
5073        N0.getValueType() != MVT::i8) {
5074      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5075      if (!C) break;
5076      uint64_t Mask = C->getZExtValue();
5077
5078      // Check if we can replace AND+IMM64 with a shift. This is possible for
5079      // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5080      // flag.
5081      if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
5082          onlyUsesZeroFlag(SDValue(Node, 0))) {
5083        if (isMask_64(~Mask)) {
5084          unsigned TrailingZeros = countTrailingZeros(Mask);
5085          SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
5086          SDValue Shift =
5087            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
5088                                           N0.getOperand(0), Imm), 0);
5089          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5090                                                       MVT::i32, Shift, Shift);
5091          ReplaceNode(Node, Test);
5092          return;
5093        }
5094        if (isMask_64(Mask)) {
5095          unsigned LeadingZeros = countLeadingZeros(Mask);
5096          SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
5097          SDValue Shift =
5098            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
5099                                           N0.getOperand(0), Imm), 0);
5100          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5101                                                       MVT::i32, Shift, Shift);
5102          ReplaceNode(Node, Test);
5103          return;
5104        }
5105      }
5106
5107      MVT VT;
5108      int SubRegOp;
5109      unsigned ROpc, MOpc;
5110
5111      // For each of these checks we need to be careful if the sign flag is
5112      // being used. It is only safe to use the sign flag in two conditions,
5113      // either the sign bit in the shrunken mask is zero or the final test
5114      // size is equal to the original compare size.
5115
5116      if (isUInt<8>(Mask) &&
5117          (!(Mask & 0x80) || CmpVT == MVT::i8 ||
5118           hasNoSignFlagUses(SDValue(Node, 0)))) {
5119        // For example, convert "testl %eax, $8" to "testb %al, $8"
5120        VT = MVT::i8;
5121        SubRegOp = X86::sub_8bit;
5122        ROpc = X86::TEST8ri;
5123        MOpc = X86::TEST8mi;
5124      } else if (OptForMinSize && isUInt<16>(Mask) &&
5125                 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
5126                  hasNoSignFlagUses(SDValue(Node, 0)))) {
5127        // For example, "testl %eax, $32776" to "testw %ax, $32776".
5128        // NOTE: We only want to form TESTW instructions if optimizing for
5129        // min size. Otherwise we only save one byte and possibly get a length
5130        // changing prefix penalty in the decoders.
5131        VT = MVT::i16;
5132        SubRegOp = X86::sub_16bit;
5133        ROpc = X86::TEST16ri;
5134        MOpc = X86::TEST16mi;
5135      } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
5136                 ((!(Mask & 0x80000000) &&
5137                   // Without minsize 16-bit Cmps can get here so we need to
5138                   // be sure we calculate the correct sign flag if needed.
5139                   (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
5140                  CmpVT == MVT::i32 ||
5141                  hasNoSignFlagUses(SDValue(Node, 0)))) {
5142        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5143        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5144        // Otherwize, we find ourselves in a position where we have to do
5145        // promotion. If previous passes did not promote the and, we assume
5146        // they had a good reason not to and do not promote here.
5147        VT = MVT::i32;
5148        SubRegOp = X86::sub_32bit;
5149        ROpc = X86::TEST32ri;
5150        MOpc = X86::TEST32mi;
5151      } else {
5152        // No eligible transformation was found.
5153        break;
5154      }
5155
5156      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
5157      SDValue Reg = N0.getOperand(0);
5158
5159      // Emit a testl or testw.
5160      MachineSDNode *NewNode;
5161      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5162      if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5163        if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
5164          if (!LoadN->isSimple()) {
5165            unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
5166            if (MOpc == X86::TEST8mi && NumVolBits != 8)
5167              break;
5168            else if (MOpc == X86::TEST16mi && NumVolBits != 16)
5169              break;
5170            else if (MOpc == X86::TEST32mi && NumVolBits != 32)
5171              break;
5172          }
5173        }
5174        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
5175                          Reg.getOperand(0) };
5176        NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
5177        // Update the chain.
5178        ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
5179        // Record the mem-refs
5180        CurDAG->setNodeMemRefs(NewNode,
5181                               {cast<LoadSDNode>(Reg)->getMemOperand()});
5182      } else {
5183        // Extract the subregister if necessary.
5184        if (N0.getValueType() != VT)
5185          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
5186
5187        NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
5188      }
5189      // Replace CMP with TEST.
5190      ReplaceNode(Node, NewNode);
5191      return;
5192    }
5193    break;
5194  }
5195  case X86ISD::PCMPISTR: {
5196    if (!Subtarget->hasSSE42())
5197      break;
5198
5199    bool NeedIndex = !SDValue(Node, 0).use_empty();
5200    bool NeedMask = !SDValue(Node, 1).use_empty();
5201    // We can't fold a load if we are going to make two instructions.
5202    bool MayFoldLoad = !NeedIndex || !NeedMask;
5203
5204    MachineSDNode *CNode;
5205    if (NeedMask) {
5206      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
5207      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
5208      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
5209      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5210    }
5211    if (NeedIndex || !NeedMask) {
5212      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
5213      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
5214      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
5215      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5216    }
5217
5218    // Connect the flag usage to the last instruction created.
5219    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5220    CurDAG->RemoveDeadNode(Node);
5221    return;
5222  }
5223  case X86ISD::PCMPESTR: {
5224    if (!Subtarget->hasSSE42())
5225      break;
5226
5227    // Copy the two implicit register inputs.
5228    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
5229                                          Node->getOperand(1),
5230                                          SDValue()).getValue(1);
5231    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
5232                                  Node->getOperand(3), InFlag).getValue(1);
5233
5234    bool NeedIndex = !SDValue(Node, 0).use_empty();
5235    bool NeedMask = !SDValue(Node, 1).use_empty();
5236    // We can't fold a load if we are going to make two instructions.
5237    bool MayFoldLoad = !NeedIndex || !NeedMask;
5238
5239    MachineSDNode *CNode;
5240    if (NeedMask) {
5241      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
5242      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
5243      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
5244                           InFlag);
5245      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5246    }
5247    if (NeedIndex || !NeedMask) {
5248      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
5249      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
5250      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
5251      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5252    }
5253    // Connect the flag usage to the last instruction created.
5254    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5255    CurDAG->RemoveDeadNode(Node);
5256    return;
5257  }
5258
5259  case ISD::SETCC: {
5260    if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5261      return;
5262
5263    break;
5264  }
5265
5266  case ISD::STORE:
5267    if (foldLoadStoreIntoMemOperand(Node))
5268      return;
5269    break;
5270  }
5271
5272  SelectCode(Node);
5273}
5274
5275bool X86DAGToDAGISel::
5276SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5277                             std::vector<SDValue> &OutOps) {
5278  SDValue Op0, Op1, Op2, Op3, Op4;
5279  switch (ConstraintID) {
5280  default:
5281    llvm_unreachable("Unexpected asm memory constraint");
5282  case InlineAsm::Constraint_o: // offsetable        ??
5283  case InlineAsm::Constraint_v: // not offsetable    ??
5284  case InlineAsm::Constraint_m: // memory
5285  case InlineAsm::Constraint_X:
5286    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5287      return true;
5288    break;
5289  }
5290
5291  OutOps.push_back(Op0);
5292  OutOps.push_back(Op1);
5293  OutOps.push_back(Op2);
5294  OutOps.push_back(Op3);
5295  OutOps.push_back(Op4);
5296  return false;
5297}
5298
5299/// This pass converts a legalized DAG into a X86-specific DAG,
5300/// ready for instruction scheduling.
5301FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5302                                     CodeGenOpt::Level OptLevel) {
5303  return new X86DAGToDAGISel(TM, OptLevel);
5304}
5305