1//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the AArch64 target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64TargetMachine.h"
14#include "MCTargetDesc/AArch64AddressingModes.h"
15#include "llvm/ADT/APSInt.h"
16#include "llvm/CodeGen/SelectionDAGISel.h"
17#include "llvm/IR/Function.h" // To access function attributes.
18#include "llvm/IR/GlobalValue.h"
19#include "llvm/IR/Intrinsics.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/Support/Debug.h"
22#include "llvm/Support/ErrorHandling.h"
23#include "llvm/Support/KnownBits.h"
24#include "llvm/Support/MathExtras.h"
25#include "llvm/Support/raw_ostream.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "aarch64-isel"
30
31//===--------------------------------------------------------------------===//
32/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
33/// instructions for SelectionDAG operations.
34///
35namespace {
36
37class AArch64DAGToDAGISel : public SelectionDAGISel {
38
39  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
40  /// make the right decision when generating code for different targets.
41  const AArch64Subtarget *Subtarget;
42
43public:
44  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
45                               CodeGenOpt::Level OptLevel)
46      : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
47
48  StringRef getPassName() const override {
49    return "AArch64 Instruction Selection";
50  }
51
52  bool runOnMachineFunction(MachineFunction &MF) override {
53    Subtarget = &MF.getSubtarget<AArch64Subtarget>();
54    return SelectionDAGISel::runOnMachineFunction(MF);
55  }
56
57  void Select(SDNode *Node) override;
58
59  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
60  /// inline asm expressions.
61  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
62                                    unsigned ConstraintID,
63                                    std::vector<SDValue> &OutOps) override;
64
65  bool tryMLAV64LaneV128(SDNode *N);
66  bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
67  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
68  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
69  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
70  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
71    return SelectShiftedRegister(N, false, Reg, Shift);
72  }
73  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
74    return SelectShiftedRegister(N, true, Reg, Shift);
75  }
76  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
77    return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
78  }
79  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
80    return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
81  }
82  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
83    return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
84  }
85  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
86    return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
87  }
88  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
89    return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
90  }
91  bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
92    return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
93  }
94  bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
95    return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
96  }
97  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
98    return SelectAddrModeIndexed(N, 1, Base, OffImm);
99  }
100  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
101    return SelectAddrModeIndexed(N, 2, Base, OffImm);
102  }
103  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
104    return SelectAddrModeIndexed(N, 4, Base, OffImm);
105  }
106  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
107    return SelectAddrModeIndexed(N, 8, Base, OffImm);
108  }
109  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
110    return SelectAddrModeIndexed(N, 16, Base, OffImm);
111  }
112  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
113    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
114  }
115  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
116    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
117  }
118  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
119    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
120  }
121  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
122    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
123  }
124  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
125    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
126  }
127
128  template<int Width>
129  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
130                         SDValue &SignExtend, SDValue &DoShift) {
131    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
132  }
133
134  template<int Width>
135  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
136                         SDValue &SignExtend, SDValue &DoShift) {
137    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
138  }
139
140  bool SelectDupZeroOrUndef(SDValue N) {
141    switch(N->getOpcode()) {
142    case ISD::UNDEF:
143      return true;
144    case AArch64ISD::DUP:
145    case ISD::SPLAT_VECTOR: {
146      auto Opnd0 = N->getOperand(0);
147      if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
148        if (CN->isNullValue())
149          return true;
150      if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
151        if (CN->isZero())
152          return true;
153      break;
154    }
155    default:
156      break;
157    }
158
159    return false;
160  }
161
162  template<MVT::SimpleValueType VT>
163  bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
164    return SelectSVEAddSubImm(N, VT, Imm, Shift);
165  }
166
167  template<MVT::SimpleValueType VT>
168  bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
169    return SelectSVELogicalImm(N, VT, Imm);
170  }
171
172  // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
173  template<signed Min, signed Max, signed Scale, bool Shift>
174  bool SelectCntImm(SDValue N, SDValue &Imm) {
175    if (!isa<ConstantSDNode>(N))
176      return false;
177
178    int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
179    if (Shift)
180      MulImm = 1LL << MulImm;
181
182    if ((MulImm % std::abs(Scale)) != 0)
183      return false;
184
185    MulImm /= Scale;
186    if ((MulImm >= Min) && (MulImm <= Max)) {
187      Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
188      return true;
189    }
190
191    return false;
192  }
193
194  /// Form sequences of consecutive 64/128-bit registers for use in NEON
195  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
196  /// between 1 and 4 elements. If it contains a single element that is returned
197  /// unchanged; otherwise a REG_SEQUENCE value is returned.
198  SDValue createDTuple(ArrayRef<SDValue> Vecs);
199  SDValue createQTuple(ArrayRef<SDValue> Vecs);
200
201  /// Generic helper for the createDTuple/createQTuple
202  /// functions. Those should almost always be called instead.
203  SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
204                      const unsigned SubRegs[]);
205
206  void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
207
208  bool tryIndexedLoad(SDNode *N);
209
210  bool trySelectStackSlotTagP(SDNode *N);
211  void SelectTagP(SDNode *N);
212
213  void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
214                     unsigned SubRegIdx);
215  void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
216                         unsigned SubRegIdx);
217  void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
218  void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
219
220  void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
221  void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
222  void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
223  void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
224
225  bool tryBitfieldExtractOp(SDNode *N);
226  bool tryBitfieldExtractOpFromSExt(SDNode *N);
227  bool tryBitfieldInsertOp(SDNode *N);
228  bool tryBitfieldInsertInZeroOp(SDNode *N);
229  bool tryShiftAmountMod(SDNode *N);
230  bool tryHighFPExt(SDNode *N);
231
232  bool tryReadRegister(SDNode *N);
233  bool tryWriteRegister(SDNode *N);
234
235// Include the pieces autogenerated from the target description.
236#include "AArch64GenDAGISel.inc"
237
238private:
239  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
240                             SDValue &Shift);
241  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
242                               SDValue &OffImm) {
243    return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
244  }
245  bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
246                                     unsigned Size, SDValue &Base,
247                                     SDValue &OffImm);
248  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
249                             SDValue &OffImm);
250  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
251                              SDValue &OffImm);
252  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
253                         SDValue &Offset, SDValue &SignExtend,
254                         SDValue &DoShift);
255  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
256                         SDValue &Offset, SDValue &SignExtend,
257                         SDValue &DoShift);
258  bool isWorthFolding(SDValue V) const;
259  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
260                         SDValue &Offset, SDValue &SignExtend);
261
262  template<unsigned RegWidth>
263  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
264    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
265  }
266
267  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
268
269  bool SelectCMP_SWAP(SDNode *N);
270
271  bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
272
273  bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
274
275  bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
276
277  bool SelectSVEArithImm(SDValue N, SDValue &Imm);
278};
279} // end anonymous namespace
280
281/// isIntImmediate - This method tests to see if the node is a constant
282/// operand. If so Imm will receive the 32-bit value.
283static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
284  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
285    Imm = C->getZExtValue();
286    return true;
287  }
288  return false;
289}
290
291// isIntImmediate - This method tests to see if a constant operand.
292// If so Imm will receive the value.
293static bool isIntImmediate(SDValue N, uint64_t &Imm) {
294  return isIntImmediate(N.getNode(), Imm);
295}
296
297// isOpcWithIntImmediate - This method tests to see if the node is a specific
298// opcode and that it has a immediate integer right operand.
299// If so Imm will receive the 32 bit value.
300static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
301                                  uint64_t &Imm) {
302  return N->getOpcode() == Opc &&
303         isIntImmediate(N->getOperand(1).getNode(), Imm);
304}
305
306bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
307    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
308  switch(ConstraintID) {
309  default:
310    llvm_unreachable("Unexpected asm memory constraint");
311  case InlineAsm::Constraint_m:
312  case InlineAsm::Constraint_Q:
313    // We need to make sure that this one operand does not end up in XZR, thus
314    // require the address to be in a PointerRegClass register.
315    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
316    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
317    SDLoc dl(Op);
318    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
319    SDValue NewOp =
320        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
321                                       dl, Op.getValueType(),
322                                       Op, RC), 0);
323    OutOps.push_back(NewOp);
324    return false;
325  }
326  return true;
327}
328
329/// SelectArithImmed - Select an immediate value that can be represented as
330/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
331/// Val set to the 12-bit value and Shift set to the shifter operand.
332bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
333                                           SDValue &Shift) {
334  // This function is called from the addsub_shifted_imm ComplexPattern,
335  // which lists [imm] as the list of opcode it's interested in, however
336  // we still need to check whether the operand is actually an immediate
337  // here because the ComplexPattern opcode list is only used in
338  // root-level opcode matching.
339  if (!isa<ConstantSDNode>(N.getNode()))
340    return false;
341
342  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
343  unsigned ShiftAmt;
344
345  if (Immed >> 12 == 0) {
346    ShiftAmt = 0;
347  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
348    ShiftAmt = 12;
349    Immed = Immed >> 12;
350  } else
351    return false;
352
353  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
354  SDLoc dl(N);
355  Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
356  Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
357  return true;
358}
359
360/// SelectNegArithImmed - As above, but negates the value before trying to
361/// select it.
362bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
363                                              SDValue &Shift) {
364  // This function is called from the addsub_shifted_imm ComplexPattern,
365  // which lists [imm] as the list of opcode it's interested in, however
366  // we still need to check whether the operand is actually an immediate
367  // here because the ComplexPattern opcode list is only used in
368  // root-level opcode matching.
369  if (!isa<ConstantSDNode>(N.getNode()))
370    return false;
371
372  // The immediate operand must be a 24-bit zero-extended immediate.
373  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
374
375  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
376  // have the opposite effect on the C flag, so this pattern mustn't match under
377  // those circumstances.
378  if (Immed == 0)
379    return false;
380
381  if (N.getValueType() == MVT::i32)
382    Immed = ~((uint32_t)Immed) + 1;
383  else
384    Immed = ~Immed + 1ULL;
385  if (Immed & 0xFFFFFFFFFF000000ULL)
386    return false;
387
388  Immed &= 0xFFFFFFULL;
389  return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
390                          Shift);
391}
392
393/// getShiftTypeForNode - Translate a shift node to the corresponding
394/// ShiftType value.
395static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
396  switch (N.getOpcode()) {
397  default:
398    return AArch64_AM::InvalidShiftExtend;
399  case ISD::SHL:
400    return AArch64_AM::LSL;
401  case ISD::SRL:
402    return AArch64_AM::LSR;
403  case ISD::SRA:
404    return AArch64_AM::ASR;
405  case ISD::ROTR:
406    return AArch64_AM::ROR;
407  }
408}
409
410/// Determine whether it is worth it to fold SHL into the addressing
411/// mode.
412static bool isWorthFoldingSHL(SDValue V) {
413  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
414  // It is worth folding logical shift of up to three places.
415  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
416  if (!CSD)
417    return false;
418  unsigned ShiftVal = CSD->getZExtValue();
419  if (ShiftVal > 3)
420    return false;
421
422  // Check if this particular node is reused in any non-memory related
423  // operation.  If yes, do not try to fold this node into the address
424  // computation, since the computation will be kept.
425  const SDNode *Node = V.getNode();
426  for (SDNode *UI : Node->uses())
427    if (!isa<MemSDNode>(*UI))
428      for (SDNode *UII : UI->uses())
429        if (!isa<MemSDNode>(*UII))
430          return false;
431  return true;
432}
433
434/// Determine whether it is worth to fold V into an extended register.
435bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
436  // Trivial if we are optimizing for code size or if there is only
437  // one use of the value.
438  if (CurDAG->shouldOptForSize() || V.hasOneUse())
439    return true;
440  // If a subtarget has a fastpath LSL we can fold a logical shift into
441  // the addressing mode and save a cycle.
442  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
443      isWorthFoldingSHL(V))
444    return true;
445  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
446    const SDValue LHS = V.getOperand(0);
447    const SDValue RHS = V.getOperand(1);
448    if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
449      return true;
450    if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
451      return true;
452  }
453
454  // It hurts otherwise, since the value will be reused.
455  return false;
456}
457
458/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
459/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
460/// instructions allow the shifted register to be rotated, but the arithmetic
461/// instructions do not.  The AllowROR parameter specifies whether ROR is
462/// supported.
463bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
464                                                SDValue &Reg, SDValue &Shift) {
465  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
466  if (ShType == AArch64_AM::InvalidShiftExtend)
467    return false;
468  if (!AllowROR && ShType == AArch64_AM::ROR)
469    return false;
470
471  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
472    unsigned BitSize = N.getValueSizeInBits();
473    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
474    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
475
476    Reg = N.getOperand(0);
477    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
478    return isWorthFolding(N);
479  }
480
481  return false;
482}
483
484/// getExtendTypeForNode - Translate an extend node to the corresponding
485/// ExtendType value.
486static AArch64_AM::ShiftExtendType
487getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
488  if (N.getOpcode() == ISD::SIGN_EXTEND ||
489      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
490    EVT SrcVT;
491    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
492      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
493    else
494      SrcVT = N.getOperand(0).getValueType();
495
496    if (!IsLoadStore && SrcVT == MVT::i8)
497      return AArch64_AM::SXTB;
498    else if (!IsLoadStore && SrcVT == MVT::i16)
499      return AArch64_AM::SXTH;
500    else if (SrcVT == MVT::i32)
501      return AArch64_AM::SXTW;
502    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
503
504    return AArch64_AM::InvalidShiftExtend;
505  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
506             N.getOpcode() == ISD::ANY_EXTEND) {
507    EVT SrcVT = N.getOperand(0).getValueType();
508    if (!IsLoadStore && SrcVT == MVT::i8)
509      return AArch64_AM::UXTB;
510    else if (!IsLoadStore && SrcVT == MVT::i16)
511      return AArch64_AM::UXTH;
512    else if (SrcVT == MVT::i32)
513      return AArch64_AM::UXTW;
514    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
515
516    return AArch64_AM::InvalidShiftExtend;
517  } else if (N.getOpcode() == ISD::AND) {
518    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
519    if (!CSD)
520      return AArch64_AM::InvalidShiftExtend;
521    uint64_t AndMask = CSD->getZExtValue();
522
523    switch (AndMask) {
524    default:
525      return AArch64_AM::InvalidShiftExtend;
526    case 0xFF:
527      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
528    case 0xFFFF:
529      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
530    case 0xFFFFFFFF:
531      return AArch64_AM::UXTW;
532    }
533  }
534
535  return AArch64_AM::InvalidShiftExtend;
536}
537
538// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
539static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
540  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
541      DL->getOpcode() != AArch64ISD::DUPLANE32)
542    return false;
543
544  SDValue SV = DL->getOperand(0);
545  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
546    return false;
547
548  SDValue EV = SV.getOperand(1);
549  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
550    return false;
551
552  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
553  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
554  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
555  LaneOp = EV.getOperand(0);
556
557  return true;
558}
559
560// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
561// high lane extract.
562static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
563                             SDValue &LaneOp, int &LaneIdx) {
564
565  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
566    std::swap(Op0, Op1);
567    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
568      return false;
569  }
570  StdOp = Op1;
571  return true;
572}
573
574/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
575/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
576/// so that we don't emit unnecessary lane extracts.
577bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
578  SDLoc dl(N);
579  SDValue Op0 = N->getOperand(0);
580  SDValue Op1 = N->getOperand(1);
581  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
582  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
583  int LaneIdx = -1; // Will hold the lane index.
584
585  if (Op1.getOpcode() != ISD::MUL ||
586      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
587                        LaneIdx)) {
588    std::swap(Op0, Op1);
589    if (Op1.getOpcode() != ISD::MUL ||
590        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
591                          LaneIdx))
592      return false;
593  }
594
595  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
596
597  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
598
599  unsigned MLAOpc = ~0U;
600
601  switch (N->getSimpleValueType(0).SimpleTy) {
602  default:
603    llvm_unreachable("Unrecognized MLA.");
604  case MVT::v4i16:
605    MLAOpc = AArch64::MLAv4i16_indexed;
606    break;
607  case MVT::v8i16:
608    MLAOpc = AArch64::MLAv8i16_indexed;
609    break;
610  case MVT::v2i32:
611    MLAOpc = AArch64::MLAv2i32_indexed;
612    break;
613  case MVT::v4i32:
614    MLAOpc = AArch64::MLAv4i32_indexed;
615    break;
616  }
617
618  ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
619  return true;
620}
621
622bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
623  SDLoc dl(N);
624  SDValue SMULLOp0;
625  SDValue SMULLOp1;
626  int LaneIdx;
627
628  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
629                        LaneIdx))
630    return false;
631
632  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
633
634  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
635
636  unsigned SMULLOpc = ~0U;
637
638  if (IntNo == Intrinsic::aarch64_neon_smull) {
639    switch (N->getSimpleValueType(0).SimpleTy) {
640    default:
641      llvm_unreachable("Unrecognized SMULL.");
642    case MVT::v4i32:
643      SMULLOpc = AArch64::SMULLv4i16_indexed;
644      break;
645    case MVT::v2i64:
646      SMULLOpc = AArch64::SMULLv2i32_indexed;
647      break;
648    }
649  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
650    switch (N->getSimpleValueType(0).SimpleTy) {
651    default:
652      llvm_unreachable("Unrecognized SMULL.");
653    case MVT::v4i32:
654      SMULLOpc = AArch64::UMULLv4i16_indexed;
655      break;
656    case MVT::v2i64:
657      SMULLOpc = AArch64::UMULLv2i32_indexed;
658      break;
659    }
660  } else
661    llvm_unreachable("Unrecognized intrinsic.");
662
663  ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
664  return true;
665}
666
667/// Instructions that accept extend modifiers like UXTW expect the register
668/// being extended to be a GPR32, but the incoming DAG might be acting on a
669/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
670/// this is the case.
671static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
672  if (N.getValueType() == MVT::i32)
673    return N;
674
675  SDLoc dl(N);
676  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
677  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
678                                               dl, MVT::i32, N, SubReg);
679  return SDValue(Node, 0);
680}
681
682
683/// SelectArithExtendedRegister - Select a "extended register" operand.  This
684/// operand folds in an extend followed by an optional left shift.
685bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
686                                                      SDValue &Shift) {
687  unsigned ShiftVal = 0;
688  AArch64_AM::ShiftExtendType Ext;
689
690  if (N.getOpcode() == ISD::SHL) {
691    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
692    if (!CSD)
693      return false;
694    ShiftVal = CSD->getZExtValue();
695    if (ShiftVal > 4)
696      return false;
697
698    Ext = getExtendTypeForNode(N.getOperand(0));
699    if (Ext == AArch64_AM::InvalidShiftExtend)
700      return false;
701
702    Reg = N.getOperand(0).getOperand(0);
703  } else {
704    Ext = getExtendTypeForNode(N);
705    if (Ext == AArch64_AM::InvalidShiftExtend)
706      return false;
707
708    Reg = N.getOperand(0);
709
710    // Don't match if free 32-bit -> 64-bit zext can be used instead.
711    if (Ext == AArch64_AM::UXTW &&
712        Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
713      return false;
714  }
715
716  // AArch64 mandates that the RHS of the operation must use the smallest
717  // register class that could contain the size being extended from.  Thus,
718  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
719  // there might not be an actual 32-bit value in the program.  We can
720  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
721  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
722  Reg = narrowIfNeeded(CurDAG, Reg);
723  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
724                                    MVT::i32);
725  return isWorthFolding(N);
726}
727
728/// If there's a use of this ADDlow that's not itself a load/store then we'll
729/// need to create a real ADD instruction from it anyway and there's no point in
730/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
731/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
732/// leads to duplicated ADRP instructions.
733static bool isWorthFoldingADDlow(SDValue N) {
734  for (auto Use : N->uses()) {
735    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
736        Use->getOpcode() != ISD::ATOMIC_LOAD &&
737        Use->getOpcode() != ISD::ATOMIC_STORE)
738      return false;
739
740    // ldar and stlr have much more restrictive addressing modes (just a
741    // register).
742    if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
743      return false;
744  }
745
746  return true;
747}
748
749/// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
750/// immediate" address.  The "Size" argument is the size in bytes of the memory
751/// reference, which determines the scale.
752bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
753                                                        unsigned BW, unsigned Size,
754                                                        SDValue &Base,
755                                                        SDValue &OffImm) {
756  SDLoc dl(N);
757  const DataLayout &DL = CurDAG->getDataLayout();
758  const TargetLowering *TLI = getTargetLowering();
759  if (N.getOpcode() == ISD::FrameIndex) {
760    int FI = cast<FrameIndexSDNode>(N)->getIndex();
761    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
762    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
763    return true;
764  }
765
766  // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
767  // selected here doesn't support labels/immediates, only base+offset.
768  if (CurDAG->isBaseWithConstantOffset(N)) {
769    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
770      if (IsSignedImm) {
771        int64_t RHSC = RHS->getSExtValue();
772        unsigned Scale = Log2_32(Size);
773        int64_t Range = 0x1LL << (BW - 1);
774
775        if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
776            RHSC < (Range << Scale)) {
777          Base = N.getOperand(0);
778          if (Base.getOpcode() == ISD::FrameIndex) {
779            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
780            Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
781          }
782          OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
783          return true;
784        }
785      } else {
786        // unsigned Immediate
787        uint64_t RHSC = RHS->getZExtValue();
788        unsigned Scale = Log2_32(Size);
789        uint64_t Range = 0x1ULL << BW;
790
791        if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
792          Base = N.getOperand(0);
793          if (Base.getOpcode() == ISD::FrameIndex) {
794            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
795            Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
796          }
797          OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
798          return true;
799        }
800      }
801    }
802  }
803  // Base only. The address will be materialized into a register before
804  // the memory is accessed.
805  //    add x0, Xbase, #offset
806  //    stp x1, x2, [x0]
807  Base = N;
808  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
809  return true;
810}
811
812/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
813/// immediate" address.  The "Size" argument is the size in bytes of the memory
814/// reference, which determines the scale.
815bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
816                                              SDValue &Base, SDValue &OffImm) {
817  SDLoc dl(N);
818  const DataLayout &DL = CurDAG->getDataLayout();
819  const TargetLowering *TLI = getTargetLowering();
820  if (N.getOpcode() == ISD::FrameIndex) {
821    int FI = cast<FrameIndexSDNode>(N)->getIndex();
822    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
823    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
824    return true;
825  }
826
827  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
828    GlobalAddressSDNode *GAN =
829        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
830    Base = N.getOperand(0);
831    OffImm = N.getOperand(1);
832    if (!GAN)
833      return true;
834
835    if (GAN->getOffset() % Size == 0) {
836      const GlobalValue *GV = GAN->getGlobal();
837      unsigned Alignment = GV->getAlignment();
838      Type *Ty = GV->getValueType();
839      if (Alignment == 0 && Ty->isSized())
840        Alignment = DL.getABITypeAlignment(Ty);
841
842      if (Alignment >= Size)
843        return true;
844    }
845  }
846
847  if (CurDAG->isBaseWithConstantOffset(N)) {
848    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
849      int64_t RHSC = (int64_t)RHS->getZExtValue();
850      unsigned Scale = Log2_32(Size);
851      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
852        Base = N.getOperand(0);
853        if (Base.getOpcode() == ISD::FrameIndex) {
854          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
855          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
856        }
857        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
858        return true;
859      }
860    }
861  }
862
863  // Before falling back to our general case, check if the unscaled
864  // instructions can handle this. If so, that's preferable.
865  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
866    return false;
867
868  // Base only. The address will be materialized into a register before
869  // the memory is accessed.
870  //    add x0, Xbase, #offset
871  //    ldr x0, [x0]
872  Base = N;
873  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
874  return true;
875}
876
877/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
878/// immediate" address.  This should only match when there is an offset that
879/// is not valid for a scaled immediate addressing mode.  The "Size" argument
880/// is the size in bytes of the memory reference, which is needed here to know
881/// what is valid for a scaled immediate.
882bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
883                                                 SDValue &Base,
884                                                 SDValue &OffImm) {
885  if (!CurDAG->isBaseWithConstantOffset(N))
886    return false;
887  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
888    int64_t RHSC = RHS->getSExtValue();
889    // If the offset is valid as a scaled immediate, don't match here.
890    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
891        RHSC < (0x1000 << Log2_32(Size)))
892      return false;
893    if (RHSC >= -256 && RHSC < 256) {
894      Base = N.getOperand(0);
895      if (Base.getOpcode() == ISD::FrameIndex) {
896        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
897        const TargetLowering *TLI = getTargetLowering();
898        Base = CurDAG->getTargetFrameIndex(
899            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
900      }
901      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
902      return true;
903    }
904  }
905  return false;
906}
907
908static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
909  SDLoc dl(N);
910  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
911  SDValue ImpDef = SDValue(
912      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
913  MachineSDNode *Node = CurDAG->getMachineNode(
914      TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
915  return SDValue(Node, 0);
916}
917
918/// Check if the given SHL node (\p N), can be used to form an
919/// extended register for an addressing mode.
920bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
921                                            bool WantExtend, SDValue &Offset,
922                                            SDValue &SignExtend) {
923  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
924  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
925  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
926    return false;
927
928  SDLoc dl(N);
929  if (WantExtend) {
930    AArch64_AM::ShiftExtendType Ext =
931        getExtendTypeForNode(N.getOperand(0), true);
932    if (Ext == AArch64_AM::InvalidShiftExtend)
933      return false;
934
935    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
936    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
937                                           MVT::i32);
938  } else {
939    Offset = N.getOperand(0);
940    SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
941  }
942
943  unsigned LegalShiftVal = Log2_32(Size);
944  unsigned ShiftVal = CSD->getZExtValue();
945
946  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
947    return false;
948
949  return isWorthFolding(N);
950}
951
952bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
953                                            SDValue &Base, SDValue &Offset,
954                                            SDValue &SignExtend,
955                                            SDValue &DoShift) {
956  if (N.getOpcode() != ISD::ADD)
957    return false;
958  SDValue LHS = N.getOperand(0);
959  SDValue RHS = N.getOperand(1);
960  SDLoc dl(N);
961
962  // We don't want to match immediate adds here, because they are better lowered
963  // to the register-immediate addressing modes.
964  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
965    return false;
966
967  // Check if this particular node is reused in any non-memory related
968  // operation.  If yes, do not try to fold this node into the address
969  // computation, since the computation will be kept.
970  const SDNode *Node = N.getNode();
971  for (SDNode *UI : Node->uses()) {
972    if (!isa<MemSDNode>(*UI))
973      return false;
974  }
975
976  // Remember if it is worth folding N when it produces extended register.
977  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
978
979  // Try to match a shifted extend on the RHS.
980  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
981      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
982    Base = LHS;
983    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
984    return true;
985  }
986
987  // Try to match a shifted extend on the LHS.
988  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
989      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
990    Base = RHS;
991    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
992    return true;
993  }
994
995  // There was no shift, whatever else we find.
996  DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
997
998  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
999  // Try to match an unshifted extend on the LHS.
1000  if (IsExtendedRegisterWorthFolding &&
1001      (Ext = getExtendTypeForNode(LHS, true)) !=
1002          AArch64_AM::InvalidShiftExtend) {
1003    Base = RHS;
1004    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
1005    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1006                                           MVT::i32);
1007    if (isWorthFolding(LHS))
1008      return true;
1009  }
1010
1011  // Try to match an unshifted extend on the RHS.
1012  if (IsExtendedRegisterWorthFolding &&
1013      (Ext = getExtendTypeForNode(RHS, true)) !=
1014          AArch64_AM::InvalidShiftExtend) {
1015    Base = LHS;
1016    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
1017    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1018                                           MVT::i32);
1019    if (isWorthFolding(RHS))
1020      return true;
1021  }
1022
1023  return false;
1024}
1025
1026// Check if the given immediate is preferred by ADD. If an immediate can be
1027// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
1028// encoded by one MOVZ, return true.
1029static bool isPreferredADD(int64_t ImmOff) {
1030  // Constant in [0x0, 0xfff] can be encoded in ADD.
1031  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
1032    return true;
1033  // Check if it can be encoded in an "ADD LSL #12".
1034  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
1035    // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
1036    return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
1037           (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
1038  return false;
1039}
1040
1041bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
1042                                            SDValue &Base, SDValue &Offset,
1043                                            SDValue &SignExtend,
1044                                            SDValue &DoShift) {
1045  if (N.getOpcode() != ISD::ADD)
1046    return false;
1047  SDValue LHS = N.getOperand(0);
1048  SDValue RHS = N.getOperand(1);
1049  SDLoc DL(N);
1050
1051  // Check if this particular node is reused in any non-memory related
1052  // operation.  If yes, do not try to fold this node into the address
1053  // computation, since the computation will be kept.
1054  const SDNode *Node = N.getNode();
1055  for (SDNode *UI : Node->uses()) {
1056    if (!isa<MemSDNode>(*UI))
1057      return false;
1058  }
1059
1060  // Watch out if RHS is a wide immediate, it can not be selected into
1061  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
1062  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
1063  // instructions like:
1064  //     MOV  X0, WideImmediate
1065  //     ADD  X1, BaseReg, X0
1066  //     LDR  X2, [X1, 0]
1067  // For such situation, using [BaseReg, XReg] addressing mode can save one
1068  // ADD/SUB:
1069  //     MOV  X0, WideImmediate
1070  //     LDR  X2, [BaseReg, X0]
1071  if (isa<ConstantSDNode>(RHS)) {
1072    int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
1073    unsigned Scale = Log2_32(Size);
1074    // Skip the immediate can be selected by load/store addressing mode.
1075    // Also skip the immediate can be encoded by a single ADD (SUB is also
1076    // checked by using -ImmOff).
1077    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
1078        isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
1079      return false;
1080
1081    SDValue Ops[] = { RHS };
1082    SDNode *MOVI =
1083        CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
1084    SDValue MOVIV = SDValue(MOVI, 0);
1085    // This ADD of two X register will be selected into [Reg+Reg] mode.
1086    N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
1087  }
1088
1089  // Remember if it is worth folding N when it produces extended register.
1090  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1091
1092  // Try to match a shifted extend on the RHS.
1093  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1094      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
1095    Base = LHS;
1096    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1097    return true;
1098  }
1099
1100  // Try to match a shifted extend on the LHS.
1101  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1102      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
1103    Base = RHS;
1104    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1105    return true;
1106  }
1107
1108  // Match any non-shifted, non-extend, non-immediate add expression.
1109  Base = LHS;
1110  Offset = RHS;
1111  SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
1112  DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
1113  // Reg1 + Reg2 is free: no check needed.
1114  return true;
1115}
1116
1117SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
1118  static const unsigned RegClassIDs[] = {
1119      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
1120  static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
1121                                     AArch64::dsub2, AArch64::dsub3};
1122
1123  return createTuple(Regs, RegClassIDs, SubRegs);
1124}
1125
1126SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
1127  static const unsigned RegClassIDs[] = {
1128      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
1129  static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
1130                                     AArch64::qsub2, AArch64::qsub3};
1131
1132  return createTuple(Regs, RegClassIDs, SubRegs);
1133}
1134
1135SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
1136                                         const unsigned RegClassIDs[],
1137                                         const unsigned SubRegs[]) {
1138  // There's no special register-class for a vector-list of 1 element: it's just
1139  // a vector.
1140  if (Regs.size() == 1)
1141    return Regs[0];
1142
1143  assert(Regs.size() >= 2 && Regs.size() <= 4);
1144
1145  SDLoc DL(Regs[0]);
1146
1147  SmallVector<SDValue, 4> Ops;
1148
1149  // First operand of REG_SEQUENCE is the desired RegClass.
1150  Ops.push_back(
1151      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
1152
1153  // Then we get pairs of source & subregister-position for the components.
1154  for (unsigned i = 0; i < Regs.size(); ++i) {
1155    Ops.push_back(Regs[i]);
1156    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
1157  }
1158
1159  SDNode *N =
1160      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
1161  return SDValue(N, 0);
1162}
1163
1164void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
1165                                      bool isExt) {
1166  SDLoc dl(N);
1167  EVT VT = N->getValueType(0);
1168
1169  unsigned ExtOff = isExt;
1170
1171  // Form a REG_SEQUENCE to force register allocation.
1172  unsigned Vec0Off = ExtOff + 1;
1173  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
1174                               N->op_begin() + Vec0Off + NumVecs);
1175  SDValue RegSeq = createQTuple(Regs);
1176
1177  SmallVector<SDValue, 6> Ops;
1178  if (isExt)
1179    Ops.push_back(N->getOperand(1));
1180  Ops.push_back(RegSeq);
1181  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
1182  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
1183}
1184
1185bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
1186  LoadSDNode *LD = cast<LoadSDNode>(N);
1187  if (LD->isUnindexed())
1188    return false;
1189  EVT VT = LD->getMemoryVT();
1190  EVT DstVT = N->getValueType(0);
1191  ISD::MemIndexedMode AM = LD->getAddressingMode();
1192  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
1193
1194  // We're not doing validity checking here. That was done when checking
1195  // if we should mark the load as indexed or not. We're just selecting
1196  // the right instruction.
1197  unsigned Opcode = 0;
1198
1199  ISD::LoadExtType ExtType = LD->getExtensionType();
1200  bool InsertTo64 = false;
1201  if (VT == MVT::i64)
1202    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
1203  else if (VT == MVT::i32) {
1204    if (ExtType == ISD::NON_EXTLOAD)
1205      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1206    else if (ExtType == ISD::SEXTLOAD)
1207      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
1208    else {
1209      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1210      InsertTo64 = true;
1211      // The result of the load is only i32. It's the subreg_to_reg that makes
1212      // it into an i64.
1213      DstVT = MVT::i32;
1214    }
1215  } else if (VT == MVT::i16) {
1216    if (ExtType == ISD::SEXTLOAD) {
1217      if (DstVT == MVT::i64)
1218        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
1219      else
1220        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
1221    } else {
1222      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
1223      InsertTo64 = DstVT == MVT::i64;
1224      // The result of the load is only i32. It's the subreg_to_reg that makes
1225      // it into an i64.
1226      DstVT = MVT::i32;
1227    }
1228  } else if (VT == MVT::i8) {
1229    if (ExtType == ISD::SEXTLOAD) {
1230      if (DstVT == MVT::i64)
1231        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
1232      else
1233        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
1234    } else {
1235      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
1236      InsertTo64 = DstVT == MVT::i64;
1237      // The result of the load is only i32. It's the subreg_to_reg that makes
1238      // it into an i64.
1239      DstVT = MVT::i32;
1240    }
1241  } else if (VT == MVT::f16) {
1242    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1243  } else if (VT == MVT::f32) {
1244    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
1245  } else if (VT == MVT::f64 || VT.is64BitVector()) {
1246    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
1247  } else if (VT.is128BitVector()) {
1248    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
1249  } else
1250    return false;
1251  SDValue Chain = LD->getChain();
1252  SDValue Base = LD->getBasePtr();
1253  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
1254  int OffsetVal = (int)OffsetOp->getZExtValue();
1255  SDLoc dl(N);
1256  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
1257  SDValue Ops[] = { Base, Offset, Chain };
1258  SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
1259                                       MVT::Other, Ops);
1260  // Either way, we're replacing the node, so tell the caller that.
1261  SDValue LoadedVal = SDValue(Res, 1);
1262  if (InsertTo64) {
1263    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1264    LoadedVal =
1265        SDValue(CurDAG->getMachineNode(
1266                    AArch64::SUBREG_TO_REG, dl, MVT::i64,
1267                    CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
1268                    SubReg),
1269                0);
1270  }
1271
1272  ReplaceUses(SDValue(N, 0), LoadedVal);
1273  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
1274  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
1275  CurDAG->RemoveDeadNode(N);
1276  return true;
1277}
1278
1279void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
1280                                     unsigned SubRegIdx) {
1281  SDLoc dl(N);
1282  EVT VT = N->getValueType(0);
1283  SDValue Chain = N->getOperand(0);
1284
1285  SDValue Ops[] = {N->getOperand(2), // Mem operand;
1286                   Chain};
1287
1288  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1289
1290  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1291  SDValue SuperReg = SDValue(Ld, 0);
1292  for (unsigned i = 0; i < NumVecs; ++i)
1293    ReplaceUses(SDValue(N, i),
1294        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1295
1296  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1297
1298  // Transfer memoperands.
1299  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1300  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
1301
1302  CurDAG->RemoveDeadNode(N);
1303}
1304
1305void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
1306                                         unsigned Opc, unsigned SubRegIdx) {
1307  SDLoc dl(N);
1308  EVT VT = N->getValueType(0);
1309  SDValue Chain = N->getOperand(0);
1310
1311  SDValue Ops[] = {N->getOperand(1), // Mem operand
1312                   N->getOperand(2), // Incremental
1313                   Chain};
1314
1315  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1316                        MVT::Untyped, MVT::Other};
1317
1318  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1319
1320  // Update uses of write back register
1321  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1322
1323  // Update uses of vector list
1324  SDValue SuperReg = SDValue(Ld, 1);
1325  if (NumVecs == 1)
1326    ReplaceUses(SDValue(N, 0), SuperReg);
1327  else
1328    for (unsigned i = 0; i < NumVecs; ++i)
1329      ReplaceUses(SDValue(N, i),
1330          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1331
1332  // Update the chain
1333  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1334  CurDAG->RemoveDeadNode(N);
1335}
1336
1337void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
1338                                      unsigned Opc) {
1339  SDLoc dl(N);
1340  EVT VT = N->getOperand(2)->getValueType(0);
1341
1342  // Form a REG_SEQUENCE to force register allocation.
1343  bool Is128Bit = VT.getSizeInBits() == 128;
1344  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1345  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1346
1347  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
1348  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1349
1350  // Transfer memoperands.
1351  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1352  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1353
1354  ReplaceNode(N, St);
1355}
1356
1357void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
1358                                          unsigned Opc) {
1359  SDLoc dl(N);
1360  EVT VT = N->getOperand(2)->getValueType(0);
1361  const EVT ResTys[] = {MVT::i64,    // Type of the write back register
1362                        MVT::Other}; // Type for the Chain
1363
1364  // Form a REG_SEQUENCE to force register allocation.
1365  bool Is128Bit = VT.getSizeInBits() == 128;
1366  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1367  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1368
1369  SDValue Ops[] = {RegSeq,
1370                   N->getOperand(NumVecs + 1), // base register
1371                   N->getOperand(NumVecs + 2), // Incremental
1372                   N->getOperand(0)};          // Chain
1373  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1374
1375  ReplaceNode(N, St);
1376}
1377
1378namespace {
1379/// WidenVector - Given a value in the V64 register class, produce the
1380/// equivalent value in the V128 register class.
1381class WidenVector {
1382  SelectionDAG &DAG;
1383
1384public:
1385  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
1386
1387  SDValue operator()(SDValue V64Reg) {
1388    EVT VT = V64Reg.getValueType();
1389    unsigned NarrowSize = VT.getVectorNumElements();
1390    MVT EltTy = VT.getVectorElementType().getSimpleVT();
1391    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
1392    SDLoc DL(V64Reg);
1393
1394    SDValue Undef =
1395        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
1396    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
1397  }
1398};
1399} // namespace
1400
1401/// NarrowVector - Given a value in the V128 register class, produce the
1402/// equivalent value in the V64 register class.
1403static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
1404  EVT VT = V128Reg.getValueType();
1405  unsigned WideSize = VT.getVectorNumElements();
1406  MVT EltTy = VT.getVectorElementType().getSimpleVT();
1407  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
1408
1409  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
1410                                    V128Reg);
1411}
1412
1413void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
1414                                         unsigned Opc) {
1415  SDLoc dl(N);
1416  EVT VT = N->getValueType(0);
1417  bool Narrow = VT.getSizeInBits() == 64;
1418
1419  // Form a REG_SEQUENCE to force register allocation.
1420  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1421
1422  if (Narrow)
1423    transform(Regs, Regs.begin(),
1424                   WidenVector(*CurDAG));
1425
1426  SDValue RegSeq = createQTuple(Regs);
1427
1428  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1429
1430  unsigned LaneNo =
1431      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1432
1433  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1434                   N->getOperand(NumVecs + 3), N->getOperand(0)};
1435  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1436  SDValue SuperReg = SDValue(Ld, 0);
1437
1438  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1439  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1440                                    AArch64::qsub2, AArch64::qsub3 };
1441  for (unsigned i = 0; i < NumVecs; ++i) {
1442    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
1443    if (Narrow)
1444      NV = NarrowVector(NV, *CurDAG);
1445    ReplaceUses(SDValue(N, i), NV);
1446  }
1447
1448  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1449  CurDAG->RemoveDeadNode(N);
1450}
1451
1452void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
1453                                             unsigned Opc) {
1454  SDLoc dl(N);
1455  EVT VT = N->getValueType(0);
1456  bool Narrow = VT.getSizeInBits() == 64;
1457
1458  // Form a REG_SEQUENCE to force register allocation.
1459  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1460
1461  if (Narrow)
1462    transform(Regs, Regs.begin(),
1463                   WidenVector(*CurDAG));
1464
1465  SDValue RegSeq = createQTuple(Regs);
1466
1467  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1468                        RegSeq->getValueType(0), MVT::Other};
1469
1470  unsigned LaneNo =
1471      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1472
1473  SDValue Ops[] = {RegSeq,
1474                   CurDAG->getTargetConstant(LaneNo, dl,
1475                                             MVT::i64),         // Lane Number
1476                   N->getOperand(NumVecs + 2),                  // Base register
1477                   N->getOperand(NumVecs + 3),                  // Incremental
1478                   N->getOperand(0)};
1479  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1480
1481  // Update uses of the write back register
1482  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1483
1484  // Update uses of the vector list
1485  SDValue SuperReg = SDValue(Ld, 1);
1486  if (NumVecs == 1) {
1487    ReplaceUses(SDValue(N, 0),
1488                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
1489  } else {
1490    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1491    static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1492                                      AArch64::qsub2, AArch64::qsub3 };
1493    for (unsigned i = 0; i < NumVecs; ++i) {
1494      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
1495                                                  SuperReg);
1496      if (Narrow)
1497        NV = NarrowVector(NV, *CurDAG);
1498      ReplaceUses(SDValue(N, i), NV);
1499    }
1500  }
1501
1502  // Update the Chain
1503  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1504  CurDAG->RemoveDeadNode(N);
1505}
1506
1507void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
1508                                          unsigned Opc) {
1509  SDLoc dl(N);
1510  EVT VT = N->getOperand(2)->getValueType(0);
1511  bool Narrow = VT.getSizeInBits() == 64;
1512
1513  // Form a REG_SEQUENCE to force register allocation.
1514  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1515
1516  if (Narrow)
1517    transform(Regs, Regs.begin(),
1518                   WidenVector(*CurDAG));
1519
1520  SDValue RegSeq = createQTuple(Regs);
1521
1522  unsigned LaneNo =
1523      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1524
1525  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1526                   N->getOperand(NumVecs + 3), N->getOperand(0)};
1527  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
1528
1529  // Transfer memoperands.
1530  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1531  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1532
1533  ReplaceNode(N, St);
1534}
1535
1536void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
1537                                              unsigned Opc) {
1538  SDLoc dl(N);
1539  EVT VT = N->getOperand(2)->getValueType(0);
1540  bool Narrow = VT.getSizeInBits() == 64;
1541
1542  // Form a REG_SEQUENCE to force register allocation.
1543  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1544
1545  if (Narrow)
1546    transform(Regs, Regs.begin(),
1547                   WidenVector(*CurDAG));
1548
1549  SDValue RegSeq = createQTuple(Regs);
1550
1551  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1552                        MVT::Other};
1553
1554  unsigned LaneNo =
1555      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1556
1557  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1558                   N->getOperand(NumVecs + 2), // Base Register
1559                   N->getOperand(NumVecs + 3), // Incremental
1560                   N->getOperand(0)};
1561  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1562
1563  // Transfer memoperands.
1564  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1565  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1566
1567  ReplaceNode(N, St);
1568}
1569
1570static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
1571                                       unsigned &Opc, SDValue &Opd0,
1572                                       unsigned &LSB, unsigned &MSB,
1573                                       unsigned NumberOfIgnoredLowBits,
1574                                       bool BiggerPattern) {
1575  assert(N->getOpcode() == ISD::AND &&
1576         "N must be a AND operation to call this function");
1577
1578  EVT VT = N->getValueType(0);
1579
1580  // Here we can test the type of VT and return false when the type does not
1581  // match, but since it is done prior to that call in the current context
1582  // we turned that into an assert to avoid redundant code.
1583  assert((VT == MVT::i32 || VT == MVT::i64) &&
1584         "Type checking must have been done before calling this function");
1585
1586  // FIXME: simplify-demanded-bits in DAGCombine will probably have
1587  // changed the AND node to a 32-bit mask operation. We'll have to
1588  // undo that as part of the transform here if we want to catch all
1589  // the opportunities.
1590  // Currently the NumberOfIgnoredLowBits argument helps to recover
1591  // form these situations when matching bigger pattern (bitfield insert).
1592
1593  // For unsigned extracts, check for a shift right and mask
1594  uint64_t AndImm = 0;
1595  if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
1596    return false;
1597
1598  const SDNode *Op0 = N->getOperand(0).getNode();
1599
1600  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
1601  // simplified. Try to undo that
1602  AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
1603
1604  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
1605  if (AndImm & (AndImm + 1))
1606    return false;
1607
1608  bool ClampMSB = false;
1609  uint64_t SrlImm = 0;
1610  // Handle the SRL + ANY_EXTEND case.
1611  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
1612      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
1613    // Extend the incoming operand of the SRL to 64-bit.
1614    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
1615    // Make sure to clamp the MSB so that we preserve the semantics of the
1616    // original operations.
1617    ClampMSB = true;
1618  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
1619             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
1620                                   SrlImm)) {
1621    // If the shift result was truncated, we can still combine them.
1622    Opd0 = Op0->getOperand(0).getOperand(0);
1623
1624    // Use the type of SRL node.
1625    VT = Opd0->getValueType(0);
1626  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
1627    Opd0 = Op0->getOperand(0);
1628  } else if (BiggerPattern) {
1629    // Let's pretend a 0 shift right has been performed.
1630    // The resulting code will be at least as good as the original one
1631    // plus it may expose more opportunities for bitfield insert pattern.
1632    // FIXME: Currently we limit this to the bigger pattern, because
1633    // some optimizations expect AND and not UBFM.
1634    Opd0 = N->getOperand(0);
1635  } else
1636    return false;
1637
1638  // Bail out on large immediates. This happens when no proper
1639  // combining/constant folding was performed.
1640  if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
1641    LLVM_DEBUG(
1642        (dbgs() << N
1643                << ": Found large shift immediate, this should not happen\n"));
1644    return false;
1645  }
1646
1647  LSB = SrlImm;
1648  MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
1649                                 : countTrailingOnes<uint64_t>(AndImm)) -
1650        1;
1651  if (ClampMSB)
1652    // Since we're moving the extend before the right shift operation, we need
1653    // to clamp the MSB to make sure we don't shift in undefined bits instead of
1654    // the zeros which would get shifted in with the original right shift
1655    // operation.
1656    MSB = MSB > 31 ? 31 : MSB;
1657
1658  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
1659  return true;
1660}
1661
1662static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
1663                                             SDValue &Opd0, unsigned &Immr,
1664                                             unsigned &Imms) {
1665  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
1666
1667  EVT VT = N->getValueType(0);
1668  unsigned BitWidth = VT.getSizeInBits();
1669  assert((VT == MVT::i32 || VT == MVT::i64) &&
1670         "Type checking must have been done before calling this function");
1671
1672  SDValue Op = N->getOperand(0);
1673  if (Op->getOpcode() == ISD::TRUNCATE) {
1674    Op = Op->getOperand(0);
1675    VT = Op->getValueType(0);
1676    BitWidth = VT.getSizeInBits();
1677  }
1678
1679  uint64_t ShiftImm;
1680  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
1681      !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1682    return false;
1683
1684  unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
1685  if (ShiftImm + Width > BitWidth)
1686    return false;
1687
1688  Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
1689  Opd0 = Op.getOperand(0);
1690  Immr = ShiftImm;
1691  Imms = ShiftImm + Width - 1;
1692  return true;
1693}
1694
1695static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
1696                                          SDValue &Opd0, unsigned &LSB,
1697                                          unsigned &MSB) {
1698  // We are looking for the following pattern which basically extracts several
1699  // continuous bits from the source value and places it from the LSB of the
1700  // destination value, all other bits of the destination value or set to zero:
1701  //
1702  // Value2 = AND Value, MaskImm
1703  // SRL Value2, ShiftImm
1704  //
1705  // with MaskImm >> ShiftImm to search for the bit width.
1706  //
1707  // This gets selected into a single UBFM:
1708  //
1709  // UBFM Value, ShiftImm, BitWide + SrlImm -1
1710  //
1711
1712  if (N->getOpcode() != ISD::SRL)
1713    return false;
1714
1715  uint64_t AndMask = 0;
1716  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
1717    return false;
1718
1719  Opd0 = N->getOperand(0).getOperand(0);
1720
1721  uint64_t SrlImm = 0;
1722  if (!isIntImmediate(N->getOperand(1), SrlImm))
1723    return false;
1724
1725  // Check whether we really have several bits extract here.
1726  unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
1727  if (BitWide && isMask_64(AndMask >> SrlImm)) {
1728    if (N->getValueType(0) == MVT::i32)
1729      Opc = AArch64::UBFMWri;
1730    else
1731      Opc = AArch64::UBFMXri;
1732
1733    LSB = SrlImm;
1734    MSB = BitWide + SrlImm - 1;
1735    return true;
1736  }
1737
1738  return false;
1739}
1740
1741static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
1742                                       unsigned &Immr, unsigned &Imms,
1743                                       bool BiggerPattern) {
1744  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
1745         "N must be a SHR/SRA operation to call this function");
1746
1747  EVT VT = N->getValueType(0);
1748
1749  // Here we can test the type of VT and return false when the type does not
1750  // match, but since it is done prior to that call in the current context
1751  // we turned that into an assert to avoid redundant code.
1752  assert((VT == MVT::i32 || VT == MVT::i64) &&
1753         "Type checking must have been done before calling this function");
1754
1755  // Check for AND + SRL doing several bits extract.
1756  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
1757    return true;
1758
1759  // We're looking for a shift of a shift.
1760  uint64_t ShlImm = 0;
1761  uint64_t TruncBits = 0;
1762  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
1763    Opd0 = N->getOperand(0).getOperand(0);
1764  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
1765             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
1766    // We are looking for a shift of truncate. Truncate from i64 to i32 could
1767    // be considered as setting high 32 bits as zero. Our strategy here is to
1768    // always generate 64bit UBFM. This consistency will help the CSE pass
1769    // later find more redundancy.
1770    Opd0 = N->getOperand(0).getOperand(0);
1771    TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
1772    VT = Opd0.getValueType();
1773    assert(VT == MVT::i64 && "the promoted type should be i64");
1774  } else if (BiggerPattern) {
1775    // Let's pretend a 0 shift left has been performed.
1776    // FIXME: Currently we limit this to the bigger pattern case,
1777    // because some optimizations expect AND and not UBFM
1778    Opd0 = N->getOperand(0);
1779  } else
1780    return false;
1781
1782  // Missing combines/constant folding may have left us with strange
1783  // constants.
1784  if (ShlImm >= VT.getSizeInBits()) {
1785    LLVM_DEBUG(
1786        (dbgs() << N
1787                << ": Found large shift immediate, this should not happen\n"));
1788    return false;
1789  }
1790
1791  uint64_t SrlImm = 0;
1792  if (!isIntImmediate(N->getOperand(1), SrlImm))
1793    return false;
1794
1795  assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
1796         "bad amount in shift node!");
1797  int immr = SrlImm - ShlImm;
1798  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
1799  Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
1800  // SRA requires a signed extraction
1801  if (VT == MVT::i32)
1802    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
1803  else
1804    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
1805  return true;
1806}
1807
1808bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
1809  assert(N->getOpcode() == ISD::SIGN_EXTEND);
1810
1811  EVT VT = N->getValueType(0);
1812  EVT NarrowVT = N->getOperand(0)->getValueType(0);
1813  if (VT != MVT::i64 || NarrowVT != MVT::i32)
1814    return false;
1815
1816  uint64_t ShiftImm;
1817  SDValue Op = N->getOperand(0);
1818  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1819    return false;
1820
1821  SDLoc dl(N);
1822  // Extend the incoming operand of the shift to 64-bits.
1823  SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
1824  unsigned Immr = ShiftImm;
1825  unsigned Imms = NarrowVT.getSizeInBits() - 1;
1826  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
1827                   CurDAG->getTargetConstant(Imms, dl, VT)};
1828  CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
1829  return true;
1830}
1831
1832/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
1833/// extract of a subvector.
1834bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
1835  assert(N->getOpcode() == ISD::FP_EXTEND);
1836
1837  // There are 2 forms of fcvtl2 - extend to double or extend to float.
1838  SDValue Extract = N->getOperand(0);
1839  EVT VT = N->getValueType(0);
1840  EVT NarrowVT = Extract.getValueType();
1841  if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
1842      (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
1843    return false;
1844
1845  // Optionally look past a bitcast.
1846  Extract = peekThroughBitcasts(Extract);
1847  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
1848    return false;
1849
1850  // Match extract from start of high half index.
1851  // Example: v8i16 -> v4i16 means the extract must begin at index 4.
1852  unsigned ExtractIndex = Extract.getConstantOperandVal(1);
1853  if (ExtractIndex != Extract.getValueType().getVectorNumElements())
1854    return false;
1855
1856  auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
1857  CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
1858  return true;
1859}
1860
1861static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
1862                                SDValue &Opd0, unsigned &Immr, unsigned &Imms,
1863                                unsigned NumberOfIgnoredLowBits = 0,
1864                                bool BiggerPattern = false) {
1865  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
1866    return false;
1867
1868  switch (N->getOpcode()) {
1869  default:
1870    if (!N->isMachineOpcode())
1871      return false;
1872    break;
1873  case ISD::AND:
1874    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
1875                                      NumberOfIgnoredLowBits, BiggerPattern);
1876  case ISD::SRL:
1877  case ISD::SRA:
1878    return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
1879
1880  case ISD::SIGN_EXTEND_INREG:
1881    return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
1882  }
1883
1884  unsigned NOpc = N->getMachineOpcode();
1885  switch (NOpc) {
1886  default:
1887    return false;
1888  case AArch64::SBFMWri:
1889  case AArch64::UBFMWri:
1890  case AArch64::SBFMXri:
1891  case AArch64::UBFMXri:
1892    Opc = NOpc;
1893    Opd0 = N->getOperand(0);
1894    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
1895    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
1896    return true;
1897  }
1898  // Unreachable
1899  return false;
1900}
1901
1902bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
1903  unsigned Opc, Immr, Imms;
1904  SDValue Opd0;
1905  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
1906    return false;
1907
1908  EVT VT = N->getValueType(0);
1909  SDLoc dl(N);
1910
1911  // If the bit extract operation is 64bit but the original type is 32bit, we
1912  // need to add one EXTRACT_SUBREG.
1913  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
1914    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
1915                       CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
1916
1917    SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
1918    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1919    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1920                                          MVT::i32, SDValue(BFM, 0), SubReg));
1921    return true;
1922  }
1923
1924  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
1925                   CurDAG->getTargetConstant(Imms, dl, VT)};
1926  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
1927  return true;
1928}
1929
1930/// Does DstMask form a complementary pair with the mask provided by
1931/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
1932/// this asks whether DstMask zeroes precisely those bits that will be set by
1933/// the other half.
1934static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
1935                              unsigned NumberOfIgnoredHighBits, EVT VT) {
1936  assert((VT == MVT::i32 || VT == MVT::i64) &&
1937         "i32 or i64 mask type expected!");
1938  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
1939
1940  APInt SignificantDstMask = APInt(BitWidth, DstMask);
1941  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
1942
1943  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
1944         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
1945}
1946
1947// Look for bits that will be useful for later uses.
1948// A bit is consider useless as soon as it is dropped and never used
1949// before it as been dropped.
1950// E.g., looking for useful bit of x
1951// 1. y = x & 0x7
1952// 2. z = y >> 2
1953// After #1, x useful bits are 0x7, then the useful bits of x, live through
1954// y.
1955// After #2, the useful bits of x are 0x4.
1956// However, if x is used on an unpredicatable instruction, then all its bits
1957// are useful.
1958// E.g.
1959// 1. y = x & 0x7
1960// 2. z = y >> 2
1961// 3. str x, [@x]
1962static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
1963
1964static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
1965                                              unsigned Depth) {
1966  uint64_t Imm =
1967      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
1968  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
1969  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
1970  getUsefulBits(Op, UsefulBits, Depth + 1);
1971}
1972
1973static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
1974                                             uint64_t Imm, uint64_t MSB,
1975                                             unsigned Depth) {
1976  // inherit the bitwidth value
1977  APInt OpUsefulBits(UsefulBits);
1978  OpUsefulBits = 1;
1979
1980  if (MSB >= Imm) {
1981    OpUsefulBits <<= MSB - Imm + 1;
1982    --OpUsefulBits;
1983    // The interesting part will be in the lower part of the result
1984    getUsefulBits(Op, OpUsefulBits, Depth + 1);
1985    // The interesting part was starting at Imm in the argument
1986    OpUsefulBits <<= Imm;
1987  } else {
1988    OpUsefulBits <<= MSB + 1;
1989    --OpUsefulBits;
1990    // The interesting part will be shifted in the result
1991    OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
1992    getUsefulBits(Op, OpUsefulBits, Depth + 1);
1993    // The interesting part was at zero in the argument
1994    OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
1995  }
1996
1997  UsefulBits &= OpUsefulBits;
1998}
1999
2000static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
2001                                  unsigned Depth) {
2002  uint64_t Imm =
2003      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2004  uint64_t MSB =
2005      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2006
2007  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
2008}
2009
2010static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
2011                                              unsigned Depth) {
2012  uint64_t ShiftTypeAndValue =
2013      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2014  APInt Mask(UsefulBits);
2015  Mask.clearAllBits();
2016  Mask.flipAllBits();
2017
2018  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
2019    // Shift Left
2020    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2021    Mask <<= ShiftAmt;
2022    getUsefulBits(Op, Mask, Depth + 1);
2023    Mask.lshrInPlace(ShiftAmt);
2024  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
2025    // Shift Right
2026    // We do not handle AArch64_AM::ASR, because the sign will change the
2027    // number of useful bits
2028    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2029    Mask.lshrInPlace(ShiftAmt);
2030    getUsefulBits(Op, Mask, Depth + 1);
2031    Mask <<= ShiftAmt;
2032  } else
2033    return;
2034
2035  UsefulBits &= Mask;
2036}
2037
2038static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
2039                                 unsigned Depth) {
2040  uint64_t Imm =
2041      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2042  uint64_t MSB =
2043      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
2044
2045  APInt OpUsefulBits(UsefulBits);
2046  OpUsefulBits = 1;
2047
2048  APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
2049  ResultUsefulBits.flipAllBits();
2050  APInt Mask(UsefulBits.getBitWidth(), 0);
2051
2052  getUsefulBits(Op, ResultUsefulBits, Depth + 1);
2053
2054  if (MSB >= Imm) {
2055    // The instruction is a BFXIL.
2056    uint64_t Width = MSB - Imm + 1;
2057    uint64_t LSB = Imm;
2058
2059    OpUsefulBits <<= Width;
2060    --OpUsefulBits;
2061
2062    if (Op.getOperand(1) == Orig) {
2063      // Copy the low bits from the result to bits starting from LSB.
2064      Mask = ResultUsefulBits & OpUsefulBits;
2065      Mask <<= LSB;
2066    }
2067
2068    if (Op.getOperand(0) == Orig)
2069      // Bits starting from LSB in the input contribute to the result.
2070      Mask |= (ResultUsefulBits & ~OpUsefulBits);
2071  } else {
2072    // The instruction is a BFI.
2073    uint64_t Width = MSB + 1;
2074    uint64_t LSB = UsefulBits.getBitWidth() - Imm;
2075
2076    OpUsefulBits <<= Width;
2077    --OpUsefulBits;
2078    OpUsefulBits <<= LSB;
2079
2080    if (Op.getOperand(1) == Orig) {
2081      // Copy the bits from the result to the zero bits.
2082      Mask = ResultUsefulBits & OpUsefulBits;
2083      Mask.lshrInPlace(LSB);
2084    }
2085
2086    if (Op.getOperand(0) == Orig)
2087      Mask |= (ResultUsefulBits & ~OpUsefulBits);
2088  }
2089
2090  UsefulBits &= Mask;
2091}
2092
2093static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
2094                                SDValue Orig, unsigned Depth) {
2095
2096  // Users of this node should have already been instruction selected
2097  // FIXME: Can we turn that into an assert?
2098  if (!UserNode->isMachineOpcode())
2099    return;
2100
2101  switch (UserNode->getMachineOpcode()) {
2102  default:
2103    return;
2104  case AArch64::ANDSWri:
2105  case AArch64::ANDSXri:
2106  case AArch64::ANDWri:
2107  case AArch64::ANDXri:
2108    // We increment Depth only when we call the getUsefulBits
2109    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
2110                                             Depth);
2111  case AArch64::UBFMWri:
2112  case AArch64::UBFMXri:
2113    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
2114
2115  case AArch64::ORRWrs:
2116  case AArch64::ORRXrs:
2117    if (UserNode->getOperand(1) != Orig)
2118      return;
2119    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
2120                                             Depth);
2121  case AArch64::BFMWri:
2122  case AArch64::BFMXri:
2123    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
2124
2125  case AArch64::STRBBui:
2126  case AArch64::STURBBi:
2127    if (UserNode->getOperand(0) != Orig)
2128      return;
2129    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
2130    return;
2131
2132  case AArch64::STRHHui:
2133  case AArch64::STURHHi:
2134    if (UserNode->getOperand(0) != Orig)
2135      return;
2136    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
2137    return;
2138  }
2139}
2140
2141static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
2142  if (Depth >= SelectionDAG::MaxRecursionDepth)
2143    return;
2144  // Initialize UsefulBits
2145  if (!Depth) {
2146    unsigned Bitwidth = Op.getScalarValueSizeInBits();
2147    // At the beginning, assume every produced bits is useful
2148    UsefulBits = APInt(Bitwidth, 0);
2149    UsefulBits.flipAllBits();
2150  }
2151  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
2152
2153  for (SDNode *Node : Op.getNode()->uses()) {
2154    // A use cannot produce useful bits
2155    APInt UsefulBitsForUse = APInt(UsefulBits);
2156    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
2157    UsersUsefulBits |= UsefulBitsForUse;
2158  }
2159  // UsefulBits contains the produced bits that are meaningful for the
2160  // current definition, thus a user cannot make a bit meaningful at
2161  // this point
2162  UsefulBits &= UsersUsefulBits;
2163}
2164
2165/// Create a machine node performing a notional SHL of Op by ShlAmount. If
2166/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
2167/// 0, return Op unchanged.
2168static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
2169  if (ShlAmount == 0)
2170    return Op;
2171
2172  EVT VT = Op.getValueType();
2173  SDLoc dl(Op);
2174  unsigned BitWidth = VT.getSizeInBits();
2175  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
2176
2177  SDNode *ShiftNode;
2178  if (ShlAmount > 0) {
2179    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
2180    ShiftNode = CurDAG->getMachineNode(
2181        UBFMOpc, dl, VT, Op,
2182        CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
2183        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
2184  } else {
2185    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
2186    assert(ShlAmount < 0 && "expected right shift");
2187    int ShrAmount = -ShlAmount;
2188    ShiftNode = CurDAG->getMachineNode(
2189        UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
2190        CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
2191  }
2192
2193  return SDValue(ShiftNode, 0);
2194}
2195
2196/// Does this tree qualify as an attempt to move a bitfield into position,
2197/// essentially "(and (shl VAL, N), Mask)".
2198static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
2199                                    bool BiggerPattern,
2200                                    SDValue &Src, int &ShiftAmount,
2201                                    int &MaskWidth) {
2202  EVT VT = Op.getValueType();
2203  unsigned BitWidth = VT.getSizeInBits();
2204  (void)BitWidth;
2205  assert(BitWidth == 32 || BitWidth == 64);
2206
2207  KnownBits Known = CurDAG->computeKnownBits(Op);
2208
2209  // Non-zero in the sense that they're not provably zero, which is the key
2210  // point if we want to use this value
2211  uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
2212
2213  // Discard a constant AND mask if present. It's safe because the node will
2214  // already have been factored into the computeKnownBits calculation above.
2215  uint64_t AndImm;
2216  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
2217    assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
2218    Op = Op.getOperand(0);
2219  }
2220
2221  // Don't match if the SHL has more than one use, since then we'll end up
2222  // generating SHL+UBFIZ instead of just keeping SHL+AND.
2223  if (!BiggerPattern && !Op.hasOneUse())
2224    return false;
2225
2226  uint64_t ShlImm;
2227  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
2228    return false;
2229  Op = Op.getOperand(0);
2230
2231  if (!isShiftedMask_64(NonZeroBits))
2232    return false;
2233
2234  ShiftAmount = countTrailingZeros(NonZeroBits);
2235  MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
2236
2237  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
2238  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
2239  // amount.  BiggerPattern is true when this pattern is being matched for BFI,
2240  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
2241  // which case it is not profitable to insert an extra shift.
2242  if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
2243    return false;
2244  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
2245
2246  return true;
2247}
2248
2249static bool isShiftedMask(uint64_t Mask, EVT VT) {
2250  assert(VT == MVT::i32 || VT == MVT::i64);
2251  if (VT == MVT::i32)
2252    return isShiftedMask_32(Mask);
2253  return isShiftedMask_64(Mask);
2254}
2255
2256// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
2257// inserted only sets known zero bits.
2258static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
2259  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2260
2261  EVT VT = N->getValueType(0);
2262  if (VT != MVT::i32 && VT != MVT::i64)
2263    return false;
2264
2265  unsigned BitWidth = VT.getSizeInBits();
2266
2267  uint64_t OrImm;
2268  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
2269    return false;
2270
2271  // Skip this transformation if the ORR immediate can be encoded in the ORR.
2272  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
2273  // performance neutral.
2274  if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
2275    return false;
2276
2277  uint64_t MaskImm;
2278  SDValue And = N->getOperand(0);
2279  // Must be a single use AND with an immediate operand.
2280  if (!And.hasOneUse() ||
2281      !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
2282    return false;
2283
2284  // Compute the Known Zero for the AND as this allows us to catch more general
2285  // cases than just looking for AND with imm.
2286  KnownBits Known = CurDAG->computeKnownBits(And);
2287
2288  // Non-zero in the sense that they're not provably zero, which is the key
2289  // point if we want to use this value.
2290  uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
2291
2292  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
2293  if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
2294    return false;
2295
2296  // The bits being inserted must only set those bits that are known to be zero.
2297  if ((OrImm & NotKnownZero) != 0) {
2298    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
2299    // currently handle this case.
2300    return false;
2301  }
2302
2303  // BFI/BFXIL dst, src, #lsb, #width.
2304  int LSB = countTrailingOnes(NotKnownZero);
2305  int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
2306
2307  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
2308  unsigned ImmR = (BitWidth - LSB) % BitWidth;
2309  unsigned ImmS = Width - 1;
2310
2311  // If we're creating a BFI instruction avoid cases where we need more
2312  // instructions to materialize the BFI constant as compared to the original
2313  // ORR.  A BFXIL will use the same constant as the original ORR, so the code
2314  // should be no worse in this case.
2315  bool IsBFI = LSB != 0;
2316  uint64_t BFIImm = OrImm >> LSB;
2317  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
2318    // We have a BFI instruction and we know the constant can't be materialized
2319    // with a ORR-immediate with the zero register.
2320    unsigned OrChunks = 0, BFIChunks = 0;
2321    for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
2322      if (((OrImm >> Shift) & 0xFFFF) != 0)
2323        ++OrChunks;
2324      if (((BFIImm >> Shift) & 0xFFFF) != 0)
2325        ++BFIChunks;
2326    }
2327    if (BFIChunks > OrChunks)
2328      return false;
2329  }
2330
2331  // Materialize the constant to be inserted.
2332  SDLoc DL(N);
2333  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
2334  SDNode *MOVI = CurDAG->getMachineNode(
2335      MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
2336
2337  // Create the BFI/BFXIL instruction.
2338  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
2339                   CurDAG->getTargetConstant(ImmR, DL, VT),
2340                   CurDAG->getTargetConstant(ImmS, DL, VT)};
2341  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2342  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2343  return true;
2344}
2345
2346static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
2347                                      SelectionDAG *CurDAG) {
2348  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2349
2350  EVT VT = N->getValueType(0);
2351  if (VT != MVT::i32 && VT != MVT::i64)
2352    return false;
2353
2354  unsigned BitWidth = VT.getSizeInBits();
2355
2356  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
2357  // have the expected shape. Try to undo that.
2358
2359  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
2360  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
2361
2362  // Given a OR operation, check if we have the following pattern
2363  // ubfm c, b, imm, imm2 (or something that does the same jobs, see
2364  //                       isBitfieldExtractOp)
2365  // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
2366  //                 countTrailingZeros(mask2) == imm2 - imm + 1
2367  // f = d | c
2368  // if yes, replace the OR instruction with:
2369  // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
2370
2371  // OR is commutative, check all combinations of operand order and values of
2372  // BiggerPattern, i.e.
2373  //     Opd0, Opd1, BiggerPattern=false
2374  //     Opd1, Opd0, BiggerPattern=false
2375  //     Opd0, Opd1, BiggerPattern=true
2376  //     Opd1, Opd0, BiggerPattern=true
2377  // Several of these combinations may match, so check with BiggerPattern=false
2378  // first since that will produce better results by matching more instructions
2379  // and/or inserting fewer extra instructions.
2380  for (int I = 0; I < 4; ++I) {
2381
2382    SDValue Dst, Src;
2383    unsigned ImmR, ImmS;
2384    bool BiggerPattern = I / 2;
2385    SDValue OrOpd0Val = N->getOperand(I % 2);
2386    SDNode *OrOpd0 = OrOpd0Val.getNode();
2387    SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
2388    SDNode *OrOpd1 = OrOpd1Val.getNode();
2389
2390    unsigned BFXOpc;
2391    int DstLSB, Width;
2392    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
2393                            NumberOfIgnoredLowBits, BiggerPattern)) {
2394      // Check that the returned opcode is compatible with the pattern,
2395      // i.e., same type and zero extended (U and not S)
2396      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
2397          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
2398        continue;
2399
2400      // Compute the width of the bitfield insertion
2401      DstLSB = 0;
2402      Width = ImmS - ImmR + 1;
2403      // FIXME: This constraint is to catch bitfield insertion we may
2404      // want to widen the pattern if we want to grab general bitfied
2405      // move case
2406      if (Width <= 0)
2407        continue;
2408
2409      // If the mask on the insertee is correct, we have a BFXIL operation. We
2410      // can share the ImmR and ImmS values from the already-computed UBFM.
2411    } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
2412                                       BiggerPattern,
2413                                       Src, DstLSB, Width)) {
2414      ImmR = (BitWidth - DstLSB) % BitWidth;
2415      ImmS = Width - 1;
2416    } else
2417      continue;
2418
2419    // Check the second part of the pattern
2420    EVT VT = OrOpd1Val.getValueType();
2421    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
2422
2423    // Compute the Known Zero for the candidate of the first operand.
2424    // This allows to catch more general case than just looking for
2425    // AND with imm. Indeed, simplify-demanded-bits may have removed
2426    // the AND instruction because it proves it was useless.
2427    KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
2428
2429    // Check if there is enough room for the second operand to appear
2430    // in the first one
2431    APInt BitsToBeInserted =
2432        APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
2433
2434    if ((BitsToBeInserted & ~Known.Zero) != 0)
2435      continue;
2436
2437    // Set the first operand
2438    uint64_t Imm;
2439    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
2440        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
2441      // In that case, we can eliminate the AND
2442      Dst = OrOpd1->getOperand(0);
2443    else
2444      // Maybe the AND has been removed by simplify-demanded-bits
2445      // or is useful because it discards more bits
2446      Dst = OrOpd1Val;
2447
2448    // both parts match
2449    SDLoc DL(N);
2450    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
2451                     CurDAG->getTargetConstant(ImmS, DL, VT)};
2452    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2453    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2454    return true;
2455  }
2456
2457  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
2458  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
2459  // mask (e.g., 0x000ffff0).
2460  uint64_t Mask0Imm, Mask1Imm;
2461  SDValue And0 = N->getOperand(0);
2462  SDValue And1 = N->getOperand(1);
2463  if (And0.hasOneUse() && And1.hasOneUse() &&
2464      isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
2465      isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
2466      APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
2467      (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
2468
2469    // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
2470    // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
2471    // bits to be inserted.
2472    if (isShiftedMask(Mask0Imm, VT)) {
2473      std::swap(And0, And1);
2474      std::swap(Mask0Imm, Mask1Imm);
2475    }
2476
2477    SDValue Src = And1->getOperand(0);
2478    SDValue Dst = And0->getOperand(0);
2479    unsigned LSB = countTrailingZeros(Mask1Imm);
2480    int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
2481
2482    // The BFXIL inserts the low-order bits from a source register, so right
2483    // shift the needed bits into place.
2484    SDLoc DL(N);
2485    unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2486    SDNode *LSR = CurDAG->getMachineNode(
2487        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
2488        CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
2489
2490    // BFXIL is an alias of BFM, so translate to BFM operands.
2491    unsigned ImmR = (BitWidth - LSB) % BitWidth;
2492    unsigned ImmS = Width - 1;
2493
2494    // Create the BFXIL instruction.
2495    SDValue Ops[] = {Dst, SDValue(LSR, 0),
2496                     CurDAG->getTargetConstant(ImmR, DL, VT),
2497                     CurDAG->getTargetConstant(ImmS, DL, VT)};
2498    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2499    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2500    return true;
2501  }
2502
2503  return false;
2504}
2505
2506bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
2507  if (N->getOpcode() != ISD::OR)
2508    return false;
2509
2510  APInt NUsefulBits;
2511  getUsefulBits(SDValue(N, 0), NUsefulBits);
2512
2513  // If all bits are not useful, just return UNDEF.
2514  if (!NUsefulBits) {
2515    CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
2516    return true;
2517  }
2518
2519  if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
2520    return true;
2521
2522  return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
2523}
2524
2525/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
2526/// equivalent of a left shift by a constant amount followed by an and masking
2527/// out a contiguous set of bits.
2528bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
2529  if (N->getOpcode() != ISD::AND)
2530    return false;
2531
2532  EVT VT = N->getValueType(0);
2533  if (VT != MVT::i32 && VT != MVT::i64)
2534    return false;
2535
2536  SDValue Op0;
2537  int DstLSB, Width;
2538  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
2539                               Op0, DstLSB, Width))
2540    return false;
2541
2542  // ImmR is the rotate right amount.
2543  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
2544  // ImmS is the most significant bit of the source to be moved.
2545  unsigned ImmS = Width - 1;
2546
2547  SDLoc DL(N);
2548  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
2549                   CurDAG->getTargetConstant(ImmS, DL, VT)};
2550  unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2551  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2552  return true;
2553}
2554
2555/// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
2556/// variable shift/rotate instructions.
2557bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
2558  EVT VT = N->getValueType(0);
2559
2560  unsigned Opc;
2561  switch (N->getOpcode()) {
2562  case ISD::ROTR:
2563    Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
2564    break;
2565  case ISD::SHL:
2566    Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
2567    break;
2568  case ISD::SRL:
2569    Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
2570    break;
2571  case ISD::SRA:
2572    Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
2573    break;
2574  default:
2575    return false;
2576  }
2577
2578  uint64_t Size;
2579  uint64_t Bits;
2580  if (VT == MVT::i32) {
2581    Bits = 5;
2582    Size = 32;
2583  } else if (VT == MVT::i64) {
2584    Bits = 6;
2585    Size = 64;
2586  } else
2587    return false;
2588
2589  SDValue ShiftAmt = N->getOperand(1);
2590  SDLoc DL(N);
2591  SDValue NewShiftAmt;
2592
2593  // Skip over an extend of the shift amount.
2594  if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
2595      ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
2596    ShiftAmt = ShiftAmt->getOperand(0);
2597
2598  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
2599    SDValue Add0 = ShiftAmt->getOperand(0);
2600    SDValue Add1 = ShiftAmt->getOperand(1);
2601    uint64_t Add0Imm;
2602    uint64_t Add1Imm;
2603    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
2604    // to avoid the ADD/SUB.
2605    if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
2606      NewShiftAmt = Add0;
2607    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
2608    // generate a NEG instead of a SUB of a constant.
2609    else if (ShiftAmt->getOpcode() == ISD::SUB &&
2610             isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
2611             (Add0Imm % Size == 0)) {
2612      unsigned NegOpc;
2613      unsigned ZeroReg;
2614      EVT SubVT = ShiftAmt->getValueType(0);
2615      if (SubVT == MVT::i32) {
2616        NegOpc = AArch64::SUBWrr;
2617        ZeroReg = AArch64::WZR;
2618      } else {
2619        assert(SubVT == MVT::i64);
2620        NegOpc = AArch64::SUBXrr;
2621        ZeroReg = AArch64::XZR;
2622      }
2623      SDValue Zero =
2624          CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
2625      MachineSDNode *Neg =
2626          CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
2627      NewShiftAmt = SDValue(Neg, 0);
2628    } else
2629      return false;
2630  } else {
2631    // If the shift amount is masked with an AND, check that the mask covers the
2632    // bits that are implicitly ANDed off by the above opcodes and if so, skip
2633    // the AND.
2634    uint64_t MaskImm;
2635    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
2636      return false;
2637
2638    if (countTrailingOnes(MaskImm) < Bits)
2639      return false;
2640
2641    NewShiftAmt = ShiftAmt->getOperand(0);
2642  }
2643
2644  // Narrow/widen the shift amount to match the size of the shift operation.
2645  if (VT == MVT::i32)
2646    NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
2647  else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
2648    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
2649    MachineSDNode *Ext = CurDAG->getMachineNode(
2650        AArch64::SUBREG_TO_REG, DL, VT,
2651        CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
2652    NewShiftAmt = SDValue(Ext, 0);
2653  }
2654
2655  SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
2656  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2657  return true;
2658}
2659
2660bool
2661AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
2662                                              unsigned RegWidth) {
2663  APFloat FVal(0.0);
2664  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
2665    FVal = CN->getValueAPF();
2666  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
2667    // Some otherwise illegal constants are allowed in this case.
2668    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
2669        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
2670      return false;
2671
2672    ConstantPoolSDNode *CN =
2673        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
2674    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
2675  } else
2676    return false;
2677
2678  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
2679  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
2680  // x-register.
2681  //
2682  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
2683  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
2684  // integers.
2685  bool IsExact;
2686
2687  // fbits is between 1 and 64 in the worst-case, which means the fmul
2688  // could have 2^64 as an actual operand. Need 65 bits of precision.
2689  APSInt IntVal(65, true);
2690  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
2691
2692  // N.b. isPowerOf2 also checks for > 0.
2693  if (!IsExact || !IntVal.isPowerOf2()) return false;
2694  unsigned FBits = IntVal.logBase2();
2695
2696  // Checks above should have guaranteed that we haven't lost information in
2697  // finding FBits, but it must still be in range.
2698  if (FBits == 0 || FBits > RegWidth) return false;
2699
2700  FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
2701  return true;
2702}
2703
2704// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
2705// of the string and obtains the integer values from them and combines these
2706// into a single value to be used in the MRS/MSR instruction.
2707static int getIntOperandFromRegisterString(StringRef RegString) {
2708  SmallVector<StringRef, 5> Fields;
2709  RegString.split(Fields, ':');
2710
2711  if (Fields.size() == 1)
2712    return -1;
2713
2714  assert(Fields.size() == 5
2715            && "Invalid number of fields in read register string");
2716
2717  SmallVector<int, 5> Ops;
2718  bool AllIntFields = true;
2719
2720  for (StringRef Field : Fields) {
2721    unsigned IntField;
2722    AllIntFields &= !Field.getAsInteger(10, IntField);
2723    Ops.push_back(IntField);
2724  }
2725
2726  assert(AllIntFields &&
2727          "Unexpected non-integer value in special register string.");
2728
2729  // Need to combine the integer fields of the string into a single value
2730  // based on the bit encoding of MRS/MSR instruction.
2731  return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
2732         (Ops[3] << 3) | (Ops[4]);
2733}
2734
2735// Lower the read_register intrinsic to an MRS instruction node if the special
2736// register string argument is either of the form detailed in the ALCE (the
2737// form described in getIntOperandsFromRegsterString) or is a named register
2738// known by the MRS SysReg mapper.
2739bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
2740  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2741  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2742  SDLoc DL(N);
2743
2744  int Reg = getIntOperandFromRegisterString(RegString->getString());
2745  if (Reg != -1) {
2746    ReplaceNode(N, CurDAG->getMachineNode(
2747                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2748                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2749                       N->getOperand(0)));
2750    return true;
2751  }
2752
2753  // Use the sysreg mapper to map the remaining possible strings to the
2754  // value for the register to be used for the instruction operand.
2755  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2756  if (TheReg && TheReg->Readable &&
2757      TheReg->haveFeatures(Subtarget->getFeatureBits()))
2758    Reg = TheReg->Encoding;
2759  else
2760    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2761
2762  if (Reg != -1) {
2763    ReplaceNode(N, CurDAG->getMachineNode(
2764                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2765                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2766                       N->getOperand(0)));
2767    return true;
2768  }
2769
2770  if (RegString->getString() == "pc") {
2771    ReplaceNode(N, CurDAG->getMachineNode(
2772                       AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
2773                       CurDAG->getTargetConstant(0, DL, MVT::i32),
2774                       N->getOperand(0)));
2775    return true;
2776  }
2777
2778  return false;
2779}
2780
2781// Lower the write_register intrinsic to an MSR instruction node if the special
2782// register string argument is either of the form detailed in the ALCE (the
2783// form described in getIntOperandsFromRegsterString) or is a named register
2784// known by the MSR SysReg mapper.
2785bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
2786  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2787  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2788  SDLoc DL(N);
2789
2790  int Reg = getIntOperandFromRegisterString(RegString->getString());
2791  if (Reg != -1) {
2792    ReplaceNode(
2793        N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
2794                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2795                                  N->getOperand(2), N->getOperand(0)));
2796    return true;
2797  }
2798
2799  // Check if the register was one of those allowed as the pstatefield value in
2800  // the MSR (immediate) instruction. To accept the values allowed in the
2801  // pstatefield for the MSR (immediate) instruction, we also require that an
2802  // immediate value has been provided as an argument, we know that this is
2803  // the case as it has been ensured by semantic checking.
2804  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
2805  if (PMapper) {
2806    assert (isa<ConstantSDNode>(N->getOperand(2))
2807              && "Expected a constant integer expression.");
2808    unsigned Reg = PMapper->Encoding;
2809    uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
2810    unsigned State;
2811    if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
2812      assert(Immed < 2 && "Bad imm");
2813      State = AArch64::MSRpstateImm1;
2814    } else {
2815      assert(Immed < 16 && "Bad imm");
2816      State = AArch64::MSRpstateImm4;
2817    }
2818    ReplaceNode(N, CurDAG->getMachineNode(
2819                       State, DL, MVT::Other,
2820                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2821                       CurDAG->getTargetConstant(Immed, DL, MVT::i16),
2822                       N->getOperand(0)));
2823    return true;
2824  }
2825
2826  // Use the sysreg mapper to attempt to map the remaining possible strings
2827  // to the value for the register to be used for the MSR (register)
2828  // instruction operand.
2829  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2830  if (TheReg && TheReg->Writeable &&
2831      TheReg->haveFeatures(Subtarget->getFeatureBits()))
2832    Reg = TheReg->Encoding;
2833  else
2834    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2835  if (Reg != -1) {
2836    ReplaceNode(N, CurDAG->getMachineNode(
2837                       AArch64::MSR, DL, MVT::Other,
2838                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2839                       N->getOperand(2), N->getOperand(0)));
2840    return true;
2841  }
2842
2843  return false;
2844}
2845
2846/// We've got special pseudo-instructions for these
2847bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
2848  unsigned Opcode;
2849  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
2850
2851  // Leave IR for LSE if subtarget supports it.
2852  if (Subtarget->hasLSE()) return false;
2853
2854  if (MemTy == MVT::i8)
2855    Opcode = AArch64::CMP_SWAP_8;
2856  else if (MemTy == MVT::i16)
2857    Opcode = AArch64::CMP_SWAP_16;
2858  else if (MemTy == MVT::i32)
2859    Opcode = AArch64::CMP_SWAP_32;
2860  else if (MemTy == MVT::i64)
2861    Opcode = AArch64::CMP_SWAP_64;
2862  else
2863    llvm_unreachable("Unknown AtomicCmpSwap type");
2864
2865  MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
2866  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
2867                   N->getOperand(0)};
2868  SDNode *CmpSwap = CurDAG->getMachineNode(
2869      Opcode, SDLoc(N),
2870      CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
2871
2872  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
2873  CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
2874
2875  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
2876  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
2877  CurDAG->RemoveDeadNode(N);
2878
2879  return true;
2880}
2881
2882bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
2883  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
2884    const int64_t ImmVal = CNode->getZExtValue();
2885    SDLoc DL(N);
2886
2887    switch (VT.SimpleTy) {
2888    case MVT::i8:
2889      if ((ImmVal & 0xFF) == ImmVal) {
2890        Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
2891        Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
2892        return true;
2893      }
2894      break;
2895    case MVT::i16:
2896    case MVT::i32:
2897    case MVT::i64:
2898      if ((ImmVal & 0xFF) == ImmVal) {
2899        Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
2900        Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
2901        return true;
2902      } else if ((ImmVal & 0xFF00) == ImmVal) {
2903        Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
2904        Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
2905        return true;
2906      }
2907      break;
2908    default:
2909      break;
2910    }
2911  }
2912
2913  return false;
2914}
2915
2916bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
2917  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
2918    int64_t ImmVal = CNode->getSExtValue();
2919    SDLoc DL(N);
2920    if (ImmVal >= -127 && ImmVal < 127) {
2921      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
2922      return true;
2923    }
2924  }
2925  return false;
2926}
2927
2928bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
2929  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
2930    uint64_t ImmVal = CNode->getSExtValue();
2931    SDLoc DL(N);
2932    ImmVal = ImmVal & 0xFF;
2933    if (ImmVal < 256) {
2934      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
2935      return true;
2936    }
2937  }
2938  return false;
2939}
2940
2941bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
2942  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
2943    uint64_t ImmVal = CNode->getZExtValue();
2944    SDLoc DL(N);
2945
2946    // Shift mask depending on type size.
2947    switch (VT.SimpleTy) {
2948      case MVT::i8:
2949        ImmVal &= 0xFF;
2950        ImmVal |= ImmVal << 8;
2951        ImmVal |= ImmVal << 16;
2952        ImmVal |= ImmVal << 32;
2953        break;
2954      case MVT::i16:
2955        ImmVal &= 0xFFFF;
2956        ImmVal |= ImmVal << 16;
2957        ImmVal |= ImmVal << 32;
2958        break;
2959      case MVT::i32:
2960        ImmVal &= 0xFFFFFFFF;
2961        ImmVal |= ImmVal << 32;
2962        break;
2963      case MVT::i64:
2964        break;
2965      default:
2966        llvm_unreachable("Unexpected type");
2967    }
2968
2969    uint64_t encoding;
2970    if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
2971      Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
2972      return true;
2973    }
2974  }
2975  return false;
2976}
2977
2978bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
2979  // tagp(FrameIndex, IRGstack, tag_offset):
2980  // since the offset between FrameIndex and IRGstack is a compile-time
2981  // constant, this can be lowered to a single ADDG instruction.
2982  if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
2983    return false;
2984  }
2985
2986  SDValue IRG_SP = N->getOperand(2);
2987  if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
2988      cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
2989          Intrinsic::aarch64_irg_sp) {
2990    return false;
2991  }
2992
2993  const TargetLowering *TLI = getTargetLowering();
2994  SDLoc DL(N);
2995  int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
2996  SDValue FiOp = CurDAG->getTargetFrameIndex(
2997      FI, TLI->getPointerTy(CurDAG->getDataLayout()));
2998  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
2999
3000  SDNode *Out = CurDAG->getMachineNode(
3001      AArch64::TAGPstack, DL, MVT::i64,
3002      {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
3003       CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3004  ReplaceNode(N, Out);
3005  return true;
3006}
3007
3008void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
3009  assert(isa<ConstantSDNode>(N->getOperand(3)) &&
3010         "llvm.aarch64.tagp third argument must be an immediate");
3011  if (trySelectStackSlotTagP(N))
3012    return;
3013  // FIXME: above applies in any case when offset between Op1 and Op2 is a
3014  // compile-time constant, not just for stack allocations.
3015
3016  // General case for unrelated pointers in Op1 and Op2.
3017  SDLoc DL(N);
3018  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3019  SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
3020                                      {N->getOperand(1), N->getOperand(2)});
3021  SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
3022                                      {SDValue(N1, 0), N->getOperand(2)});
3023  SDNode *N3 = CurDAG->getMachineNode(
3024      AArch64::ADDG, DL, MVT::i64,
3025      {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
3026       CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3027  ReplaceNode(N, N3);
3028}
3029
3030void AArch64DAGToDAGISel::Select(SDNode *Node) {
3031  // If we have a custom node, we already have selected!
3032  if (Node->isMachineOpcode()) {
3033    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
3034    Node->setNodeId(-1);
3035    return;
3036  }
3037
3038  // Few custom selection stuff.
3039  EVT VT = Node->getValueType(0);
3040
3041  switch (Node->getOpcode()) {
3042  default:
3043    break;
3044
3045  case ISD::ATOMIC_CMP_SWAP:
3046    if (SelectCMP_SWAP(Node))
3047      return;
3048    break;
3049
3050  case ISD::READ_REGISTER:
3051    if (tryReadRegister(Node))
3052      return;
3053    break;
3054
3055  case ISD::WRITE_REGISTER:
3056    if (tryWriteRegister(Node))
3057      return;
3058    break;
3059
3060  case ISD::ADD:
3061    if (tryMLAV64LaneV128(Node))
3062      return;
3063    break;
3064
3065  case ISD::LOAD: {
3066    // Try to select as an indexed load. Fall through to normal processing
3067    // if we can't.
3068    if (tryIndexedLoad(Node))
3069      return;
3070    break;
3071  }
3072
3073  case ISD::SRL:
3074  case ISD::AND:
3075  case ISD::SRA:
3076  case ISD::SIGN_EXTEND_INREG:
3077    if (tryBitfieldExtractOp(Node))
3078      return;
3079    if (tryBitfieldInsertInZeroOp(Node))
3080      return;
3081    LLVM_FALLTHROUGH;
3082  case ISD::ROTR:
3083  case ISD::SHL:
3084    if (tryShiftAmountMod(Node))
3085      return;
3086    break;
3087
3088  case ISD::SIGN_EXTEND:
3089    if (tryBitfieldExtractOpFromSExt(Node))
3090      return;
3091    break;
3092
3093  case ISD::FP_EXTEND:
3094    if (tryHighFPExt(Node))
3095      return;
3096    break;
3097
3098  case ISD::OR:
3099    if (tryBitfieldInsertOp(Node))
3100      return;
3101    break;
3102
3103  case ISD::Constant: {
3104    // Materialize zero constants as copies from WZR/XZR.  This allows
3105    // the coalescer to propagate these into other instructions.
3106    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
3107    if (ConstNode->isNullValue()) {
3108      if (VT == MVT::i32) {
3109        SDValue New = CurDAG->getCopyFromReg(
3110            CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
3111        ReplaceNode(Node, New.getNode());
3112        return;
3113      } else if (VT == MVT::i64) {
3114        SDValue New = CurDAG->getCopyFromReg(
3115            CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
3116        ReplaceNode(Node, New.getNode());
3117        return;
3118      }
3119    }
3120    break;
3121  }
3122
3123  case ISD::FrameIndex: {
3124    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
3125    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
3126    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
3127    const TargetLowering *TLI = getTargetLowering();
3128    SDValue TFI = CurDAG->getTargetFrameIndex(
3129        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3130    SDLoc DL(Node);
3131    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
3132                      CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
3133    CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
3134    return;
3135  }
3136  case ISD::INTRINSIC_W_CHAIN: {
3137    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3138    switch (IntNo) {
3139    default:
3140      break;
3141    case Intrinsic::aarch64_ldaxp:
3142    case Intrinsic::aarch64_ldxp: {
3143      unsigned Op =
3144          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
3145      SDValue MemAddr = Node->getOperand(2);
3146      SDLoc DL(Node);
3147      SDValue Chain = Node->getOperand(0);
3148
3149      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
3150                                          MVT::Other, MemAddr, Chain);
3151
3152      // Transfer memoperands.
3153      MachineMemOperand *MemOp =
3154          cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3155      CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
3156      ReplaceNode(Node, Ld);
3157      return;
3158    }
3159    case Intrinsic::aarch64_stlxp:
3160    case Intrinsic::aarch64_stxp: {
3161      unsigned Op =
3162          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
3163      SDLoc DL(Node);
3164      SDValue Chain = Node->getOperand(0);
3165      SDValue ValLo = Node->getOperand(2);
3166      SDValue ValHi = Node->getOperand(3);
3167      SDValue MemAddr = Node->getOperand(4);
3168
3169      // Place arguments in the right order.
3170      SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
3171
3172      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
3173      // Transfer memoperands.
3174      MachineMemOperand *MemOp =
3175          cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3176      CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
3177
3178      ReplaceNode(Node, St);
3179      return;
3180    }
3181    case Intrinsic::aarch64_neon_ld1x2:
3182      if (VT == MVT::v8i8) {
3183        SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
3184        return;
3185      } else if (VT == MVT::v16i8) {
3186        SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
3187        return;
3188      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3189        SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
3190        return;
3191      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3192        SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
3193        return;
3194      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3195        SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
3196        return;
3197      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3198        SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
3199        return;
3200      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3201        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3202        return;
3203      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3204        SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
3205        return;
3206      }
3207      break;
3208    case Intrinsic::aarch64_neon_ld1x3:
3209      if (VT == MVT::v8i8) {
3210        SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
3211        return;
3212      } else if (VT == MVT::v16i8) {
3213        SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
3214        return;
3215      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3216        SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
3217        return;
3218      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3219        SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
3220        return;
3221      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3222        SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
3223        return;
3224      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3225        SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
3226        return;
3227      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3228        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3229        return;
3230      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3231        SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
3232        return;
3233      }
3234      break;
3235    case Intrinsic::aarch64_neon_ld1x4:
3236      if (VT == MVT::v8i8) {
3237        SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
3238        return;
3239      } else if (VT == MVT::v16i8) {
3240        SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
3241        return;
3242      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3243        SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
3244        return;
3245      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3246        SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
3247        return;
3248      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3249        SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
3250        return;
3251      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3252        SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
3253        return;
3254      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3255        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3256        return;
3257      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3258        SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
3259        return;
3260      }
3261      break;
3262    case Intrinsic::aarch64_neon_ld2:
3263      if (VT == MVT::v8i8) {
3264        SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
3265        return;
3266      } else if (VT == MVT::v16i8) {
3267        SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
3268        return;
3269      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3270        SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
3271        return;
3272      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3273        SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
3274        return;
3275      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3276        SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
3277        return;
3278      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3279        SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
3280        return;
3281      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3282        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3283        return;
3284      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3285        SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
3286        return;
3287      }
3288      break;
3289    case Intrinsic::aarch64_neon_ld3:
3290      if (VT == MVT::v8i8) {
3291        SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
3292        return;
3293      } else if (VT == MVT::v16i8) {
3294        SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
3295        return;
3296      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3297        SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
3298        return;
3299      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3300        SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
3301        return;
3302      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3303        SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
3304        return;
3305      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3306        SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
3307        return;
3308      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3309        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3310        return;
3311      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3312        SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
3313        return;
3314      }
3315      break;
3316    case Intrinsic::aarch64_neon_ld4:
3317      if (VT == MVT::v8i8) {
3318        SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
3319        return;
3320      } else if (VT == MVT::v16i8) {
3321        SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
3322        return;
3323      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3324        SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
3325        return;
3326      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3327        SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
3328        return;
3329      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3330        SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
3331        return;
3332      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3333        SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
3334        return;
3335      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3336        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3337        return;
3338      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3339        SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
3340        return;
3341      }
3342      break;
3343    case Intrinsic::aarch64_neon_ld2r:
3344      if (VT == MVT::v8i8) {
3345        SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
3346        return;
3347      } else if (VT == MVT::v16i8) {
3348        SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
3349        return;
3350      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3351        SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
3352        return;
3353      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3354        SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
3355        return;
3356      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3357        SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
3358        return;
3359      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3360        SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
3361        return;
3362      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3363        SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
3364        return;
3365      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3366        SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
3367        return;
3368      }
3369      break;
3370    case Intrinsic::aarch64_neon_ld3r:
3371      if (VT == MVT::v8i8) {
3372        SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
3373        return;
3374      } else if (VT == MVT::v16i8) {
3375        SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
3376        return;
3377      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3378        SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
3379        return;
3380      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3381        SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
3382        return;
3383      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3384        SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
3385        return;
3386      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3387        SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
3388        return;
3389      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3390        SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
3391        return;
3392      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3393        SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
3394        return;
3395      }
3396      break;
3397    case Intrinsic::aarch64_neon_ld4r:
3398      if (VT == MVT::v8i8) {
3399        SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
3400        return;
3401      } else if (VT == MVT::v16i8) {
3402        SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
3403        return;
3404      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3405        SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
3406        return;
3407      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3408        SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
3409        return;
3410      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3411        SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
3412        return;
3413      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3414        SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
3415        return;
3416      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3417        SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
3418        return;
3419      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3420        SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
3421        return;
3422      }
3423      break;
3424    case Intrinsic::aarch64_neon_ld2lane:
3425      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3426        SelectLoadLane(Node, 2, AArch64::LD2i8);
3427        return;
3428      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3429                 VT == MVT::v8f16) {
3430        SelectLoadLane(Node, 2, AArch64::LD2i16);
3431        return;
3432      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3433                 VT == MVT::v2f32) {
3434        SelectLoadLane(Node, 2, AArch64::LD2i32);
3435        return;
3436      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3437                 VT == MVT::v1f64) {
3438        SelectLoadLane(Node, 2, AArch64::LD2i64);
3439        return;
3440      }
3441      break;
3442    case Intrinsic::aarch64_neon_ld3lane:
3443      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3444        SelectLoadLane(Node, 3, AArch64::LD3i8);
3445        return;
3446      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3447                 VT == MVT::v8f16) {
3448        SelectLoadLane(Node, 3, AArch64::LD3i16);
3449        return;
3450      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3451                 VT == MVT::v2f32) {
3452        SelectLoadLane(Node, 3, AArch64::LD3i32);
3453        return;
3454      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3455                 VT == MVT::v1f64) {
3456        SelectLoadLane(Node, 3, AArch64::LD3i64);
3457        return;
3458      }
3459      break;
3460    case Intrinsic::aarch64_neon_ld4lane:
3461      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3462        SelectLoadLane(Node, 4, AArch64::LD4i8);
3463        return;
3464      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3465                 VT == MVT::v8f16) {
3466        SelectLoadLane(Node, 4, AArch64::LD4i16);
3467        return;
3468      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3469                 VT == MVT::v2f32) {
3470        SelectLoadLane(Node, 4, AArch64::LD4i32);
3471        return;
3472      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3473                 VT == MVT::v1f64) {
3474        SelectLoadLane(Node, 4, AArch64::LD4i64);
3475        return;
3476      }
3477      break;
3478    }
3479  } break;
3480  case ISD::INTRINSIC_WO_CHAIN: {
3481    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
3482    switch (IntNo) {
3483    default:
3484      break;
3485    case Intrinsic::aarch64_tagp:
3486      SelectTagP(Node);
3487      return;
3488    case Intrinsic::aarch64_neon_tbl2:
3489      SelectTable(Node, 2,
3490                  VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
3491                  false);
3492      return;
3493    case Intrinsic::aarch64_neon_tbl3:
3494      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
3495                                           : AArch64::TBLv16i8Three,
3496                  false);
3497      return;
3498    case Intrinsic::aarch64_neon_tbl4:
3499      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
3500                                           : AArch64::TBLv16i8Four,
3501                  false);
3502      return;
3503    case Intrinsic::aarch64_neon_tbx2:
3504      SelectTable(Node, 2,
3505                  VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
3506                  true);
3507      return;
3508    case Intrinsic::aarch64_neon_tbx3:
3509      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
3510                                           : AArch64::TBXv16i8Three,
3511                  true);
3512      return;
3513    case Intrinsic::aarch64_neon_tbx4:
3514      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
3515                                           : AArch64::TBXv16i8Four,
3516                  true);
3517      return;
3518    case Intrinsic::aarch64_neon_smull:
3519    case Intrinsic::aarch64_neon_umull:
3520      if (tryMULLV64LaneV128(IntNo, Node))
3521        return;
3522      break;
3523    }
3524    break;
3525  }
3526  case ISD::INTRINSIC_VOID: {
3527    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3528    if (Node->getNumOperands() >= 3)
3529      VT = Node->getOperand(2)->getValueType(0);
3530    switch (IntNo) {
3531    default:
3532      break;
3533    case Intrinsic::aarch64_neon_st1x2: {
3534      if (VT == MVT::v8i8) {
3535        SelectStore(Node, 2, AArch64::ST1Twov8b);
3536        return;
3537      } else if (VT == MVT::v16i8) {
3538        SelectStore(Node, 2, AArch64::ST1Twov16b);
3539        return;
3540      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3541        SelectStore(Node, 2, AArch64::ST1Twov4h);
3542        return;
3543      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3544        SelectStore(Node, 2, AArch64::ST1Twov8h);
3545        return;
3546      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3547        SelectStore(Node, 2, AArch64::ST1Twov2s);
3548        return;
3549      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3550        SelectStore(Node, 2, AArch64::ST1Twov4s);
3551        return;
3552      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3553        SelectStore(Node, 2, AArch64::ST1Twov2d);
3554        return;
3555      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3556        SelectStore(Node, 2, AArch64::ST1Twov1d);
3557        return;
3558      }
3559      break;
3560    }
3561    case Intrinsic::aarch64_neon_st1x3: {
3562      if (VT == MVT::v8i8) {
3563        SelectStore(Node, 3, AArch64::ST1Threev8b);
3564        return;
3565      } else if (VT == MVT::v16i8) {
3566        SelectStore(Node, 3, AArch64::ST1Threev16b);
3567        return;
3568      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3569        SelectStore(Node, 3, AArch64::ST1Threev4h);
3570        return;
3571      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3572        SelectStore(Node, 3, AArch64::ST1Threev8h);
3573        return;
3574      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3575        SelectStore(Node, 3, AArch64::ST1Threev2s);
3576        return;
3577      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3578        SelectStore(Node, 3, AArch64::ST1Threev4s);
3579        return;
3580      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3581        SelectStore(Node, 3, AArch64::ST1Threev2d);
3582        return;
3583      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3584        SelectStore(Node, 3, AArch64::ST1Threev1d);
3585        return;
3586      }
3587      break;
3588    }
3589    case Intrinsic::aarch64_neon_st1x4: {
3590      if (VT == MVT::v8i8) {
3591        SelectStore(Node, 4, AArch64::ST1Fourv8b);
3592        return;
3593      } else if (VT == MVT::v16i8) {
3594        SelectStore(Node, 4, AArch64::ST1Fourv16b);
3595        return;
3596      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3597        SelectStore(Node, 4, AArch64::ST1Fourv4h);
3598        return;
3599      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3600        SelectStore(Node, 4, AArch64::ST1Fourv8h);
3601        return;
3602      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3603        SelectStore(Node, 4, AArch64::ST1Fourv2s);
3604        return;
3605      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3606        SelectStore(Node, 4, AArch64::ST1Fourv4s);
3607        return;
3608      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3609        SelectStore(Node, 4, AArch64::ST1Fourv2d);
3610        return;
3611      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3612        SelectStore(Node, 4, AArch64::ST1Fourv1d);
3613        return;
3614      }
3615      break;
3616    }
3617    case Intrinsic::aarch64_neon_st2: {
3618      if (VT == MVT::v8i8) {
3619        SelectStore(Node, 2, AArch64::ST2Twov8b);
3620        return;
3621      } else if (VT == MVT::v16i8) {
3622        SelectStore(Node, 2, AArch64::ST2Twov16b);
3623        return;
3624      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3625        SelectStore(Node, 2, AArch64::ST2Twov4h);
3626        return;
3627      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3628        SelectStore(Node, 2, AArch64::ST2Twov8h);
3629        return;
3630      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3631        SelectStore(Node, 2, AArch64::ST2Twov2s);
3632        return;
3633      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3634        SelectStore(Node, 2, AArch64::ST2Twov4s);
3635        return;
3636      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3637        SelectStore(Node, 2, AArch64::ST2Twov2d);
3638        return;
3639      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3640        SelectStore(Node, 2, AArch64::ST1Twov1d);
3641        return;
3642      }
3643      break;
3644    }
3645    case Intrinsic::aarch64_neon_st3: {
3646      if (VT == MVT::v8i8) {
3647        SelectStore(Node, 3, AArch64::ST3Threev8b);
3648        return;
3649      } else if (VT == MVT::v16i8) {
3650        SelectStore(Node, 3, AArch64::ST3Threev16b);
3651        return;
3652      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3653        SelectStore(Node, 3, AArch64::ST3Threev4h);
3654        return;
3655      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3656        SelectStore(Node, 3, AArch64::ST3Threev8h);
3657        return;
3658      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3659        SelectStore(Node, 3, AArch64::ST3Threev2s);
3660        return;
3661      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3662        SelectStore(Node, 3, AArch64::ST3Threev4s);
3663        return;
3664      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3665        SelectStore(Node, 3, AArch64::ST3Threev2d);
3666        return;
3667      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3668        SelectStore(Node, 3, AArch64::ST1Threev1d);
3669        return;
3670      }
3671      break;
3672    }
3673    case Intrinsic::aarch64_neon_st4: {
3674      if (VT == MVT::v8i8) {
3675        SelectStore(Node, 4, AArch64::ST4Fourv8b);
3676        return;
3677      } else if (VT == MVT::v16i8) {
3678        SelectStore(Node, 4, AArch64::ST4Fourv16b);
3679        return;
3680      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3681        SelectStore(Node, 4, AArch64::ST4Fourv4h);
3682        return;
3683      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3684        SelectStore(Node, 4, AArch64::ST4Fourv8h);
3685        return;
3686      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3687        SelectStore(Node, 4, AArch64::ST4Fourv2s);
3688        return;
3689      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3690        SelectStore(Node, 4, AArch64::ST4Fourv4s);
3691        return;
3692      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3693        SelectStore(Node, 4, AArch64::ST4Fourv2d);
3694        return;
3695      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3696        SelectStore(Node, 4, AArch64::ST1Fourv1d);
3697        return;
3698      }
3699      break;
3700    }
3701    case Intrinsic::aarch64_neon_st2lane: {
3702      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3703        SelectStoreLane(Node, 2, AArch64::ST2i8);
3704        return;
3705      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3706                 VT == MVT::v8f16) {
3707        SelectStoreLane(Node, 2, AArch64::ST2i16);
3708        return;
3709      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3710                 VT == MVT::v2f32) {
3711        SelectStoreLane(Node, 2, AArch64::ST2i32);
3712        return;
3713      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3714                 VT == MVT::v1f64) {
3715        SelectStoreLane(Node, 2, AArch64::ST2i64);
3716        return;
3717      }
3718      break;
3719    }
3720    case Intrinsic::aarch64_neon_st3lane: {
3721      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3722        SelectStoreLane(Node, 3, AArch64::ST3i8);
3723        return;
3724      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3725                 VT == MVT::v8f16) {
3726        SelectStoreLane(Node, 3, AArch64::ST3i16);
3727        return;
3728      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3729                 VT == MVT::v2f32) {
3730        SelectStoreLane(Node, 3, AArch64::ST3i32);
3731        return;
3732      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3733                 VT == MVT::v1f64) {
3734        SelectStoreLane(Node, 3, AArch64::ST3i64);
3735        return;
3736      }
3737      break;
3738    }
3739    case Intrinsic::aarch64_neon_st4lane: {
3740      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3741        SelectStoreLane(Node, 4, AArch64::ST4i8);
3742        return;
3743      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3744                 VT == MVT::v8f16) {
3745        SelectStoreLane(Node, 4, AArch64::ST4i16);
3746        return;
3747      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3748                 VT == MVT::v2f32) {
3749        SelectStoreLane(Node, 4, AArch64::ST4i32);
3750        return;
3751      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3752                 VT == MVT::v1f64) {
3753        SelectStoreLane(Node, 4, AArch64::ST4i64);
3754        return;
3755      }
3756      break;
3757    }
3758    }
3759    break;
3760  }
3761  case AArch64ISD::LD2post: {
3762    if (VT == MVT::v8i8) {
3763      SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
3764      return;
3765    } else if (VT == MVT::v16i8) {
3766      SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
3767      return;
3768    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3769      SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
3770      return;
3771    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3772      SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
3773      return;
3774    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3775      SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
3776      return;
3777    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3778      SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
3779      return;
3780    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3781      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
3782      return;
3783    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3784      SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
3785      return;
3786    }
3787    break;
3788  }
3789  case AArch64ISD::LD3post: {
3790    if (VT == MVT::v8i8) {
3791      SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
3792      return;
3793    } else if (VT == MVT::v16i8) {
3794      SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
3795      return;
3796    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3797      SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
3798      return;
3799    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3800      SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
3801      return;
3802    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3803      SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
3804      return;
3805    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3806      SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
3807      return;
3808    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3809      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
3810      return;
3811    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3812      SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
3813      return;
3814    }
3815    break;
3816  }
3817  case AArch64ISD::LD4post: {
3818    if (VT == MVT::v8i8) {
3819      SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
3820      return;
3821    } else if (VT == MVT::v16i8) {
3822      SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
3823      return;
3824    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3825      SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
3826      return;
3827    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3828      SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
3829      return;
3830    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3831      SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
3832      return;
3833    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3834      SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
3835      return;
3836    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3837      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
3838      return;
3839    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3840      SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
3841      return;
3842    }
3843    break;
3844  }
3845  case AArch64ISD::LD1x2post: {
3846    if (VT == MVT::v8i8) {
3847      SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
3848      return;
3849    } else if (VT == MVT::v16i8) {
3850      SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
3851      return;
3852    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3853      SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
3854      return;
3855    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3856      SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
3857      return;
3858    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3859      SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
3860      return;
3861    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3862      SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
3863      return;
3864    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3865      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
3866      return;
3867    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3868      SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
3869      return;
3870    }
3871    break;
3872  }
3873  case AArch64ISD::LD1x3post: {
3874    if (VT == MVT::v8i8) {
3875      SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
3876      return;
3877    } else if (VT == MVT::v16i8) {
3878      SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
3879      return;
3880    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3881      SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
3882      return;
3883    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3884      SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
3885      return;
3886    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3887      SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
3888      return;
3889    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3890      SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
3891      return;
3892    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3893      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
3894      return;
3895    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3896      SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
3897      return;
3898    }
3899    break;
3900  }
3901  case AArch64ISD::LD1x4post: {
3902    if (VT == MVT::v8i8) {
3903      SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
3904      return;
3905    } else if (VT == MVT::v16i8) {
3906      SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
3907      return;
3908    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3909      SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
3910      return;
3911    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3912      SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
3913      return;
3914    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3915      SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
3916      return;
3917    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3918      SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
3919      return;
3920    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3921      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
3922      return;
3923    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3924      SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
3925      return;
3926    }
3927    break;
3928  }
3929  case AArch64ISD::LD1DUPpost: {
3930    if (VT == MVT::v8i8) {
3931      SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
3932      return;
3933    } else if (VT == MVT::v16i8) {
3934      SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
3935      return;
3936    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3937      SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
3938      return;
3939    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3940      SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
3941      return;
3942    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3943      SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
3944      return;
3945    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3946      SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
3947      return;
3948    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3949      SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
3950      return;
3951    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3952      SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
3953      return;
3954    }
3955    break;
3956  }
3957  case AArch64ISD::LD2DUPpost: {
3958    if (VT == MVT::v8i8) {
3959      SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
3960      return;
3961    } else if (VT == MVT::v16i8) {
3962      SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
3963      return;
3964    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3965      SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
3966      return;
3967    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3968      SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
3969      return;
3970    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3971      SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
3972      return;
3973    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3974      SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
3975      return;
3976    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3977      SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
3978      return;
3979    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3980      SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
3981      return;
3982    }
3983    break;
3984  }
3985  case AArch64ISD::LD3DUPpost: {
3986    if (VT == MVT::v8i8) {
3987      SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
3988      return;
3989    } else if (VT == MVT::v16i8) {
3990      SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
3991      return;
3992    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3993      SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
3994      return;
3995    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3996      SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
3997      return;
3998    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3999      SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
4000      return;
4001    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4002      SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
4003      return;
4004    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4005      SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
4006      return;
4007    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4008      SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
4009      return;
4010    }
4011    break;
4012  }
4013  case AArch64ISD::LD4DUPpost: {
4014    if (VT == MVT::v8i8) {
4015      SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
4016      return;
4017    } else if (VT == MVT::v16i8) {
4018      SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
4019      return;
4020    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4021      SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
4022      return;
4023    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4024      SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
4025      return;
4026    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4027      SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
4028      return;
4029    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4030      SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
4031      return;
4032    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4033      SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
4034      return;
4035    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4036      SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
4037      return;
4038    }
4039    break;
4040  }
4041  case AArch64ISD::LD1LANEpost: {
4042    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4043      SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
4044      return;
4045    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4046               VT == MVT::v8f16) {
4047      SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
4048      return;
4049    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4050               VT == MVT::v2f32) {
4051      SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
4052      return;
4053    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4054               VT == MVT::v1f64) {
4055      SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
4056      return;
4057    }
4058    break;
4059  }
4060  case AArch64ISD::LD2LANEpost: {
4061    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4062      SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
4063      return;
4064    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4065               VT == MVT::v8f16) {
4066      SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
4067      return;
4068    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4069               VT == MVT::v2f32) {
4070      SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
4071      return;
4072    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4073               VT == MVT::v1f64) {
4074      SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
4075      return;
4076    }
4077    break;
4078  }
4079  case AArch64ISD::LD3LANEpost: {
4080    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4081      SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
4082      return;
4083    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4084               VT == MVT::v8f16) {
4085      SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
4086      return;
4087    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4088               VT == MVT::v2f32) {
4089      SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
4090      return;
4091    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4092               VT == MVT::v1f64) {
4093      SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
4094      return;
4095    }
4096    break;
4097  }
4098  case AArch64ISD::LD4LANEpost: {
4099    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4100      SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
4101      return;
4102    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4103               VT == MVT::v8f16) {
4104      SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
4105      return;
4106    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4107               VT == MVT::v2f32) {
4108      SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
4109      return;
4110    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4111               VT == MVT::v1f64) {
4112      SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
4113      return;
4114    }
4115    break;
4116  }
4117  case AArch64ISD::ST2post: {
4118    VT = Node->getOperand(1).getValueType();
4119    if (VT == MVT::v8i8) {
4120      SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
4121      return;
4122    } else if (VT == MVT::v16i8) {
4123      SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
4124      return;
4125    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4126      SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
4127      return;
4128    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4129      SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
4130      return;
4131    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4132      SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
4133      return;
4134    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4135      SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
4136      return;
4137    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4138      SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
4139      return;
4140    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4141      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4142      return;
4143    }
4144    break;
4145  }
4146  case AArch64ISD::ST3post: {
4147    VT = Node->getOperand(1).getValueType();
4148    if (VT == MVT::v8i8) {
4149      SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
4150      return;
4151    } else if (VT == MVT::v16i8) {
4152      SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
4153      return;
4154    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4155      SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
4156      return;
4157    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4158      SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
4159      return;
4160    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4161      SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
4162      return;
4163    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4164      SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
4165      return;
4166    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4167      SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
4168      return;
4169    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4170      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4171      return;
4172    }
4173    break;
4174  }
4175  case AArch64ISD::ST4post: {
4176    VT = Node->getOperand(1).getValueType();
4177    if (VT == MVT::v8i8) {
4178      SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
4179      return;
4180    } else if (VT == MVT::v16i8) {
4181      SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
4182      return;
4183    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4184      SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
4185      return;
4186    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4187      SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
4188      return;
4189    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4190      SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
4191      return;
4192    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4193      SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
4194      return;
4195    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4196      SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
4197      return;
4198    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4199      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4200      return;
4201    }
4202    break;
4203  }
4204  case AArch64ISD::ST1x2post: {
4205    VT = Node->getOperand(1).getValueType();
4206    if (VT == MVT::v8i8) {
4207      SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
4208      return;
4209    } else if (VT == MVT::v16i8) {
4210      SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
4211      return;
4212    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4213      SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
4214      return;
4215    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4216      SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
4217      return;
4218    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4219      SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
4220      return;
4221    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4222      SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
4223      return;
4224    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4225      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4226      return;
4227    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4228      SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
4229      return;
4230    }
4231    break;
4232  }
4233  case AArch64ISD::ST1x3post: {
4234    VT = Node->getOperand(1).getValueType();
4235    if (VT == MVT::v8i8) {
4236      SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
4237      return;
4238    } else if (VT == MVT::v16i8) {
4239      SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
4240      return;
4241    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4242      SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
4243      return;
4244    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4245      SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
4246      return;
4247    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4248      SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
4249      return;
4250    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4251      SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
4252      return;
4253    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4254      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4255      return;
4256    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4257      SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
4258      return;
4259    }
4260    break;
4261  }
4262  case AArch64ISD::ST1x4post: {
4263    VT = Node->getOperand(1).getValueType();
4264    if (VT == MVT::v8i8) {
4265      SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
4266      return;
4267    } else if (VT == MVT::v16i8) {
4268      SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
4269      return;
4270    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4271      SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
4272      return;
4273    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4274      SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
4275      return;
4276    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4277      SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
4278      return;
4279    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4280      SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
4281      return;
4282    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4283      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4284      return;
4285    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4286      SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
4287      return;
4288    }
4289    break;
4290  }
4291  case AArch64ISD::ST2LANEpost: {
4292    VT = Node->getOperand(1).getValueType();
4293    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4294      SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
4295      return;
4296    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4297               VT == MVT::v8f16) {
4298      SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
4299      return;
4300    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4301               VT == MVT::v2f32) {
4302      SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
4303      return;
4304    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4305               VT == MVT::v1f64) {
4306      SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
4307      return;
4308    }
4309    break;
4310  }
4311  case AArch64ISD::ST3LANEpost: {
4312    VT = Node->getOperand(1).getValueType();
4313    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4314      SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
4315      return;
4316    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4317               VT == MVT::v8f16) {
4318      SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
4319      return;
4320    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4321               VT == MVT::v2f32) {
4322      SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
4323      return;
4324    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4325               VT == MVT::v1f64) {
4326      SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
4327      return;
4328    }
4329    break;
4330  }
4331  case AArch64ISD::ST4LANEpost: {
4332    VT = Node->getOperand(1).getValueType();
4333    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4334      SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
4335      return;
4336    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4337               VT == MVT::v8f16) {
4338      SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
4339      return;
4340    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4341               VT == MVT::v2f32) {
4342      SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
4343      return;
4344    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4345               VT == MVT::v1f64) {
4346      SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
4347      return;
4348    }
4349    break;
4350  }
4351  }
4352
4353  // Select the default instruction
4354  SelectCode(Node);
4355}
4356
4357/// createAArch64ISelDag - This pass converts a legalized DAG into a
4358/// AArch64-specific DAG, ready for instruction scheduling.
4359FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
4360                                         CodeGenOpt::Level OptLevel) {
4361  return new AArch64DAGToDAGISel(TM, OptLevel);
4362}
4363