1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
14#include "AArch64CallingConvention.h"
15#include "AArch64ExpandImm.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PerfectShuffle.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Triple.h"
31#include "llvm/ADT/Twine.h"
32#include "llvm/Analysis/LoopInfo.h"
33#include "llvm/Analysis/MemoryLocation.h"
34#include "llvm/Analysis/ObjCARCUtil.h"
35#include "llvm/Analysis/TargetTransformInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/CodeGen/Analysis.h"
39#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/ISDOpcodes.h"
41#include "llvm/CodeGen/MachineBasicBlock.h"
42#include "llvm/CodeGen/MachineFrameInfo.h"
43#include "llvm/CodeGen/MachineFunction.h"
44#include "llvm/CodeGen/MachineInstr.h"
45#include "llvm/CodeGen/MachineInstrBuilder.h"
46#include "llvm/CodeGen/MachineMemOperand.h"
47#include "llvm/CodeGen/MachineRegisterInfo.h"
48#include "llvm/CodeGen/RuntimeLibcalls.h"
49#include "llvm/CodeGen/SelectionDAG.h"
50#include "llvm/CodeGen/SelectionDAGNodes.h"
51#include "llvm/CodeGen/TargetCallingConv.h"
52#include "llvm/CodeGen/TargetInstrInfo.h"
53#include "llvm/CodeGen/ValueTypes.h"
54#include "llvm/IR/Attributes.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DebugLoc.h"
58#include "llvm/IR/DerivedTypes.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/GetElementPtrTypeIterator.h"
61#include "llvm/IR/GlobalValue.h"
62#include "llvm/IR/IRBuilder.h"
63#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Instructions.h"
65#include "llvm/IR/IntrinsicInst.h"
66#include "llvm/IR/Intrinsics.h"
67#include "llvm/IR/IntrinsicsAArch64.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/OperandTraits.h"
70#include "llvm/IR/PatternMatch.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/Value.h"
74#include "llvm/MC/MCRegisterInfo.h"
75#include "llvm/Support/Casting.h"
76#include "llvm/Support/CodeGen.h"
77#include "llvm/Support/CommandLine.h"
78#include "llvm/Support/Compiler.h"
79#include "llvm/Support/Debug.h"
80#include "llvm/Support/ErrorHandling.h"
81#include "llvm/Support/InstructionCost.h"
82#include "llvm/Support/KnownBits.h"
83#include "llvm/Support/MachineValueType.h"
84#include "llvm/Support/MathExtras.h"
85#include "llvm/Support/raw_ostream.h"
86#include "llvm/Target/TargetMachine.h"
87#include "llvm/Target/TargetOptions.h"
88#include <algorithm>
89#include <bitset>
90#include <cassert>
91#include <cctype>
92#include <cstdint>
93#include <cstdlib>
94#include <iterator>
95#include <limits>
96#include <optional>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103
104#define DEBUG_TYPE "aarch64-lower"
105
106STATISTIC(NumTailCalls, "Number of tail calls");
107STATISTIC(NumShiftInserts, "Number of vector shift inserts");
108STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
109
110// FIXME: The necessary dtprel relocations don't seem to be supported
111// well in the GNU bfd and gold linkers at the moment. Therefore, by
112// default, for now, fall back to GeneralDynamic code generation.
113cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
114    "aarch64-elf-ldtls-generation", cl::Hidden,
115    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
116    cl::init(false));
117
118static cl::opt<bool>
119EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
120                         cl::desc("Enable AArch64 logical imm instruction "
121                                  "optimization"),
122                         cl::init(true));
123
124// Temporary option added for the purpose of testing functionality added
125// to DAGCombiner.cpp in D92230. It is expected that this can be removed
126// in future when both implementations will be based off MGATHER rather
127// than the GLD1 nodes added for the SVE gather load intrinsics.
128static cl::opt<bool>
129EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
130                                cl::desc("Combine extends of AArch64 masked "
131                                         "gather intrinsics"),
132                                cl::init(true));
133
134// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
135// bottleneck after this transform on high end CPU. So this max leaf node
136// limitation is guard cmp+ccmp will be profitable.
137static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
138                                 cl::desc("Maximum of xors"));
139
140/// Value type used for condition codes.
141static const MVT MVT_CC = MVT::i32;
142
143static inline EVT getPackedSVEVectorVT(EVT VT) {
144  switch (VT.getSimpleVT().SimpleTy) {
145  default:
146    llvm_unreachable("unexpected element type for vector");
147  case MVT::i8:
148    return MVT::nxv16i8;
149  case MVT::i16:
150    return MVT::nxv8i16;
151  case MVT::i32:
152    return MVT::nxv4i32;
153  case MVT::i64:
154    return MVT::nxv2i64;
155  case MVT::f16:
156    return MVT::nxv8f16;
157  case MVT::f32:
158    return MVT::nxv4f32;
159  case MVT::f64:
160    return MVT::nxv2f64;
161  case MVT::bf16:
162    return MVT::nxv8bf16;
163  }
164}
165
166// NOTE: Currently there's only a need to return integer vector types. If this
167// changes then just add an extra "type" parameter.
168static inline EVT getPackedSVEVectorVT(ElementCount EC) {
169  switch (EC.getKnownMinValue()) {
170  default:
171    llvm_unreachable("unexpected element count for vector");
172  case 16:
173    return MVT::nxv16i8;
174  case 8:
175    return MVT::nxv8i16;
176  case 4:
177    return MVT::nxv4i32;
178  case 2:
179    return MVT::nxv2i64;
180  }
181}
182
183static inline EVT getPromotedVTForPredicate(EVT VT) {
184  assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
185         "Expected scalable predicate vector type!");
186  switch (VT.getVectorMinNumElements()) {
187  default:
188    llvm_unreachable("unexpected element count for vector");
189  case 2:
190    return MVT::nxv2i64;
191  case 4:
192    return MVT::nxv4i32;
193  case 8:
194    return MVT::nxv8i16;
195  case 16:
196    return MVT::nxv16i8;
197  }
198}
199
200/// Returns true if VT's elements occupy the lowest bit positions of its
201/// associated register class without any intervening space.
202///
203/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
204/// same register class, but only nxv8f16 can be treated as a packed vector.
205static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
206  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
207         "Expected legal vector type!");
208  return VT.isFixedLengthVector() ||
209         VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
210}
211
212// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
213// predicate and end with a passthru value matching the result type.
214static bool isMergePassthruOpcode(unsigned Opc) {
215  switch (Opc) {
216  default:
217    return false;
218  case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
219  case AArch64ISD::BSWAP_MERGE_PASSTHRU:
220  case AArch64ISD::REVH_MERGE_PASSTHRU:
221  case AArch64ISD::REVW_MERGE_PASSTHRU:
222  case AArch64ISD::REVD_MERGE_PASSTHRU:
223  case AArch64ISD::CTLZ_MERGE_PASSTHRU:
224  case AArch64ISD::CTPOP_MERGE_PASSTHRU:
225  case AArch64ISD::DUP_MERGE_PASSTHRU:
226  case AArch64ISD::ABS_MERGE_PASSTHRU:
227  case AArch64ISD::NEG_MERGE_PASSTHRU:
228  case AArch64ISD::FNEG_MERGE_PASSTHRU:
229  case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
230  case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
231  case AArch64ISD::FCEIL_MERGE_PASSTHRU:
232  case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
233  case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
234  case AArch64ISD::FRINT_MERGE_PASSTHRU:
235  case AArch64ISD::FROUND_MERGE_PASSTHRU:
236  case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
237  case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
238  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
239  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
240  case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
241  case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
242  case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
243  case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
244  case AArch64ISD::FSQRT_MERGE_PASSTHRU:
245  case AArch64ISD::FRECPX_MERGE_PASSTHRU:
246  case AArch64ISD::FABS_MERGE_PASSTHRU:
247    return true;
248  }
249}
250
251// Returns true if inactive lanes are known to be zeroed by construction.
252static bool isZeroingInactiveLanes(SDValue Op) {
253  switch (Op.getOpcode()) {
254  default:
255    // We guarantee i1 splat_vectors to zero the other lanes by
256    // implementing it with ptrue and possibly a punpklo for nxv1i1.
257    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
258      return true;
259    return false;
260  case AArch64ISD::PTRUE:
261  case AArch64ISD::SETCC_MERGE_ZERO:
262    return true;
263  case ISD::INTRINSIC_WO_CHAIN:
264    switch (Op.getConstantOperandVal(0)) {
265    default:
266      return false;
267    case Intrinsic::aarch64_sve_ptrue:
268    case Intrinsic::aarch64_sve_pnext:
269    case Intrinsic::aarch64_sve_cmpeq:
270    case Intrinsic::aarch64_sve_cmpne:
271    case Intrinsic::aarch64_sve_cmpge:
272    case Intrinsic::aarch64_sve_cmpgt:
273    case Intrinsic::aarch64_sve_cmphs:
274    case Intrinsic::aarch64_sve_cmphi:
275    case Intrinsic::aarch64_sve_cmpeq_wide:
276    case Intrinsic::aarch64_sve_cmpne_wide:
277    case Intrinsic::aarch64_sve_cmpge_wide:
278    case Intrinsic::aarch64_sve_cmpgt_wide:
279    case Intrinsic::aarch64_sve_cmplt_wide:
280    case Intrinsic::aarch64_sve_cmple_wide:
281    case Intrinsic::aarch64_sve_cmphs_wide:
282    case Intrinsic::aarch64_sve_cmphi_wide:
283    case Intrinsic::aarch64_sve_cmplo_wide:
284    case Intrinsic::aarch64_sve_cmpls_wide:
285    case Intrinsic::aarch64_sve_fcmpeq:
286    case Intrinsic::aarch64_sve_fcmpne:
287    case Intrinsic::aarch64_sve_fcmpge:
288    case Intrinsic::aarch64_sve_fcmpgt:
289    case Intrinsic::aarch64_sve_fcmpuo:
290    case Intrinsic::aarch64_sve_facgt:
291    case Intrinsic::aarch64_sve_facge:
292    case Intrinsic::aarch64_sve_whilege:
293    case Intrinsic::aarch64_sve_whilegt:
294    case Intrinsic::aarch64_sve_whilehi:
295    case Intrinsic::aarch64_sve_whilehs:
296    case Intrinsic::aarch64_sve_whilele:
297    case Intrinsic::aarch64_sve_whilelo:
298    case Intrinsic::aarch64_sve_whilels:
299    case Intrinsic::aarch64_sve_whilelt:
300    case Intrinsic::aarch64_sve_match:
301    case Intrinsic::aarch64_sve_nmatch:
302    case Intrinsic::aarch64_sve_whilege_x2:
303    case Intrinsic::aarch64_sve_whilegt_x2:
304    case Intrinsic::aarch64_sve_whilehi_x2:
305    case Intrinsic::aarch64_sve_whilehs_x2:
306    case Intrinsic::aarch64_sve_whilele_x2:
307    case Intrinsic::aarch64_sve_whilelo_x2:
308    case Intrinsic::aarch64_sve_whilels_x2:
309    case Intrinsic::aarch64_sve_whilelt_x2:
310      return true;
311    }
312  }
313}
314
315AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
316                                             const AArch64Subtarget &STI)
317    : TargetLowering(TM), Subtarget(&STI) {
318  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
319  // we have to make something up. Arbitrarily, choose ZeroOrOne.
320  setBooleanContents(ZeroOrOneBooleanContent);
321  // When comparing vectors the result sets the different elements in the
322  // vector to all-one or all-zero.
323  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
324
325  // Set up the register classes.
326  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
327  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
328
329  if (Subtarget->hasLS64()) {
330    addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
331    setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
332    setOperationAction(ISD::STORE, MVT::i64x8, Custom);
333  }
334
335  if (Subtarget->hasFPARMv8()) {
336    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
337    addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
338    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
339    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
340    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
341  }
342
343  if (Subtarget->hasNEON()) {
344    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
345    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
346    // Someone set us up the NEON.
347    addDRTypeForNEON(MVT::v2f32);
348    addDRTypeForNEON(MVT::v8i8);
349    addDRTypeForNEON(MVT::v4i16);
350    addDRTypeForNEON(MVT::v2i32);
351    addDRTypeForNEON(MVT::v1i64);
352    addDRTypeForNEON(MVT::v1f64);
353    addDRTypeForNEON(MVT::v4f16);
354    if (Subtarget->hasBF16())
355      addDRTypeForNEON(MVT::v4bf16);
356
357    addQRTypeForNEON(MVT::v4f32);
358    addQRTypeForNEON(MVT::v2f64);
359    addQRTypeForNEON(MVT::v16i8);
360    addQRTypeForNEON(MVT::v8i16);
361    addQRTypeForNEON(MVT::v4i32);
362    addQRTypeForNEON(MVT::v2i64);
363    addQRTypeForNEON(MVT::v8f16);
364    if (Subtarget->hasBF16())
365      addQRTypeForNEON(MVT::v8bf16);
366  }
367
368  if (Subtarget->hasSVEorSME()) {
369    // Add legal sve predicate types
370    addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
371    addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
372    addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
373    addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
374    addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
375
376    // Add legal sve data types
377    addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
378    addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
379    addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
380    addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
381
382    addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
383    addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
384    addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
385    addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
386    addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
387    addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
388
389    if (Subtarget->hasBF16()) {
390      addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
391      addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
392      addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
393    }
394
395    if (Subtarget->useSVEForFixedLengthVectors()) {
396      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
397        if (useSVEForFixedLengthVectorVT(VT))
398          addRegisterClass(VT, &AArch64::ZPRRegClass);
399
400      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
401        if (useSVEForFixedLengthVectorVT(VT))
402          addRegisterClass(VT, &AArch64::ZPRRegClass);
403    }
404  }
405
406  // Compute derived properties from the register classes
407  computeRegisterProperties(Subtarget->getRegisterInfo());
408
409  // Provide all sorts of operation actions
410  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
411  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
412  setOperationAction(ISD::SETCC, MVT::i32, Custom);
413  setOperationAction(ISD::SETCC, MVT::i64, Custom);
414  setOperationAction(ISD::SETCC, MVT::f16, Custom);
415  setOperationAction(ISD::SETCC, MVT::f32, Custom);
416  setOperationAction(ISD::SETCC, MVT::f64, Custom);
417  setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
418  setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
419  setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
420  setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
421  setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
422  setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
423  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
424  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
425  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
426  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
427  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
428  setOperationAction(ISD::BR_CC, MVT::f16, Custom);
429  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
430  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
431  setOperationAction(ISD::SELECT, MVT::i32, Custom);
432  setOperationAction(ISD::SELECT, MVT::i64, Custom);
433  setOperationAction(ISD::SELECT, MVT::f16, Custom);
434  setOperationAction(ISD::SELECT, MVT::bf16, Custom);
435  setOperationAction(ISD::SELECT, MVT::f32, Custom);
436  setOperationAction(ISD::SELECT, MVT::f64, Custom);
437  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
438  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
439  setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
440  setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
441  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
442  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
443  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
444  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
445  setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
446
447  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
448  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
449  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
450
451  setOperationAction(ISD::FREM, MVT::f32, Expand);
452  setOperationAction(ISD::FREM, MVT::f64, Expand);
453  setOperationAction(ISD::FREM, MVT::f80, Expand);
454
455  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
456
457  // Custom lowering hooks are needed for XOR
458  // to fold it into CSINC/CSINV.
459  setOperationAction(ISD::XOR, MVT::i32, Custom);
460  setOperationAction(ISD::XOR, MVT::i64, Custom);
461
462  // Virtually no operation on f128 is legal, but LLVM can't expand them when
463  // there's a valid register class, so we need custom operations in most cases.
464  setOperationAction(ISD::FABS, MVT::f128, Expand);
465  setOperationAction(ISD::FADD, MVT::f128, LibCall);
466  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
467  setOperationAction(ISD::FCOS, MVT::f128, Expand);
468  setOperationAction(ISD::FDIV, MVT::f128, LibCall);
469  setOperationAction(ISD::FMA, MVT::f128, Expand);
470  setOperationAction(ISD::FMUL, MVT::f128, LibCall);
471  setOperationAction(ISD::FNEG, MVT::f128, Expand);
472  setOperationAction(ISD::FPOW, MVT::f128, Expand);
473  setOperationAction(ISD::FREM, MVT::f128, Expand);
474  setOperationAction(ISD::FRINT, MVT::f128, Expand);
475  setOperationAction(ISD::FSIN, MVT::f128, Expand);
476  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
477  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
478  setOperationAction(ISD::FSUB, MVT::f128, LibCall);
479  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
480  setOperationAction(ISD::SETCC, MVT::f128, Custom);
481  setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
482  setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
483  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
484  setOperationAction(ISD::SELECT, MVT::f128, Custom);
485  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
486  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
487  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
488  // aren't handled.
489
490  // Lowering for many of the conversions is actually specified by the non-f128
491  // type. The LowerXXX function will be trivial when f128 isn't involved.
492  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
493  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
494  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
495  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
496  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
497  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
498  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
499  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
500  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
501  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
502  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
503  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
504  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
505  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
506  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
507  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
508  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
509  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
510  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
511  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
512  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
513  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
514  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
515  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
516  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
517  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
518  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
519  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
520  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
521  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
522
523  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
524  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
525  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
526  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
527
528  // Variable arguments.
529  setOperationAction(ISD::VASTART, MVT::Other, Custom);
530  setOperationAction(ISD::VAARG, MVT::Other, Custom);
531  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
532  setOperationAction(ISD::VAEND, MVT::Other, Expand);
533
534  // Variable-sized objects.
535  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
536  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
537
538  if (Subtarget->isTargetWindows())
539    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
540  else
541    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
542
543  // Constant pool entries
544  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
545
546  // BlockAddress
547  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
548
549  // AArch64 lacks both left-rotate and popcount instructions.
550  setOperationAction(ISD::ROTL, MVT::i32, Expand);
551  setOperationAction(ISD::ROTL, MVT::i64, Expand);
552  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
553    setOperationAction(ISD::ROTL, VT, Expand);
554    setOperationAction(ISD::ROTR, VT, Expand);
555  }
556
557  // AArch64 doesn't have i32 MULH{S|U}.
558  setOperationAction(ISD::MULHU, MVT::i32, Expand);
559  setOperationAction(ISD::MULHS, MVT::i32, Expand);
560
561  // AArch64 doesn't have {U|S}MUL_LOHI.
562  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
563  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
564
565  if (Subtarget->hasCSSC()) {
566    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
567    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
568    setOperationAction(ISD::CTPOP, MVT::i128, Expand);
569
570    setOperationAction(ISD::PARITY, MVT::i128, Expand);
571
572    setOperationAction(ISD::CTTZ, MVT::i32, Legal);
573    setOperationAction(ISD::CTTZ, MVT::i64, Legal);
574    setOperationAction(ISD::CTTZ, MVT::i128, Expand);
575
576    setOperationAction(ISD::ABS, MVT::i32, Legal);
577    setOperationAction(ISD::ABS, MVT::i64, Legal);
578
579    setOperationAction(ISD::SMAX, MVT::i32, Legal);
580    setOperationAction(ISD::SMAX, MVT::i64, Legal);
581    setOperationAction(ISD::UMAX, MVT::i32, Legal);
582    setOperationAction(ISD::UMAX, MVT::i64, Legal);
583
584    setOperationAction(ISD::SMIN, MVT::i32, Legal);
585    setOperationAction(ISD::SMIN, MVT::i64, Legal);
586    setOperationAction(ISD::UMIN, MVT::i32, Legal);
587    setOperationAction(ISD::UMIN, MVT::i64, Legal);
588  } else {
589    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
590    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
591    setOperationAction(ISD::CTPOP, MVT::i128, Custom);
592
593    setOperationAction(ISD::PARITY, MVT::i64, Custom);
594    setOperationAction(ISD::PARITY, MVT::i128, Custom);
595
596    setOperationAction(ISD::ABS, MVT::i32, Custom);
597    setOperationAction(ISD::ABS, MVT::i64, Custom);
598  }
599
600  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
601  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
602  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
603    setOperationAction(ISD::SDIVREM, VT, Expand);
604    setOperationAction(ISD::UDIVREM, VT, Expand);
605  }
606  setOperationAction(ISD::SREM, MVT::i32, Expand);
607  setOperationAction(ISD::SREM, MVT::i64, Expand);
608  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
609  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
610  setOperationAction(ISD::UREM, MVT::i32, Expand);
611  setOperationAction(ISD::UREM, MVT::i64, Expand);
612
613  // Custom lower Add/Sub/Mul with overflow.
614  setOperationAction(ISD::SADDO, MVT::i32, Custom);
615  setOperationAction(ISD::SADDO, MVT::i64, Custom);
616  setOperationAction(ISD::UADDO, MVT::i32, Custom);
617  setOperationAction(ISD::UADDO, MVT::i64, Custom);
618  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
619  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
620  setOperationAction(ISD::USUBO, MVT::i32, Custom);
621  setOperationAction(ISD::USUBO, MVT::i64, Custom);
622  setOperationAction(ISD::SMULO, MVT::i32, Custom);
623  setOperationAction(ISD::SMULO, MVT::i64, Custom);
624  setOperationAction(ISD::UMULO, MVT::i32, Custom);
625  setOperationAction(ISD::UMULO, MVT::i64, Custom);
626
627  setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
628  setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
629  setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
630  setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
631  setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
632  setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
633  setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
634  setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
635
636  setOperationAction(ISD::FSIN, MVT::f32, Expand);
637  setOperationAction(ISD::FSIN, MVT::f64, Expand);
638  setOperationAction(ISD::FCOS, MVT::f32, Expand);
639  setOperationAction(ISD::FCOS, MVT::f64, Expand);
640  setOperationAction(ISD::FPOW, MVT::f32, Expand);
641  setOperationAction(ISD::FPOW, MVT::f64, Expand);
642  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
643  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
644  if (Subtarget->hasFullFP16())
645    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
646  else
647    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
648
649  for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
650                  ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
651                  ISD::FEXP,        ISD::FEXP2,        ISD::FLOG,
652                  ISD::FLOG2,       ISD::FLOG10,       ISD::STRICT_FREM,
653                  ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
654                  ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
655                  ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
656    setOperationAction(Op, MVT::f16, Promote);
657    setOperationAction(Op, MVT::v4f16, Expand);
658    setOperationAction(Op, MVT::v8f16, Expand);
659  }
660
661  if (!Subtarget->hasFullFP16()) {
662    for (auto Op :
663         {ISD::SETCC,          ISD::SELECT_CC,
664          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
665          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
666          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
667          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
668          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
669          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
670          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
671          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
672          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
673          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
674          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
675          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
676          ISD::STRICT_FMAXIMUM})
677      setOperationAction(Op, MVT::f16, Promote);
678
679    // Round-to-integer need custom lowering for fp16, as Promote doesn't work
680    // because the result type is integer.
681    for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
682                    ISD::STRICT_LLRINT})
683      setOperationAction(Op, MVT::f16, Custom);
684
685    // promote v4f16 to v4f32 when that is known to be safe.
686    setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
687    setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
688    setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
689    setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
690
691    setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
692    setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
693    setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
694    setOperationAction(ISD::FROUNDEVEN,  MVT::v4f16, Expand);
695    setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
696    setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
697    setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
698    setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
699    setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
700    setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
701    setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
702    setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
703    setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
704    setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
705    setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
706    setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
707
708    setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
709    setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
710    setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
711    setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
712    setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
713    setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
714    setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
715    setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
716    setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
717    setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
718    setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
719    setOperationAction(ISD::FROUNDEVEN,  MVT::v8f16, Expand);
720    setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
721    setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
722    setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
723    setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
724    setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
725    setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
726    setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
727    setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
728    setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
729  }
730
731  // AArch64 has implementations of a lot of rounding-like FP operations.
732  for (auto Op :
733       {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
734        ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
735        ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
736        ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
737        ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
738        ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
739        ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
740        ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
741        ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
742        ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
743    for (MVT Ty : {MVT::f32, MVT::f64})
744      setOperationAction(Op, Ty, Legal);
745    if (Subtarget->hasFullFP16())
746      setOperationAction(Op, MVT::f16, Legal);
747  }
748
749  // Basic strict FP operations are legal
750  for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
751                  ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
752    for (MVT Ty : {MVT::f32, MVT::f64})
753      setOperationAction(Op, Ty, Legal);
754    if (Subtarget->hasFullFP16())
755      setOperationAction(Op, MVT::f16, Legal);
756  }
757
758  // Strict conversion to a larger type is legal
759  for (auto VT : {MVT::f32, MVT::f64})
760    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
761
762  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
763
764  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
765  setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
766
767  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
768  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
769  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
770  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
771  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
772
773  // Generate outline atomics library calls only if LSE was not specified for
774  // subtarget
775  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
776    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
777    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
778    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
779    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
780    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
781    setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
782    setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
783    setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
784    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
785    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
786    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
787    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
788    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
789    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
790    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
791    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
792    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
793    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
794    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
795    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
796    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
797    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
798    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
799    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
800    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
801#define LCALLNAMES(A, B, N)                                                    \
802  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
803  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
804  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
805  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
806#define LCALLNAME4(A, B)                                                       \
807  LCALLNAMES(A, B, 1)                                                          \
808  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
809#define LCALLNAME5(A, B)                                                       \
810  LCALLNAMES(A, B, 1)                                                          \
811  LCALLNAMES(A, B, 2)                                                          \
812  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
813    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
814    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
815    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
816    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
817    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
818    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
819#undef LCALLNAMES
820#undef LCALLNAME4
821#undef LCALLNAME5
822  }
823
824  // 128-bit loads and stores can be done without expanding
825  setOperationAction(ISD::LOAD, MVT::i128, Custom);
826  setOperationAction(ISD::STORE, MVT::i128, Custom);
827
828  // Aligned 128-bit loads and stores are single-copy atomic according to the
829  // v8.4a spec.
830  if (Subtarget->hasLSE2()) {
831    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
832    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
833  }
834
835  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
836  // custom lowering, as there are no un-paired non-temporal stores and
837  // legalization will break up 256 bit inputs.
838  setOperationAction(ISD::STORE, MVT::v32i8, Custom);
839  setOperationAction(ISD::STORE, MVT::v16i16, Custom);
840  setOperationAction(ISD::STORE, MVT::v16f16, Custom);
841  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
842  setOperationAction(ISD::STORE, MVT::v8f32, Custom);
843  setOperationAction(ISD::STORE, MVT::v4f64, Custom);
844  setOperationAction(ISD::STORE, MVT::v4i64, Custom);
845
846  // 256 bit non-temporal loads can be lowered to LDNP. This is done using
847  // custom lowering, as there are no un-paired non-temporal loads legalization
848  // will break up 256 bit inputs.
849  setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
850  setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
851  setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
852  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
853  setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
854  setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
855  setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
856
857  // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
858  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
859
860  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
861      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
862    // Issue __sincos_stret if available.
863    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
864    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
865  } else {
866    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
867    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
868  }
869
870  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
871    // MSVCRT doesn't have powi; fall back to pow
872    setLibcallName(RTLIB::POWI_F32, nullptr);
873    setLibcallName(RTLIB::POWI_F64, nullptr);
874  }
875
876  // Make floating-point constants legal for the large code model, so they don't
877  // become loads from the constant pool.
878  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
879    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
880    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
881  }
882
883  // AArch64 does not have floating-point extending loads, i1 sign-extending
884  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
885  for (MVT VT : MVT::fp_valuetypes()) {
886    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
887    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
888    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
889    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
890  }
891  for (MVT VT : MVT::integer_valuetypes())
892    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
893
894  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
895  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
896  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
897  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
898  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
899  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
900  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
901
902  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
903  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
904  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
905
906  // Indexed loads and stores are supported.
907  for (unsigned im = (unsigned)ISD::PRE_INC;
908       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
909    setIndexedLoadAction(im, MVT::i8, Legal);
910    setIndexedLoadAction(im, MVT::i16, Legal);
911    setIndexedLoadAction(im, MVT::i32, Legal);
912    setIndexedLoadAction(im, MVT::i64, Legal);
913    setIndexedLoadAction(im, MVT::f64, Legal);
914    setIndexedLoadAction(im, MVT::f32, Legal);
915    setIndexedLoadAction(im, MVT::f16, Legal);
916    setIndexedLoadAction(im, MVT::bf16, Legal);
917    setIndexedStoreAction(im, MVT::i8, Legal);
918    setIndexedStoreAction(im, MVT::i16, Legal);
919    setIndexedStoreAction(im, MVT::i32, Legal);
920    setIndexedStoreAction(im, MVT::i64, Legal);
921    setIndexedStoreAction(im, MVT::f64, Legal);
922    setIndexedStoreAction(im, MVT::f32, Legal);
923    setIndexedStoreAction(im, MVT::f16, Legal);
924    setIndexedStoreAction(im, MVT::bf16, Legal);
925  }
926
927  // Trap.
928  setOperationAction(ISD::TRAP, MVT::Other, Legal);
929  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
930  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
931
932  // We combine OR nodes for bitfield operations.
933  setTargetDAGCombine(ISD::OR);
934  // Try to create BICs for vector ANDs.
935  setTargetDAGCombine(ISD::AND);
936
937  // Vector add and sub nodes may conceal a high-half opportunity.
938  // Also, try to fold ADD into CSINC/CSINV..
939  setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
940                       ISD::UINT_TO_FP});
941
942  setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
943                       ISD::FP_TO_UINT_SAT, ISD::FDIV});
944
945  // Try and combine setcc with csel
946  setTargetDAGCombine(ISD::SETCC);
947
948  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
949
950  setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
951                       ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
952                       ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
953                       ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
954  setTargetDAGCombine(ISD::TRUNCATE);
955  setTargetDAGCombine(ISD::LOAD);
956
957  setTargetDAGCombine(ISD::MSTORE);
958
959  setTargetDAGCombine(ISD::MUL);
960
961  setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
962
963  setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
964                       ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
965                       ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
966
967  setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
968
969  setTargetDAGCombine(ISD::FP_EXTEND);
970
971  setTargetDAGCombine(ISD::GlobalAddress);
972
973  setTargetDAGCombine(ISD::CTLZ);
974
975  // In case of strict alignment, avoid an excessive number of byte wide stores.
976  MaxStoresPerMemsetOptSize = 8;
977  MaxStoresPerMemset =
978      Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
979
980  MaxGluedStoresPerMemcpy = 4;
981  MaxStoresPerMemcpyOptSize = 4;
982  MaxStoresPerMemcpy =
983      Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
984
985  MaxStoresPerMemmoveOptSize = 4;
986  MaxStoresPerMemmove = 4;
987
988  MaxLoadsPerMemcmpOptSize = 4;
989  MaxLoadsPerMemcmp =
990      Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
991
992  setStackPointerRegisterToSaveRestore(AArch64::SP);
993
994  setSchedulingPreference(Sched::Hybrid);
995
996  EnableExtLdPromotion = true;
997
998  // Set required alignment.
999  setMinFunctionAlignment(Align(4));
1000  // Set preferred alignments.
1001  setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
1002  setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1003  setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
1004
1005  // Only change the limit for entries in a jump table if specified by
1006  // the sub target, but not at the command line.
1007  unsigned MaxJT = STI.getMaximumJumpTableSize();
1008  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1009    setMaximumJumpTableSize(MaxJT);
1010
1011  setHasExtractBitsInsn(true);
1012
1013  setMaxDivRemBitWidthSupported(128);
1014
1015  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1016
1017  if (Subtarget->hasNEON()) {
1018    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1019    // silliness like this:
1020    for (auto Op :
1021         {ISD::SELECT,         ISD::SELECT_CC,      ISD::SETCC,
1022          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
1023          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
1024          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
1025          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
1026          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
1027          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
1028          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
1029          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
1030          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
1031          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
1032          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
1033          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
1034          ISD::STRICT_FMAXIMUM})
1035      setOperationAction(Op, MVT::v1f64, Expand);
1036
1037    for (auto Op :
1038         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1039          ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1040          ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1041          ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1042      setOperationAction(Op, MVT::v1i64, Expand);
1043
1044    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1045    // elements smaller than i32, so promote the input to i32 first.
1046    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1047    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1048
1049    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1050    // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
1051    // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1052    for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1053                    ISD::STRICT_UINT_TO_FP})
1054      for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1055        setOperationAction(Op, VT, Custom);
1056
1057    if (Subtarget->hasFullFP16()) {
1058      setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1059
1060      setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1061      setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1062      setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1063      setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1064      setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1065      setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1066      setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1067      setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1068    } else {
1069      // when AArch64 doesn't have fullfp16 support, promote the input
1070      // to i32 first.
1071      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1072      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1073      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1074      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1075      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1076      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1077      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1078      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1079    }
1080
1081    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
1082    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1083    setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1084    setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1085    setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1086    setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1087    setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1088    setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1089    for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1090      setOperationAction(ISD::UMAX, VT, Custom);
1091      setOperationAction(ISD::SMAX, VT, Custom);
1092      setOperationAction(ISD::UMIN, VT, Custom);
1093      setOperationAction(ISD::SMIN, VT, Custom);
1094    }
1095
1096    // AArch64 doesn't have MUL.2d:
1097    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1098    // Custom handling for some quad-vector types to detect MULL.
1099    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1100    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1101    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1102
1103    // Saturates
1104    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1105                    MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1106      setOperationAction(ISD::SADDSAT, VT, Legal);
1107      setOperationAction(ISD::UADDSAT, VT, Legal);
1108      setOperationAction(ISD::SSUBSAT, VT, Legal);
1109      setOperationAction(ISD::USUBSAT, VT, Legal);
1110    }
1111
1112    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1113                   MVT::v4i32}) {
1114      setOperationAction(ISD::AVGFLOORS, VT, Legal);
1115      setOperationAction(ISD::AVGFLOORU, VT, Legal);
1116      setOperationAction(ISD::AVGCEILS, VT, Legal);
1117      setOperationAction(ISD::AVGCEILU, VT, Legal);
1118      setOperationAction(ISD::ABDS, VT, Legal);
1119      setOperationAction(ISD::ABDU, VT, Legal);
1120    }
1121
1122    // Vector reductions
1123    for (MVT VT : { MVT::v4f16, MVT::v2f32,
1124                    MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1125      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1126        setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1127        setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1128
1129        setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1130      }
1131    }
1132    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1133                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1134      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1135      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1136      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1137      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1138      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1139    }
1140    setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1141
1142    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1143    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1144    // Likewise, narrowing and extending vector loads/stores aren't handled
1145    // directly.
1146    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1147      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1148
1149      if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1150        setOperationAction(ISD::MULHS, VT, Legal);
1151        setOperationAction(ISD::MULHU, VT, Legal);
1152      } else {
1153        setOperationAction(ISD::MULHS, VT, Expand);
1154        setOperationAction(ISD::MULHU, VT, Expand);
1155      }
1156      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1157      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1158
1159      setOperationAction(ISD::BSWAP, VT, Expand);
1160      setOperationAction(ISD::CTTZ, VT, Expand);
1161
1162      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1163        setTruncStoreAction(VT, InnerVT, Expand);
1164        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1165        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1166        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1167      }
1168    }
1169
1170    // AArch64 has implementations of a lot of rounding-like FP operations.
1171    for (auto Op :
1172         {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1173          ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1174          ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1175          ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1176      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1177        setOperationAction(Op, Ty, Legal);
1178      if (Subtarget->hasFullFP16())
1179        for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1180          setOperationAction(Op, Ty, Legal);
1181    }
1182
1183    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1184
1185    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
1186    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1187    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1188    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
1189    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1190    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1191
1192    // ADDP custom lowering
1193    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1194      setOperationAction(ISD::ADD, VT, Custom);
1195    // FADDP custom lowering
1196    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1197      setOperationAction(ISD::FADD, VT, Custom);
1198  }
1199
1200  if (Subtarget->hasSME()) {
1201    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1202  }
1203
1204  // FIXME: Move lowering for more nodes here if those are common between
1205  // SVE and SME.
1206  if (Subtarget->hasSVEorSME()) {
1207    for (auto VT :
1208         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1209      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1210      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1211    }
1212  }
1213
1214  if (Subtarget->hasSVE()) {
1215    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1216      setOperationAction(ISD::BITREVERSE, VT, Custom);
1217      setOperationAction(ISD::BSWAP, VT, Custom);
1218      setOperationAction(ISD::CTLZ, VT, Custom);
1219      setOperationAction(ISD::CTPOP, VT, Custom);
1220      setOperationAction(ISD::CTTZ, VT, Custom);
1221      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1222      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1223      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1224      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1225      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1226      setOperationAction(ISD::MGATHER, VT, Custom);
1227      setOperationAction(ISD::MSCATTER, VT, Custom);
1228      setOperationAction(ISD::MLOAD, VT, Custom);
1229      setOperationAction(ISD::MUL, VT, Custom);
1230      setOperationAction(ISD::MULHS, VT, Custom);
1231      setOperationAction(ISD::MULHU, VT, Custom);
1232      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1233      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1234      setOperationAction(ISD::SELECT, VT, Custom);
1235      setOperationAction(ISD::SETCC, VT, Custom);
1236      setOperationAction(ISD::SDIV, VT, Custom);
1237      setOperationAction(ISD::UDIV, VT, Custom);
1238      setOperationAction(ISD::SMIN, VT, Custom);
1239      setOperationAction(ISD::UMIN, VT, Custom);
1240      setOperationAction(ISD::SMAX, VT, Custom);
1241      setOperationAction(ISD::UMAX, VT, Custom);
1242      setOperationAction(ISD::SHL, VT, Custom);
1243      setOperationAction(ISD::SRL, VT, Custom);
1244      setOperationAction(ISD::SRA, VT, Custom);
1245      setOperationAction(ISD::ABS, VT, Custom);
1246      setOperationAction(ISD::ABDS, VT, Custom);
1247      setOperationAction(ISD::ABDU, VT, Custom);
1248      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1249      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1250      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1251      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1252      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1253      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1254      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1255      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1256
1257      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1258      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1259      setOperationAction(ISD::SELECT_CC, VT, Expand);
1260      setOperationAction(ISD::ROTL, VT, Expand);
1261      setOperationAction(ISD::ROTR, VT, Expand);
1262
1263      setOperationAction(ISD::SADDSAT, VT, Legal);
1264      setOperationAction(ISD::UADDSAT, VT, Legal);
1265      setOperationAction(ISD::SSUBSAT, VT, Legal);
1266      setOperationAction(ISD::USUBSAT, VT, Legal);
1267      setOperationAction(ISD::UREM, VT, Expand);
1268      setOperationAction(ISD::SREM, VT, Expand);
1269      setOperationAction(ISD::SDIVREM, VT, Expand);
1270      setOperationAction(ISD::UDIVREM, VT, Expand);
1271
1272      if (Subtarget->hasSVE2()) {
1273        setOperationAction(ISD::AVGFLOORS, VT, Custom);
1274        setOperationAction(ISD::AVGFLOORU, VT, Custom);
1275        setOperationAction(ISD::AVGCEILS, VT, Custom);
1276        setOperationAction(ISD::AVGCEILU, VT, Custom);
1277      }
1278    }
1279
1280    // Illegal unpacked integer vector types.
1281    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1282      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1283      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1284    }
1285
1286    // Legalize unpacked bitcasts to REINTERPRET_CAST.
1287    for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1288                    MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1289      setOperationAction(ISD::BITCAST, VT, Custom);
1290
1291    for (auto VT :
1292         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1293           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1294      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1295
1296    for (auto VT :
1297         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1298      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1299      setOperationAction(ISD::SELECT, VT, Custom);
1300      setOperationAction(ISD::SETCC, VT, Custom);
1301      setOperationAction(ISD::TRUNCATE, VT, Custom);
1302      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1303      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1304      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1305
1306      setOperationAction(ISD::SELECT_CC, VT, Expand);
1307      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1308      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1309
1310      // There are no legal MVT::nxv16f## based types.
1311      if (VT != MVT::nxv16i1) {
1312        setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1313        setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1314      }
1315    }
1316
1317    // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1318    for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1319                    MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1320                    MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1321      setOperationAction(ISD::MLOAD, VT, Custom);
1322      setOperationAction(ISD::MSTORE, VT, Custom);
1323      setOperationAction(ISD::MGATHER, VT, Custom);
1324      setOperationAction(ISD::MSCATTER, VT, Custom);
1325    }
1326
1327    // Firstly, exclude all scalable vector extending loads/truncating stores,
1328    // include both integer and floating scalable vector.
1329    for (MVT VT : MVT::scalable_vector_valuetypes()) {
1330      for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1331        setTruncStoreAction(VT, InnerVT, Expand);
1332        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1333        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1334        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1335      }
1336    }
1337
1338    // Then, selectively enable those which we directly support.
1339    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1340    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1341    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1342    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1343    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1344    setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1345    for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1346      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1347      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1348      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1349      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1350      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1351      setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1352    }
1353
1354    // SVE supports truncating stores of 64 and 128-bit vectors
1355    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1356    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1357    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1358    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1359    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1360
1361    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1362                    MVT::nxv4f32, MVT::nxv2f64}) {
1363      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1364      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1365      setOperationAction(ISD::MGATHER, VT, Custom);
1366      setOperationAction(ISD::MSCATTER, VT, Custom);
1367      setOperationAction(ISD::MLOAD, VT, Custom);
1368      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1369      setOperationAction(ISD::SELECT, VT, Custom);
1370      setOperationAction(ISD::FADD, VT, Custom);
1371      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1372      setOperationAction(ISD::FDIV, VT, Custom);
1373      setOperationAction(ISD::FMA, VT, Custom);
1374      setOperationAction(ISD::FMAXIMUM, VT, Custom);
1375      setOperationAction(ISD::FMAXNUM, VT, Custom);
1376      setOperationAction(ISD::FMINIMUM, VT, Custom);
1377      setOperationAction(ISD::FMINNUM, VT, Custom);
1378      setOperationAction(ISD::FMUL, VT, Custom);
1379      setOperationAction(ISD::FNEG, VT, Custom);
1380      setOperationAction(ISD::FSUB, VT, Custom);
1381      setOperationAction(ISD::FCEIL, VT, Custom);
1382      setOperationAction(ISD::FFLOOR, VT, Custom);
1383      setOperationAction(ISD::FNEARBYINT, VT, Custom);
1384      setOperationAction(ISD::FRINT, VT, Custom);
1385      setOperationAction(ISD::FROUND, VT, Custom);
1386      setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1387      setOperationAction(ISD::FTRUNC, VT, Custom);
1388      setOperationAction(ISD::FSQRT, VT, Custom);
1389      setOperationAction(ISD::FABS, VT, Custom);
1390      setOperationAction(ISD::FP_EXTEND, VT, Custom);
1391      setOperationAction(ISD::FP_ROUND, VT, Custom);
1392      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1393      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1394      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1395      setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1396      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1397
1398      setOperationAction(ISD::SELECT_CC, VT, Expand);
1399      setOperationAction(ISD::FREM, VT, Expand);
1400      setOperationAction(ISD::FPOW, VT, Expand);
1401      setOperationAction(ISD::FPOWI, VT, Expand);
1402      setOperationAction(ISD::FCOS, VT, Expand);
1403      setOperationAction(ISD::FSIN, VT, Expand);
1404      setOperationAction(ISD::FSINCOS, VT, Expand);
1405      setOperationAction(ISD::FEXP, VT, Expand);
1406      setOperationAction(ISD::FEXP2, VT, Expand);
1407      setOperationAction(ISD::FLOG, VT, Expand);
1408      setOperationAction(ISD::FLOG2, VT, Expand);
1409      setOperationAction(ISD::FLOG10, VT, Expand);
1410
1411      setCondCodeAction(ISD::SETO, VT, Expand);
1412      setCondCodeAction(ISD::SETOLT, VT, Expand);
1413      setCondCodeAction(ISD::SETLT, VT, Expand);
1414      setCondCodeAction(ISD::SETOLE, VT, Expand);
1415      setCondCodeAction(ISD::SETLE, VT, Expand);
1416      setCondCodeAction(ISD::SETULT, VT, Expand);
1417      setCondCodeAction(ISD::SETULE, VT, Expand);
1418      setCondCodeAction(ISD::SETUGE, VT, Expand);
1419      setCondCodeAction(ISD::SETUGT, VT, Expand);
1420      setCondCodeAction(ISD::SETUEQ, VT, Expand);
1421      setCondCodeAction(ISD::SETONE, VT, Expand);
1422    }
1423
1424    for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1425      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1426      setOperationAction(ISD::MGATHER, VT, Custom);
1427      setOperationAction(ISD::MSCATTER, VT, Custom);
1428      setOperationAction(ISD::MLOAD, VT, Custom);
1429      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1430      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1431    }
1432
1433    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1434    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1435
1436    // NEON doesn't support integer divides, but SVE does
1437    for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1438                    MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1439      setOperationAction(ISD::SDIV, VT, Custom);
1440      setOperationAction(ISD::UDIV, VT, Custom);
1441    }
1442
1443    // NEON doesn't support 64-bit vector integer muls, but SVE does.
1444    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1445    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1446
1447    // NEON doesn't support across-vector reductions, but SVE does.
1448    for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1449      setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1450
1451    if (Subtarget->forceStreamingCompatibleSVE()) {
1452      setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1453      setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1454      setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1455      setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1456      setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1457      setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1458      setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1459      setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1460      setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1461      for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1462                     MVT::v4i32, MVT::v1i64, MVT::v2i64})
1463        addTypeForStreamingSVE(VT);
1464
1465      for (MVT VT :
1466           {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1467        addTypeForStreamingSVE(VT);
1468    }
1469
1470    // NOTE: Currently this has to happen after computeRegisterProperties rather
1471    // than the preferred option of combining it with the addRegisterClass call.
1472    if (Subtarget->useSVEForFixedLengthVectors()) {
1473      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1474        if (useSVEForFixedLengthVectorVT(VT))
1475          addTypeForFixedLengthSVE(VT);
1476      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1477        if (useSVEForFixedLengthVectorVT(VT))
1478          addTypeForFixedLengthSVE(VT);
1479
1480      // 64bit results can mean a bigger than NEON input.
1481      for (auto VT : {MVT::v8i8, MVT::v4i16})
1482        setOperationAction(ISD::TRUNCATE, VT, Custom);
1483      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1484
1485      // 128bit results imply a bigger than NEON input.
1486      for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1487        setOperationAction(ISD::TRUNCATE, VT, Custom);
1488      for (auto VT : {MVT::v8f16, MVT::v4f32})
1489        setOperationAction(ISD::FP_ROUND, VT, Custom);
1490
1491      // These operations are not supported on NEON but SVE can do them.
1492      setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1493      setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1494      setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1495      setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1496      setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1497      setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1498      setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1499      setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1500      setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1501      setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1502      setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1503      setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1504      setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1505      setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1506      setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1507      setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1508      setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1509      setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1510      setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1511      setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1512
1513      // Int operations with no NEON support.
1514      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1515                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1516        setOperationAction(ISD::BITREVERSE, VT, Custom);
1517        setOperationAction(ISD::CTTZ, VT, Custom);
1518        setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1519        setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1520        setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1521      }
1522
1523
1524      // Use SVE for vectors with more than 2 elements.
1525      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1526        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1527    }
1528
1529    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1530    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1531    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1532    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1533
1534    setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1535  }
1536
1537  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1538    // Only required for llvm.aarch64.mops.memset.tag
1539    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1540  }
1541
1542  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1543
1544  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1545
1546  IsStrictFPEnabled = true;
1547}
1548
1549void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1550  assert(VT.isVector() && "VT should be a vector type");
1551
1552  if (VT.isFloatingPoint()) {
1553    MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1554    setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1555    setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1556  }
1557
1558  // Mark vector float intrinsics as expand.
1559  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1560    setOperationAction(ISD::FSIN, VT, Expand);
1561    setOperationAction(ISD::FCOS, VT, Expand);
1562    setOperationAction(ISD::FPOW, VT, Expand);
1563    setOperationAction(ISD::FLOG, VT, Expand);
1564    setOperationAction(ISD::FLOG2, VT, Expand);
1565    setOperationAction(ISD::FLOG10, VT, Expand);
1566    setOperationAction(ISD::FEXP, VT, Expand);
1567    setOperationAction(ISD::FEXP2, VT, Expand);
1568  }
1569
1570  // But we do support custom-lowering for FCOPYSIGN.
1571  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1572      ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1573    setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1574
1575  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1576  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1577  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1578  setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1579  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1580  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1581  setOperationAction(ISD::SRA, VT, Custom);
1582  setOperationAction(ISD::SRL, VT, Custom);
1583  setOperationAction(ISD::SHL, VT, Custom);
1584  setOperationAction(ISD::OR, VT, Custom);
1585  setOperationAction(ISD::SETCC, VT, Custom);
1586  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1587
1588  setOperationAction(ISD::SELECT, VT, Expand);
1589  setOperationAction(ISD::SELECT_CC, VT, Expand);
1590  setOperationAction(ISD::VSELECT, VT, Expand);
1591  for (MVT InnerVT : MVT::all_valuetypes())
1592    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1593
1594  // CNT supports only B element sizes, then use UADDLP to widen.
1595  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1596    setOperationAction(ISD::CTPOP, VT, Custom);
1597
1598  setOperationAction(ISD::UDIV, VT, Expand);
1599  setOperationAction(ISD::SDIV, VT, Expand);
1600  setOperationAction(ISD::UREM, VT, Expand);
1601  setOperationAction(ISD::SREM, VT, Expand);
1602  setOperationAction(ISD::FREM, VT, Expand);
1603
1604  for (unsigned Opcode :
1605       {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1606        ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1607    setOperationAction(Opcode, VT, Custom);
1608
1609  if (!VT.isFloatingPoint())
1610    setOperationAction(ISD::ABS, VT, Legal);
1611
1612  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1613  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1614    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1615      setOperationAction(Opcode, VT, Legal);
1616
1617  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1618  // NEON types.
1619  if (VT.isFloatingPoint() &&
1620      VT.getVectorElementType() != MVT::bf16 &&
1621      (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1622    for (unsigned Opcode :
1623         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1624          ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1625          ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1626          ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1627          ISD::STRICT_FSQRT})
1628      setOperationAction(Opcode, VT, Legal);
1629
1630  // Strict fp extend and trunc are legal
1631  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1632    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1633  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1634    setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1635
1636  // FIXME: We could potentially make use of the vector comparison instructions
1637  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1638  // complications:
1639  //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1640  //    so we would need to expand when the condition code doesn't match the
1641  //    kind of comparison.
1642  //  * Some kinds of comparison require more than one FCMXY instruction so
1643  //    would need to be expanded instead.
1644  //  * The lowering of the non-strict versions involves target-specific ISD
1645  //    nodes so we would likely need to add strict versions of all of them and
1646  //    handle them appropriately.
1647  setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
1648  setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
1649
1650  if (Subtarget->isLittleEndian()) {
1651    for (unsigned im = (unsigned)ISD::PRE_INC;
1652         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1653      setIndexedLoadAction(im, VT, Legal);
1654      setIndexedStoreAction(im, VT, Legal);
1655    }
1656  }
1657
1658  if (Subtarget->hasD128()) {
1659    setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
1660    setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
1661  }
1662}
1663
1664bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1665                                                          EVT OpVT) const {
1666  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1667  if (!Subtarget->hasSVE())
1668    return true;
1669
1670  // We can only support legal predicate result types. We can use the SVE
1671  // whilelo instruction for generating fixed-width predicates too.
1672  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1673      ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1674      ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1675    return true;
1676
1677  // The whilelo instruction only works with i32 or i64 scalar inputs.
1678  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1679    return true;
1680
1681  return false;
1682}
1683
1684void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) {
1685  // By default set all operations to Expand,
1686  // then change to Legal/Custom if needed.
1687  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1688    setOperationAction(Op, VT, Expand);
1689
1690  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1691
1692  if (VT.isFloatingPoint()) {
1693    setCondCodeAction(ISD::SETO, VT, Expand);
1694    setCondCodeAction(ISD::SETOLT, VT, Expand);
1695    setCondCodeAction(ISD::SETOLE, VT, Expand);
1696    setCondCodeAction(ISD::SETULT, VT, Expand);
1697    setCondCodeAction(ISD::SETULE, VT, Expand);
1698    setCondCodeAction(ISD::SETUGE, VT, Expand);
1699    setCondCodeAction(ISD::SETUGT, VT, Expand);
1700    setCondCodeAction(ISD::SETUEQ, VT, Expand);
1701    setCondCodeAction(ISD::SETONE, VT, Expand);
1702  }
1703
1704  // STORE, LOAD, SCALAR_TO_VECTOR and BITCAST are natively supported,
1705  // so no need to Custom/Expand them.
1706  setOperationAction(ISD::STORE, VT, Legal);
1707  setOperationAction(ISD::LOAD, VT, Legal);
1708  setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
1709  setOperationAction(ISD::BITCAST, VT, Legal);
1710
1711  // Mark integer truncating stores/extending loads as having custom lowering
1712  if (VT.isInteger()) {
1713    MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1714    while (InnerVT != VT) {
1715      setTruncStoreAction(VT, InnerVT, Custom);
1716      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1717      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1718      InnerVT = InnerVT.changeVectorElementType(
1719          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1720    }
1721  }
1722
1723  // Mark floating-point truncating stores/extending loads as having custom
1724  // lowering
1725  if (VT.isFloatingPoint()) {
1726    MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1727    while (InnerVT != VT) {
1728      setTruncStoreAction(VT, InnerVT, Custom);
1729      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1730      InnerVT = InnerVT.changeVectorElementType(
1731          MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1732    }
1733  }
1734
1735  setOperationAction(ISD::ABS, VT, Custom);
1736  setOperationAction(ISD::ADD, VT, Custom);
1737  setOperationAction(ISD::AND, VT, Custom);
1738  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1739  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1740  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1741  setOperationAction(ISD::CTLZ, VT, Custom);
1742  setOperationAction(ISD::CTPOP, VT, Custom);
1743  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1744  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1745  setOperationAction(ISD::FABS, VT, Custom);
1746  setOperationAction(ISD::FADD, VT, Custom);
1747  setOperationAction(ISD::FCEIL, VT, Custom);
1748  setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1749  setOperationAction(ISD::FDIV, VT, Custom);
1750  setOperationAction(ISD::FFLOOR, VT, Custom);
1751  setOperationAction(ISD::FMA, VT, Custom);
1752  setOperationAction(ISD::FMAXIMUM, VT, Custom);
1753  setOperationAction(ISD::FMAXNUM, VT, Custom);
1754  setOperationAction(ISD::FMINIMUM, VT, Custom);
1755  setOperationAction(ISD::FMINNUM, VT, Custom);
1756  setOperationAction(ISD::FMUL, VT, Custom);
1757  setOperationAction(ISD::FNEARBYINT, VT, Custom);
1758  setOperationAction(ISD::FNEG, VT, Custom);
1759  setOperationAction(ISD::FP_ROUND, VT, Custom);
1760  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1761  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1762  setOperationAction(ISD::FRINT, VT, Custom);
1763  setOperationAction(ISD::FROUND, VT, Custom);
1764  setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1765  setOperationAction(ISD::FSQRT, VT, Custom);
1766  setOperationAction(ISD::FSUB, VT, Custom);
1767  setOperationAction(ISD::FTRUNC, VT, Custom);
1768  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1769  setOperationAction(ISD::MLOAD, VT, Custom);
1770  setOperationAction(ISD::MSTORE, VT, Custom);
1771  setOperationAction(ISD::MUL, VT, Custom);
1772  setOperationAction(ISD::MULHS, VT, Custom);
1773  setOperationAction(ISD::MULHU, VT, Custom);
1774  setOperationAction(ISD::OR, VT, Custom);
1775  setOperationAction(ISD::SDIV, VT, Custom);
1776  setOperationAction(ISD::SETCC, VT, Custom);
1777  setOperationAction(ISD::SHL, VT, Custom);
1778  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1779  setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1780  setOperationAction(ISD::SMAX, VT, Custom);
1781  setOperationAction(ISD::SMIN, VT, Custom);
1782  setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1783  setOperationAction(ISD::SRA, VT, Custom);
1784  setOperationAction(ISD::SRL, VT, Custom);
1785  setOperationAction(ISD::SUB, VT, Custom);
1786  setOperationAction(ISD::TRUNCATE, VT, Custom);
1787  setOperationAction(ISD::UDIV, VT, Custom);
1788  setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1789  setOperationAction(ISD::UMAX, VT, Custom);
1790  setOperationAction(ISD::UMIN, VT, Custom);
1791  setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1792  setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1793  setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1794  setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1795  setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1796  setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1797  setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1798  setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1799  setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1800  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1801  setOperationAction(ISD::XOR, VT, Custom);
1802  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1803}
1804
1805void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1806  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1807
1808  // By default everything must be expanded.
1809  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1810    setOperationAction(Op, VT, Expand);
1811
1812  if (VT.isFloatingPoint()) {
1813    setCondCodeAction(ISD::SETO, VT, Expand);
1814    setCondCodeAction(ISD::SETOLT, VT, Expand);
1815    setCondCodeAction(ISD::SETOLE, VT, Expand);
1816    setCondCodeAction(ISD::SETULT, VT, Expand);
1817    setCondCodeAction(ISD::SETULE, VT, Expand);
1818    setCondCodeAction(ISD::SETUGE, VT, Expand);
1819    setCondCodeAction(ISD::SETUGT, VT, Expand);
1820    setCondCodeAction(ISD::SETUEQ, VT, Expand);
1821    setCondCodeAction(ISD::SETONE, VT, Expand);
1822  }
1823
1824  // Mark integer truncating stores/extending loads as having custom lowering
1825  if (VT.isInteger()) {
1826    MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1827    while (InnerVT != VT) {
1828      setTruncStoreAction(VT, InnerVT, Custom);
1829      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1830      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1831      InnerVT = InnerVT.changeVectorElementType(
1832          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1833    }
1834  }
1835
1836  // Mark floating-point truncating stores/extending loads as having custom
1837  // lowering
1838  if (VT.isFloatingPoint()) {
1839    MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1840    while (InnerVT != VT) {
1841      setTruncStoreAction(VT, InnerVT, Custom);
1842      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1843      InnerVT = InnerVT.changeVectorElementType(
1844          MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1845    }
1846  }
1847
1848  // Lower fixed length vector operations to scalable equivalents.
1849  setOperationAction(ISD::ABS, VT, Custom);
1850  setOperationAction(ISD::ADD, VT, Custom);
1851  setOperationAction(ISD::AND, VT, Custom);
1852  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1853  setOperationAction(ISD::BITCAST, VT, Custom);
1854  setOperationAction(ISD::BITREVERSE, VT, Custom);
1855  setOperationAction(ISD::BSWAP, VT, Custom);
1856  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1857  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1858  setOperationAction(ISD::CTLZ, VT, Custom);
1859  setOperationAction(ISD::CTPOP, VT, Custom);
1860  setOperationAction(ISD::CTTZ, VT, Custom);
1861  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1862  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1863  setOperationAction(ISD::FABS, VT, Custom);
1864  setOperationAction(ISD::FADD, VT, Custom);
1865  setOperationAction(ISD::FCEIL, VT, Custom);
1866  setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1867  setOperationAction(ISD::FDIV, VT, Custom);
1868  setOperationAction(ISD::FFLOOR, VT, Custom);
1869  setOperationAction(ISD::FMA, VT, Custom);
1870  setOperationAction(ISD::FMAXIMUM, VT, Custom);
1871  setOperationAction(ISD::FMAXNUM, VT, Custom);
1872  setOperationAction(ISD::FMINIMUM, VT, Custom);
1873  setOperationAction(ISD::FMINNUM, VT, Custom);
1874  setOperationAction(ISD::FMUL, VT, Custom);
1875  setOperationAction(ISD::FNEARBYINT, VT, Custom);
1876  setOperationAction(ISD::FNEG, VT, Custom);
1877  setOperationAction(ISD::FP_EXTEND, VT, Custom);
1878  setOperationAction(ISD::FP_ROUND, VT, Custom);
1879  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1880  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1881  setOperationAction(ISD::FRINT, VT, Custom);
1882  setOperationAction(ISD::FROUND, VT, Custom);
1883  setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1884  setOperationAction(ISD::FSQRT, VT, Custom);
1885  setOperationAction(ISD::FSUB, VT, Custom);
1886  setOperationAction(ISD::FTRUNC, VT, Custom);
1887  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1888  setOperationAction(ISD::LOAD, VT, Custom);
1889  setOperationAction(ISD::MGATHER, VT, Custom);
1890  setOperationAction(ISD::MLOAD, VT, Custom);
1891  setOperationAction(ISD::MSCATTER, VT, Custom);
1892  setOperationAction(ISD::MSTORE, VT, Custom);
1893  setOperationAction(ISD::MUL, VT, Custom);
1894  setOperationAction(ISD::MULHS, VT, Custom);
1895  setOperationAction(ISD::MULHU, VT, Custom);
1896  setOperationAction(ISD::OR, VT, Custom);
1897  setOperationAction(ISD::SDIV, VT, Custom);
1898  setOperationAction(ISD::SELECT, VT, Custom);
1899  setOperationAction(ISD::SETCC, VT, Custom);
1900  setOperationAction(ISD::SHL, VT, Custom);
1901  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1902  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1903  setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1904  setOperationAction(ISD::SMAX, VT, Custom);
1905  setOperationAction(ISD::SMIN, VT, Custom);
1906  setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1907  setOperationAction(ISD::SRA, VT, Custom);
1908  setOperationAction(ISD::SRL, VT, Custom);
1909  setOperationAction(ISD::STORE, VT, Custom);
1910  setOperationAction(ISD::SUB, VT, Custom);
1911  setOperationAction(ISD::TRUNCATE, VT, Custom);
1912  setOperationAction(ISD::UDIV, VT, Custom);
1913  setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1914  setOperationAction(ISD::UMAX, VT, Custom);
1915  setOperationAction(ISD::UMIN, VT, Custom);
1916  setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1917  setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1918  setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1919  setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1920  setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1921  setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1922  setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1923  setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1924  setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1925  setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1926  setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1927  setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1928  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1929  setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1930  setOperationAction(ISD::VSELECT, VT, Custom);
1931  setOperationAction(ISD::XOR, VT, Custom);
1932  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1933}
1934
1935void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1936  addRegisterClass(VT, &AArch64::FPR64RegClass);
1937  addTypeForNEON(VT);
1938}
1939
1940void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1941  addRegisterClass(VT, &AArch64::FPR128RegClass);
1942  addTypeForNEON(VT);
1943}
1944
1945EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1946                                              LLVMContext &C, EVT VT) const {
1947  if (!VT.isVector())
1948    return MVT::i32;
1949  if (VT.isScalableVector())
1950    return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1951  return VT.changeVectorElementTypeToInteger();
1952}
1953
1954// isIntImmediate - This method tests to see if the node is a constant
1955// operand. If so Imm will receive the value.
1956static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1957  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1958    Imm = C->getZExtValue();
1959    return true;
1960  }
1961  return false;
1962}
1963
1964// isOpcWithIntImmediate - This method tests to see if the node is a specific
1965// opcode and that it has a immediate integer right operand.
1966// If so Imm will receive the value.
1967static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1968                                  uint64_t &Imm) {
1969  return N->getOpcode() == Opc &&
1970         isIntImmediate(N->getOperand(1).getNode(), Imm);
1971}
1972
1973static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1974                               const APInt &Demanded,
1975                               TargetLowering::TargetLoweringOpt &TLO,
1976                               unsigned NewOpc) {
1977  uint64_t OldImm = Imm, NewImm, Enc;
1978  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1979
1980  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1981  // bimm64.
1982  if (Imm == 0 || Imm == Mask ||
1983      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1984    return false;
1985
1986  unsigned EltSize = Size;
1987  uint64_t DemandedBits = Demanded.getZExtValue();
1988
1989  // Clear bits that are not demanded.
1990  Imm &= DemandedBits;
1991
1992  while (true) {
1993    // The goal here is to set the non-demanded bits in a way that minimizes
1994    // the number of switching between 0 and 1. In order to achieve this goal,
1995    // we set the non-demanded bits to the value of the preceding demanded bits.
1996    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1997    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1998    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1999    // The final result is 0b11000011.
2000    uint64_t NonDemandedBits = ~DemandedBits;
2001    uint64_t InvertedImm = ~Imm & DemandedBits;
2002    uint64_t RotatedImm =
2003        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2004        NonDemandedBits;
2005    uint64_t Sum = RotatedImm + NonDemandedBits;
2006    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2007    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2008    NewImm = (Imm | Ones) & Mask;
2009
2010    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2011    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2012    // we halve the element size and continue the search.
2013    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2014      break;
2015
2016    // We cannot shrink the element size any further if it is 2-bits.
2017    if (EltSize == 2)
2018      return false;
2019
2020    EltSize /= 2;
2021    Mask >>= EltSize;
2022    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2023
2024    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2025    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2026      return false;
2027
2028    // Merge the upper and lower halves of Imm and DemandedBits.
2029    Imm |= Hi;
2030    DemandedBits |= DemandedBitsHi;
2031  }
2032
2033  ++NumOptimizedImms;
2034
2035  // Replicate the element across the register width.
2036  while (EltSize < Size) {
2037    NewImm |= NewImm << EltSize;
2038    EltSize *= 2;
2039  }
2040
2041  (void)OldImm;
2042  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2043         "demanded bits should never be altered");
2044  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2045
2046  // Create the new constant immediate node.
2047  EVT VT = Op.getValueType();
2048  SDLoc DL(Op);
2049  SDValue New;
2050
2051  // If the new constant immediate is all-zeros or all-ones, let the target
2052  // independent DAG combine optimize this node.
2053  if (NewImm == 0 || NewImm == OrigMask) {
2054    New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2055                          TLO.DAG.getConstant(NewImm, DL, VT));
2056  // Otherwise, create a machine node so that target independent DAG combine
2057  // doesn't undo this optimization.
2058  } else {
2059    Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
2060    SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2061    New = SDValue(
2062        TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2063  }
2064
2065  return TLO.CombineTo(Op, New);
2066}
2067
2068bool AArch64TargetLowering::targetShrinkDemandedConstant(
2069    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2070    TargetLoweringOpt &TLO) const {
2071  // Delay this optimization to as late as possible.
2072  if (!TLO.LegalOps)
2073    return false;
2074
2075  if (!EnableOptimizeLogicalImm)
2076    return false;
2077
2078  EVT VT = Op.getValueType();
2079  if (VT.isVector())
2080    return false;
2081
2082  unsigned Size = VT.getSizeInBits();
2083  assert((Size == 32 || Size == 64) &&
2084         "i32 or i64 is expected after legalization.");
2085
2086  // Exit early if we demand all bits.
2087  if (DemandedBits.countPopulation() == Size)
2088    return false;
2089
2090  unsigned NewOpc;
2091  switch (Op.getOpcode()) {
2092  default:
2093    return false;
2094  case ISD::AND:
2095    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2096    break;
2097  case ISD::OR:
2098    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2099    break;
2100  case ISD::XOR:
2101    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2102    break;
2103  }
2104  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2105  if (!C)
2106    return false;
2107  uint64_t Imm = C->getZExtValue();
2108  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2109}
2110
2111/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2112/// Mask are known to be either zero or one and return them Known.
2113void AArch64TargetLowering::computeKnownBitsForTargetNode(
2114    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2115    const SelectionDAG &DAG, unsigned Depth) const {
2116  switch (Op.getOpcode()) {
2117  default:
2118    break;
2119  case AArch64ISD::DUP: {
2120    SDValue SrcOp = Op.getOperand(0);
2121    Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2122    if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2123      assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2124             "Expected DUP implicit truncation");
2125      Known = Known.trunc(Op.getScalarValueSizeInBits());
2126    }
2127    break;
2128  }
2129  case AArch64ISD::CSEL: {
2130    KnownBits Known2;
2131    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2132    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2133    Known = KnownBits::commonBits(Known, Known2);
2134    break;
2135  }
2136  case AArch64ISD::BICi: {
2137    // Compute the bit cleared value.
2138    uint64_t Mask =
2139        ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2140    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2141    Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2142    break;
2143  }
2144  case AArch64ISD::VLSHR: {
2145    KnownBits Known2;
2146    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2147    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2148    Known = KnownBits::lshr(Known, Known2);
2149    break;
2150  }
2151  case AArch64ISD::VASHR: {
2152    KnownBits Known2;
2153    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2154    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2155    Known = KnownBits::ashr(Known, Known2);
2156    break;
2157  }
2158  case AArch64ISD::MOVI: {
2159    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0));
2160    Known =
2161        KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue()));
2162    break;
2163  }
2164  case AArch64ISD::LOADgot:
2165  case AArch64ISD::ADDlow: {
2166    if (!Subtarget->isTargetILP32())
2167      break;
2168    // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2169    Known.Zero = APInt::getHighBitsSet(64, 32);
2170    break;
2171  }
2172  case AArch64ISD::ASSERT_ZEXT_BOOL: {
2173    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2174    Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2175    break;
2176  }
2177  case ISD::INTRINSIC_W_CHAIN: {
2178    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
2179    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
2180    switch (IntID) {
2181    default: return;
2182    case Intrinsic::aarch64_ldaxr:
2183    case Intrinsic::aarch64_ldxr: {
2184      unsigned BitWidth = Known.getBitWidth();
2185      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2186      unsigned MemBits = VT.getScalarSizeInBits();
2187      Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2188      return;
2189    }
2190    }
2191    break;
2192  }
2193  case ISD::INTRINSIC_WO_CHAIN:
2194  case ISD::INTRINSIC_VOID: {
2195    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2196    switch (IntNo) {
2197    default:
2198      break;
2199    case Intrinsic::aarch64_neon_umaxv:
2200    case Intrinsic::aarch64_neon_uminv: {
2201      // Figure out the datatype of the vector operand. The UMINV instruction
2202      // will zero extend the result, so we can mark as known zero all the
2203      // bits larger than the element datatype. 32-bit or larget doesn't need
2204      // this as those are legal types and will be handled by isel directly.
2205      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2206      unsigned BitWidth = Known.getBitWidth();
2207      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2208        assert(BitWidth >= 8 && "Unexpected width!");
2209        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
2210        Known.Zero |= Mask;
2211      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2212        assert(BitWidth >= 16 && "Unexpected width!");
2213        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
2214        Known.Zero |= Mask;
2215      }
2216      break;
2217    } break;
2218    }
2219  }
2220  }
2221}
2222
2223MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2224                                                  EVT) const {
2225  return MVT::i64;
2226}
2227
2228bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2229    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2230    unsigned *Fast) const {
2231  if (Subtarget->requiresStrictAlign())
2232    return false;
2233
2234  if (Fast) {
2235    // Some CPUs are fine with unaligned stores except for 128-bit ones.
2236    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2237            // See comments in performSTORECombine() for more details about
2238            // these conditions.
2239
2240            // Code that uses clang vector extensions can mark that it
2241            // wants unaligned accesses to be treated as fast by
2242            // underspecifying alignment to be 1 or 2.
2243            Alignment <= 2 ||
2244
2245            // Disregard v2i64. Memcpy lowering produces those and splitting
2246            // them regresses performance on micro-benchmarks and olden/bh.
2247            VT == MVT::v2i64;
2248  }
2249  return true;
2250}
2251
2252// Same as above but handling LLTs instead.
2253bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2254    LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2255    unsigned *Fast) const {
2256  if (Subtarget->requiresStrictAlign())
2257    return false;
2258
2259  if (Fast) {
2260    // Some CPUs are fine with unaligned stores except for 128-bit ones.
2261    *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2262            Ty.getSizeInBytes() != 16 ||
2263            // See comments in performSTORECombine() for more details about
2264            // these conditions.
2265
2266            // Code that uses clang vector extensions can mark that it
2267            // wants unaligned accesses to be treated as fast by
2268            // underspecifying alignment to be 1 or 2.
2269            Alignment <= 2 ||
2270
2271            // Disregard v2i64. Memcpy lowering produces those and splitting
2272            // them regresses performance on micro-benchmarks and olden/bh.
2273            Ty == LLT::fixed_vector(2, 64);
2274  }
2275  return true;
2276}
2277
2278FastISel *
2279AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2280                                      const TargetLibraryInfo *libInfo) const {
2281  return AArch64::createFastISel(funcInfo, libInfo);
2282}
2283
2284const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2285#define MAKE_CASE(V)                                                           \
2286  case V:                                                                      \
2287    return #V;
2288  switch ((AArch64ISD::NodeType)Opcode) {
2289  case AArch64ISD::FIRST_NUMBER:
2290    break;
2291    MAKE_CASE(AArch64ISD::OBSCURE_COPY)
2292    MAKE_CASE(AArch64ISD::SMSTART)
2293    MAKE_CASE(AArch64ISD::SMSTOP)
2294    MAKE_CASE(AArch64ISD::RESTORE_ZA)
2295    MAKE_CASE(AArch64ISD::CALL)
2296    MAKE_CASE(AArch64ISD::ADRP)
2297    MAKE_CASE(AArch64ISD::ADR)
2298    MAKE_CASE(AArch64ISD::ADDlow)
2299    MAKE_CASE(AArch64ISD::LOADgot)
2300    MAKE_CASE(AArch64ISD::RET_FLAG)
2301    MAKE_CASE(AArch64ISD::BRCOND)
2302    MAKE_CASE(AArch64ISD::CSEL)
2303    MAKE_CASE(AArch64ISD::CSINV)
2304    MAKE_CASE(AArch64ISD::CSNEG)
2305    MAKE_CASE(AArch64ISD::CSINC)
2306    MAKE_CASE(AArch64ISD::THREAD_POINTER)
2307    MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2308    MAKE_CASE(AArch64ISD::ABDS_PRED)
2309    MAKE_CASE(AArch64ISD::ABDU_PRED)
2310    MAKE_CASE(AArch64ISD::HADDS_PRED)
2311    MAKE_CASE(AArch64ISD::HADDU_PRED)
2312    MAKE_CASE(AArch64ISD::MUL_PRED)
2313    MAKE_CASE(AArch64ISD::MULHS_PRED)
2314    MAKE_CASE(AArch64ISD::MULHU_PRED)
2315    MAKE_CASE(AArch64ISD::RHADDS_PRED)
2316    MAKE_CASE(AArch64ISD::RHADDU_PRED)
2317    MAKE_CASE(AArch64ISD::SDIV_PRED)
2318    MAKE_CASE(AArch64ISD::SHL_PRED)
2319    MAKE_CASE(AArch64ISD::SMAX_PRED)
2320    MAKE_CASE(AArch64ISD::SMIN_PRED)
2321    MAKE_CASE(AArch64ISD::SRA_PRED)
2322    MAKE_CASE(AArch64ISD::SRL_PRED)
2323    MAKE_CASE(AArch64ISD::UDIV_PRED)
2324    MAKE_CASE(AArch64ISD::UMAX_PRED)
2325    MAKE_CASE(AArch64ISD::UMIN_PRED)
2326    MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2327    MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2328    MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2329    MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2330    MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2331    MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2332    MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2333    MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2334    MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2335    MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2336    MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2337    MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2338    MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2339    MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2340    MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2341    MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2342    MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2343    MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2344    MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2345    MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2346    MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2347    MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2348    MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2349    MAKE_CASE(AArch64ISD::ADC)
2350    MAKE_CASE(AArch64ISD::SBC)
2351    MAKE_CASE(AArch64ISD::ADDS)
2352    MAKE_CASE(AArch64ISD::SUBS)
2353    MAKE_CASE(AArch64ISD::ADCS)
2354    MAKE_CASE(AArch64ISD::SBCS)
2355    MAKE_CASE(AArch64ISD::ANDS)
2356    MAKE_CASE(AArch64ISD::CCMP)
2357    MAKE_CASE(AArch64ISD::CCMN)
2358    MAKE_CASE(AArch64ISD::FCCMP)
2359    MAKE_CASE(AArch64ISD::FCMP)
2360    MAKE_CASE(AArch64ISD::STRICT_FCMP)
2361    MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2362    MAKE_CASE(AArch64ISD::DUP)
2363    MAKE_CASE(AArch64ISD::DUPLANE8)
2364    MAKE_CASE(AArch64ISD::DUPLANE16)
2365    MAKE_CASE(AArch64ISD::DUPLANE32)
2366    MAKE_CASE(AArch64ISD::DUPLANE64)
2367    MAKE_CASE(AArch64ISD::DUPLANE128)
2368    MAKE_CASE(AArch64ISD::MOVI)
2369    MAKE_CASE(AArch64ISD::MOVIshift)
2370    MAKE_CASE(AArch64ISD::MOVIedit)
2371    MAKE_CASE(AArch64ISD::MOVImsl)
2372    MAKE_CASE(AArch64ISD::FMOV)
2373    MAKE_CASE(AArch64ISD::MVNIshift)
2374    MAKE_CASE(AArch64ISD::MVNImsl)
2375    MAKE_CASE(AArch64ISD::BICi)
2376    MAKE_CASE(AArch64ISD::ORRi)
2377    MAKE_CASE(AArch64ISD::BSP)
2378    MAKE_CASE(AArch64ISD::EXTR)
2379    MAKE_CASE(AArch64ISD::ZIP1)
2380    MAKE_CASE(AArch64ISD::ZIP2)
2381    MAKE_CASE(AArch64ISD::UZP1)
2382    MAKE_CASE(AArch64ISD::UZP2)
2383    MAKE_CASE(AArch64ISD::TRN1)
2384    MAKE_CASE(AArch64ISD::TRN2)
2385    MAKE_CASE(AArch64ISD::REV16)
2386    MAKE_CASE(AArch64ISD::REV32)
2387    MAKE_CASE(AArch64ISD::REV64)
2388    MAKE_CASE(AArch64ISD::EXT)
2389    MAKE_CASE(AArch64ISD::SPLICE)
2390    MAKE_CASE(AArch64ISD::VSHL)
2391    MAKE_CASE(AArch64ISD::VLSHR)
2392    MAKE_CASE(AArch64ISD::VASHR)
2393    MAKE_CASE(AArch64ISD::VSLI)
2394    MAKE_CASE(AArch64ISD::VSRI)
2395    MAKE_CASE(AArch64ISD::CMEQ)
2396    MAKE_CASE(AArch64ISD::CMGE)
2397    MAKE_CASE(AArch64ISD::CMGT)
2398    MAKE_CASE(AArch64ISD::CMHI)
2399    MAKE_CASE(AArch64ISD::CMHS)
2400    MAKE_CASE(AArch64ISD::FCMEQ)
2401    MAKE_CASE(AArch64ISD::FCMGE)
2402    MAKE_CASE(AArch64ISD::FCMGT)
2403    MAKE_CASE(AArch64ISD::CMEQz)
2404    MAKE_CASE(AArch64ISD::CMGEz)
2405    MAKE_CASE(AArch64ISD::CMGTz)
2406    MAKE_CASE(AArch64ISD::CMLEz)
2407    MAKE_CASE(AArch64ISD::CMLTz)
2408    MAKE_CASE(AArch64ISD::FCMEQz)
2409    MAKE_CASE(AArch64ISD::FCMGEz)
2410    MAKE_CASE(AArch64ISD::FCMGTz)
2411    MAKE_CASE(AArch64ISD::FCMLEz)
2412    MAKE_CASE(AArch64ISD::FCMLTz)
2413    MAKE_CASE(AArch64ISD::SADDV)
2414    MAKE_CASE(AArch64ISD::UADDV)
2415    MAKE_CASE(AArch64ISD::SDOT)
2416    MAKE_CASE(AArch64ISD::UDOT)
2417    MAKE_CASE(AArch64ISD::SMINV)
2418    MAKE_CASE(AArch64ISD::UMINV)
2419    MAKE_CASE(AArch64ISD::SMAXV)
2420    MAKE_CASE(AArch64ISD::UMAXV)
2421    MAKE_CASE(AArch64ISD::SADDV_PRED)
2422    MAKE_CASE(AArch64ISD::UADDV_PRED)
2423    MAKE_CASE(AArch64ISD::SMAXV_PRED)
2424    MAKE_CASE(AArch64ISD::UMAXV_PRED)
2425    MAKE_CASE(AArch64ISD::SMINV_PRED)
2426    MAKE_CASE(AArch64ISD::UMINV_PRED)
2427    MAKE_CASE(AArch64ISD::ORV_PRED)
2428    MAKE_CASE(AArch64ISD::EORV_PRED)
2429    MAKE_CASE(AArch64ISD::ANDV_PRED)
2430    MAKE_CASE(AArch64ISD::CLASTA_N)
2431    MAKE_CASE(AArch64ISD::CLASTB_N)
2432    MAKE_CASE(AArch64ISD::LASTA)
2433    MAKE_CASE(AArch64ISD::LASTB)
2434    MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2435    MAKE_CASE(AArch64ISD::LS64_BUILD)
2436    MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2437    MAKE_CASE(AArch64ISD::TBL)
2438    MAKE_CASE(AArch64ISD::FADD_PRED)
2439    MAKE_CASE(AArch64ISD::FADDA_PRED)
2440    MAKE_CASE(AArch64ISD::FADDV_PRED)
2441    MAKE_CASE(AArch64ISD::FDIV_PRED)
2442    MAKE_CASE(AArch64ISD::FMA_PRED)
2443    MAKE_CASE(AArch64ISD::FMAX_PRED)
2444    MAKE_CASE(AArch64ISD::FMAXV_PRED)
2445    MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2446    MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2447    MAKE_CASE(AArch64ISD::FMIN_PRED)
2448    MAKE_CASE(AArch64ISD::FMINV_PRED)
2449    MAKE_CASE(AArch64ISD::FMINNM_PRED)
2450    MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2451    MAKE_CASE(AArch64ISD::FMUL_PRED)
2452    MAKE_CASE(AArch64ISD::FSUB_PRED)
2453    MAKE_CASE(AArch64ISD::RDSVL)
2454    MAKE_CASE(AArch64ISD::BIC)
2455    MAKE_CASE(AArch64ISD::BIT)
2456    MAKE_CASE(AArch64ISD::CBZ)
2457    MAKE_CASE(AArch64ISD::CBNZ)
2458    MAKE_CASE(AArch64ISD::TBZ)
2459    MAKE_CASE(AArch64ISD::TBNZ)
2460    MAKE_CASE(AArch64ISD::TC_RETURN)
2461    MAKE_CASE(AArch64ISD::PREFETCH)
2462    MAKE_CASE(AArch64ISD::SITOF)
2463    MAKE_CASE(AArch64ISD::UITOF)
2464    MAKE_CASE(AArch64ISD::NVCAST)
2465    MAKE_CASE(AArch64ISD::MRS)
2466    MAKE_CASE(AArch64ISD::SQSHL_I)
2467    MAKE_CASE(AArch64ISD::UQSHL_I)
2468    MAKE_CASE(AArch64ISD::SRSHR_I)
2469    MAKE_CASE(AArch64ISD::URSHR_I)
2470    MAKE_CASE(AArch64ISD::SQSHLU_I)
2471    MAKE_CASE(AArch64ISD::WrapperLarge)
2472    MAKE_CASE(AArch64ISD::LD2post)
2473    MAKE_CASE(AArch64ISD::LD3post)
2474    MAKE_CASE(AArch64ISD::LD4post)
2475    MAKE_CASE(AArch64ISD::ST2post)
2476    MAKE_CASE(AArch64ISD::ST3post)
2477    MAKE_CASE(AArch64ISD::ST4post)
2478    MAKE_CASE(AArch64ISD::LD1x2post)
2479    MAKE_CASE(AArch64ISD::LD1x3post)
2480    MAKE_CASE(AArch64ISD::LD1x4post)
2481    MAKE_CASE(AArch64ISD::ST1x2post)
2482    MAKE_CASE(AArch64ISD::ST1x3post)
2483    MAKE_CASE(AArch64ISD::ST1x4post)
2484    MAKE_CASE(AArch64ISD::LD1DUPpost)
2485    MAKE_CASE(AArch64ISD::LD2DUPpost)
2486    MAKE_CASE(AArch64ISD::LD3DUPpost)
2487    MAKE_CASE(AArch64ISD::LD4DUPpost)
2488    MAKE_CASE(AArch64ISD::LD1LANEpost)
2489    MAKE_CASE(AArch64ISD::LD2LANEpost)
2490    MAKE_CASE(AArch64ISD::LD3LANEpost)
2491    MAKE_CASE(AArch64ISD::LD4LANEpost)
2492    MAKE_CASE(AArch64ISD::ST2LANEpost)
2493    MAKE_CASE(AArch64ISD::ST3LANEpost)
2494    MAKE_CASE(AArch64ISD::ST4LANEpost)
2495    MAKE_CASE(AArch64ISD::SMULL)
2496    MAKE_CASE(AArch64ISD::UMULL)
2497    MAKE_CASE(AArch64ISD::PMULL)
2498    MAKE_CASE(AArch64ISD::FRECPE)
2499    MAKE_CASE(AArch64ISD::FRECPS)
2500    MAKE_CASE(AArch64ISD::FRSQRTE)
2501    MAKE_CASE(AArch64ISD::FRSQRTS)
2502    MAKE_CASE(AArch64ISD::STG)
2503    MAKE_CASE(AArch64ISD::STZG)
2504    MAKE_CASE(AArch64ISD::ST2G)
2505    MAKE_CASE(AArch64ISD::STZ2G)
2506    MAKE_CASE(AArch64ISD::SUNPKHI)
2507    MAKE_CASE(AArch64ISD::SUNPKLO)
2508    MAKE_CASE(AArch64ISD::UUNPKHI)
2509    MAKE_CASE(AArch64ISD::UUNPKLO)
2510    MAKE_CASE(AArch64ISD::INSR)
2511    MAKE_CASE(AArch64ISD::PTEST)
2512    MAKE_CASE(AArch64ISD::PTEST_ANY)
2513    MAKE_CASE(AArch64ISD::PTRUE)
2514    MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2515    MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2516    MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2517    MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2518    MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2519    MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2520    MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2521    MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2522    MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2523    MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2524    MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2525    MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2526    MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2527    MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2528    MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2529    MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2530    MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2531    MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2532    MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2533    MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2534    MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2535    MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2536    MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2537    MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2538    MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2539    MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2540    MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2541    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2542    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2543    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2544    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2545    MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2546    MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2547    MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2548    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2549    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2550    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2551    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2552    MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2553    MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2554    MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2555    MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2556    MAKE_CASE(AArch64ISD::ST1_PRED)
2557    MAKE_CASE(AArch64ISD::SST1_PRED)
2558    MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2559    MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2560    MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2561    MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2562    MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2563    MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2564    MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2565    MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2566    MAKE_CASE(AArch64ISD::LDP)
2567    MAKE_CASE(AArch64ISD::LDNP)
2568    MAKE_CASE(AArch64ISD::STP)
2569    MAKE_CASE(AArch64ISD::STNP)
2570    MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2571    MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2572    MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2573    MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2574    MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2575    MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2576    MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2577    MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2578    MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2579    MAKE_CASE(AArch64ISD::ADDP)
2580    MAKE_CASE(AArch64ISD::SADDLP)
2581    MAKE_CASE(AArch64ISD::UADDLP)
2582    MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2583    MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2584    MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2585    MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2586    MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2587    MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2588    MAKE_CASE(AArch64ISD::CALL_BTI)
2589    MAKE_CASE(AArch64ISD::MRRS)
2590    MAKE_CASE(AArch64ISD::MSRR)
2591  }
2592#undef MAKE_CASE
2593  return nullptr;
2594}
2595
2596MachineBasicBlock *
2597AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2598                                    MachineBasicBlock *MBB) const {
2599  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2600  // phi node:
2601
2602  // OrigBB:
2603  //     [... previous instrs leading to comparison ...]
2604  //     b.ne TrueBB
2605  //     b EndBB
2606  // TrueBB:
2607  //     ; Fallthrough
2608  // EndBB:
2609  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2610
2611  MachineFunction *MF = MBB->getParent();
2612  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2613  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2614  DebugLoc DL = MI.getDebugLoc();
2615  MachineFunction::iterator It = ++MBB->getIterator();
2616
2617  Register DestReg = MI.getOperand(0).getReg();
2618  Register IfTrueReg = MI.getOperand(1).getReg();
2619  Register IfFalseReg = MI.getOperand(2).getReg();
2620  unsigned CondCode = MI.getOperand(3).getImm();
2621  bool NZCVKilled = MI.getOperand(4).isKill();
2622
2623  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2624  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2625  MF->insert(It, TrueBB);
2626  MF->insert(It, EndBB);
2627
2628  // Transfer rest of current basic-block to EndBB
2629  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2630                MBB->end());
2631  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2632
2633  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2634  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2635  MBB->addSuccessor(TrueBB);
2636  MBB->addSuccessor(EndBB);
2637
2638  // TrueBB falls through to the end.
2639  TrueBB->addSuccessor(EndBB);
2640
2641  if (!NZCVKilled) {
2642    TrueBB->addLiveIn(AArch64::NZCV);
2643    EndBB->addLiveIn(AArch64::NZCV);
2644  }
2645
2646  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2647      .addReg(IfTrueReg)
2648      .addMBB(TrueBB)
2649      .addReg(IfFalseReg)
2650      .addMBB(MBB);
2651
2652  MI.eraseFromParent();
2653  return EndBB;
2654}
2655
2656MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2657       MachineInstr &MI, MachineBasicBlock *BB) const {
2658  assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2659             BB->getParent()->getFunction().getPersonalityFn())) &&
2660         "SEH does not use catchret!");
2661  return BB;
2662}
2663
2664MachineBasicBlock *
2665AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2666                                    MachineInstr &MI,
2667                                    MachineBasicBlock *BB) const {
2668  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2669  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2670
2671  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2672  MIB.add(MI.getOperand(1)); // slice index register
2673  MIB.add(MI.getOperand(2)); // slice index offset
2674  MIB.add(MI.getOperand(3)); // pg
2675  MIB.add(MI.getOperand(4)); // base
2676  MIB.add(MI.getOperand(5)); // offset
2677
2678  MI.eraseFromParent(); // The pseudo is gone now.
2679  return BB;
2680}
2681
2682MachineBasicBlock *
2683AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2684  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2685  MachineInstrBuilder MIB =
2686      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2687
2688  MIB.addReg(AArch64::ZA, RegState::Define);
2689  MIB.add(MI.getOperand(0)); // Vector select register
2690  MIB.add(MI.getOperand(1)); // Vector select offset
2691  MIB.add(MI.getOperand(2)); // Base
2692  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2693
2694  MI.eraseFromParent(); // The pseudo is gone now.
2695  return BB;
2696}
2697
2698MachineBasicBlock *
2699AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2700                                   MachineInstr &MI,
2701                                   MachineBasicBlock *BB, bool HasTile) const {
2702  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2703  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2704  unsigned StartIdx = 0;
2705
2706  if (HasTile) {
2707    MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2708    MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2709    StartIdx = 1;
2710  } else
2711    MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2712
2713  for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2714    MIB.add(MI.getOperand(I));
2715
2716  MI.eraseFromParent(); // The pseudo is gone now.
2717  return BB;
2718}
2719
2720MachineBasicBlock *
2721AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2722  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2723  MachineInstrBuilder MIB =
2724      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2725  MIB.add(MI.getOperand(0)); // Mask
2726
2727  unsigned Mask = MI.getOperand(0).getImm();
2728  for (unsigned I = 0; I < 8; I++) {
2729    if (Mask & (1 << I))
2730      MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2731  }
2732
2733  MI.eraseFromParent(); // The pseudo is gone now.
2734  return BB;
2735}
2736
2737MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2738    MachineInstr &MI, MachineBasicBlock *BB) const {
2739
2740  int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2741  if (SMEOrigInstr != -1) {
2742    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2743    uint64_t SMEMatrixType =
2744        TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2745    switch (SMEMatrixType) {
2746    case (AArch64::SMEMatrixArray):
2747      return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2748    case (AArch64::SMEMatrixTileB):
2749      return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2750    case (AArch64::SMEMatrixTileH):
2751      return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2752    case (AArch64::SMEMatrixTileS):
2753      return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2754    case (AArch64::SMEMatrixTileD):
2755      return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2756    case (AArch64::SMEMatrixTileQ):
2757      return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2758    }
2759  }
2760
2761  switch (MI.getOpcode()) {
2762  default:
2763#ifndef NDEBUG
2764    MI.dump();
2765#endif
2766    llvm_unreachable("Unexpected instruction for custom inserter!");
2767
2768  case AArch64::F128CSEL:
2769    return EmitF128CSEL(MI, BB);
2770  case TargetOpcode::STATEPOINT:
2771    // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2772    // while bl call instruction (where statepoint will be lowered at the end)
2773    // has implicit def. This def is early-clobber as it will be set at
2774    // the moment of the call and earlier than any use is read.
2775    // Add this implicit dead def here as a workaround.
2776    MI.addOperand(*MI.getMF(),
2777                  MachineOperand::CreateReg(
2778                      AArch64::LR, /*isDef*/ true,
2779                      /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2780                      /*isUndef*/ false, /*isEarlyClobber*/ true));
2781    [[fallthrough]];
2782  case TargetOpcode::STACKMAP:
2783  case TargetOpcode::PATCHPOINT:
2784    return emitPatchPoint(MI, BB);
2785
2786  case AArch64::CATCHRET:
2787    return EmitLoweredCatchRet(MI, BB);
2788  case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2789    return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2790  case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2791    return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2792  case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2793    return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2794  case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2795    return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2796  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2797    return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2798  case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2799    return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2800  case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2801    return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2802  case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2803    return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2804  case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2805    return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2806  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2807    return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2808  case AArch64::LDR_ZA_PSEUDO:
2809    return EmitFill(MI, BB);
2810  case AArch64::ZERO_M_PSEUDO:
2811    return EmitZero(MI, BB);
2812  }
2813}
2814
2815//===----------------------------------------------------------------------===//
2816// AArch64 Lowering private implementation.
2817//===----------------------------------------------------------------------===//
2818
2819//===----------------------------------------------------------------------===//
2820// Lowering Code
2821//===----------------------------------------------------------------------===//
2822
2823// Forward declarations of SVE fixed length lowering helpers
2824static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2825static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2826static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2827static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2828                                                SelectionDAG &DAG);
2829static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2830                                             EVT VT);
2831
2832/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2833static bool isZerosVector(const SDNode *N) {
2834  // Look through a bit convert.
2835  while (N->getOpcode() == ISD::BITCAST)
2836    N = N->getOperand(0).getNode();
2837
2838  if (ISD::isConstantSplatVectorAllZeros(N))
2839    return true;
2840
2841  if (N->getOpcode() != AArch64ISD::DUP)
2842    return false;
2843
2844  auto Opnd0 = N->getOperand(0);
2845  return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2846}
2847
2848/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2849/// CC
2850static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2851  switch (CC) {
2852  default:
2853    llvm_unreachable("Unknown condition code!");
2854  case ISD::SETNE:
2855    return AArch64CC::NE;
2856  case ISD::SETEQ:
2857    return AArch64CC::EQ;
2858  case ISD::SETGT:
2859    return AArch64CC::GT;
2860  case ISD::SETGE:
2861    return AArch64CC::GE;
2862  case ISD::SETLT:
2863    return AArch64CC::LT;
2864  case ISD::SETLE:
2865    return AArch64CC::LE;
2866  case ISD::SETUGT:
2867    return AArch64CC::HI;
2868  case ISD::SETUGE:
2869    return AArch64CC::HS;
2870  case ISD::SETULT:
2871    return AArch64CC::LO;
2872  case ISD::SETULE:
2873    return AArch64CC::LS;
2874  }
2875}
2876
2877/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2878static void changeFPCCToAArch64CC(ISD::CondCode CC,
2879                                  AArch64CC::CondCode &CondCode,
2880                                  AArch64CC::CondCode &CondCode2) {
2881  CondCode2 = AArch64CC::AL;
2882  switch (CC) {
2883  default:
2884    llvm_unreachable("Unknown FP condition!");
2885  case ISD::SETEQ:
2886  case ISD::SETOEQ:
2887    CondCode = AArch64CC::EQ;
2888    break;
2889  case ISD::SETGT:
2890  case ISD::SETOGT:
2891    CondCode = AArch64CC::GT;
2892    break;
2893  case ISD::SETGE:
2894  case ISD::SETOGE:
2895    CondCode = AArch64CC::GE;
2896    break;
2897  case ISD::SETOLT:
2898    CondCode = AArch64CC::MI;
2899    break;
2900  case ISD::SETOLE:
2901    CondCode = AArch64CC::LS;
2902    break;
2903  case ISD::SETONE:
2904    CondCode = AArch64CC::MI;
2905    CondCode2 = AArch64CC::GT;
2906    break;
2907  case ISD::SETO:
2908    CondCode = AArch64CC::VC;
2909    break;
2910  case ISD::SETUO:
2911    CondCode = AArch64CC::VS;
2912    break;
2913  case ISD::SETUEQ:
2914    CondCode = AArch64CC::EQ;
2915    CondCode2 = AArch64CC::VS;
2916    break;
2917  case ISD::SETUGT:
2918    CondCode = AArch64CC::HI;
2919    break;
2920  case ISD::SETUGE:
2921    CondCode = AArch64CC::PL;
2922    break;
2923  case ISD::SETLT:
2924  case ISD::SETULT:
2925    CondCode = AArch64CC::LT;
2926    break;
2927  case ISD::SETLE:
2928  case ISD::SETULE:
2929    CondCode = AArch64CC::LE;
2930    break;
2931  case ISD::SETNE:
2932  case ISD::SETUNE:
2933    CondCode = AArch64CC::NE;
2934    break;
2935  }
2936}
2937
2938/// Convert a DAG fp condition code to an AArch64 CC.
2939/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2940/// should be AND'ed instead of OR'ed.
2941static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2942                                     AArch64CC::CondCode &CondCode,
2943                                     AArch64CC::CondCode &CondCode2) {
2944  CondCode2 = AArch64CC::AL;
2945  switch (CC) {
2946  default:
2947    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2948    assert(CondCode2 == AArch64CC::AL);
2949    break;
2950  case ISD::SETONE:
2951    // (a one b)
2952    // == ((a olt b) || (a ogt b))
2953    // == ((a ord b) && (a une b))
2954    CondCode = AArch64CC::VC;
2955    CondCode2 = AArch64CC::NE;
2956    break;
2957  case ISD::SETUEQ:
2958    // (a ueq b)
2959    // == ((a uno b) || (a oeq b))
2960    // == ((a ule b) && (a uge b))
2961    CondCode = AArch64CC::PL;
2962    CondCode2 = AArch64CC::LE;
2963    break;
2964  }
2965}
2966
2967/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2968/// CC usable with the vector instructions. Fewer operations are available
2969/// without a real NZCV register, so we have to use less efficient combinations
2970/// to get the same effect.
2971static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2972                                        AArch64CC::CondCode &CondCode,
2973                                        AArch64CC::CondCode &CondCode2,
2974                                        bool &Invert) {
2975  Invert = false;
2976  switch (CC) {
2977  default:
2978    // Mostly the scalar mappings work fine.
2979    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2980    break;
2981  case ISD::SETUO:
2982    Invert = true;
2983    [[fallthrough]];
2984  case ISD::SETO:
2985    CondCode = AArch64CC::MI;
2986    CondCode2 = AArch64CC::GE;
2987    break;
2988  case ISD::SETUEQ:
2989  case ISD::SETULT:
2990  case ISD::SETULE:
2991  case ISD::SETUGT:
2992  case ISD::SETUGE:
2993    // All of the compare-mask comparisons are ordered, but we can switch
2994    // between the two by a double inversion. E.g. ULE == !OGT.
2995    Invert = true;
2996    changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2997                          CondCode, CondCode2);
2998    break;
2999  }
3000}
3001
3002static bool isLegalArithImmed(uint64_t C) {
3003  // Matches AArch64DAGToDAGISel::SelectArithImmed().
3004  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3005  LLVM_DEBUG(dbgs() << "Is imm " << C
3006                    << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3007  return IsLegal;
3008}
3009
3010// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3011// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3012// can be set differently by this operation. It comes down to whether
3013// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3014// everything is fine. If not then the optimization is wrong. Thus general
3015// comparisons are only valid if op2 != 0.
3016//
3017// So, finally, the only LLVM-native comparisons that don't mention C and V
3018// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3019// the absence of information about op2.
3020static bool isCMN(SDValue Op, ISD::CondCode CC) {
3021  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3022         (CC == ISD::SETEQ || CC == ISD::SETNE);
3023}
3024
3025static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3026                                      SelectionDAG &DAG, SDValue Chain,
3027                                      bool IsSignaling) {
3028  EVT VT = LHS.getValueType();
3029  assert(VT != MVT::f128);
3030
3031  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3032
3033  if (VT == MVT::f16 && !FullFP16) {
3034    LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3035                      {Chain, LHS});
3036    RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3037                      {LHS.getValue(1), RHS});
3038    Chain = RHS.getValue(1);
3039    VT = MVT::f32;
3040  }
3041  unsigned Opcode =
3042      IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3043  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3044}
3045
3046static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3047                              const SDLoc &dl, SelectionDAG &DAG) {
3048  EVT VT = LHS.getValueType();
3049  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3050
3051  if (VT.isFloatingPoint()) {
3052    assert(VT != MVT::f128);
3053    if (VT == MVT::f16 && !FullFP16) {
3054      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3055      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3056      VT = MVT::f32;
3057    }
3058    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3059  }
3060
3061  // The CMP instruction is just an alias for SUBS, and representing it as
3062  // SUBS means that it's possible to get CSE with subtract operations.
3063  // A later phase can perform the optimization of setting the destination
3064  // register to WZR/XZR if it ends up being unused.
3065  unsigned Opcode = AArch64ISD::SUBS;
3066
3067  if (isCMN(RHS, CC)) {
3068    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3069    Opcode = AArch64ISD::ADDS;
3070    RHS = RHS.getOperand(1);
3071  } else if (isCMN(LHS, CC)) {
3072    // As we are looking for EQ/NE compares, the operands can be commuted ; can
3073    // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3074    Opcode = AArch64ISD::ADDS;
3075    LHS = LHS.getOperand(1);
3076  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3077    if (LHS.getOpcode() == ISD::AND) {
3078      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3079      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3080      // of the signed comparisons.
3081      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3082                                           DAG.getVTList(VT, MVT_CC),
3083                                           LHS.getOperand(0),
3084                                           LHS.getOperand(1));
3085      // Replace all users of (and X, Y) with newly generated (ands X, Y)
3086      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3087      return ANDSNode.getValue(1);
3088    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3089      // Use result of ANDS
3090      return LHS.getValue(1);
3091    }
3092  }
3093
3094  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3095      .getValue(1);
3096}
3097
3098/// \defgroup AArch64CCMP CMP;CCMP matching
3099///
3100/// These functions deal with the formation of CMP;CCMP;... sequences.
3101/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3102/// a comparison. They set the NZCV flags to a predefined value if their
3103/// predicate is false. This allows to express arbitrary conjunctions, for
3104/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3105/// expressed as:
3106///   cmp A
3107///   ccmp B, inv(CB), CA
3108///   check for CB flags
3109///
3110/// This naturally lets us implement chains of AND operations with SETCC
3111/// operands. And we can even implement some other situations by transforming
3112/// them:
3113///   - We can implement (NEG SETCC) i.e. negating a single comparison by
3114///     negating the flags used in a CCMP/FCCMP operations.
3115///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3116///     by negating the flags we test for afterwards. i.e.
3117///     NEG (CMP CCMP CCCMP ...) can be implemented.
3118///   - Note that we can only ever negate all previously processed results.
3119///     What we can not implement by flipping the flags to test is a negation
3120///     of two sub-trees (because the negation affects all sub-trees emitted so
3121///     far, so the 2nd sub-tree we emit would also affect the first).
3122/// With those tools we can implement some OR operations:
3123///   - (OR (SETCC A) (SETCC B)) can be implemented via:
3124///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3125///   - After transforming OR to NEG/AND combinations we may be able to use NEG
3126///     elimination rules from earlier to implement the whole thing as a
3127///     CCMP/FCCMP chain.
3128///
3129/// As complete example:
3130///     or (or (setCA (cmp A)) (setCB (cmp B)))
3131///        (and (setCC (cmp C)) (setCD (cmp D)))"
3132/// can be reassociated to:
3133///     or (and (setCC (cmp C)) setCD (cmp D))
3134//         (or (setCA (cmp A)) (setCB (cmp B)))
3135/// can be transformed to:
3136///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3137///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3138/// which can be implemented as:
3139///   cmp C
3140///   ccmp D, inv(CD), CC
3141///   ccmp A, CA, inv(CD)
3142///   ccmp B, CB, inv(CA)
3143///   check for CB flags
3144///
3145/// A counterexample is "or (and A B) (and C D)" which translates to
3146/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3147/// can only implement 1 of the inner (not) operations, but not both!
3148/// @{
3149
3150/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3151static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3152                                         ISD::CondCode CC, SDValue CCOp,
3153                                         AArch64CC::CondCode Predicate,
3154                                         AArch64CC::CondCode OutCC,
3155                                         const SDLoc &DL, SelectionDAG &DAG) {
3156  unsigned Opcode = 0;
3157  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3158
3159  if (LHS.getValueType().isFloatingPoint()) {
3160    assert(LHS.getValueType() != MVT::f128);
3161    if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3162      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3163      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3164    }
3165    Opcode = AArch64ISD::FCCMP;
3166  } else if (RHS.getOpcode() == ISD::SUB) {
3167    SDValue SubOp0 = RHS.getOperand(0);
3168    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3169      // See emitComparison() on why we can only do this for SETEQ and SETNE.
3170      Opcode = AArch64ISD::CCMN;
3171      RHS = RHS.getOperand(1);
3172    }
3173  }
3174  if (Opcode == 0)
3175    Opcode = AArch64ISD::CCMP;
3176
3177  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3178  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3179  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3180  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3181  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3182}
3183
3184/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3185/// expressed as a conjunction. See \ref AArch64CCMP.
3186/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
3187///                     changing the conditions on the SETCC tests.
3188///                     (this means we can call emitConjunctionRec() with
3189///                      Negate==true on this sub-tree)
3190/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
3191///                     cannot do the negation naturally. We are required to
3192///                     emit the subtree first in this case.
3193/// \param WillNegate   Is true if are called when the result of this
3194///                     subexpression must be negated. This happens when the
3195///                     outer expression is an OR. We can use this fact to know
3196///                     that we have a double negation (or (or ...) ...) that
3197///                     can be implemented for free.
3198static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3199                               bool &MustBeFirst, bool WillNegate,
3200                               unsigned Depth = 0) {
3201  if (!Val.hasOneUse())
3202    return false;
3203  unsigned Opcode = Val->getOpcode();
3204  if (Opcode == ISD::SETCC) {
3205    if (Val->getOperand(0).getValueType() == MVT::f128)
3206      return false;
3207    CanNegate = true;
3208    MustBeFirst = false;
3209    return true;
3210  }
3211  // Protect against exponential runtime and stack overflow.
3212  if (Depth > 6)
3213    return false;
3214  if (Opcode == ISD::AND || Opcode == ISD::OR) {
3215    bool IsOR = Opcode == ISD::OR;
3216    SDValue O0 = Val->getOperand(0);
3217    SDValue O1 = Val->getOperand(1);
3218    bool CanNegateL;
3219    bool MustBeFirstL;
3220    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3221      return false;
3222    bool CanNegateR;
3223    bool MustBeFirstR;
3224    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3225      return false;
3226
3227    if (MustBeFirstL && MustBeFirstR)
3228      return false;
3229
3230    if (IsOR) {
3231      // For an OR expression we need to be able to naturally negate at least
3232      // one side or we cannot do the transformation at all.
3233      if (!CanNegateL && !CanNegateR)
3234        return false;
3235      // If we the result of the OR will be negated and we can naturally negate
3236      // the leafs, then this sub-tree as a whole negates naturally.
3237      CanNegate = WillNegate && CanNegateL && CanNegateR;
3238      // If we cannot naturally negate the whole sub-tree, then this must be
3239      // emitted first.
3240      MustBeFirst = !CanNegate;
3241    } else {
3242      assert(Opcode == ISD::AND && "Must be OR or AND");
3243      // We cannot naturally negate an AND operation.
3244      CanNegate = false;
3245      MustBeFirst = MustBeFirstL || MustBeFirstR;
3246    }
3247    return true;
3248  }
3249  return false;
3250}
3251
3252/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3253/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3254/// Tries to transform the given i1 producing node @p Val to a series compare
3255/// and conditional compare operations. @returns an NZCV flags producing node
3256/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3257/// transformation was not possible.
3258/// \p Negate is true if we want this sub-tree being negated just by changing
3259/// SETCC conditions.
3260static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3261    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3262    AArch64CC::CondCode Predicate) {
3263  // We're at a tree leaf, produce a conditional comparison operation.
3264  unsigned Opcode = Val->getOpcode();
3265  if (Opcode == ISD::SETCC) {
3266    SDValue LHS = Val->getOperand(0);
3267    SDValue RHS = Val->getOperand(1);
3268    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3269    bool isInteger = LHS.getValueType().isInteger();
3270    if (Negate)
3271      CC = getSetCCInverse(CC, LHS.getValueType());
3272    SDLoc DL(Val);
3273    // Determine OutCC and handle FP special case.
3274    if (isInteger) {
3275      OutCC = changeIntCCToAArch64CC(CC);
3276    } else {
3277      assert(LHS.getValueType().isFloatingPoint());
3278      AArch64CC::CondCode ExtraCC;
3279      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3280      // Some floating point conditions can't be tested with a single condition
3281      // code. Construct an additional comparison in this case.
3282      if (ExtraCC != AArch64CC::AL) {
3283        SDValue ExtraCmp;
3284        if (!CCOp.getNode())
3285          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3286        else
3287          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3288                                               ExtraCC, DL, DAG);
3289        CCOp = ExtraCmp;
3290        Predicate = ExtraCC;
3291      }
3292    }
3293
3294    // Produce a normal comparison if we are first in the chain
3295    if (!CCOp)
3296      return emitComparison(LHS, RHS, CC, DL, DAG);
3297    // Otherwise produce a ccmp.
3298    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3299                                     DAG);
3300  }
3301  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3302
3303  bool IsOR = Opcode == ISD::OR;
3304
3305  SDValue LHS = Val->getOperand(0);
3306  bool CanNegateL;
3307  bool MustBeFirstL;
3308  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3309  assert(ValidL && "Valid conjunction/disjunction tree");
3310  (void)ValidL;
3311
3312  SDValue RHS = Val->getOperand(1);
3313  bool CanNegateR;
3314  bool MustBeFirstR;
3315  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3316  assert(ValidR && "Valid conjunction/disjunction tree");
3317  (void)ValidR;
3318
3319  // Swap sub-tree that must come first to the right side.
3320  if (MustBeFirstL) {
3321    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3322    std::swap(LHS, RHS);
3323    std::swap(CanNegateL, CanNegateR);
3324    std::swap(MustBeFirstL, MustBeFirstR);
3325  }
3326
3327  bool NegateR;
3328  bool NegateAfterR;
3329  bool NegateL;
3330  bool NegateAfterAll;
3331  if (Opcode == ISD::OR) {
3332    // Swap the sub-tree that we can negate naturally to the left.
3333    if (!CanNegateL) {
3334      assert(CanNegateR && "at least one side must be negatable");
3335      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3336      assert(!Negate);
3337      std::swap(LHS, RHS);
3338      NegateR = false;
3339      NegateAfterR = true;
3340    } else {
3341      // Negate the left sub-tree if possible, otherwise negate the result.
3342      NegateR = CanNegateR;
3343      NegateAfterR = !CanNegateR;
3344    }
3345    NegateL = true;
3346    NegateAfterAll = !Negate;
3347  } else {
3348    assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3349    assert(!Negate && "Valid conjunction/disjunction tree");
3350
3351    NegateL = false;
3352    NegateR = false;
3353    NegateAfterR = false;
3354    NegateAfterAll = false;
3355  }
3356
3357  // Emit sub-trees.
3358  AArch64CC::CondCode RHSCC;
3359  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3360  if (NegateAfterR)
3361    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3362  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3363  if (NegateAfterAll)
3364    OutCC = AArch64CC::getInvertedCondCode(OutCC);
3365  return CmpL;
3366}
3367
3368/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3369/// In some cases this is even possible with OR operations in the expression.
3370/// See \ref AArch64CCMP.
3371/// \see emitConjunctionRec().
3372static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3373                               AArch64CC::CondCode &OutCC) {
3374  bool DummyCanNegate;
3375  bool DummyMustBeFirst;
3376  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3377    return SDValue();
3378
3379  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3380}
3381
3382/// @}
3383
3384/// Returns how profitable it is to fold a comparison's operand's shift and/or
3385/// extension operations.
3386static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3387  auto isSupportedExtend = [&](SDValue V) {
3388    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3389      return true;
3390
3391    if (V.getOpcode() == ISD::AND)
3392      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3393        uint64_t Mask = MaskCst->getZExtValue();
3394        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3395      }
3396
3397    return false;
3398  };
3399
3400  if (!Op.hasOneUse())
3401    return 0;
3402
3403  if (isSupportedExtend(Op))
3404    return 1;
3405
3406  unsigned Opc = Op.getOpcode();
3407  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3408    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3409      uint64_t Shift = ShiftCst->getZExtValue();
3410      if (isSupportedExtend(Op.getOperand(0)))
3411        return (Shift <= 4) ? 2 : 1;
3412      EVT VT = Op.getValueType();
3413      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3414        return 1;
3415    }
3416
3417  return 0;
3418}
3419
3420static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3421                             SDValue &AArch64cc, SelectionDAG &DAG,
3422                             const SDLoc &dl) {
3423  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3424    EVT VT = RHS.getValueType();
3425    uint64_t C = RHSC->getZExtValue();
3426    if (!isLegalArithImmed(C)) {
3427      // Constant does not fit, try adjusting it by one?
3428      switch (CC) {
3429      default:
3430        break;
3431      case ISD::SETLT:
3432      case ISD::SETGE:
3433        if ((VT == MVT::i32 && C != 0x80000000 &&
3434             isLegalArithImmed((uint32_t)(C - 1))) ||
3435            (VT == MVT::i64 && C != 0x80000000ULL &&
3436             isLegalArithImmed(C - 1ULL))) {
3437          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3438          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3439          RHS = DAG.getConstant(C, dl, VT);
3440        }
3441        break;
3442      case ISD::SETULT:
3443      case ISD::SETUGE:
3444        if ((VT == MVT::i32 && C != 0 &&
3445             isLegalArithImmed((uint32_t)(C - 1))) ||
3446            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3447          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3448          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3449          RHS = DAG.getConstant(C, dl, VT);
3450        }
3451        break;
3452      case ISD::SETLE:
3453      case ISD::SETGT:
3454        if ((VT == MVT::i32 && C != INT32_MAX &&
3455             isLegalArithImmed((uint32_t)(C + 1))) ||
3456            (VT == MVT::i64 && C != INT64_MAX &&
3457             isLegalArithImmed(C + 1ULL))) {
3458          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3459          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3460          RHS = DAG.getConstant(C, dl, VT);
3461        }
3462        break;
3463      case ISD::SETULE:
3464      case ISD::SETUGT:
3465        if ((VT == MVT::i32 && C != UINT32_MAX &&
3466             isLegalArithImmed((uint32_t)(C + 1))) ||
3467            (VT == MVT::i64 && C != UINT64_MAX &&
3468             isLegalArithImmed(C + 1ULL))) {
3469          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3470          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3471          RHS = DAG.getConstant(C, dl, VT);
3472        }
3473        break;
3474      }
3475    }
3476  }
3477
3478  // Comparisons are canonicalized so that the RHS operand is simpler than the
3479  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3480  // can fold some shift+extend operations on the RHS operand, so swap the
3481  // operands if that can be done.
3482  //
3483  // For example:
3484  //    lsl     w13, w11, #1
3485  //    cmp     w13, w12
3486  // can be turned into:
3487  //    cmp     w12, w11, lsl #1
3488  if (!isa<ConstantSDNode>(RHS) ||
3489      !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3490    SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3491
3492    if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
3493      std::swap(LHS, RHS);
3494      CC = ISD::getSetCCSwappedOperands(CC);
3495    }
3496  }
3497
3498  SDValue Cmp;
3499  AArch64CC::CondCode AArch64CC;
3500  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3501    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3502
3503    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3504    // For the i8 operand, the largest immediate is 255, so this can be easily
3505    // encoded in the compare instruction. For the i16 operand, however, the
3506    // largest immediate cannot be encoded in the compare.
3507    // Therefore, use a sign extending load and cmn to avoid materializing the
3508    // -1 constant. For example,
3509    // movz w1, #65535
3510    // ldrh w0, [x0, #0]
3511    // cmp w0, w1
3512    // >
3513    // ldrsh w0, [x0, #0]
3514    // cmn w0, #1
3515    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3516    // if and only if (sext LHS) == (sext RHS). The checks are in place to
3517    // ensure both the LHS and RHS are truly zero extended and to make sure the
3518    // transformation is profitable.
3519    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3520        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3521        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3522        LHS.getNode()->hasNUsesOfValue(1, 0)) {
3523      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3524      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3525        SDValue SExt =
3526            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3527                        DAG.getValueType(MVT::i16));
3528        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3529                                                   RHS.getValueType()),
3530                             CC, dl, DAG);
3531        AArch64CC = changeIntCCToAArch64CC(CC);
3532      }
3533    }
3534
3535    if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3536      if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3537        if ((CC == ISD::SETNE) ^ RHSC->isZero())
3538          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3539      }
3540    }
3541  }
3542
3543  if (!Cmp) {
3544    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3545    AArch64CC = changeIntCCToAArch64CC(CC);
3546  }
3547  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3548  return Cmp;
3549}
3550
3551static std::pair<SDValue, SDValue>
3552getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3553  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3554         "Unsupported value type");
3555  SDValue Value, Overflow;
3556  SDLoc DL(Op);
3557  SDValue LHS = Op.getOperand(0);
3558  SDValue RHS = Op.getOperand(1);
3559  unsigned Opc = 0;
3560  switch (Op.getOpcode()) {
3561  default:
3562    llvm_unreachable("Unknown overflow instruction!");
3563  case ISD::SADDO:
3564    Opc = AArch64ISD::ADDS;
3565    CC = AArch64CC::VS;
3566    break;
3567  case ISD::UADDO:
3568    Opc = AArch64ISD::ADDS;
3569    CC = AArch64CC::HS;
3570    break;
3571  case ISD::SSUBO:
3572    Opc = AArch64ISD::SUBS;
3573    CC = AArch64CC::VS;
3574    break;
3575  case ISD::USUBO:
3576    Opc = AArch64ISD::SUBS;
3577    CC = AArch64CC::LO;
3578    break;
3579  // Multiply needs a little bit extra work.
3580  case ISD::SMULO:
3581  case ISD::UMULO: {
3582    CC = AArch64CC::NE;
3583    bool IsSigned = Op.getOpcode() == ISD::SMULO;
3584    if (Op.getValueType() == MVT::i32) {
3585      // Extend to 64-bits, then perform a 64-bit multiply.
3586      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3587      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3588      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3589      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3590      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3591
3592      // Check that the result fits into a 32-bit integer.
3593      SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3594      if (IsSigned) {
3595        // cmp xreg, wreg, sxtw
3596        SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3597        Overflow =
3598            DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3599      } else {
3600        // tst xreg, #0xffffffff00000000
3601        SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3602        Overflow =
3603            DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3604      }
3605      break;
3606    }
3607    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3608    // For the 64 bit multiply
3609    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3610    if (IsSigned) {
3611      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3612      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3613                                      DAG.getConstant(63, DL, MVT::i64));
3614      // It is important that LowerBits is last, otherwise the arithmetic
3615      // shift will not be folded into the compare (SUBS).
3616      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3617      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3618                     .getValue(1);
3619    } else {
3620      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3621      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3622      Overflow =
3623          DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3624                      DAG.getConstant(0, DL, MVT::i64),
3625                      UpperBits).getValue(1);
3626    }
3627    break;
3628  }
3629  } // switch (...)
3630
3631  if (Opc) {
3632    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3633
3634    // Emit the AArch64 operation with overflow check.
3635    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3636    Overflow = Value.getValue(1);
3637  }
3638  return std::make_pair(Value, Overflow);
3639}
3640
3641SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3642  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3643                                   Subtarget->forceStreamingCompatibleSVE()))
3644    return LowerToScalableOp(Op, DAG);
3645
3646  SDValue Sel = Op.getOperand(0);
3647  SDValue Other = Op.getOperand(1);
3648  SDLoc dl(Sel);
3649
3650  // If the operand is an overflow checking operation, invert the condition
3651  // code and kill the Not operation. I.e., transform:
3652  // (xor (overflow_op_bool, 1))
3653  //   -->
3654  // (csel 1, 0, invert(cc), overflow_op_bool)
3655  // ... which later gets transformed to just a cset instruction with an
3656  // inverted condition code, rather than a cset + eor sequence.
3657  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3658    // Only lower legal XALUO ops.
3659    if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3660      return SDValue();
3661
3662    SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3663    SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3664    AArch64CC::CondCode CC;
3665    SDValue Value, Overflow;
3666    std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3667    SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3668    return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3669                       CCVal, Overflow);
3670  }
3671  // If neither operand is a SELECT_CC, give up.
3672  if (Sel.getOpcode() != ISD::SELECT_CC)
3673    std::swap(Sel, Other);
3674  if (Sel.getOpcode() != ISD::SELECT_CC)
3675    return Op;
3676
3677  // The folding we want to perform is:
3678  // (xor x, (select_cc a, b, cc, 0, -1) )
3679  //   -->
3680  // (csel x, (xor x, -1), cc ...)
3681  //
3682  // The latter will get matched to a CSINV instruction.
3683
3684  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3685  SDValue LHS = Sel.getOperand(0);
3686  SDValue RHS = Sel.getOperand(1);
3687  SDValue TVal = Sel.getOperand(2);
3688  SDValue FVal = Sel.getOperand(3);
3689
3690  // FIXME: This could be generalized to non-integer comparisons.
3691  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3692    return Op;
3693
3694  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3695  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3696
3697  // The values aren't constants, this isn't the pattern we're looking for.
3698  if (!CFVal || !CTVal)
3699    return Op;
3700
3701  // We can commute the SELECT_CC by inverting the condition.  This
3702  // might be needed to make this fit into a CSINV pattern.
3703  if (CTVal->isAllOnes() && CFVal->isZero()) {
3704    std::swap(TVal, FVal);
3705    std::swap(CTVal, CFVal);
3706    CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3707  }
3708
3709  // If the constants line up, perform the transform!
3710  if (CTVal->isZero() && CFVal->isAllOnes()) {
3711    SDValue CCVal;
3712    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3713
3714    FVal = Other;
3715    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3716                       DAG.getConstant(-1ULL, dl, Other.getValueType()));
3717
3718    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3719                       CCVal, Cmp);
3720  }
3721
3722  return Op;
3723}
3724
3725// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3726// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3727// sets 'C' bit to 0.
3728static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3729  SDLoc DL(Value);
3730  EVT VT = Value.getValueType();
3731  SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3732  SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3733  SDValue Cmp =
3734      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3735  return Cmp.getValue(1);
3736}
3737
3738// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3739// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3740static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
3741                                bool Invert) {
3742  assert(Flag.getResNo() == 1);
3743  SDLoc DL(Flag);
3744  SDValue Zero = DAG.getConstant(0, DL, VT);
3745  SDValue One = DAG.getConstant(1, DL, VT);
3746  unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3747  SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3748  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3749}
3750
3751// Value is 1 if 'V' bit of NZCV is 1, else 0
3752static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
3753  assert(Flag.getResNo() == 1);
3754  SDLoc DL(Flag);
3755  SDValue Zero = DAG.getConstant(0, DL, VT);
3756  SDValue One = DAG.getConstant(1, DL, VT);
3757  SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3758  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3759}
3760
3761// This lowering is inefficient, but it will get cleaned up by
3762// `foldOverflowCheck`
3763static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
3764                                bool IsSigned) {
3765  EVT VT0 = Op.getValue(0).getValueType();
3766  EVT VT1 = Op.getValue(1).getValueType();
3767
3768  if (VT0 != MVT::i32 && VT0 != MVT::i64)
3769    return SDValue();
3770
3771  bool InvertCarry = Opcode == AArch64ISD::SBCS;
3772  SDValue OpLHS = Op.getOperand(0);
3773  SDValue OpRHS = Op.getOperand(1);
3774  SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3775
3776  SDLoc DL(Op);
3777  SDVTList VTs = DAG.getVTList(VT0, VT1);
3778
3779  SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3780                            OpRHS, OpCarryIn);
3781
3782  SDValue OutFlag =
3783      IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3784               : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3785
3786  return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3787}
3788
3789static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3790  // Let legalize expand this if it isn't a legal type yet.
3791  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3792    return SDValue();
3793
3794  SDLoc dl(Op);
3795  AArch64CC::CondCode CC;
3796  // The actual operation that sets the overflow or carry flag.
3797  SDValue Value, Overflow;
3798  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3799
3800  // We use 0 and 1 as false and true values.
3801  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3802  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3803
3804  // We use an inverted condition, because the conditional select is inverted
3805  // too. This will allow it to be selected to a single instruction:
3806  // CSINC Wd, WZR, WZR, invert(cond).
3807  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3808  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3809                         CCVal, Overflow);
3810
3811  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3812  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3813}
3814
3815// Prefetch operands are:
3816// 1: Address to prefetch
3817// 2: bool isWrite
3818// 3: int locality (0 = no locality ... 3 = extreme locality)
3819// 4: bool isDataCache
3820static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3821  SDLoc DL(Op);
3822  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3823  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3824  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3825
3826  bool IsStream = !Locality;
3827  // When the locality number is set
3828  if (Locality) {
3829    // The front-end should have filtered out the out-of-range values
3830    assert(Locality <= 3 && "Prefetch locality out-of-range");
3831    // The locality degree is the opposite of the cache speed.
3832    // Put the number the other way around.
3833    // The encoding starts at 0 for level 1
3834    Locality = 3 - Locality;
3835  }
3836
3837  // built the mask value encoding the expected behavior.
3838  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
3839                   (!IsData << 3) |     // IsDataCache bit
3840                   (Locality << 1) |    // Cache level bits
3841                   (unsigned)IsStream;  // Stream bit
3842  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3843                     DAG.getTargetConstant(PrfOp, DL, MVT::i32),
3844                     Op.getOperand(1));
3845}
3846
3847SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3848                                              SelectionDAG &DAG) const {
3849  EVT VT = Op.getValueType();
3850  if (VT.isScalableVector())
3851    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3852
3853  if (useSVEForFixedLengthVectorVT(VT))
3854    return LowerFixedLengthFPExtendToSVE(Op, DAG);
3855
3856  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3857  return SDValue();
3858}
3859
3860SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3861                                             SelectionDAG &DAG) const {
3862  if (Op.getValueType().isScalableVector())
3863    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3864
3865  bool IsStrict = Op->isStrictFPOpcode();
3866  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3867  EVT SrcVT = SrcVal.getValueType();
3868
3869  if (useSVEForFixedLengthVectorVT(SrcVT,
3870                                   Subtarget->forceStreamingCompatibleSVE()))
3871    return LowerFixedLengthFPRoundToSVE(Op, DAG);
3872
3873  if (SrcVT != MVT::f128) {
3874    // Expand cases where the input is a vector bigger than NEON.
3875    if (useSVEForFixedLengthVectorVT(SrcVT))
3876      return SDValue();
3877
3878    // It's legal except when f128 is involved
3879    return Op;
3880  }
3881
3882  return SDValue();
3883}
3884
3885SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3886                                                    SelectionDAG &DAG) const {
3887  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3888  // Any additional optimization in this function should be recorded
3889  // in the cost tables.
3890  bool IsStrict = Op->isStrictFPOpcode();
3891  EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
3892  EVT VT = Op.getValueType();
3893
3894  if (VT.isScalableVector()) {
3895    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3896                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3897                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3898    return LowerToPredicatedOp(Op, DAG, Opcode);
3899  }
3900
3901  if (useSVEForFixedLengthVectorVT(VT,
3902                                   Subtarget->forceStreamingCompatibleSVE()) ||
3903      useSVEForFixedLengthVectorVT(InVT,
3904                                   Subtarget->forceStreamingCompatibleSVE()))
3905    return LowerFixedLengthFPToIntToSVE(Op, DAG);
3906
3907  unsigned NumElts = InVT.getVectorNumElements();
3908
3909  // f16 conversions are promoted to f32 when full fp16 is not supported.
3910  if (InVT.getVectorElementType() == MVT::f16 &&
3911      !Subtarget->hasFullFP16()) {
3912    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3913    SDLoc dl(Op);
3914    if (IsStrict) {
3915      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
3916                                {Op.getOperand(0), Op.getOperand(1)});
3917      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3918                         {Ext.getValue(1), Ext.getValue(0)});
3919    }
3920    return DAG.getNode(
3921        Op.getOpcode(), dl, Op.getValueType(),
3922        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3923  }
3924
3925  uint64_t VTSize = VT.getFixedSizeInBits();
3926  uint64_t InVTSize = InVT.getFixedSizeInBits();
3927  if (VTSize < InVTSize) {
3928    SDLoc dl(Op);
3929    if (IsStrict) {
3930      InVT = InVT.changeVectorElementTypeToInteger();
3931      SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
3932                               {Op.getOperand(0), Op.getOperand(1)});
3933      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3934      return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
3935    }
3936    SDValue Cv =
3937        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3938                    Op.getOperand(0));
3939    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3940  }
3941
3942  if (VTSize > InVTSize) {
3943    SDLoc dl(Op);
3944    MVT ExtVT =
3945        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3946                         VT.getVectorNumElements());
3947    if (IsStrict) {
3948      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
3949                                {Op.getOperand(0), Op.getOperand(1)});
3950      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3951                         {Ext.getValue(1), Ext.getValue(0)});
3952    }
3953    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3954    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3955  }
3956
3957  // Use a scalar operation for conversions between single-element vectors of
3958  // the same size.
3959  if (NumElts == 1) {
3960    SDLoc dl(Op);
3961    SDValue Extract = DAG.getNode(
3962        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
3963        Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
3964    EVT ScalarVT = VT.getScalarType();
3965    if (IsStrict)
3966      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
3967                         {Op.getOperand(0), Extract});
3968    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
3969  }
3970
3971  // Type changing conversions are illegal.
3972  return Op;
3973}
3974
3975SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3976                                              SelectionDAG &DAG) const {
3977  bool IsStrict = Op->isStrictFPOpcode();
3978  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3979
3980  if (SrcVal.getValueType().isVector())
3981    return LowerVectorFP_TO_INT(Op, DAG);
3982
3983  // f16 conversions are promoted to f32 when full fp16 is not supported.
3984  if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3985    SDLoc dl(Op);
3986    if (IsStrict) {
3987      SDValue Ext =
3988          DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3989                      {Op.getOperand(0), SrcVal});
3990      return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
3991                         {Ext.getValue(1), Ext.getValue(0)});
3992    }
3993    return DAG.getNode(
3994        Op.getOpcode(), dl, Op.getValueType(),
3995        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3996  }
3997
3998  if (SrcVal.getValueType() != MVT::f128) {
3999    // It's legal except when f128 is involved
4000    return Op;
4001  }
4002
4003  return SDValue();
4004}
4005
4006SDValue
4007AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4008                                                SelectionDAG &DAG) const {
4009  // AArch64 FP-to-int conversions saturate to the destination element size, so
4010  // we can lower common saturating conversions to simple instructions.
4011  SDValue SrcVal = Op.getOperand(0);
4012  EVT SrcVT = SrcVal.getValueType();
4013  EVT DstVT = Op.getValueType();
4014  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4015
4016  uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4017  uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4018  uint64_t SatWidth = SatVT.getScalarSizeInBits();
4019  assert(SatWidth <= DstElementWidth &&
4020         "Saturation width cannot exceed result width");
4021
4022  // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4023  // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4024  // types, so this is hard to reach.
4025  if (DstVT.isScalableVector())
4026    return SDValue();
4027
4028  EVT SrcElementVT = SrcVT.getVectorElementType();
4029
4030  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4031  if (SrcElementVT == MVT::f16 &&
4032      (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
4033    MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4034    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4035    SrcVT = F32VT;
4036    SrcElementVT = MVT::f32;
4037    SrcElementWidth = 32;
4038  } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4039             SrcElementVT != MVT::f16)
4040    return SDValue();
4041
4042  SDLoc DL(Op);
4043  // Cases that we can emit directly.
4044  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4045    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4046                       DAG.getValueType(DstVT.getScalarType()));
4047
4048  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4049  // result. This is only valid if the legal cvt is larger than the saturate
4050  // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4051  // (at least until sqxtn is selected).
4052  if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4053    return SDValue();
4054
4055  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4056  SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4057                                  DAG.getValueType(IntVT.getScalarType()));
4058  SDValue Sat;
4059  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4060    SDValue MinC = DAG.getConstant(
4061        APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4062    SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4063    SDValue MaxC = DAG.getConstant(
4064        APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4065    Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4066  } else {
4067    SDValue MinC = DAG.getConstant(
4068        APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
4069    Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4070  }
4071
4072  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4073}
4074
4075SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4076                                                  SelectionDAG &DAG) const {
4077  // AArch64 FP-to-int conversions saturate to the destination register size, so
4078  // we can lower common saturating conversions to simple instructions.
4079  SDValue SrcVal = Op.getOperand(0);
4080  EVT SrcVT = SrcVal.getValueType();
4081
4082  if (SrcVT.isVector())
4083    return LowerVectorFP_TO_INT_SAT(Op, DAG);
4084
4085  EVT DstVT = Op.getValueType();
4086  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4087  uint64_t SatWidth = SatVT.getScalarSizeInBits();
4088  uint64_t DstWidth = DstVT.getScalarSizeInBits();
4089  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4090
4091  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4092  if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
4093    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4094    SrcVT = MVT::f32;
4095  } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
4096    return SDValue();
4097
4098  SDLoc DL(Op);
4099  // Cases that we can emit directly.
4100  if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4101       (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4102      DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4103    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4104                       DAG.getValueType(DstVT));
4105
4106  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4107  // result. This is only valid if the legal cvt is larger than the saturate
4108  // width.
4109  if (DstWidth < SatWidth)
4110    return SDValue();
4111
4112  SDValue NativeCvt =
4113      DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4114  SDValue Sat;
4115  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4116    SDValue MinC = DAG.getConstant(
4117        APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4118    SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4119    SDValue MaxC = DAG.getConstant(
4120        APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4121    Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4122  } else {
4123    SDValue MinC = DAG.getConstant(
4124        APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
4125    Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4126  }
4127
4128  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4129}
4130
4131SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4132                                                    SelectionDAG &DAG) const {
4133  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4134  // Any additional optimization in this function should be recorded
4135  // in the cost tables.
4136  bool IsStrict = Op->isStrictFPOpcode();
4137  EVT VT = Op.getValueType();
4138  SDLoc dl(Op);
4139  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4140  EVT InVT = In.getValueType();
4141  unsigned Opc = Op.getOpcode();
4142  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4143
4144  if (VT.isScalableVector()) {
4145    if (InVT.getVectorElementType() == MVT::i1) {
4146      // We can't directly extend an SVE predicate; extend it first.
4147      unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4148      EVT CastVT = getPromotedVTForPredicate(InVT);
4149      In = DAG.getNode(CastOpc, dl, CastVT, In);
4150      return DAG.getNode(Opc, dl, VT, In);
4151    }
4152
4153    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4154                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4155    return LowerToPredicatedOp(Op, DAG, Opcode);
4156  }
4157
4158  if (useSVEForFixedLengthVectorVT(VT,
4159                                   Subtarget->forceStreamingCompatibleSVE()) ||
4160      useSVEForFixedLengthVectorVT(InVT,
4161                                   Subtarget->forceStreamingCompatibleSVE()))
4162    return LowerFixedLengthIntToFPToSVE(Op, DAG);
4163
4164  uint64_t VTSize = VT.getFixedSizeInBits();
4165  uint64_t InVTSize = InVT.getFixedSizeInBits();
4166  if (VTSize < InVTSize) {
4167    MVT CastVT =
4168        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
4169                         InVT.getVectorNumElements());
4170    if (IsStrict) {
4171      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4172                       {Op.getOperand(0), In});
4173      return DAG.getNode(
4174          ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4175          {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4176    }
4177    In = DAG.getNode(Opc, dl, CastVT, In);
4178    return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4179                       DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4180  }
4181
4182  if (VTSize > InVTSize) {
4183    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4184    EVT CastVT = VT.changeVectorElementTypeToInteger();
4185    In = DAG.getNode(CastOpc, dl, CastVT, In);
4186    if (IsStrict)
4187      return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4188    return DAG.getNode(Opc, dl, VT, In);
4189  }
4190
4191  // Use a scalar operation for conversions between single-element vectors of
4192  // the same size.
4193  if (VT.getVectorNumElements() == 1) {
4194    SDValue Extract = DAG.getNode(
4195        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4196        In, DAG.getConstant(0, dl, MVT::i64));
4197    EVT ScalarVT = VT.getScalarType();
4198    if (IsStrict)
4199      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4200                         {Op.getOperand(0), Extract});
4201    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4202  }
4203
4204  return Op;
4205}
4206
4207SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4208                                            SelectionDAG &DAG) const {
4209  if (Op.getValueType().isVector())
4210    return LowerVectorINT_TO_FP(Op, DAG);
4211
4212  bool IsStrict = Op->isStrictFPOpcode();
4213  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4214
4215  // f16 conversions are promoted to f32 when full fp16 is not supported.
4216  if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4217    SDLoc dl(Op);
4218    if (IsStrict) {
4219      SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4220                                {Op.getOperand(0), SrcVal});
4221      return DAG.getNode(
4222          ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4223          {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4224    }
4225    return DAG.getNode(
4226        ISD::FP_ROUND, dl, MVT::f16,
4227        DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4228        DAG.getIntPtrConstant(0, dl));
4229  }
4230
4231  // i128 conversions are libcalls.
4232  if (SrcVal.getValueType() == MVT::i128)
4233    return SDValue();
4234
4235  // Other conversions are legal, unless it's to the completely software-based
4236  // fp128.
4237  if (Op.getValueType() != MVT::f128)
4238    return Op;
4239  return SDValue();
4240}
4241
4242SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4243                                            SelectionDAG &DAG) const {
4244  // For iOS, we want to call an alternative entry point: __sincos_stret,
4245  // which returns the values in two S / D registers.
4246  SDLoc dl(Op);
4247  SDValue Arg = Op.getOperand(0);
4248  EVT ArgVT = Arg.getValueType();
4249  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4250
4251  ArgListTy Args;
4252  ArgListEntry Entry;
4253
4254  Entry.Node = Arg;
4255  Entry.Ty = ArgTy;
4256  Entry.IsSExt = false;
4257  Entry.IsZExt = false;
4258  Args.push_back(Entry);
4259
4260  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4261                                        : RTLIB::SINCOS_STRET_F32;
4262  const char *LibcallName = getLibcallName(LC);
4263  SDValue Callee =
4264      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4265
4266  StructType *RetTy = StructType::get(ArgTy, ArgTy);
4267  TargetLowering::CallLoweringInfo CLI(DAG);
4268  CLI.setDebugLoc(dl)
4269      .setChain(DAG.getEntryNode())
4270      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4271
4272  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4273  return CallResult.first;
4274}
4275
4276static MVT getSVEContainerType(EVT ContentTy);
4277
4278SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4279                                            SelectionDAG &DAG) const {
4280  EVT OpVT = Op.getValueType();
4281  EVT ArgVT = Op.getOperand(0).getValueType();
4282
4283  if (useSVEForFixedLengthVectorVT(OpVT))
4284    return LowerFixedLengthBitcastToSVE(Op, DAG);
4285
4286  if (OpVT.isScalableVector()) {
4287    // Bitcasting between unpacked vector types of different element counts is
4288    // not a NOP because the live elements are laid out differently.
4289    //                01234567
4290    // e.g. nxv2i32 = XX??XX??
4291    //      nxv4f16 = X?X?X?X?
4292    if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4293      return SDValue();
4294
4295    if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4296      assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4297             "Expected int->fp bitcast!");
4298      SDValue ExtResult =
4299          DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
4300                      Op.getOperand(0));
4301      return getSVESafeBitCast(OpVT, ExtResult, DAG);
4302    }
4303    return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4304  }
4305
4306  if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4307    return SDValue();
4308
4309  // Bitcasts between f16 and bf16 are legal.
4310  if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4311    return Op;
4312
4313  assert(ArgVT == MVT::i16);
4314  SDLoc DL(Op);
4315
4316  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4317  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4318  return SDValue(
4319      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
4320                         DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
4321      0);
4322}
4323
4324static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4325  if (OrigVT.getSizeInBits() >= 64)
4326    return OrigVT;
4327
4328  assert(OrigVT.isSimple() && "Expecting a simple value type");
4329
4330  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4331  switch (OrigSimpleTy) {
4332  default: llvm_unreachable("Unexpected Vector Type");
4333  case MVT::v2i8:
4334  case MVT::v2i16:
4335     return MVT::v2i32;
4336  case MVT::v4i8:
4337    return  MVT::v4i16;
4338  }
4339}
4340
4341static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4342                                                 const EVT &OrigTy,
4343                                                 const EVT &ExtTy,
4344                                                 unsigned ExtOpcode) {
4345  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4346  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4347  // 64-bits we need to insert a new extension so that it will be 64-bits.
4348  assert(ExtTy.is128BitVector() && "Unexpected extension size");
4349  if (OrigTy.getSizeInBits() >= 64)
4350    return N;
4351
4352  // Must extend size to at least 64 bits to be used as an operand for VMULL.
4353  EVT NewVT = getExtensionTo64Bits(OrigTy);
4354
4355  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4356}
4357
4358// Returns lane if Op extracts from a two-element vector and lane is constant
4359// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4360static std::optional<uint64_t>
4361getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4362  SDNode *OpNode = Op.getNode();
4363  if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4364    return std::nullopt;
4365
4366  EVT VT = OpNode->getOperand(0).getValueType();
4367  ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4368  if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4369    return std::nullopt;
4370
4371  return C->getZExtValue();
4372}
4373
4374static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4375                                   bool isSigned) {
4376  EVT VT = N->getValueType(0);
4377
4378  if (N->getOpcode() != ISD::BUILD_VECTOR)
4379    return false;
4380
4381  for (const SDValue &Elt : N->op_values()) {
4382    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4383      unsigned EltSize = VT.getScalarSizeInBits();
4384      unsigned HalfSize = EltSize / 2;
4385      if (isSigned) {
4386        if (!isIntN(HalfSize, C->getSExtValue()))
4387          return false;
4388      } else {
4389        if (!isUIntN(HalfSize, C->getZExtValue()))
4390          return false;
4391      }
4392      continue;
4393    }
4394    return false;
4395  }
4396
4397  return true;
4398}
4399
4400static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
4401  if (N->getOpcode() == ISD::SIGN_EXTEND ||
4402      N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
4403    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
4404                                             N->getOperand(0)->getValueType(0),
4405                                             N->getValueType(0),
4406                                             N->getOpcode());
4407
4408  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4409  EVT VT = N->getValueType(0);
4410  SDLoc dl(N);
4411  unsigned EltSize = VT.getScalarSizeInBits() / 2;
4412  unsigned NumElts = VT.getVectorNumElements();
4413  MVT TruncVT = MVT::getIntegerVT(EltSize);
4414  SmallVector<SDValue, 8> Ops;
4415  for (unsigned i = 0; i != NumElts; ++i) {
4416    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4417    const APInt &CInt = C->getAPIntValue();
4418    // Element types smaller than 32 bits are not legal, so use i32 elements.
4419    // The values are implicitly truncated so sext vs. zext doesn't matter.
4420    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4421  }
4422  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
4423}
4424
4425static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4426  return N->getOpcode() == ISD::SIGN_EXTEND ||
4427         N->getOpcode() == ISD::ANY_EXTEND ||
4428         isExtendedBUILD_VECTOR(N, DAG, true);
4429}
4430
4431static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4432  return N->getOpcode() == ISD::ZERO_EXTEND ||
4433         N->getOpcode() == ISD::ANY_EXTEND ||
4434         isExtendedBUILD_VECTOR(N, DAG, false);
4435}
4436
4437static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4438  unsigned Opcode = N->getOpcode();
4439  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4440    SDNode *N0 = N->getOperand(0).getNode();
4441    SDNode *N1 = N->getOperand(1).getNode();
4442    return N0->hasOneUse() && N1->hasOneUse() &&
4443      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4444  }
4445  return false;
4446}
4447
4448static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4449  unsigned Opcode = N->getOpcode();
4450  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4451    SDNode *N0 = N->getOperand(0).getNode();
4452    SDNode *N1 = N->getOperand(1).getNode();
4453    return N0->hasOneUse() && N1->hasOneUse() &&
4454      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4455  }
4456  return false;
4457}
4458
4459SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4460                                                 SelectionDAG &DAG) const {
4461  // The rounding mode is in bits 23:22 of the FPSCR.
4462  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4463  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4464  // so that the shift + and get folded into a bitfield extract.
4465  SDLoc dl(Op);
4466
4467  SDValue Chain = Op.getOperand(0);
4468  SDValue FPCR_64 = DAG.getNode(
4469      ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4470      {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4471  Chain = FPCR_64.getValue(1);
4472  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4473  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4474                                  DAG.getConstant(1U << 22, dl, MVT::i32));
4475  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4476                              DAG.getConstant(22, dl, MVT::i32));
4477  SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4478                            DAG.getConstant(3, dl, MVT::i32));
4479  return DAG.getMergeValues({AND, Chain}, dl);
4480}
4481
4482SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4483                                                 SelectionDAG &DAG) const {
4484  SDLoc DL(Op);
4485  SDValue Chain = Op->getOperand(0);
4486  SDValue RMValue = Op->getOperand(1);
4487
4488  // The rounding mode is in bits 23:22 of the FPCR.
4489  // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4490  // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4491  // ((arg - 1) & 3) << 22).
4492  //
4493  // The argument of llvm.set.rounding must be within the segment [0, 3], so
4494  // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4495  // generated llvm.set.rounding to ensure this condition.
4496
4497  // Calculate new value of FPCR[23:22].
4498  RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4499                        DAG.getConstant(1, DL, MVT::i32));
4500  RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4501                        DAG.getConstant(0x3, DL, MVT::i32));
4502  RMValue =
4503      DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4504                  DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4505  RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4506
4507  // Get current value of FPCR.
4508  SDValue Ops[] = {
4509      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4510  SDValue FPCR =
4511      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4512  Chain = FPCR.getValue(1);
4513  FPCR = FPCR.getValue(0);
4514
4515  // Put new rounding mode into FPSCR[23:22].
4516  const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4517  FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4518                     DAG.getConstant(RMMask, DL, MVT::i64));
4519  FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4520  SDValue Ops2[] = {
4521      Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4522      FPCR};
4523  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4524}
4525
4526static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG,
4527                                 SDLoc DL, bool &IsMLA) {
4528  bool IsN0SExt = isSignExtended(N0, DAG);
4529  bool IsN1SExt = isSignExtended(N1, DAG);
4530  if (IsN0SExt && IsN1SExt)
4531    return AArch64ISD::SMULL;
4532
4533  bool IsN0ZExt = isZeroExtended(N0, DAG);
4534  bool IsN1ZExt = isZeroExtended(N1, DAG);
4535
4536  if (IsN0ZExt && IsN1ZExt)
4537    return AArch64ISD::UMULL;
4538
4539  // Select SMULL if we can replace zext with sext.
4540  if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4541      !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4542      !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4543    SDValue ZextOperand;
4544    if (IsN0ZExt)
4545      ZextOperand = N0->getOperand(0);
4546    else
4547      ZextOperand = N1->getOperand(0);
4548    if (DAG.SignBitIsZero(ZextOperand)) {
4549      SDNode *NewSext =
4550          DAG.getSExtOrTrunc(ZextOperand, DL, N0->getValueType(0)).getNode();
4551      if (IsN0ZExt)
4552        N0 = NewSext;
4553      else
4554        N1 = NewSext;
4555      return AArch64ISD::SMULL;
4556    }
4557  }
4558
4559  // Select UMULL if we can replace the other operand with an extend.
4560  if (IsN0ZExt || IsN1ZExt) {
4561    EVT VT = N0->getValueType(0);
4562    APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(),
4563                                       VT.getScalarSizeInBits() / 2);
4564    if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) {
4565      EVT HalfVT;
4566      switch (VT.getSimpleVT().SimpleTy) {
4567      case MVT::v2i64:
4568        HalfVT = MVT::v2i32;
4569        break;
4570      case MVT::v4i32:
4571        HalfVT = MVT::v4i16;
4572        break;
4573      case MVT::v8i16:
4574        HalfVT = MVT::v8i8;
4575        break;
4576      default:
4577        return 0;
4578      }
4579      // Truncate and then extend the result.
4580      SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
4581                                   SDValue(IsN0ZExt ? N1 : N0, 0));
4582      NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT);
4583      if (IsN0ZExt)
4584        N1 = NewExt.getNode();
4585      else
4586        N0 = NewExt.getNode();
4587      return AArch64ISD::UMULL;
4588    }
4589  }
4590
4591  if (!IsN1SExt && !IsN1ZExt)
4592    return 0;
4593
4594  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4595  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4596  if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4597    IsMLA = true;
4598    return AArch64ISD::SMULL;
4599  }
4600  if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4601    IsMLA = true;
4602    return AArch64ISD::UMULL;
4603  }
4604  if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4605    std::swap(N0, N1);
4606    IsMLA = true;
4607    return AArch64ISD::UMULL;
4608  }
4609  return 0;
4610}
4611
4612SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4613  EVT VT = Op.getValueType();
4614
4615  // If SVE is available then i64 vector multiplications can also be made legal.
4616  bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 ||
4617                      Subtarget->forceStreamingCompatibleSVE();
4618
4619  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4620    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4621
4622  // Multiplications are only custom-lowered for 128-bit vectors so that
4623  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
4624  assert(VT.is128BitVector() && VT.isInteger() &&
4625         "unexpected type for custom-lowering ISD::MUL");
4626  SDNode *N0 = Op.getOperand(0).getNode();
4627  SDNode *N1 = Op.getOperand(1).getNode();
4628  bool isMLA = false;
4629  SDLoc DL(Op);
4630  unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
4631
4632  if (!NewOpc) {
4633    if (VT == MVT::v2i64)
4634      // Fall through to expand this.  It is not legal.
4635      return SDValue();
4636    else
4637      // Other vector multiplications are legal.
4638      return Op;
4639  }
4640
4641  // Legalize to a S/UMULL instruction
4642  SDValue Op0;
4643  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4644  if (!isMLA) {
4645    Op0 = skipExtensionForVectorMULL(N0, DAG);
4646    assert(Op0.getValueType().is64BitVector() &&
4647           Op1.getValueType().is64BitVector() &&
4648           "unexpected types for extended operands to VMULL");
4649    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4650  }
4651  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4652  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4653  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4654  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
4655  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
4656  EVT Op1VT = Op1.getValueType();
4657  return DAG.getNode(N0->getOpcode(), DL, VT,
4658                     DAG.getNode(NewOpc, DL, VT,
4659                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4660                     DAG.getNode(NewOpc, DL, VT,
4661                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4662}
4663
4664static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4665                               int Pattern) {
4666  if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4667    return DAG.getConstant(1, DL, MVT::nxv1i1);
4668  return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4669                     DAG.getTargetConstant(Pattern, DL, MVT::i32));
4670}
4671
4672static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
4673                             bool IsLess, bool IsEqual) {
4674  if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
4675      !isa<ConstantSDNode>(Op.getOperand(2)))
4676    return SDValue();
4677
4678  SDLoc dl(Op);
4679  APInt X = Op.getConstantOperandAPInt(1);
4680  APInt Y = Op.getConstantOperandAPInt(2);
4681  APInt NumActiveElems;
4682  bool Overflow;
4683  if (IsLess)
4684    NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
4685  else
4686    NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
4687
4688  if (Overflow)
4689    return SDValue();
4690
4691  if (IsEqual) {
4692    APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
4693    NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
4694                              : NumActiveElems.uadd_ov(One, Overflow);
4695    if (Overflow)
4696      return SDValue();
4697  }
4698
4699  std::optional<unsigned> PredPattern =
4700      getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
4701  unsigned MinSVEVectorSize = std::max(
4702      DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u);
4703  unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
4704  if (PredPattern != std::nullopt &&
4705      NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
4706    return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
4707
4708  return SDValue();
4709}
4710
4711// Returns a safe bitcast between two scalable vector predicates, where
4712// any newly created lanes from a widening bitcast are defined as zero.
4713static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
4714  SDLoc DL(Op);
4715  EVT InVT = Op.getValueType();
4716
4717  assert(InVT.getVectorElementType() == MVT::i1 &&
4718         VT.getVectorElementType() == MVT::i1 &&
4719         "Expected a predicate-to-predicate bitcast");
4720  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
4721         InVT.isScalableVector() &&
4722         DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4723         "Only expect to cast between legal scalable predicate types!");
4724
4725  // Return the operand if the cast isn't changing type,
4726  // e.g. <n x 16 x i1> -> <n x 16 x i1>
4727  if (InVT == VT)
4728    return Op;
4729
4730  SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4731
4732  // We only have to zero the lanes if new lanes are being defined, e.g. when
4733  // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4734  // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4735  // we can return here.
4736  if (InVT.bitsGT(VT))
4737    return Reinterpret;
4738
4739  // Check if the other lanes are already known to be zeroed by
4740  // construction.
4741  if (isZeroingInactiveLanes(Op))
4742    return Reinterpret;
4743
4744  // Zero the newly introduced lanes.
4745  SDValue Mask = DAG.getConstant(1, DL, InVT);
4746  Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4747  return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4748}
4749
4750SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
4751                                           SMEAttrs Attrs, SDLoc DL,
4752                                           EVT VT) const {
4753  if (Attrs.hasStreamingInterfaceOrBody())
4754    return DAG.getConstant(1, DL, VT);
4755
4756  if (Attrs.hasNonStreamingInterfaceAndBody())
4757    return DAG.getConstant(0, DL, VT);
4758
4759  assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface");
4760
4761  SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
4762                                         getPointerTy(DAG.getDataLayout()));
4763  Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
4764  Type *RetTy = StructType::get(Int64Ty, Int64Ty);
4765  TargetLowering::CallLoweringInfo CLI(DAG);
4766  ArgListTy Args;
4767  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
4768      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
4769      RetTy, Callee, std::move(Args));
4770  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4771  SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
4772  return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
4773                     Mask);
4774}
4775
4776static std::optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
4777  if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
4778    StringRef S(ES->getSymbol());
4779    if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
4780      return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
4781    if (S == "__arm_tpidr2_restore")
4782      return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
4783  }
4784  return std::nullopt;
4785}
4786
4787SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4788                                                   SelectionDAG &DAG) const {
4789  unsigned IntNo = Op.getConstantOperandVal(1);
4790  SDLoc DL(Op);
4791  switch (IntNo) {
4792  default:
4793    return SDValue(); // Don't custom lower most intrinsics.
4794  case Intrinsic::aarch64_prefetch: {
4795    SDValue Chain = Op.getOperand(0);
4796    SDValue Addr = Op.getOperand(2);
4797
4798    unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
4799    unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4800    unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
4801    unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
4802    unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
4803                     (!IsData << 3) |    // IsDataCache bit
4804                     (Locality << 1) |   // Cache level bits
4805                     (unsigned)IsStream; // Stream bit
4806
4807    return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
4808                       DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
4809  }
4810  case Intrinsic::aarch64_sme_za_enable:
4811    return DAG.getNode(
4812        AArch64ISD::SMSTART, DL, MVT::Other,
4813        Op->getOperand(0), // Chain
4814        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4815        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4816  case Intrinsic::aarch64_sme_za_disable:
4817    return DAG.getNode(
4818        AArch64ISD::SMSTOP, DL, MVT::Other,
4819        Op->getOperand(0), // Chain
4820        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4821        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4822  }
4823}
4824
4825SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4826                                                      SelectionDAG &DAG) const {
4827  unsigned IntNo = Op.getConstantOperandVal(1);
4828  SDLoc DL(Op);
4829  switch (IntNo) {
4830  default:
4831    return SDValue(); // Don't custom lower most intrinsics.
4832  case Intrinsic::aarch64_mops_memset_tag: {
4833    auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
4834    SDValue Chain = Node->getChain();
4835    SDValue Dst = Op.getOperand(2);
4836    SDValue Val = Op.getOperand(3);
4837    Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
4838    SDValue Size = Op.getOperand(4);
4839    auto Alignment = Node->getMemOperand()->getAlign();
4840    bool IsVol = Node->isVolatile();
4841    auto DstPtrInfo = Node->getPointerInfo();
4842
4843    const auto &SDI =
4844        static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
4845    SDValue MS =
4846        SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
4847                     Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
4848
4849    // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
4850    // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
4851    // LowerOperationWrapper will complain that the number of results has
4852    // changed.
4853    return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
4854  }
4855  }
4856}
4857
4858SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4859                                                     SelectionDAG &DAG) const {
4860  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4861  SDLoc dl(Op);
4862  switch (IntNo) {
4863  default: return SDValue();    // Don't custom lower most intrinsics.
4864  case Intrinsic::thread_pointer: {
4865    EVT PtrVT = getPointerTy(DAG.getDataLayout());
4866    return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
4867  }
4868  case Intrinsic::aarch64_neon_abs: {
4869    EVT Ty = Op.getValueType();
4870    if (Ty == MVT::i64) {
4871      SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
4872                                   Op.getOperand(1));
4873      Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
4874      return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
4875    } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
4876      return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
4877    } else {
4878      report_fatal_error("Unexpected type for AArch64 NEON intrinic");
4879    }
4880  }
4881  case Intrinsic::aarch64_neon_pmull64: {
4882    SDValue LHS = Op.getOperand(1);
4883    SDValue RHS = Op.getOperand(2);
4884
4885    std::optional<uint64_t> LHSLane =
4886        getConstantLaneNumOfExtractHalfOperand(LHS);
4887    std::optional<uint64_t> RHSLane =
4888        getConstantLaneNumOfExtractHalfOperand(RHS);
4889
4890    assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
4891    assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
4892
4893    // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
4894    // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
4895    // which ISel recognizes better. For example, generate a ldr into d*
4896    // registers as opposed to a GPR load followed by a fmov.
4897    auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
4898                                  std::optional<uint64_t> OtherLane,
4899                                  const SDLoc &dl,
4900                                  SelectionDAG &DAG) -> SDValue {
4901      // If the operand is an higher half itself, rewrite it to
4902      // extract_high_v2i64; this way aarch64_neon_pmull64 could
4903      // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
4904      if (NLane && *NLane == 1)
4905        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4906                           N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
4907
4908      // Operand N is not a higher half but the other operand is.
4909      if (OtherLane && *OtherLane == 1) {
4910        // If this operand is a lower half, rewrite it to
4911        // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
4912        // align lanes of two operands. A roundtrip sequence (to move from lane
4913        // 1 to lane 0) is like this:
4914        //   mov x8, v0.d[1]
4915        //   fmov d0, x8
4916        if (NLane && *NLane == 0)
4917          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4918                             DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
4919                                         N.getOperand(0),
4920                                         DAG.getConstant(0, dl, MVT::i64)),
4921                             DAG.getConstant(1, dl, MVT::i64));
4922
4923        // Otherwise just dup from main to all lanes.
4924        return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
4925      }
4926
4927      // Neither operand is an extract of higher half, so codegen may just use
4928      // the non-high version of PMULL instruction. Use v1i64 to represent i64.
4929      assert(N.getValueType() == MVT::i64 &&
4930             "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
4931      return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
4932    };
4933
4934    LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
4935    RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
4936
4937    return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
4938  }
4939  case Intrinsic::aarch64_neon_smax:
4940    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
4941                       Op.getOperand(1), Op.getOperand(2));
4942  case Intrinsic::aarch64_neon_umax:
4943    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
4944                       Op.getOperand(1), Op.getOperand(2));
4945  case Intrinsic::aarch64_neon_smin:
4946    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
4947                       Op.getOperand(1), Op.getOperand(2));
4948  case Intrinsic::aarch64_neon_umin:
4949    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
4950                       Op.getOperand(1), Op.getOperand(2));
4951  case Intrinsic::aarch64_neon_scalar_sqxtn:
4952  case Intrinsic::aarch64_neon_scalar_sqxtun:
4953  case Intrinsic::aarch64_neon_scalar_uqxtn: {
4954    assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
4955    if (Op.getValueType() == MVT::i32)
4956      return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4957                         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
4958                                     Op.getOperand(0),
4959                                     DAG.getNode(ISD::BITCAST, dl, MVT::f64,
4960                                                 Op.getOperand(1))));
4961    return SDValue();
4962  }
4963  case Intrinsic::aarch64_sve_whilelo:
4964    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
4965                         /*IsEqual=*/false);
4966  case Intrinsic::aarch64_sve_whilelt:
4967    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
4968                         /*IsEqual=*/false);
4969  case Intrinsic::aarch64_sve_whilels:
4970    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
4971                         /*IsEqual=*/true);
4972  case Intrinsic::aarch64_sve_whilele:
4973    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
4974                         /*IsEqual=*/true);
4975  case Intrinsic::aarch64_sve_whilege:
4976    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
4977                         /*IsEqual=*/true);
4978  case Intrinsic::aarch64_sve_whilegt:
4979    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
4980                         /*IsEqual=*/false);
4981  case Intrinsic::aarch64_sve_whilehs:
4982    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
4983                         /*IsEqual=*/true);
4984  case Intrinsic::aarch64_sve_whilehi:
4985    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
4986                         /*IsEqual=*/false);
4987  case Intrinsic::aarch64_sve_sunpkhi:
4988    return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
4989                       Op.getOperand(1));
4990  case Intrinsic::aarch64_sve_sunpklo:
4991    return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
4992                       Op.getOperand(1));
4993  case Intrinsic::aarch64_sve_uunpkhi:
4994    return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
4995                       Op.getOperand(1));
4996  case Intrinsic::aarch64_sve_uunpklo:
4997    return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
4998                       Op.getOperand(1));
4999  case Intrinsic::aarch64_sve_clasta_n:
5000    return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5001                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5002  case Intrinsic::aarch64_sve_clastb_n:
5003    return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5004                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5005  case Intrinsic::aarch64_sve_lasta:
5006    return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5007                       Op.getOperand(1), Op.getOperand(2));
5008  case Intrinsic::aarch64_sve_lastb:
5009    return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5010                       Op.getOperand(1), Op.getOperand(2));
5011  case Intrinsic::aarch64_sve_rev:
5012    return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5013                       Op.getOperand(1));
5014  case Intrinsic::aarch64_sve_tbl:
5015    return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5016                       Op.getOperand(1), Op.getOperand(2));
5017  case Intrinsic::aarch64_sve_trn1:
5018    return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5019                       Op.getOperand(1), Op.getOperand(2));
5020  case Intrinsic::aarch64_sve_trn2:
5021    return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5022                       Op.getOperand(1), Op.getOperand(2));
5023  case Intrinsic::aarch64_sve_uzp1:
5024    return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5025                       Op.getOperand(1), Op.getOperand(2));
5026  case Intrinsic::aarch64_sve_uzp2:
5027    return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5028                       Op.getOperand(1), Op.getOperand(2));
5029  case Intrinsic::aarch64_sve_zip1:
5030    return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5031                       Op.getOperand(1), Op.getOperand(2));
5032  case Intrinsic::aarch64_sve_zip2:
5033    return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5034                       Op.getOperand(1), Op.getOperand(2));
5035  case Intrinsic::aarch64_sve_splice:
5036    return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5037                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5038  case Intrinsic::aarch64_sve_ptrue:
5039    return getPTrue(DAG, dl, Op.getValueType(),
5040                    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
5041  case Intrinsic::aarch64_sve_clz:
5042    return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5043                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5044  case Intrinsic::aarch64_sme_cntsb:
5045    return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5046                       DAG.getConstant(1, dl, MVT::i32));
5047  case Intrinsic::aarch64_sme_cntsh: {
5048    SDValue One = DAG.getConstant(1, dl, MVT::i32);
5049    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5050    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5051  }
5052  case Intrinsic::aarch64_sme_cntsw: {
5053    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5054                                DAG.getConstant(1, dl, MVT::i32));
5055    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5056                       DAG.getConstant(2, dl, MVT::i32));
5057  }
5058  case Intrinsic::aarch64_sme_cntsd: {
5059    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5060                                DAG.getConstant(1, dl, MVT::i32));
5061    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5062                       DAG.getConstant(3, dl, MVT::i32));
5063  }
5064  case Intrinsic::aarch64_sve_cnt: {
5065    SDValue Data = Op.getOperand(3);
5066    // CTPOP only supports integer operands.
5067    if (Data.getValueType().isFloatingPoint())
5068      Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5069    return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5070                       Op.getOperand(2), Data, Op.getOperand(1));
5071  }
5072  case Intrinsic::aarch64_sve_dupq_lane:
5073    return LowerDUPQLane(Op, DAG);
5074  case Intrinsic::aarch64_sve_convert_from_svbool:
5075    return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5076  case Intrinsic::aarch64_sve_convert_to_svbool:
5077    return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5078  case Intrinsic::aarch64_sve_fneg:
5079    return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5080                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5081  case Intrinsic::aarch64_sve_frintp:
5082    return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5083                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5084  case Intrinsic::aarch64_sve_frintm:
5085    return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5086                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5087  case Intrinsic::aarch64_sve_frinti:
5088    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5089                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5090  case Intrinsic::aarch64_sve_frintx:
5091    return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5092                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5093  case Intrinsic::aarch64_sve_frinta:
5094    return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5095                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5096  case Intrinsic::aarch64_sve_frintn:
5097    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5098                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5099  case Intrinsic::aarch64_sve_frintz:
5100    return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5101                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5102  case Intrinsic::aarch64_sve_ucvtf:
5103    return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
5104                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5105                       Op.getOperand(1));
5106  case Intrinsic::aarch64_sve_scvtf:
5107    return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
5108                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5109                       Op.getOperand(1));
5110  case Intrinsic::aarch64_sve_fcvtzu:
5111    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
5112                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5113                       Op.getOperand(1));
5114  case Intrinsic::aarch64_sve_fcvtzs:
5115    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
5116                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5117                       Op.getOperand(1));
5118  case Intrinsic::aarch64_sve_fsqrt:
5119    return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5120                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5121  case Intrinsic::aarch64_sve_frecpx:
5122    return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5123                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5124  case Intrinsic::aarch64_sve_frecpe_x:
5125    return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5126                       Op.getOperand(1));
5127  case Intrinsic::aarch64_sve_frecps_x:
5128    return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5129                       Op.getOperand(1), Op.getOperand(2));
5130  case Intrinsic::aarch64_sve_frsqrte_x:
5131    return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5132                       Op.getOperand(1));
5133  case Intrinsic::aarch64_sve_frsqrts_x:
5134    return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5135                       Op.getOperand(1), Op.getOperand(2));
5136  case Intrinsic::aarch64_sve_fabs:
5137    return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5138                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5139  case Intrinsic::aarch64_sve_abs:
5140    return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5141                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5142  case Intrinsic::aarch64_sve_neg:
5143    return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5144                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5145  case Intrinsic::aarch64_sve_insr: {
5146    SDValue Scalar = Op.getOperand(2);
5147    EVT ScalarTy = Scalar.getValueType();
5148    if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5149      Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5150
5151    return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5152                       Op.getOperand(1), Scalar);
5153  }
5154  case Intrinsic::aarch64_sve_rbit:
5155    return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
5156                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5157                       Op.getOperand(1));
5158  case Intrinsic::aarch64_sve_revb:
5159    return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5160                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5161  case Intrinsic::aarch64_sve_revh:
5162    return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5163                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5164  case Intrinsic::aarch64_sve_revw:
5165    return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5166                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5167  case Intrinsic::aarch64_sve_revd:
5168    return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5169                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5170  case Intrinsic::aarch64_sve_sxtb:
5171    return DAG.getNode(
5172        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5173        Op.getOperand(2), Op.getOperand(3),
5174        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5175        Op.getOperand(1));
5176  case Intrinsic::aarch64_sve_sxth:
5177    return DAG.getNode(
5178        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5179        Op.getOperand(2), Op.getOperand(3),
5180        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5181        Op.getOperand(1));
5182  case Intrinsic::aarch64_sve_sxtw:
5183    return DAG.getNode(
5184        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5185        Op.getOperand(2), Op.getOperand(3),
5186        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5187        Op.getOperand(1));
5188  case Intrinsic::aarch64_sve_uxtb:
5189    return DAG.getNode(
5190        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5191        Op.getOperand(2), Op.getOperand(3),
5192        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5193        Op.getOperand(1));
5194  case Intrinsic::aarch64_sve_uxth:
5195    return DAG.getNode(
5196        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5197        Op.getOperand(2), Op.getOperand(3),
5198        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5199        Op.getOperand(1));
5200  case Intrinsic::aarch64_sve_uxtw:
5201    return DAG.getNode(
5202        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5203        Op.getOperand(2), Op.getOperand(3),
5204        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5205        Op.getOperand(1));
5206  case Intrinsic::localaddress: {
5207    const auto &MF = DAG.getMachineFunction();
5208    const auto *RegInfo = Subtarget->getRegisterInfo();
5209    unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5210    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5211                              Op.getSimpleValueType());
5212  }
5213
5214  case Intrinsic::eh_recoverfp: {
5215    // FIXME: This needs to be implemented to correctly handle highly aligned
5216    // stack objects. For now we simply return the incoming FP. Refer D53541
5217    // for more details.
5218    SDValue FnOp = Op.getOperand(1);
5219    SDValue IncomingFPOp = Op.getOperand(2);
5220    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5221    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5222    if (!Fn)
5223      report_fatal_error(
5224          "llvm.eh.recoverfp must take a function as the first argument");
5225    return IncomingFPOp;
5226  }
5227
5228  case Intrinsic::aarch64_neon_vsri:
5229  case Intrinsic::aarch64_neon_vsli: {
5230    EVT Ty = Op.getValueType();
5231
5232    if (!Ty.isVector())
5233      report_fatal_error("Unexpected type for aarch64_neon_vsli");
5234
5235    assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5236
5237    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
5238    unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5239    return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5240                       Op.getOperand(3));
5241  }
5242
5243  case Intrinsic::aarch64_neon_srhadd:
5244  case Intrinsic::aarch64_neon_urhadd:
5245  case Intrinsic::aarch64_neon_shadd:
5246  case Intrinsic::aarch64_neon_uhadd: {
5247    bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5248                        IntNo == Intrinsic::aarch64_neon_shadd);
5249    bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5250                          IntNo == Intrinsic::aarch64_neon_urhadd);
5251    unsigned Opcode = IsSignedAdd
5252                          ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5253                          : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5254    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5255                       Op.getOperand(2));
5256  }
5257  case Intrinsic::aarch64_neon_sabd:
5258  case Intrinsic::aarch64_neon_uabd: {
5259    unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
5260                                                            : ISD::ABDS;
5261    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5262                       Op.getOperand(2));
5263  }
5264  case Intrinsic::aarch64_neon_saddlp:
5265  case Intrinsic::aarch64_neon_uaddlp: {
5266    unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5267                          ? AArch64ISD::UADDLP
5268                          : AArch64ISD::SADDLP;
5269    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5270  }
5271  case Intrinsic::aarch64_neon_sdot:
5272  case Intrinsic::aarch64_neon_udot:
5273  case Intrinsic::aarch64_sve_sdot:
5274  case Intrinsic::aarch64_sve_udot: {
5275    unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5276                       IntNo == Intrinsic::aarch64_sve_udot)
5277                          ? AArch64ISD::UDOT
5278                          : AArch64ISD::SDOT;
5279    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5280                       Op.getOperand(2), Op.getOperand(3));
5281  }
5282  case Intrinsic::get_active_lane_mask: {
5283    SDValue ID =
5284        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5285    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5286                       Op.getOperand(1), Op.getOperand(2));
5287  }
5288  }
5289}
5290
5291bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5292  if (VT.getVectorElementType() == MVT::i8 ||
5293      VT.getVectorElementType() == MVT::i16) {
5294    EltTy = MVT::i32;
5295    return true;
5296  }
5297  return false;
5298}
5299
5300bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
5301                                                          EVT DataVT) const {
5302  // SVE only supports implicit extension of 32-bit indices.
5303  if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5304    return false;
5305
5306  // Indices cannot be smaller than the main data type.
5307  if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5308    return false;
5309
5310  // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5311  // element container type, which would violate the previous clause.
5312  return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5313}
5314
5315bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5316  return ExtVal.getValueType().isScalableVector() ||
5317         useSVEForFixedLengthVectorVT(
5318             ExtVal.getValueType(),
5319             /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
5320}
5321
5322unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5323  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5324      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5325       AArch64ISD::GLD1_MERGE_ZERO},
5326      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5327       AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5328      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5329       AArch64ISD::GLD1_MERGE_ZERO},
5330      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5331       AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5332      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5333       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5334      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5335       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5336      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5337       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5338      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5339       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5340  };
5341  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5342  return AddrModes.find(Key)->second;
5343}
5344
5345unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5346  switch (Opcode) {
5347  default:
5348    llvm_unreachable("unimplemented opcode");
5349    return Opcode;
5350  case AArch64ISD::GLD1_MERGE_ZERO:
5351    return AArch64ISD::GLD1S_MERGE_ZERO;
5352  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5353    return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5354  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5355    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5356  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5357    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5358  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5359    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5360  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5361    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5362  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5363    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5364  }
5365}
5366
5367SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5368                                            SelectionDAG &DAG) const {
5369  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5370
5371  SDLoc DL(Op);
5372  SDValue Chain = MGT->getChain();
5373  SDValue PassThru = MGT->getPassThru();
5374  SDValue Mask = MGT->getMask();
5375  SDValue BasePtr = MGT->getBasePtr();
5376  SDValue Index = MGT->getIndex();
5377  SDValue Scale = MGT->getScale();
5378  EVT VT = Op.getValueType();
5379  EVT MemVT = MGT->getMemoryVT();
5380  ISD::LoadExtType ExtType = MGT->getExtensionType();
5381  ISD::MemIndexType IndexType = MGT->getIndexType();
5382
5383  // SVE supports zero (and so undef) passthrough values only, everything else
5384  // must be handled manually by an explicit select on the load's output.
5385  if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5386    SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5387    SDValue Load =
5388        DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5389                            MGT->getMemOperand(), IndexType, ExtType);
5390    SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5391    return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5392  }
5393
5394  bool IsScaled = MGT->isIndexScaled();
5395  bool IsSigned = MGT->isIndexSigned();
5396
5397  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5398  // must be calculated before hand.
5399  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5400  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5401    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5402    EVT IndexVT = Index.getValueType();
5403    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5404                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5405    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5406
5407    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5408    return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5409                               MGT->getMemOperand(), IndexType, ExtType);
5410  }
5411
5412  // Lower fixed length gather to a scalable equivalent.
5413  if (VT.isFixedLengthVector()) {
5414    assert(Subtarget->useSVEForFixedLengthVectors() &&
5415           "Cannot lower when not using SVE for fixed vectors!");
5416
5417    // NOTE: Handle floating-point as if integer then bitcast the result.
5418    EVT DataVT = VT.changeVectorElementTypeToInteger();
5419    MemVT = MemVT.changeVectorElementTypeToInteger();
5420
5421    // Find the smallest integer fixed length vector we can use for the gather.
5422    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5423    if (DataVT.getVectorElementType() == MVT::i64 ||
5424        Index.getValueType().getVectorElementType() == MVT::i64 ||
5425        Mask.getValueType().getVectorElementType() == MVT::i64)
5426      PromotedVT = VT.changeVectorElementType(MVT::i64);
5427
5428    // Promote vector operands except for passthrough, which we know is either
5429    // undef or zero, and thus best constructed directly.
5430    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5431    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5432    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5433
5434    // A promoted result type forces the need for an extending load.
5435    if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5436      ExtType = ISD::EXTLOAD;
5437
5438    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5439
5440    // Convert fixed length vector operands to scalable.
5441    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5442    Index = convertToScalableVector(DAG, ContainerVT, Index);
5443    Mask = convertFixedMaskToScalableVector(Mask, DAG);
5444    PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5445                                   : DAG.getConstant(0, DL, ContainerVT);
5446
5447    // Emit equivalent scalable vector gather.
5448    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5449    SDValue Load =
5450        DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5451                            Ops, MGT->getMemOperand(), IndexType, ExtType);
5452
5453    // Extract fixed length data then convert to the required result type.
5454    SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5455    Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5456    if (VT.isFloatingPoint())
5457      Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5458
5459    return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5460  }
5461
5462  // Everything else is legal.
5463  return Op;
5464}
5465
5466SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5467                                             SelectionDAG &DAG) const {
5468  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5469
5470  SDLoc DL(Op);
5471  SDValue Chain = MSC->getChain();
5472  SDValue StoreVal = MSC->getValue();
5473  SDValue Mask = MSC->getMask();
5474  SDValue BasePtr = MSC->getBasePtr();
5475  SDValue Index = MSC->getIndex();
5476  SDValue Scale = MSC->getScale();
5477  EVT VT = StoreVal.getValueType();
5478  EVT MemVT = MSC->getMemoryVT();
5479  ISD::MemIndexType IndexType = MSC->getIndexType();
5480  bool Truncating = MSC->isTruncatingStore();
5481
5482  bool IsScaled = MSC->isIndexScaled();
5483  bool IsSigned = MSC->isIndexSigned();
5484
5485  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5486  // must be calculated before hand.
5487  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5488  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5489    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5490    EVT IndexVT = Index.getValueType();
5491    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5492                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5493    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5494
5495    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5496    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5497                                MSC->getMemOperand(), IndexType, Truncating);
5498  }
5499
5500  // Lower fixed length scatter to a scalable equivalent.
5501  if (VT.isFixedLengthVector()) {
5502    assert(Subtarget->useSVEForFixedLengthVectors() &&
5503           "Cannot lower when not using SVE for fixed vectors!");
5504
5505    // Once bitcast we treat floating-point scatters as if integer.
5506    if (VT.isFloatingPoint()) {
5507      VT = VT.changeVectorElementTypeToInteger();
5508      MemVT = MemVT.changeVectorElementTypeToInteger();
5509      StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5510    }
5511
5512    // Find the smallest integer fixed length vector we can use for the scatter.
5513    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5514    if (VT.getVectorElementType() == MVT::i64 ||
5515        Index.getValueType().getVectorElementType() == MVT::i64 ||
5516        Mask.getValueType().getVectorElementType() == MVT::i64)
5517      PromotedVT = VT.changeVectorElementType(MVT::i64);
5518
5519    // Promote vector operands.
5520    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5521    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5522    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5523    StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5524
5525    // A promoted value type forces the need for a truncating store.
5526    if (PromotedVT != VT)
5527      Truncating = true;
5528
5529    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5530
5531    // Convert fixed length vector operands to scalable.
5532    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5533    Index = convertToScalableVector(DAG, ContainerVT, Index);
5534    Mask = convertFixedMaskToScalableVector(Mask, DAG);
5535    StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5536
5537    // Emit equivalent scalable vector scatter.
5538    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5539    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5540                                MSC->getMemOperand(), IndexType, Truncating);
5541  }
5542
5543  // Everything else is legal.
5544  return Op;
5545}
5546
5547SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5548  SDLoc DL(Op);
5549  MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5550  assert(LoadNode && "Expected custom lowering of a masked load node");
5551  EVT VT = Op->getValueType(0);
5552
5553  if (useSVEForFixedLengthVectorVT(
5554          VT,
5555          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5556    return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5557
5558  SDValue PassThru = LoadNode->getPassThru();
5559  SDValue Mask = LoadNode->getMask();
5560
5561  if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5562    return Op;
5563
5564  SDValue Load = DAG.getMaskedLoad(
5565      VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5566      LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5567      LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5568      LoadNode->getExtensionType());
5569
5570  SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5571
5572  return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5573}
5574
5575// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5576static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
5577                                        EVT VT, EVT MemVT,
5578                                        SelectionDAG &DAG) {
5579  assert(VT.isVector() && "VT should be a vector type");
5580  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5581
5582  SDValue Value = ST->getValue();
5583
5584  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5585  // the word lane which represent the v4i8 subvector.  It optimizes the store
5586  // to:
5587  //
5588  //   xtn  v0.8b, v0.8h
5589  //   str  s0, [x0]
5590
5591  SDValue Undef = DAG.getUNDEF(MVT::i16);
5592  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5593                                        {Undef, Undef, Undef, Undef});
5594
5595  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5596                                 Value, UndefVec);
5597  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5598
5599  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5600  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5601                                     Trunc, DAG.getConstant(0, DL, MVT::i64));
5602
5603  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5604                      ST->getBasePtr(), ST->getMemOperand());
5605}
5606
5607// Custom lowering for any store, vector or scalar and/or default or with
5608// a truncate operations.  Currently only custom lower truncate operation
5609// from vector v4i16 to v4i8 or volatile stores of i128.
5610SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5611                                          SelectionDAG &DAG) const {
5612  SDLoc Dl(Op);
5613  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5614  assert (StoreNode && "Can only custom lower store nodes");
5615
5616  SDValue Value = StoreNode->getValue();
5617
5618  EVT VT = Value.getValueType();
5619  EVT MemVT = StoreNode->getMemoryVT();
5620
5621  if (VT.isVector()) {
5622    if (useSVEForFixedLengthVectorVT(
5623            VT,
5624            /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5625      return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5626
5627    unsigned AS = StoreNode->getAddressSpace();
5628    Align Alignment = StoreNode->getAlign();
5629    if (Alignment < MemVT.getStoreSize() &&
5630        !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5631                                        StoreNode->getMemOperand()->getFlags(),
5632                                        nullptr)) {
5633      return scalarizeVectorStore(StoreNode, DAG);
5634    }
5635
5636    if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5637        MemVT == MVT::v4i8) {
5638      return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5639    }
5640    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5641    // the custom lowering, as there are no un-paired non-temporal stores and
5642    // legalization will break up 256 bit inputs.
5643    ElementCount EC = MemVT.getVectorElementCount();
5644    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5645        EC.isKnownEven() &&
5646        ((MemVT.getScalarSizeInBits() == 8u ||
5647          MemVT.getScalarSizeInBits() == 16u ||
5648          MemVT.getScalarSizeInBits() == 32u ||
5649          MemVT.getScalarSizeInBits() == 64u))) {
5650      SDValue Lo =
5651          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5652                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5653                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5654      SDValue Hi =
5655          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5656                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5657                      StoreNode->getValue(),
5658                      DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5659      SDValue Result = DAG.getMemIntrinsicNode(
5660          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5661          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5662          StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5663      return Result;
5664    }
5665  } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5666    return LowerStore128(Op, DAG);
5667  } else if (MemVT == MVT::i64x8) {
5668    SDValue Value = StoreNode->getValue();
5669    assert(Value->getValueType(0) == MVT::i64x8);
5670    SDValue Chain = StoreNode->getChain();
5671    SDValue Base = StoreNode->getBasePtr();
5672    EVT PtrVT = Base.getValueType();
5673    for (unsigned i = 0; i < 8; i++) {
5674      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5675                                 Value, DAG.getConstant(i, Dl, MVT::i32));
5676      SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5677                                DAG.getConstant(i * 8, Dl, PtrVT));
5678      Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5679                           StoreNode->getOriginalAlign());
5680    }
5681    return Chain;
5682  }
5683
5684  return SDValue();
5685}
5686
5687/// Lower atomic or volatile 128-bit stores to a single STP instruction.
5688SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5689                                             SelectionDAG &DAG) const {
5690  MemSDNode *StoreNode = cast<MemSDNode>(Op);
5691  assert(StoreNode->getMemoryVT() == MVT::i128);
5692  assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5693  assert(!StoreNode->isAtomic() ||
5694         StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
5695         StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
5696
5697  SDValue Value = StoreNode->getOpcode() == ISD::STORE
5698                      ? StoreNode->getOperand(1)
5699                      : StoreNode->getOperand(2);
5700  SDLoc DL(Op);
5701  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5702                           DAG.getConstant(0, DL, MVT::i64));
5703  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5704                           DAG.getConstant(1, DL, MVT::i64));
5705  SDValue Result = DAG.getMemIntrinsicNode(
5706      AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
5707      {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5708      StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5709  return Result;
5710}
5711
5712SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5713                                         SelectionDAG &DAG) const {
5714  SDLoc DL(Op);
5715  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5716  assert(LoadNode && "Expected custom lowering of a load node");
5717
5718  if (LoadNode->getMemoryVT() == MVT::i64x8) {
5719    SmallVector<SDValue, 8> Ops;
5720    SDValue Base = LoadNode->getBasePtr();
5721    SDValue Chain = LoadNode->getChain();
5722    EVT PtrVT = Base.getValueType();
5723    for (unsigned i = 0; i < 8; i++) {
5724      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5725                                DAG.getConstant(i * 8, DL, PtrVT));
5726      SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5727                                 LoadNode->getPointerInfo(),
5728                                 LoadNode->getOriginalAlign());
5729      Ops.push_back(Part);
5730      Chain = SDValue(Part.getNode(), 1);
5731    }
5732    SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5733    return DAG.getMergeValues({Loaded, Chain}, DL);
5734  }
5735
5736  // Custom lowering for extending v4i8 vector loads.
5737  EVT VT = Op->getValueType(0);
5738  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
5739
5740  if (LoadNode->getMemoryVT() != MVT::v4i8)
5741    return SDValue();
5742
5743  unsigned ExtType;
5744  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
5745    ExtType = ISD::SIGN_EXTEND;
5746  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
5747           LoadNode->getExtensionType() == ISD::EXTLOAD)
5748    ExtType = ISD::ZERO_EXTEND;
5749  else
5750    return SDValue();
5751
5752  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
5753                             LoadNode->getBasePtr(), MachinePointerInfo());
5754  SDValue Chain = Load.getValue(1);
5755  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
5756  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
5757  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
5758  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
5759                    DAG.getConstant(0, DL, MVT::i64));
5760  if (VT == MVT::v4i32)
5761    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
5762  return DAG.getMergeValues({Ext, Chain}, DL);
5763}
5764
5765// Generate SUBS and CSEL for integer abs.
5766SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5767  MVT VT = Op.getSimpleValueType();
5768
5769  if (VT.isVector())
5770    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
5771
5772  SDLoc DL(Op);
5773  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
5774                            Op.getOperand(0));
5775  // Generate SUBS & CSEL.
5776  SDValue Cmp =
5777      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
5778                  Op.getOperand(0), DAG.getConstant(0, DL, VT));
5779  return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
5780                     DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
5781                     Cmp.getValue(1));
5782}
5783
5784static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5785  SDValue Chain = Op.getOperand(0);
5786  SDValue Cond = Op.getOperand(1);
5787  SDValue Dest = Op.getOperand(2);
5788
5789  AArch64CC::CondCode CC;
5790  if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
5791    SDLoc dl(Op);
5792    SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
5793    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5794                       Cmp);
5795  }
5796
5797  return SDValue();
5798}
5799
5800SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
5801                                              SelectionDAG &DAG) const {
5802  LLVM_DEBUG(dbgs() << "Custom lowering: ");
5803  LLVM_DEBUG(Op.dump());
5804
5805  switch (Op.getOpcode()) {
5806  default:
5807    llvm_unreachable("unimplemented operand");
5808    return SDValue();
5809  case ISD::BITCAST:
5810    return LowerBITCAST(Op, DAG);
5811  case ISD::GlobalAddress:
5812    return LowerGlobalAddress(Op, DAG);
5813  case ISD::GlobalTLSAddress:
5814    return LowerGlobalTLSAddress(Op, DAG);
5815  case ISD::SETCC:
5816  case ISD::STRICT_FSETCC:
5817  case ISD::STRICT_FSETCCS:
5818    return LowerSETCC(Op, DAG);
5819  case ISD::SETCCCARRY:
5820    return LowerSETCCCARRY(Op, DAG);
5821  case ISD::BRCOND:
5822    return LowerBRCOND(Op, DAG);
5823  case ISD::BR_CC:
5824    return LowerBR_CC(Op, DAG);
5825  case ISD::SELECT:
5826    return LowerSELECT(Op, DAG);
5827  case ISD::SELECT_CC:
5828    return LowerSELECT_CC(Op, DAG);
5829  case ISD::JumpTable:
5830    return LowerJumpTable(Op, DAG);
5831  case ISD::BR_JT:
5832    return LowerBR_JT(Op, DAG);
5833  case ISD::ConstantPool:
5834    return LowerConstantPool(Op, DAG);
5835  case ISD::BlockAddress:
5836    return LowerBlockAddress(Op, DAG);
5837  case ISD::VASTART:
5838    return LowerVASTART(Op, DAG);
5839  case ISD::VACOPY:
5840    return LowerVACOPY(Op, DAG);
5841  case ISD::VAARG:
5842    return LowerVAARG(Op, DAG);
5843  case ISD::ADDCARRY:
5844    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
5845  case ISD::SUBCARRY:
5846    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
5847  case ISD::SADDO_CARRY:
5848    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
5849  case ISD::SSUBO_CARRY:
5850    return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
5851  case ISD::SADDO:
5852  case ISD::UADDO:
5853  case ISD::SSUBO:
5854  case ISD::USUBO:
5855  case ISD::SMULO:
5856  case ISD::UMULO:
5857    return LowerXALUO(Op, DAG);
5858  case ISD::FADD:
5859    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
5860  case ISD::FSUB:
5861    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
5862  case ISD::FMUL:
5863    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
5864  case ISD::FMA:
5865    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
5866  case ISD::FDIV:
5867    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
5868  case ISD::FNEG:
5869    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
5870  case ISD::FCEIL:
5871    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
5872  case ISD::FFLOOR:
5873    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
5874  case ISD::FNEARBYINT:
5875    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
5876  case ISD::FRINT:
5877    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
5878  case ISD::FROUND:
5879    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
5880  case ISD::FROUNDEVEN:
5881    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
5882  case ISD::FTRUNC:
5883    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
5884  case ISD::FSQRT:
5885    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
5886  case ISD::FABS:
5887    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
5888  case ISD::FP_ROUND:
5889  case ISD::STRICT_FP_ROUND:
5890    return LowerFP_ROUND(Op, DAG);
5891  case ISD::FP_EXTEND:
5892    return LowerFP_EXTEND(Op, DAG);
5893  case ISD::FRAMEADDR:
5894    return LowerFRAMEADDR(Op, DAG);
5895  case ISD::SPONENTRY:
5896    return LowerSPONENTRY(Op, DAG);
5897  case ISD::RETURNADDR:
5898    return LowerRETURNADDR(Op, DAG);
5899  case ISD::ADDROFRETURNADDR:
5900    return LowerADDROFRETURNADDR(Op, DAG);
5901  case ISD::CONCAT_VECTORS:
5902    return LowerCONCAT_VECTORS(Op, DAG);
5903  case ISD::INSERT_VECTOR_ELT:
5904    return LowerINSERT_VECTOR_ELT(Op, DAG);
5905  case ISD::EXTRACT_VECTOR_ELT:
5906    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5907  case ISD::BUILD_VECTOR:
5908    return LowerBUILD_VECTOR(Op, DAG);
5909  case ISD::ZERO_EXTEND_VECTOR_INREG:
5910    return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
5911  case ISD::VECTOR_SHUFFLE:
5912    return LowerVECTOR_SHUFFLE(Op, DAG);
5913  case ISD::SPLAT_VECTOR:
5914    return LowerSPLAT_VECTOR(Op, DAG);
5915  case ISD::EXTRACT_SUBVECTOR:
5916    return LowerEXTRACT_SUBVECTOR(Op, DAG);
5917  case ISD::INSERT_SUBVECTOR:
5918    return LowerINSERT_SUBVECTOR(Op, DAG);
5919  case ISD::SDIV:
5920  case ISD::UDIV:
5921    return LowerDIV(Op, DAG);
5922  case ISD::SMIN:
5923  case ISD::UMIN:
5924  case ISD::SMAX:
5925  case ISD::UMAX:
5926    return LowerMinMax(Op, DAG);
5927  case ISD::SRA:
5928  case ISD::SRL:
5929  case ISD::SHL:
5930    return LowerVectorSRA_SRL_SHL(Op, DAG);
5931  case ISD::SHL_PARTS:
5932  case ISD::SRL_PARTS:
5933  case ISD::SRA_PARTS:
5934    return LowerShiftParts(Op, DAG);
5935  case ISD::CTPOP:
5936  case ISD::PARITY:
5937    return LowerCTPOP_PARITY(Op, DAG);
5938  case ISD::FCOPYSIGN:
5939    return LowerFCOPYSIGN(Op, DAG);
5940  case ISD::OR:
5941    return LowerVectorOR(Op, DAG);
5942  case ISD::XOR:
5943    return LowerXOR(Op, DAG);
5944  case ISD::PREFETCH:
5945    return LowerPREFETCH(Op, DAG);
5946  case ISD::SINT_TO_FP:
5947  case ISD::UINT_TO_FP:
5948  case ISD::STRICT_SINT_TO_FP:
5949  case ISD::STRICT_UINT_TO_FP:
5950    return LowerINT_TO_FP(Op, DAG);
5951  case ISD::FP_TO_SINT:
5952  case ISD::FP_TO_UINT:
5953  case ISD::STRICT_FP_TO_SINT:
5954  case ISD::STRICT_FP_TO_UINT:
5955    return LowerFP_TO_INT(Op, DAG);
5956  case ISD::FP_TO_SINT_SAT:
5957  case ISD::FP_TO_UINT_SAT:
5958    return LowerFP_TO_INT_SAT(Op, DAG);
5959  case ISD::FSINCOS:
5960    return LowerFSINCOS(Op, DAG);
5961  case ISD::GET_ROUNDING:
5962    return LowerGET_ROUNDING(Op, DAG);
5963  case ISD::SET_ROUNDING:
5964    return LowerSET_ROUNDING(Op, DAG);
5965  case ISD::MUL:
5966    return LowerMUL(Op, DAG);
5967  case ISD::MULHS:
5968    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
5969  case ISD::MULHU:
5970    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
5971  case ISD::INTRINSIC_W_CHAIN:
5972    return LowerINTRINSIC_W_CHAIN(Op, DAG);
5973  case ISD::INTRINSIC_WO_CHAIN:
5974    return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5975  case ISD::INTRINSIC_VOID:
5976    return LowerINTRINSIC_VOID(Op, DAG);
5977  case ISD::ATOMIC_STORE:
5978    if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
5979      assert(Subtarget->hasLSE2());
5980      return LowerStore128(Op, DAG);
5981    }
5982    return SDValue();
5983  case ISD::STORE:
5984    return LowerSTORE(Op, DAG);
5985  case ISD::MSTORE:
5986    return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
5987  case ISD::MGATHER:
5988    return LowerMGATHER(Op, DAG);
5989  case ISD::MSCATTER:
5990    return LowerMSCATTER(Op, DAG);
5991  case ISD::VECREDUCE_SEQ_FADD:
5992    return LowerVECREDUCE_SEQ_FADD(Op, DAG);
5993  case ISD::VECREDUCE_ADD:
5994  case ISD::VECREDUCE_AND:
5995  case ISD::VECREDUCE_OR:
5996  case ISD::VECREDUCE_XOR:
5997  case ISD::VECREDUCE_SMAX:
5998  case ISD::VECREDUCE_SMIN:
5999  case ISD::VECREDUCE_UMAX:
6000  case ISD::VECREDUCE_UMIN:
6001  case ISD::VECREDUCE_FADD:
6002  case ISD::VECREDUCE_FMAX:
6003  case ISD::VECREDUCE_FMIN:
6004    return LowerVECREDUCE(Op, DAG);
6005  case ISD::ATOMIC_LOAD_SUB:
6006    return LowerATOMIC_LOAD_SUB(Op, DAG);
6007  case ISD::ATOMIC_LOAD_AND:
6008    return LowerATOMIC_LOAD_AND(Op, DAG);
6009  case ISD::DYNAMIC_STACKALLOC:
6010    return LowerDYNAMIC_STACKALLOC(Op, DAG);
6011  case ISD::VSCALE:
6012    return LowerVSCALE(Op, DAG);
6013  case ISD::ANY_EXTEND:
6014  case ISD::SIGN_EXTEND:
6015  case ISD::ZERO_EXTEND:
6016    return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6017  case ISD::SIGN_EXTEND_INREG: {
6018    // Only custom lower when ExtraVT has a legal byte based element type.
6019    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6020    EVT ExtraEltVT = ExtraVT.getVectorElementType();
6021    if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6022        (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6023      return SDValue();
6024
6025    return LowerToPredicatedOp(Op, DAG,
6026                               AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6027  }
6028  case ISD::TRUNCATE:
6029    return LowerTRUNCATE(Op, DAG);
6030  case ISD::MLOAD:
6031    return LowerMLOAD(Op, DAG);
6032  case ISD::LOAD:
6033    if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6034                                     Subtarget->forceStreamingCompatibleSVE()))
6035      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6036    return LowerLOAD(Op, DAG);
6037  case ISD::ADD:
6038  case ISD::AND:
6039  case ISD::SUB:
6040    return LowerToScalableOp(Op, DAG);
6041  case ISD::FMAXIMUM:
6042    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6043  case ISD::FMAXNUM:
6044    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6045  case ISD::FMINIMUM:
6046    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6047  case ISD::FMINNUM:
6048    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6049  case ISD::VSELECT:
6050    return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6051  case ISD::ABS:
6052    return LowerABS(Op, DAG);
6053  case ISD::ABDS:
6054    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6055  case ISD::ABDU:
6056    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6057  case ISD::AVGFLOORS:
6058    return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED);
6059  case ISD::AVGFLOORU:
6060    return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED);
6061  case ISD::AVGCEILS:
6062    return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED);
6063  case ISD::AVGCEILU:
6064    return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED);
6065  case ISD::BITREVERSE:
6066    return LowerBitreverse(Op, DAG);
6067  case ISD::BSWAP:
6068    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6069  case ISD::CTLZ:
6070    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6071  case ISD::CTTZ:
6072    return LowerCTTZ(Op, DAG);
6073  case ISD::VECTOR_SPLICE:
6074    return LowerVECTOR_SPLICE(Op, DAG);
6075  case ISD::STRICT_LROUND:
6076  case ISD::STRICT_LLROUND:
6077  case ISD::STRICT_LRINT:
6078  case ISD::STRICT_LLRINT: {
6079    assert(Op.getOperand(1).getValueType() == MVT::f16 &&
6080           "Expected custom lowering of rounding operations only for f16");
6081    SDLoc DL(Op);
6082    SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6083                              {Op.getOperand(0), Op.getOperand(1)});
6084    return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6085                       {Ext.getValue(1), Ext.getValue(0)});
6086  }
6087  case ISD::WRITE_REGISTER: {
6088    assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6089           "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6090    SDLoc DL(Op);
6091
6092    SDValue Chain = Op.getOperand(0);
6093    SDValue SysRegName = Op.getOperand(1);
6094    SDValue Pair = Op.getOperand(2);
6095
6096    SDValue PairLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair,
6097                                 DAG.getConstant(0, DL, MVT::i32));
6098    SDValue PairHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Pair,
6099                                 DAG.getConstant(1, DL, MVT::i32));
6100
6101    // chain = MSRR(chain, sysregname, lo, hi)
6102    SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6103                                 SysRegName, PairLo, PairHi);
6104
6105    return Result;
6106  }
6107  }
6108}
6109
6110bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
6111  return !Subtarget->useSVEForFixedLengthVectors();
6112}
6113
6114bool AArch64TargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
6115  return true;
6116}
6117
6118bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
6119    EVT VT, bool OverrideNEON) const {
6120  if (!VT.isFixedLengthVector() || !VT.isSimple())
6121    return false;
6122
6123  // Don't use SVE for vectors we cannot scalarize if required.
6124  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6125  // Fixed length predicates should be promoted to i8.
6126  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6127  case MVT::i1:
6128  default:
6129    return false;
6130  case MVT::i8:
6131  case MVT::i16:
6132  case MVT::i32:
6133  case MVT::i64:
6134  case MVT::f16:
6135  case MVT::f32:
6136  case MVT::f64:
6137    break;
6138  }
6139
6140  // All SVE implementations support NEON sized vectors.
6141  if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6142    return Subtarget->hasSVE();
6143
6144  // Ensure NEON MVTs only belong to a single register class.
6145  if (VT.getFixedSizeInBits() <= 128)
6146    return false;
6147
6148  // Ensure wider than NEON code generation is enabled.
6149  if (!Subtarget->useSVEForFixedLengthVectors())
6150    return false;
6151
6152  // Don't use SVE for types that don't fit.
6153  if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6154    return false;
6155
6156  // TODO: Perhaps an artificial restriction, but worth having whilst getting
6157  // the base fixed length SVE support in place.
6158  if (!VT.isPow2VectorType())
6159    return false;
6160
6161  return true;
6162}
6163
6164//===----------------------------------------------------------------------===//
6165//                      Calling Convention Implementation
6166//===----------------------------------------------------------------------===//
6167
6168static unsigned getIntrinsicID(const SDNode *N) {
6169  unsigned Opcode = N->getOpcode();
6170  switch (Opcode) {
6171  default:
6172    return Intrinsic::not_intrinsic;
6173  case ISD::INTRINSIC_WO_CHAIN: {
6174    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
6175    if (IID < Intrinsic::num_intrinsics)
6176      return IID;
6177    return Intrinsic::not_intrinsic;
6178  }
6179  }
6180}
6181
6182bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
6183                                                SDValue N1) const {
6184  if (!N0.hasOneUse())
6185    return false;
6186
6187  unsigned IID = getIntrinsicID(N1.getNode());
6188  // Avoid reassociating expressions that can be lowered to smlal/umlal.
6189  if (IID == Intrinsic::aarch64_neon_umull ||
6190      N1.getOpcode() == AArch64ISD::UMULL ||
6191      IID == Intrinsic::aarch64_neon_smull ||
6192      N1.getOpcode() == AArch64ISD::SMULL)
6193    return N0.getOpcode() != ISD::ADD;
6194
6195  return true;
6196}
6197
6198/// Selects the correct CCAssignFn for a given CallingConvention value.
6199CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
6200                                                     bool IsVarArg) const {
6201  switch (CC) {
6202  default:
6203    report_fatal_error("Unsupported calling convention.");
6204  case CallingConv::WebKit_JS:
6205    return CC_AArch64_WebKit_JS;
6206  case CallingConv::GHC:
6207    return CC_AArch64_GHC;
6208  case CallingConv::C:
6209  case CallingConv::Fast:
6210  case CallingConv::PreserveMost:
6211  case CallingConv::CXX_FAST_TLS:
6212  case CallingConv::Swift:
6213  case CallingConv::SwiftTail:
6214  case CallingConv::Tail:
6215    if (Subtarget->isTargetWindows() && IsVarArg) {
6216      if (Subtarget->isWindowsArm64EC())
6217        return CC_AArch64_Arm64EC_VarArg;
6218      return CC_AArch64_Win64_VarArg;
6219    }
6220    if (!Subtarget->isTargetDarwin())
6221      return CC_AArch64_AAPCS;
6222    if (!IsVarArg)
6223      return CC_AArch64_DarwinPCS;
6224    return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
6225                                      : CC_AArch64_DarwinPCS_VarArg;
6226   case CallingConv::Win64:
6227     if (IsVarArg) {
6228       if (Subtarget->isWindowsArm64EC())
6229         return CC_AArch64_Arm64EC_VarArg;
6230       return CC_AArch64_Win64_VarArg;
6231     }
6232     return CC_AArch64_AAPCS;
6233   case CallingConv::CFGuard_Check:
6234     return CC_AArch64_Win64_CFGuard_Check;
6235   case CallingConv::AArch64_VectorCall:
6236   case CallingConv::AArch64_SVE_VectorCall:
6237   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
6238   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
6239     return CC_AArch64_AAPCS;
6240  }
6241}
6242
6243CCAssignFn *
6244AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
6245  return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
6246                                      : RetCC_AArch64_AAPCS;
6247}
6248
6249
6250unsigned
6251AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6252                                              SelectionDAG &DAG) const {
6253  MachineFunction &MF = DAG.getMachineFunction();
6254  MachineFrameInfo &MFI = MF.getFrameInfo();
6255
6256  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6257  SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6258                          DAG.getConstant(1, DL, MVT::i32));
6259  SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6260  SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6261  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6262  SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6263  Chain = Buffer.getValue(1);
6264  MFI.CreateVariableSizedObject(Align(1), nullptr);
6265
6266  // Allocate an additional TPIDR2 object on the stack (16 bytes)
6267  unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6268
6269  // Store the buffer pointer to the TPIDR2 stack object.
6270  MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
6271  SDValue Ptr = DAG.getFrameIndex(
6272      TPIDR2Obj,
6273      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
6274  Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6275
6276  return TPIDR2Obj;
6277}
6278
6279SDValue AArch64TargetLowering::LowerFormalArguments(
6280    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6281    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6282    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6283  MachineFunction &MF = DAG.getMachineFunction();
6284  const Function &F = MF.getFunction();
6285  MachineFrameInfo &MFI = MF.getFrameInfo();
6286  bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6287  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6288
6289  SmallVector<ISD::OutputArg, 4> Outs;
6290  GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6291                DAG.getTargetLoweringInfo(), MF.getDataLayout());
6292  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6293    FuncInfo->setIsSVECC(true);
6294
6295  // Assign locations to all of the incoming arguments.
6296  SmallVector<CCValAssign, 16> ArgLocs;
6297  DenseMap<unsigned, SDValue> CopiedRegs;
6298  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6299
6300  // At this point, Ins[].VT may already be promoted to i32. To correctly
6301  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6302  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6303  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6304  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6305  // LocVT.
6306  unsigned NumArgs = Ins.size();
6307  Function::const_arg_iterator CurOrigArg = F.arg_begin();
6308  unsigned CurArgIdx = 0;
6309  for (unsigned i = 0; i != NumArgs; ++i) {
6310    MVT ValVT = Ins[i].VT;
6311    if (Ins[i].isOrigArg()) {
6312      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6313      CurArgIdx = Ins[i].getOrigArgIndex();
6314
6315      // Get type of the original argument.
6316      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6317                                  /*AllowUnknown*/ true);
6318      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6319      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6320      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6321        ValVT = MVT::i8;
6322      else if (ActualMVT == MVT::i16)
6323        ValVT = MVT::i16;
6324    }
6325    bool UseVarArgCC = false;
6326    if (IsWin64)
6327      UseVarArgCC = isVarArg;
6328    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6329    bool Res =
6330        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6331    assert(!Res && "Call operand has unhandled type");
6332    (void)Res;
6333  }
6334
6335  SMEAttrs Attrs(MF.getFunction());
6336  bool IsLocallyStreaming =
6337      !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6338  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6339  SDValue Glue = Chain.getValue(1);
6340
6341  SmallVector<SDValue, 16> ArgValues;
6342  unsigned ExtraArgLocs = 0;
6343  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6344    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6345
6346    if (Ins[i].Flags.isByVal()) {
6347      // Byval is used for HFAs in the PCS, but the system should work in a
6348      // non-compliant manner for larger structs.
6349      EVT PtrVT = getPointerTy(DAG.getDataLayout());
6350      int Size = Ins[i].Flags.getByValSize();
6351      unsigned NumRegs = (Size + 7) / 8;
6352
6353      // FIXME: This works on big-endian for composite byvals, which are the common
6354      // case. It should also work for fundamental types too.
6355      unsigned FrameIdx =
6356        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6357      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6358      InVals.push_back(FrameIdxN);
6359
6360      continue;
6361    }
6362
6363    if (Ins[i].Flags.isSwiftAsync())
6364      MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6365
6366    SDValue ArgValue;
6367    if (VA.isRegLoc()) {
6368      // Arguments stored in registers.
6369      EVT RegVT = VA.getLocVT();
6370      const TargetRegisterClass *RC;
6371
6372      if (RegVT == MVT::i32)
6373        RC = &AArch64::GPR32RegClass;
6374      else if (RegVT == MVT::i64)
6375        RC = &AArch64::GPR64RegClass;
6376      else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6377        RC = &AArch64::FPR16RegClass;
6378      else if (RegVT == MVT::f32)
6379        RC = &AArch64::FPR32RegClass;
6380      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6381        RC = &AArch64::FPR64RegClass;
6382      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6383        RC = &AArch64::FPR128RegClass;
6384      else if (RegVT.isScalableVector() &&
6385               RegVT.getVectorElementType() == MVT::i1) {
6386        FuncInfo->setIsSVECC(true);
6387        RC = &AArch64::PPRRegClass;
6388      } else if (RegVT.isScalableVector()) {
6389        FuncInfo->setIsSVECC(true);
6390        RC = &AArch64::ZPRRegClass;
6391      } else
6392        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6393
6394      // Transform the arguments in physical registers into virtual ones.
6395      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6396
6397      if (IsLocallyStreaming) {
6398        // LocallyStreamingFunctions must insert the SMSTART in the correct
6399        // position, so we use Glue to ensure no instructions can be scheduled
6400        // between the chain of:
6401        //        t0: ch,glue = EntryNode
6402        //      t1:  res,ch,glue = CopyFromReg
6403        //     ...
6404        //   tn: res,ch,glue = CopyFromReg t(n-1), ..
6405        // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6406        // ^^^^^^
6407        // This will be the new Chain/Root node.
6408        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6409        Glue = ArgValue.getValue(2);
6410      } else
6411        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6412
6413      // If this is an 8, 16 or 32-bit value, it is really passed promoted
6414      // to 64 bits.  Insert an assert[sz]ext to capture this, then
6415      // truncate to the right size.
6416      switch (VA.getLocInfo()) {
6417      default:
6418        llvm_unreachable("Unknown loc info!");
6419      case CCValAssign::Full:
6420        break;
6421      case CCValAssign::Indirect:
6422        assert((VA.getValVT().isScalableVector() ||
6423                Subtarget->isWindowsArm64EC()) &&
6424               "Indirect arguments should be scalable on most subtargets");
6425        break;
6426      case CCValAssign::BCvt:
6427        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
6428        break;
6429      case CCValAssign::AExt:
6430      case CCValAssign::SExt:
6431      case CCValAssign::ZExt:
6432        break;
6433      case CCValAssign::AExtUpper:
6434        ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
6435                               DAG.getConstant(32, DL, RegVT));
6436        ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
6437        break;
6438      }
6439    } else { // VA.isRegLoc()
6440      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
6441      unsigned ArgOffset = VA.getLocMemOffset();
6442      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
6443                              ? VA.getLocVT().getSizeInBits()
6444                              : VA.getValVT().getSizeInBits()) / 8;
6445
6446      uint32_t BEAlign = 0;
6447      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
6448          !Ins[i].Flags.isInConsecutiveRegs())
6449        BEAlign = 8 - ArgSize;
6450
6451      SDValue FIN;
6452      MachinePointerInfo PtrInfo;
6453      if (isVarArg && Subtarget->isWindowsArm64EC()) {
6454        // In the ARM64EC varargs convention, fixed arguments on the stack are
6455        // accessed relative to x4, not sp.
6456        unsigned ObjOffset = ArgOffset + BEAlign;
6457        Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6458        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6459        FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
6460                          DAG.getConstant(ObjOffset, DL, MVT::i64));
6461        PtrInfo = MachinePointerInfo::getUnknownStack(MF);
6462      } else {
6463        int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
6464
6465        // Create load nodes to retrieve arguments from the stack.
6466        FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
6467        PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6468      }
6469
6470      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6471      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
6472      MVT MemVT = VA.getValVT();
6473
6474      switch (VA.getLocInfo()) {
6475      default:
6476        break;
6477      case CCValAssign::Trunc:
6478      case CCValAssign::BCvt:
6479        MemVT = VA.getLocVT();
6480        break;
6481      case CCValAssign::Indirect:
6482        assert((VA.getValVT().isScalableVector() ||
6483                Subtarget->isWindowsArm64EC()) &&
6484               "Indirect arguments should be scalable on most subtargets");
6485        MemVT = VA.getLocVT();
6486        break;
6487      case CCValAssign::SExt:
6488        ExtType = ISD::SEXTLOAD;
6489        break;
6490      case CCValAssign::ZExt:
6491        ExtType = ISD::ZEXTLOAD;
6492        break;
6493      case CCValAssign::AExt:
6494        ExtType = ISD::EXTLOAD;
6495        break;
6496      }
6497
6498      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
6499                                MemVT);
6500    }
6501
6502    if (VA.getLocInfo() == CCValAssign::Indirect) {
6503      assert(
6504          (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) &&
6505          "Indirect arguments should be scalable on most subtargets");
6506
6507      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
6508      unsigned NumParts = 1;
6509      if (Ins[i].Flags.isInConsecutiveRegs()) {
6510        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
6511        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6512          ++NumParts;
6513      }
6514
6515      MVT PartLoad = VA.getValVT();
6516      SDValue Ptr = ArgValue;
6517
6518      // Ensure we generate all loads for each tuple part, whilst updating the
6519      // pointer after each load correctly using vscale.
6520      while (NumParts > 0) {
6521        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6522        InVals.push_back(ArgValue);
6523        NumParts--;
6524        if (NumParts > 0) {
6525          SDValue BytesIncrement;
6526          if (PartLoad.isScalableVector()) {
6527            BytesIncrement = DAG.getVScale(
6528                DL, Ptr.getValueType(),
6529                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
6530          } else {
6531            BytesIncrement = DAG.getConstant(
6532                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
6533                Ptr.getValueType());
6534          }
6535          SDNodeFlags Flags;
6536          Flags.setNoUnsignedWrap(true);
6537          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6538                            BytesIncrement, Flags);
6539          ExtraArgLocs++;
6540          i++;
6541        }
6542      }
6543    } else {
6544      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6545        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6546                               ArgValue, DAG.getValueType(MVT::i32));
6547
6548      // i1 arguments are zero-extended to i8 by the caller. Emit a
6549      // hint to reflect this.
6550      if (Ins[i].isOrigArg()) {
6551        Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6552        if (OrigArg->getType()->isIntegerTy(1)) {
6553          if (!Ins[i].Flags.isZExt()) {
6554            ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6555                                   ArgValue.getValueType(), ArgValue);
6556          }
6557        }
6558      }
6559
6560      InVals.push_back(ArgValue);
6561    }
6562  }
6563  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6564
6565  // Insert the SMSTART if this is a locally streaming function and
6566  // make sure it is Glued to the last CopyFromReg value.
6567  if (IsLocallyStreaming) {
6568    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6569    Chain = DAG.getNode(
6570        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6571        {DAG.getRoot(),
6572          DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
6573         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
6574         DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
6575    // Ensure that the SMSTART happens after the CopyWithChain such that its
6576    // chain result is used.
6577    for (unsigned I=0; I<InVals.size(); ++I) {
6578      Register Reg = MF.getRegInfo().createVirtualRegister(
6579          getRegClassFor(InVals[I].getValueType().getSimpleVT()));
6580      Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
6581      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
6582                                     InVals[I].getValueType());
6583    }
6584  }
6585
6586  // varargs
6587  if (isVarArg) {
6588    if (!Subtarget->isTargetDarwin() || IsWin64) {
6589      // The AAPCS variadic function ABI is identical to the non-variadic
6590      // one. As a result there may be more arguments in registers and we should
6591      // save them for future reference.
6592      // Win64 variadic functions also pass arguments in registers, but all float
6593      // arguments are passed in integer registers.
6594      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6595    }
6596
6597    // This will point to the next argument passed via stack.
6598    unsigned StackOffset = CCInfo.getNextStackOffset();
6599    // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6600    StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
6601    FuncInfo->setVarArgsStackOffset(StackOffset);
6602    FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
6603
6604    if (MFI.hasMustTailInVarArgFunc()) {
6605      SmallVector<MVT, 2> RegParmTypes;
6606      RegParmTypes.push_back(MVT::i64);
6607      RegParmTypes.push_back(MVT::f128);
6608      // Compute the set of forwarded registers. The rest are scratch.
6609      SmallVectorImpl<ForwardedRegister> &Forwards =
6610                                       FuncInfo->getForwardedMustTailRegParms();
6611      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
6612                                               CC_AArch64_AAPCS);
6613
6614      // Conservatively forward X8, since it might be used for aggregate return.
6615      if (!CCInfo.isAllocated(AArch64::X8)) {
6616        Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
6617        Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
6618      }
6619    }
6620  }
6621
6622  // On Windows, InReg pointers must be returned, so record the pointer in a
6623  // virtual register at the start of the function so it can be returned in the
6624  // epilogue.
6625  if (IsWin64) {
6626    for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
6627      if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) {
6628        assert(!FuncInfo->getSRetReturnReg());
6629
6630        MVT PtrTy = getPointerTy(DAG.getDataLayout());
6631        Register Reg =
6632            MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
6633        FuncInfo->setSRetReturnReg(Reg);
6634
6635        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
6636        Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
6637        break;
6638      }
6639    }
6640  }
6641
6642  unsigned StackArgSize = CCInfo.getNextStackOffset();
6643  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6644  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
6645    // This is a non-standard ABI so by fiat I say we're allowed to make full
6646    // use of the stack area to be popped, which must be aligned to 16 bytes in
6647    // any case:
6648    StackArgSize = alignTo(StackArgSize, 16);
6649
6650    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
6651    // a multiple of 16.
6652    FuncInfo->setArgumentStackToRestore(StackArgSize);
6653
6654    // This realignment carries over to the available bytes below. Our own
6655    // callers will guarantee the space is free by giving an aligned value to
6656    // CALLSEQ_START.
6657  }
6658  // Even if we're not expected to free up the space, it's useful to know how
6659  // much is there while considering tail calls (because we can reuse it).
6660  FuncInfo->setBytesInStackArgArea(StackArgSize);
6661
6662  if (Subtarget->hasCustomCallingConv())
6663    Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
6664
6665  // Conservatively assume the function requires the lazy-save mechanism.
6666  if (SMEAttrs(MF.getFunction()).hasZAState()) {
6667    unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
6668    FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
6669  }
6670
6671  return Chain;
6672}
6673
6674void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
6675                                                SelectionDAG &DAG,
6676                                                const SDLoc &DL,
6677                                                SDValue &Chain) const {
6678  MachineFunction &MF = DAG.getMachineFunction();
6679  MachineFrameInfo &MFI = MF.getFrameInfo();
6680  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6681  auto PtrVT = getPointerTy(DAG.getDataLayout());
6682  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
6683
6684  SmallVector<SDValue, 8> MemOps;
6685
6686  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
6687                                          AArch64::X3, AArch64::X4, AArch64::X5,
6688                                          AArch64::X6, AArch64::X7 };
6689  unsigned NumGPRArgRegs = std::size(GPRArgRegs);
6690  if (Subtarget->isWindowsArm64EC()) {
6691    // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
6692    // functions.
6693    NumGPRArgRegs = 4;
6694  }
6695  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
6696
6697  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
6698  int GPRIdx = 0;
6699  if (GPRSaveSize != 0) {
6700    if (IsWin64) {
6701      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
6702      if (GPRSaveSize & 15)
6703        // The extra size here, if triggered, will always be 8.
6704        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
6705    } else
6706      GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
6707
6708    SDValue FIN;
6709    if (Subtarget->isWindowsArm64EC()) {
6710      // With the Arm64EC ABI, we reserve the save area as usual, but we
6711      // compute its address relative to x4.  For a normal AArch64->AArch64
6712      // call, x4 == sp on entry, but calls from an entry thunk can pass in a
6713      // different address.
6714      Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6715      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6716      FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
6717                        DAG.getConstant(GPRSaveSize, DL, MVT::i64));
6718    } else {
6719      FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
6720    }
6721
6722    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
6723      Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
6724      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6725      SDValue Store =
6726          DAG.getStore(Val.getValue(1), DL, Val, FIN,
6727                       IsWin64 ? MachinePointerInfo::getFixedStack(
6728                                     MF, GPRIdx, (i - FirstVariadicGPR) * 8)
6729                               : MachinePointerInfo::getStack(MF, i * 8));
6730      MemOps.push_back(Store);
6731      FIN =
6732          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
6733    }
6734  }
6735  FuncInfo->setVarArgsGPRIndex(GPRIdx);
6736  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
6737
6738  if (Subtarget->hasFPARMv8() && !IsWin64) {
6739    static const MCPhysReg FPRArgRegs[] = {
6740        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
6741        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
6742    static const unsigned NumFPRArgRegs = std::size(FPRArgRegs);
6743    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
6744
6745    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
6746    int FPRIdx = 0;
6747    if (FPRSaveSize != 0) {
6748      FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
6749
6750      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
6751
6752      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
6753        Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
6754        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
6755
6756        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
6757                                     MachinePointerInfo::getStack(MF, i * 16));
6758        MemOps.push_back(Store);
6759        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
6760                          DAG.getConstant(16, DL, PtrVT));
6761      }
6762    }
6763    FuncInfo->setVarArgsFPRIndex(FPRIdx);
6764    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
6765  }
6766
6767  if (!MemOps.empty()) {
6768    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
6769  }
6770}
6771
6772/// LowerCallResult - Lower the result values of a call into the
6773/// appropriate copies out of appropriate physical registers.
6774SDValue AArch64TargetLowering::LowerCallResult(
6775    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
6776    const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
6777    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
6778    SDValue ThisVal) const {
6779  DenseMap<unsigned, SDValue> CopiedRegs;
6780  // Copy all of the result registers out of their specified physreg.
6781  for (unsigned i = 0; i != RVLocs.size(); ++i) {
6782    CCValAssign VA = RVLocs[i];
6783
6784    // Pass 'this' value directly from the argument to return value, to avoid
6785    // reg unit interference
6786    if (i == 0 && isThisReturn) {
6787      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
6788             "unexpected return calling convention register assignment");
6789      InVals.push_back(ThisVal);
6790      continue;
6791    }
6792
6793    // Avoid copying a physreg twice since RegAllocFast is incompetent and only
6794    // allows one use of a physreg per block.
6795    SDValue Val = CopiedRegs.lookup(VA.getLocReg());
6796    if (!Val) {
6797      Val =
6798          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
6799      Chain = Val.getValue(1);
6800      InFlag = Val.getValue(2);
6801      CopiedRegs[VA.getLocReg()] = Val;
6802    }
6803
6804    switch (VA.getLocInfo()) {
6805    default:
6806      llvm_unreachable("Unknown loc info!");
6807    case CCValAssign::Full:
6808      break;
6809    case CCValAssign::BCvt:
6810      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
6811      break;
6812    case CCValAssign::AExtUpper:
6813      Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
6814                        DAG.getConstant(32, DL, VA.getLocVT()));
6815      [[fallthrough]];
6816    case CCValAssign::AExt:
6817      [[fallthrough]];
6818    case CCValAssign::ZExt:
6819      Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
6820      break;
6821    }
6822
6823    InVals.push_back(Val);
6824  }
6825
6826  return Chain;
6827}
6828
6829/// Return true if the calling convention is one that we can guarantee TCO for.
6830static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
6831  return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
6832         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
6833}
6834
6835/// Return true if we might ever do TCO for calls with this calling convention.
6836static bool mayTailCallThisCC(CallingConv::ID CC) {
6837  switch (CC) {
6838  case CallingConv::C:
6839  case CallingConv::AArch64_SVE_VectorCall:
6840  case CallingConv::PreserveMost:
6841  case CallingConv::Swift:
6842  case CallingConv::SwiftTail:
6843  case CallingConv::Tail:
6844  case CallingConv::Fast:
6845    return true;
6846  default:
6847    return false;
6848  }
6849}
6850
6851static void analyzeCallOperands(const AArch64TargetLowering &TLI,
6852                                const AArch64Subtarget *Subtarget,
6853                                const TargetLowering::CallLoweringInfo &CLI,
6854                                CCState &CCInfo) {
6855  const SelectionDAG &DAG = CLI.DAG;
6856  CallingConv::ID CalleeCC = CLI.CallConv;
6857  bool IsVarArg = CLI.IsVarArg;
6858  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6859  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
6860
6861  unsigned NumArgs = Outs.size();
6862  for (unsigned i = 0; i != NumArgs; ++i) {
6863    MVT ArgVT = Outs[i].VT;
6864    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6865
6866    bool UseVarArgCC = false;
6867    if (IsVarArg) {
6868      // On Windows, the fixed arguments in a vararg call are passed in GPRs
6869      // too, so use the vararg CC to force them to integer registers.
6870      if (IsCalleeWin64) {
6871        UseVarArgCC = true;
6872      } else {
6873        UseVarArgCC = !Outs[i].IsFixed;
6874      }
6875    }
6876
6877    if (!UseVarArgCC) {
6878      // Get type of the original argument.
6879      EVT ActualVT =
6880          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
6881                       /*AllowUnknown*/ true);
6882      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
6883      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6884      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6885        ArgVT = MVT::i8;
6886      else if (ActualMVT == MVT::i16)
6887        ArgVT = MVT::i16;
6888    }
6889
6890    CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
6891    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
6892    assert(!Res && "Call operand has unhandled type");
6893    (void)Res;
6894  }
6895}
6896
6897bool AArch64TargetLowering::isEligibleForTailCallOptimization(
6898    const CallLoweringInfo &CLI) const {
6899  CallingConv::ID CalleeCC = CLI.CallConv;
6900  if (!mayTailCallThisCC(CalleeCC))
6901    return false;
6902
6903  SDValue Callee = CLI.Callee;
6904  bool IsVarArg = CLI.IsVarArg;
6905  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6906  const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6907  const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6908  const SelectionDAG &DAG = CLI.DAG;
6909  MachineFunction &MF = DAG.getMachineFunction();
6910  const Function &CallerF = MF.getFunction();
6911  CallingConv::ID CallerCC = CallerF.getCallingConv();
6912
6913  // SME Streaming functions are not eligible for TCO as they may require
6914  // the streaming mode or ZA to be restored after returning from the call.
6915  SMEAttrs CallerAttrs(MF.getFunction());
6916  auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
6917  if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
6918      CallerAttrs.requiresLazySave(CalleeAttrs))
6919    return false;
6920
6921  // Functions using the C or Fast calling convention that have an SVE signature
6922  // preserve more registers and should assume the SVE_VectorCall CC.
6923  // The check for matching callee-saved regs will determine whether it is
6924  // eligible for TCO.
6925  if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
6926      MF.getInfo<AArch64FunctionInfo>()->isSVECC())
6927    CallerCC = CallingConv::AArch64_SVE_VectorCall;
6928
6929  bool CCMatch = CallerCC == CalleeCC;
6930
6931  // When using the Windows calling convention on a non-windows OS, we want
6932  // to back up and restore X18 in such functions; we can't do a tail call
6933  // from those functions.
6934  if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
6935      CalleeCC != CallingConv::Win64)
6936    return false;
6937
6938  // Byval parameters hand the function a pointer directly into the stack area
6939  // we want to reuse during a tail call. Working around this *is* possible (see
6940  // X86) but less efficient and uglier in LowerCall.
6941  for (Function::const_arg_iterator i = CallerF.arg_begin(),
6942                                    e = CallerF.arg_end();
6943       i != e; ++i) {
6944    if (i->hasByValAttr())
6945      return false;
6946
6947    // On Windows, "inreg" attributes signify non-aggregate indirect returns.
6948    // In this case, it is necessary to save/restore X0 in the callee. Tail
6949    // call opt interferes with this. So we disable tail call opt when the
6950    // caller has an argument with "inreg" attribute.
6951
6952    // FIXME: Check whether the callee also has an "inreg" argument.
6953    if (i->hasInRegAttr())
6954      return false;
6955  }
6956
6957  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
6958    return CCMatch;
6959
6960  // Externally-defined functions with weak linkage should not be
6961  // tail-called on AArch64 when the OS does not support dynamic
6962  // pre-emption of symbols, as the AAELF spec requires normal calls
6963  // to undefined weak functions to be replaced with a NOP or jump to the
6964  // next instruction. The behaviour of branch instructions in this
6965  // situation (as used for tail calls) is implementation-defined, so we
6966  // cannot rely on the linker replacing the tail call with a return.
6967  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6968    const GlobalValue *GV = G->getGlobal();
6969    const Triple &TT = getTargetMachine().getTargetTriple();
6970    if (GV->hasExternalWeakLinkage() &&
6971        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
6972      return false;
6973  }
6974
6975  // Now we search for cases where we can use a tail call without changing the
6976  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
6977  // concept.
6978
6979  // I want anyone implementing a new calling convention to think long and hard
6980  // about this assert.
6981  assert((!IsVarArg || CalleeCC == CallingConv::C) &&
6982         "Unexpected variadic calling convention");
6983
6984  LLVMContext &C = *DAG.getContext();
6985  // Check that the call results are passed in the same way.
6986  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
6987                                  CCAssignFnForCall(CalleeCC, IsVarArg),
6988                                  CCAssignFnForCall(CallerCC, IsVarArg)))
6989    return false;
6990  // The callee has to preserve all registers the caller needs to preserve.
6991  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6992  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
6993  if (!CCMatch) {
6994    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
6995    if (Subtarget->hasCustomCallingConv()) {
6996      TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
6997      TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
6998    }
6999    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7000      return false;
7001  }
7002
7003  // Nothing more to check if the callee is taking no arguments
7004  if (Outs.empty())
7005    return true;
7006
7007  SmallVector<CCValAssign, 16> ArgLocs;
7008  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7009
7010  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7011
7012  if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7013    // When we are musttail, additional checks have been done and we can safely ignore this check
7014    // At least two cases here: if caller is fastcc then we can't have any
7015    // memory arguments (we'd be expected to clean up the stack afterwards). If
7016    // caller is C then we could potentially use its argument area.
7017
7018    // FIXME: for now we take the most conservative of these in both cases:
7019    // disallow all variadic memory operands.
7020    for (const CCValAssign &ArgLoc : ArgLocs)
7021      if (!ArgLoc.isRegLoc())
7022        return false;
7023  }
7024
7025  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7026
7027  // If any of the arguments is passed indirectly, it must be SVE, so the
7028  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7029  // allocate space on the stack. That is why we determine this explicitly here
7030  // the call cannot be a tailcall.
7031  if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7032        assert((A.getLocInfo() != CCValAssign::Indirect ||
7033                A.getValVT().isScalableVector() ||
7034                Subtarget->isWindowsArm64EC()) &&
7035               "Expected value to be scalable");
7036        return A.getLocInfo() == CCValAssign::Indirect;
7037      }))
7038    return false;
7039
7040  // If the stack arguments for this call do not fit into our own save area then
7041  // the call cannot be made tail.
7042  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
7043    return false;
7044
7045  const MachineRegisterInfo &MRI = MF.getRegInfo();
7046  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7047    return false;
7048
7049  return true;
7050}
7051
7052SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7053                                                   SelectionDAG &DAG,
7054                                                   MachineFrameInfo &MFI,
7055                                                   int ClobberedFI) const {
7056  SmallVector<SDValue, 8> ArgChains;
7057  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7058  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7059
7060  // Include the original chain at the beginning of the list. When this is
7061  // used by target LowerCall hooks, this helps legalize find the
7062  // CALLSEQ_BEGIN node.
7063  ArgChains.push_back(Chain);
7064
7065  // Add a chain value for each stack argument corresponding
7066  for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7067    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7068      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7069        if (FI->getIndex() < 0) {
7070          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7071          int64_t InLastByte = InFirstByte;
7072          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7073
7074          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7075              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7076            ArgChains.push_back(SDValue(L, 1));
7077        }
7078
7079  // Build a tokenfactor for all the chains.
7080  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7081}
7082
7083bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7084                                                   bool TailCallOpt) const {
7085  return (CallCC == CallingConv::Fast && TailCallOpt) ||
7086         CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7087}
7088
7089// Check if the value is zero-extended from i1 to i8
7090static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7091  unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7092  if (SizeInBits < 8)
7093    return false;
7094
7095  APInt RequredZero(SizeInBits, 0xFE);
7096  KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7097  bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7098  return ZExtBool;
7099}
7100
7101SDValue AArch64TargetLowering::changeStreamingMode(
7102    SelectionDAG &DAG, SDLoc DL, bool Enable,
7103    SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const {
7104  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7105  SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7106  SDValue MSROp =
7107      DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7108
7109  SDValue ExpectedSMVal =
7110      DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
7111  SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
7112
7113  if (InFlag)
7114    Ops.push_back(InFlag);
7115
7116  unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7117  return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7118}
7119
7120/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7121/// and add input and output parameter nodes.
7122SDValue
7123AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7124                                 SmallVectorImpl<SDValue> &InVals) const {
7125  SelectionDAG &DAG = CLI.DAG;
7126  SDLoc &DL = CLI.DL;
7127  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7128  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7129  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7130  SDValue Chain = CLI.Chain;
7131  SDValue Callee = CLI.Callee;
7132  bool &IsTailCall = CLI.IsTailCall;
7133  CallingConv::ID &CallConv = CLI.CallConv;
7134  bool IsVarArg = CLI.IsVarArg;
7135
7136  MachineFunction &MF = DAG.getMachineFunction();
7137  MachineFunction::CallSiteInfo CSInfo;
7138  bool IsThisReturn = false;
7139
7140  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7141  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7142  bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7143  bool IsSibCall = false;
7144  bool GuardWithBTI = false;
7145
7146  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
7147      !Subtarget->noBTIAtReturnTwice()) {
7148    GuardWithBTI = FuncInfo->branchTargetEnforcement();
7149  }
7150
7151  // Analyze operands of the call, assigning locations to each operand.
7152  SmallVector<CCValAssign, 16> ArgLocs;
7153  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7154
7155  if (IsVarArg) {
7156    unsigned NumArgs = Outs.size();
7157
7158    for (unsigned i = 0; i != NumArgs; ++i) {
7159      if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7160        report_fatal_error("Passing SVE types to variadic functions is "
7161                           "currently not supported");
7162    }
7163  }
7164
7165  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7166
7167  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7168  // Assign locations to each value returned by this call.
7169  SmallVector<CCValAssign, 16> RVLocs;
7170  CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7171                    *DAG.getContext());
7172  RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7173
7174  // Check callee args/returns for SVE registers and set calling convention
7175  // accordingly.
7176  if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7177    auto HasSVERegLoc = [](CCValAssign &Loc) {
7178      if (!Loc.isRegLoc())
7179        return false;
7180      return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7181             AArch64::PPRRegClass.contains(Loc.getLocReg());
7182    };
7183    if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7184      CallConv = CallingConv::AArch64_SVE_VectorCall;
7185  }
7186
7187  if (IsTailCall) {
7188    // Check if it's really possible to do a tail call.
7189    IsTailCall = isEligibleForTailCallOptimization(CLI);
7190
7191    // A sibling call is one where we're under the usual C ABI and not planning
7192    // to change that but can still do a tail call:
7193    if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7194        CallConv != CallingConv::SwiftTail)
7195      IsSibCall = true;
7196
7197    if (IsTailCall)
7198      ++NumTailCalls;
7199  }
7200
7201  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7202    report_fatal_error("failed to perform tail call elimination on a call "
7203                       "site marked musttail");
7204
7205  // Get a count of how many bytes are to be pushed on the stack.
7206  unsigned NumBytes = CCInfo.getNextStackOffset();
7207
7208  if (IsSibCall) {
7209    // Since we're not changing the ABI to make this a tail call, the memory
7210    // operands are already available in the caller's incoming argument space.
7211    NumBytes = 0;
7212  }
7213
7214  // FPDiff is the byte offset of the call's argument area from the callee's.
7215  // Stores to callee stack arguments will be placed in FixedStackSlots offset
7216  // by this amount for a tail call. In a sibling call it must be 0 because the
7217  // caller will deallocate the entire stack and the callee still expects its
7218  // arguments to begin at SP+0. Completely unused for non-tail calls.
7219  int FPDiff = 0;
7220
7221  if (IsTailCall && !IsSibCall) {
7222    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7223
7224    // Since callee will pop argument stack as a tail call, we must keep the
7225    // popped size 16-byte aligned.
7226    NumBytes = alignTo(NumBytes, 16);
7227
7228    // FPDiff will be negative if this tail call requires more space than we
7229    // would automatically have in our incoming argument space. Positive if we
7230    // can actually shrink the stack.
7231    FPDiff = NumReusableBytes - NumBytes;
7232
7233    // Update the required reserved area if this is the tail call requiring the
7234    // most argument stack space.
7235    if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7236      FuncInfo->setTailCallReservedStack(-FPDiff);
7237
7238    // The stack pointer must be 16-byte aligned at all times it's used for a
7239    // memory operation, which in practice means at *all* times and in
7240    // particular across call boundaries. Therefore our own arguments started at
7241    // a 16-byte aligned SP and the delta applied for the tail call should
7242    // satisfy the same constraint.
7243    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7244  }
7245
7246  // Determine whether we need any streaming mode changes.
7247  SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7248  if (CLI.CB)
7249    CalleeAttrs = SMEAttrs(*CLI.CB);
7250  else if (std::optional<SMEAttrs> Attrs =
7251               getCalleeAttrsFromExternalFunction(CLI.Callee))
7252    CalleeAttrs = *Attrs;
7253
7254  bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7255
7256  MachineFrameInfo &MFI = MF.getFrameInfo();
7257  if (RequiresLazySave) {
7258    // Set up a lazy save mechanism by storing the runtime live slices
7259    // (worst-case N*N) to the TPIDR2 stack object.
7260    SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7261                            DAG.getConstant(1, DL, MVT::i32));
7262    SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
7263    unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7264
7265    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
7266    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7267        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7268    SDValue BufferPtrAddr =
7269        DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7270                    DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7271    Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
7272    Chain = DAG.getNode(
7273        ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7274        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7275        TPIDR2ObjAddr);
7276  }
7277
7278  SDValue PStateSM;
7279  std::optional<bool> RequiresSMChange =
7280      CallerAttrs.requiresSMChange(CalleeAttrs);
7281  if (RequiresSMChange)
7282    PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
7283
7284  // Adjust the stack pointer for the new arguments...
7285  // These operations are automatically eliminated by the prolog/epilog pass
7286  if (!IsSibCall)
7287    Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
7288
7289  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
7290                                        getPointerTy(DAG.getDataLayout()));
7291
7292  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7293  SmallSet<unsigned, 8> RegsUsed;
7294  SmallVector<SDValue, 8> MemOpChains;
7295  auto PtrVT = getPointerTy(DAG.getDataLayout());
7296
7297  if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
7298    const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
7299    for (const auto &F : Forwards) {
7300      SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
7301       RegsToPass.emplace_back(F.PReg, Val);
7302    }
7303  }
7304
7305  // Walk the register/memloc assignments, inserting copies/loads.
7306  unsigned ExtraArgLocs = 0;
7307  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
7308    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7309    SDValue Arg = OutVals[i];
7310    ISD::ArgFlagsTy Flags = Outs[i].Flags;
7311
7312    // Promote the value if needed.
7313    switch (VA.getLocInfo()) {
7314    default:
7315      llvm_unreachable("Unknown loc info!");
7316    case CCValAssign::Full:
7317      break;
7318    case CCValAssign::SExt:
7319      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
7320      break;
7321    case CCValAssign::ZExt:
7322      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7323      break;
7324    case CCValAssign::AExt:
7325      if (Outs[i].ArgVT == MVT::i1) {
7326        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7327        //
7328        // Check if we actually have to do this, because the value may
7329        // already be zero-extended.
7330        //
7331        // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7332        // and rely on DAGCombiner to fold this, because the following
7333        // (anyext i32) is combined with (zext i8) in DAG.getNode:
7334        //
7335        //   (ext (zext x)) -> (zext x)
7336        //
7337        // This will give us (zext i32), which we cannot remove, so
7338        // try to check this beforehand.
7339        if (!checkZExtBool(Arg, DAG)) {
7340          Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7341          Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
7342        }
7343      }
7344      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7345      break;
7346    case CCValAssign::AExtUpper:
7347      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7348      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7349      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7350                        DAG.getConstant(32, DL, VA.getLocVT()));
7351      break;
7352    case CCValAssign::BCvt:
7353      Arg = DAG.getBitcast(VA.getLocVT(), Arg);
7354      break;
7355    case CCValAssign::Trunc:
7356      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7357      break;
7358    case CCValAssign::FPExt:
7359      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
7360      break;
7361    case CCValAssign::Indirect:
7362      bool isScalable = VA.getValVT().isScalableVector();
7363      assert((isScalable || Subtarget->isWindowsArm64EC()) &&
7364             "Indirect arguments should be scalable on most subtargets");
7365
7366      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
7367      uint64_t PartSize = StoreSize;
7368      unsigned NumParts = 1;
7369      if (Outs[i].Flags.isInConsecutiveRegs()) {
7370        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
7371        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7372          ++NumParts;
7373        StoreSize *= NumParts;
7374      }
7375
7376      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
7377      Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7378      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
7379      if (isScalable)
7380        MFI.setStackID(FI, TargetStackID::ScalableVector);
7381
7382      MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
7383      SDValue Ptr = DAG.getFrameIndex(
7384          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7385      SDValue SpillSlot = Ptr;
7386
7387      // Ensure we generate all stores for each tuple part, whilst updating the
7388      // pointer after each store correctly using vscale.
7389      while (NumParts) {
7390        Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
7391        NumParts--;
7392        if (NumParts > 0) {
7393          SDValue BytesIncrement;
7394          if (isScalable) {
7395            BytesIncrement = DAG.getVScale(
7396                DL, Ptr.getValueType(),
7397                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7398          } else {
7399            BytesIncrement = DAG.getConstant(
7400                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7401                Ptr.getValueType());
7402          }
7403          SDNodeFlags Flags;
7404          Flags.setNoUnsignedWrap(true);
7405
7406          MPI = MachinePointerInfo(MPI.getAddrSpace());
7407          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7408                            BytesIncrement, Flags);
7409          ExtraArgLocs++;
7410          i++;
7411        }
7412      }
7413
7414      Arg = SpillSlot;
7415      break;
7416    }
7417
7418    if (VA.isRegLoc()) {
7419      if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
7420          Outs[0].VT == MVT::i64) {
7421        assert(VA.getLocVT() == MVT::i64 &&
7422               "unexpected calling convention register assignment");
7423        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
7424               "unexpected use of 'returned'");
7425        IsThisReturn = true;
7426      }
7427      if (RegsUsed.count(VA.getLocReg())) {
7428        // If this register has already been used then we're trying to pack
7429        // parts of an [N x i32] into an X-register. The extension type will
7430        // take care of putting the two halves in the right place but we have to
7431        // combine them.
7432        SDValue &Bits =
7433            llvm::find_if(RegsToPass,
7434                          [=](const std::pair<unsigned, SDValue> &Elt) {
7435                            return Elt.first == VA.getLocReg();
7436                          })
7437                ->second;
7438        Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7439        // Call site info is used for function's parameter entry value
7440        // tracking. For now we track only simple cases when parameter
7441        // is transferred through whole register.
7442        llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
7443          return ArgReg.Reg == VA.getLocReg();
7444        });
7445      } else {
7446        // Add an extra level of indirection for streaming mode changes by
7447        // using a pseudo copy node that cannot be rematerialised between a
7448        // smstart/smstop and the call by the simple register coalescer.
7449        if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
7450          Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
7451        RegsToPass.emplace_back(VA.getLocReg(), Arg);
7452        RegsUsed.insert(VA.getLocReg());
7453        const TargetOptions &Options = DAG.getTarget().Options;
7454        if (Options.EmitCallSiteInfo)
7455          CSInfo.emplace_back(VA.getLocReg(), i);
7456      }
7457    } else {
7458      assert(VA.isMemLoc());
7459
7460      SDValue DstAddr;
7461      MachinePointerInfo DstInfo;
7462
7463      // FIXME: This works on big-endian for composite byvals, which are the
7464      // common case. It should also work for fundamental types too.
7465      uint32_t BEAlign = 0;
7466      unsigned OpSize;
7467      if (VA.getLocInfo() == CCValAssign::Indirect ||
7468          VA.getLocInfo() == CCValAssign::Trunc)
7469        OpSize = VA.getLocVT().getFixedSizeInBits();
7470      else
7471        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
7472                                 : VA.getValVT().getSizeInBits();
7473      OpSize = (OpSize + 7) / 8;
7474      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
7475          !Flags.isInConsecutiveRegs()) {
7476        if (OpSize < 8)
7477          BEAlign = 8 - OpSize;
7478      }
7479      unsigned LocMemOffset = VA.getLocMemOffset();
7480      int32_t Offset = LocMemOffset + BEAlign;
7481      SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7482      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7483
7484      if (IsTailCall) {
7485        Offset = Offset + FPDiff;
7486        int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
7487
7488        DstAddr = DAG.getFrameIndex(FI, PtrVT);
7489        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
7490
7491        // Make sure any stack arguments overlapping with where we're storing
7492        // are loaded before this eventual operation. Otherwise they'll be
7493        // clobbered.
7494        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
7495      } else {
7496        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7497
7498        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7499        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
7500      }
7501
7502      if (Outs[i].Flags.isByVal()) {
7503        SDValue SizeNode =
7504            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
7505        SDValue Cpy = DAG.getMemcpy(
7506            Chain, DL, DstAddr, Arg, SizeNode,
7507            Outs[i].Flags.getNonZeroByValAlign(),
7508            /*isVol = */ false, /*AlwaysInline = */ false,
7509            /*isTailCall = */ false, DstInfo, MachinePointerInfo());
7510
7511        MemOpChains.push_back(Cpy);
7512      } else {
7513        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7514        // promoted to a legal register type i32, we should truncate Arg back to
7515        // i1/i8/i16.
7516        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
7517            VA.getValVT() == MVT::i16)
7518          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
7519
7520        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
7521        MemOpChains.push_back(Store);
7522      }
7523    }
7524  }
7525
7526  if (IsVarArg && Subtarget->isWindowsArm64EC()) {
7527    // For vararg calls, the Arm64EC ABI requires values in x4 and x5
7528    // describing the argument list.  x4 contains the address of the
7529    // first stack parameter. x5 contains the size in bytes of all parameters
7530    // passed on the stack.
7531    RegsToPass.emplace_back(AArch64::X4, StackPtr);
7532    RegsToPass.emplace_back(AArch64::X5,
7533                            DAG.getConstant(NumBytes, DL, MVT::i64));
7534  }
7535
7536  if (!MemOpChains.empty())
7537    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
7538
7539  SDValue InFlag;
7540  if (RequiresSMChange) {
7541    SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
7542                                           InFlag, PStateSM, true);
7543    Chain = NewChain.getValue(0);
7544    InFlag = NewChain.getValue(1);
7545  }
7546
7547  // Build a sequence of copy-to-reg nodes chained together with token chain
7548  // and flag operands which copy the outgoing args into the appropriate regs.
7549  for (auto &RegToPass : RegsToPass) {
7550    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
7551                             RegToPass.second, InFlag);
7552    InFlag = Chain.getValue(1);
7553  }
7554
7555  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
7556  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
7557  // node so that legalize doesn't hack it.
7558  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7559    auto GV = G->getGlobal();
7560    unsigned OpFlags =
7561        Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
7562    if (OpFlags & AArch64II::MO_GOT) {
7563      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
7564      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7565    } else {
7566      const GlobalValue *GV = G->getGlobal();
7567      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
7568    }
7569  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
7570    if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7571        Subtarget->isTargetMachO()) {
7572      const char *Sym = S->getSymbol();
7573      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
7574      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7575    } else {
7576      const char *Sym = S->getSymbol();
7577      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
7578    }
7579  }
7580
7581  // We don't usually want to end the call-sequence here because we would tidy
7582  // the frame up *after* the call, however in the ABI-changing tail-call case
7583  // we've carefully laid out the parameters so that when sp is reset they'll be
7584  // in the correct location.
7585  if (IsTailCall && !IsSibCall) {
7586    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL);
7587    InFlag = Chain.getValue(1);
7588  }
7589
7590  std::vector<SDValue> Ops;
7591  Ops.push_back(Chain);
7592  Ops.push_back(Callee);
7593
7594  if (IsTailCall) {
7595    // Each tail call may have to adjust the stack by a different amount, so
7596    // this information must travel along with the operation for eventual
7597    // consumption by emitEpilogue.
7598    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
7599  }
7600
7601  // Add argument registers to the end of the list so that they are known live
7602  // into the call.
7603  for (auto &RegToPass : RegsToPass)
7604    Ops.push_back(DAG.getRegister(RegToPass.first,
7605                                  RegToPass.second.getValueType()));
7606
7607  // Add a register mask operand representing the call-preserved registers.
7608  const uint32_t *Mask;
7609  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7610  if (IsThisReturn) {
7611    // For 'this' returns, use the X0-preserving mask if applicable
7612    Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
7613    if (!Mask) {
7614      IsThisReturn = false;
7615      Mask = TRI->getCallPreservedMask(MF, CallConv);
7616    }
7617  } else
7618    Mask = TRI->getCallPreservedMask(MF, CallConv);
7619
7620  if (Subtarget->hasCustomCallingConv())
7621    TRI->UpdateCustomCallPreservedMask(MF, &Mask);
7622
7623  if (TRI->isAnyArgRegReserved(MF))
7624    TRI->emitReservedArgRegCallError(MF);
7625
7626  assert(Mask && "Missing call preserved mask for calling convention");
7627  Ops.push_back(DAG.getRegisterMask(Mask));
7628
7629  if (InFlag.getNode())
7630    Ops.push_back(InFlag);
7631
7632  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7633
7634  // If we're doing a tall call, use a TC_RETURN here rather than an
7635  // actual call instruction.
7636  if (IsTailCall) {
7637    MF.getFrameInfo().setHasTailCall();
7638    SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
7639
7640    if (IsCFICall)
7641      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7642
7643    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
7644    return Ret;
7645  }
7646
7647  unsigned CallOpc = AArch64ISD::CALL;
7648  // Calls with operand bundle "clang.arc.attachedcall" are special. They should
7649  // be expanded to the call, directly followed by a special marker sequence and
7650  // a call to an ObjC library function.  Use CALL_RVMARKER to do that.
7651  if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
7652    assert(!IsTailCall &&
7653           "tail calls cannot be marked with clang.arc.attachedcall");
7654    CallOpc = AArch64ISD::CALL_RVMARKER;
7655
7656    // Add a target global address for the retainRV/claimRV runtime function
7657    // just before the call target.
7658    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
7659    auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
7660    Ops.insert(Ops.begin() + 1, GA);
7661  } else if (GuardWithBTI)
7662    CallOpc = AArch64ISD::CALL_BTI;
7663
7664  // Returns a chain and a flag for retval copy to use.
7665  Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
7666
7667  if (IsCFICall)
7668    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7669
7670  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
7671  InFlag = Chain.getValue(1);
7672  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
7673
7674  uint64_t CalleePopBytes =
7675      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
7676
7677  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL);
7678  InFlag = Chain.getValue(1);
7679
7680  // Handle result values, copying them out of physregs into vregs that we
7681  // return.
7682  SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs,
7683                                   DL, DAG, InVals, IsThisReturn,
7684                                   IsThisReturn ? OutVals[0] : SDValue());
7685
7686  if (!Ins.empty())
7687    InFlag = Result.getValue(Result->getNumValues() - 1);
7688
7689  if (RequiresSMChange) {
7690    assert(PStateSM && "Expected a PStateSM to be set");
7691    Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag,
7692                                 PStateSM, false);
7693  }
7694
7695  if (RequiresLazySave) {
7696    // Unconditionally resume ZA.
7697    Result = DAG.getNode(
7698        AArch64ISD::SMSTART, DL, MVT::Other, Result,
7699        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7700        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7701
7702    // Conditionally restore the lazy save using a pseudo node.
7703    unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
7704    SDValue RegMask = DAG.getRegisterMask(
7705        TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
7706    SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
7707        "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
7708    SDValue TPIDR2_EL0 = DAG.getNode(
7709        ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
7710        DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
7711
7712    // Copy the address of the TPIDR2 block into X0 before 'calling' the
7713    // RESTORE_ZA pseudo.
7714    SDValue Glue;
7715    SDValue TPIDR2Block = DAG.getFrameIndex(
7716        FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7717    Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
7718    Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
7719                         {Result, TPIDR2_EL0,
7720                          DAG.getRegister(AArch64::X0, MVT::i64),
7721                          RestoreRoutine,
7722                          RegMask,
7723                          Result.getValue(1)});
7724
7725    // Finally reset the TPIDR2_EL0 register to 0.
7726    Result = DAG.getNode(
7727        ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
7728        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7729        DAG.getConstant(0, DL, MVT::i64));
7730  }
7731
7732  if (RequiresSMChange || RequiresLazySave) {
7733    for (unsigned I = 0; I < InVals.size(); ++I) {
7734      // The smstart/smstop is chained as part of the call, but when the
7735      // resulting chain is discarded (which happens when the call is not part
7736      // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
7737      // smstart/smstop is chained to the result value. We can do that by doing
7738      // a vreg -> vreg copy.
7739      Register Reg = MF.getRegInfo().createVirtualRegister(
7740          getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7741      SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
7742      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
7743                                     InVals[I].getValueType());
7744    }
7745  }
7746
7747  return Result;
7748}
7749
7750bool AArch64TargetLowering::CanLowerReturn(
7751    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
7752    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
7753  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7754  SmallVector<CCValAssign, 16> RVLocs;
7755  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7756  return CCInfo.CheckReturn(Outs, RetCC);
7757}
7758
7759SDValue
7760AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7761                                   bool isVarArg,
7762                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
7763                                   const SmallVectorImpl<SDValue> &OutVals,
7764                                   const SDLoc &DL, SelectionDAG &DAG) const {
7765  auto &MF = DAG.getMachineFunction();
7766  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7767
7768  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7769  SmallVector<CCValAssign, 16> RVLocs;
7770  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
7771  CCInfo.AnalyzeReturn(Outs, RetCC);
7772
7773  // Copy the result values into the output registers.
7774  SDValue Flag;
7775  SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
7776  SmallSet<unsigned, 4> RegsUsed;
7777  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
7778       ++i, ++realRVLocIdx) {
7779    CCValAssign &VA = RVLocs[i];
7780    assert(VA.isRegLoc() && "Can only return in registers!");
7781    SDValue Arg = OutVals[realRVLocIdx];
7782
7783    switch (VA.getLocInfo()) {
7784    default:
7785      llvm_unreachable("Unknown loc info!");
7786    case CCValAssign::Full:
7787      if (Outs[i].ArgVT == MVT::i1) {
7788        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
7789        // value. This is strictly redundant on Darwin (which uses "zeroext
7790        // i1"), but will be optimised out before ISel.
7791        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7792        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7793      }
7794      break;
7795    case CCValAssign::BCvt:
7796      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
7797      break;
7798    case CCValAssign::AExt:
7799    case CCValAssign::ZExt:
7800      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7801      break;
7802    case CCValAssign::AExtUpper:
7803      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7804      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7805      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7806                        DAG.getConstant(32, DL, VA.getLocVT()));
7807      break;
7808    }
7809
7810    if (RegsUsed.count(VA.getLocReg())) {
7811      SDValue &Bits =
7812          llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
7813            return Elt.first == VA.getLocReg();
7814          })->second;
7815      Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7816    } else {
7817      RetVals.emplace_back(VA.getLocReg(), Arg);
7818      RegsUsed.insert(VA.getLocReg());
7819    }
7820  }
7821
7822  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7823
7824  // Emit SMSTOP before returning from a locally streaming function
7825  SMEAttrs FuncAttrs(MF.getFunction());
7826  if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
7827    Chain = DAG.getNode(
7828        AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
7829        DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
7830        DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
7831        DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
7832    Flag = Chain.getValue(1);
7833  }
7834
7835  SmallVector<SDValue, 4> RetOps(1, Chain);
7836  for (auto &RetVal : RetVals) {
7837    Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
7838    Flag = Chain.getValue(1);
7839    RetOps.push_back(
7840        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
7841  }
7842
7843  // Windows AArch64 ABIs require that for returning structs by value we copy
7844  // the sret argument into X0 for the return.
7845  // We saved the argument into a virtual register in the entry block,
7846  // so now we copy the value out and into X0.
7847  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
7848    SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
7849                                     getPointerTy(MF.getDataLayout()));
7850
7851    unsigned RetValReg = AArch64::X0;
7852    Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
7853    Flag = Chain.getValue(1);
7854
7855    RetOps.push_back(
7856      DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
7857  }
7858
7859  const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
7860  if (I) {
7861    for (; *I; ++I) {
7862      if (AArch64::GPR64RegClass.contains(*I))
7863        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
7864      else if (AArch64::FPR64RegClass.contains(*I))
7865        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
7866      else
7867        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
7868    }
7869  }
7870
7871  RetOps[0] = Chain; // Update chain.
7872
7873  // Add the flag if we have it.
7874  if (Flag.getNode())
7875    RetOps.push_back(Flag);
7876
7877  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
7878}
7879
7880//===----------------------------------------------------------------------===//
7881//  Other Lowering Code
7882//===----------------------------------------------------------------------===//
7883
7884SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
7885                                             SelectionDAG &DAG,
7886                                             unsigned Flag) const {
7887  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
7888                                    N->getOffset(), Flag);
7889}
7890
7891SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
7892                                             SelectionDAG &DAG,
7893                                             unsigned Flag) const {
7894  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
7895}
7896
7897SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
7898                                             SelectionDAG &DAG,
7899                                             unsigned Flag) const {
7900  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
7901                                   N->getOffset(), Flag);
7902}
7903
7904SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
7905                                             SelectionDAG &DAG,
7906                                             unsigned Flag) const {
7907  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
7908}
7909
7910// (loadGOT sym)
7911template <class NodeTy>
7912SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
7913                                      unsigned Flags) const {
7914  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
7915  SDLoc DL(N);
7916  EVT Ty = getPointerTy(DAG.getDataLayout());
7917  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
7918  // FIXME: Once remat is capable of dealing with instructions with register
7919  // operands, expand this into two nodes instead of using a wrapper node.
7920  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
7921}
7922
7923// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
7924template <class NodeTy>
7925SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
7926                                            unsigned Flags) const {
7927  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
7928  SDLoc DL(N);
7929  EVT Ty = getPointerTy(DAG.getDataLayout());
7930  const unsigned char MO_NC = AArch64II::MO_NC;
7931  return DAG.getNode(
7932      AArch64ISD::WrapperLarge, DL, Ty,
7933      getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
7934      getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
7935      getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
7936      getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
7937}
7938
7939// (addlow (adrp %hi(sym)) %lo(sym))
7940template <class NodeTy>
7941SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
7942                                       unsigned Flags) const {
7943  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
7944  SDLoc DL(N);
7945  EVT Ty = getPointerTy(DAG.getDataLayout());
7946  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
7947  SDValue Lo = getTargetNode(N, Ty, DAG,
7948                             AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
7949  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
7950  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
7951}
7952
7953// (adr sym)
7954template <class NodeTy>
7955SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
7956                                           unsigned Flags) const {
7957  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
7958  SDLoc DL(N);
7959  EVT Ty = getPointerTy(DAG.getDataLayout());
7960  SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
7961  return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
7962}
7963
7964SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
7965                                                  SelectionDAG &DAG) const {
7966  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
7967  const GlobalValue *GV = GN->getGlobal();
7968  unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7969
7970  if (OpFlags != AArch64II::MO_NO_FLAG)
7971    assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
7972           "unexpected offset in global node");
7973
7974  // This also catches the large code model case for Darwin, and tiny code
7975  // model with got relocations.
7976  if ((OpFlags & AArch64II::MO_GOT) != 0) {
7977    return getGOT(GN, DAG, OpFlags);
7978  }
7979
7980  SDValue Result;
7981  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7982    Result = getAddrLarge(GN, DAG, OpFlags);
7983  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7984    Result = getAddrTiny(GN, DAG, OpFlags);
7985  } else {
7986    Result = getAddr(GN, DAG, OpFlags);
7987  }
7988  EVT PtrVT = getPointerTy(DAG.getDataLayout());
7989  SDLoc DL(GN);
7990  if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
7991                 AArch64II::MO_COFFSTUB))
7992    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
7993                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
7994  return Result;
7995}
7996
7997/// Convert a TLS address reference into the correct sequence of loads
7998/// and calls to compute the variable's address (for Darwin, currently) and
7999/// return an SDValue containing the final node.
8000
8001/// Darwin only has one TLS scheme which must be capable of dealing with the
8002/// fully general situation, in the worst case. This means:
8003///     + "extern __thread" declaration.
8004///     + Defined in a possibly unknown dynamic library.
8005///
8006/// The general system is that each __thread variable has a [3 x i64] descriptor
8007/// which contains information used by the runtime to calculate the address. The
8008/// only part of this the compiler needs to know about is the first xword, which
8009/// contains a function pointer that must be called with the address of the
8010/// entire descriptor in "x0".
8011///
8012/// Since this descriptor may be in a different unit, in general even the
8013/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8014/// is:
8015///     adrp x0, _var@TLVPPAGE
8016///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
8017///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
8018///                                      ; the function pointer
8019///     blr x1                           ; Uses descriptor address in x0
8020///     ; Address of _var is now in x0.
8021///
8022/// If the address of _var's descriptor *is* known to the linker, then it can
8023/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8024/// a slight efficiency gain.
8025SDValue
8026AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8027                                                   SelectionDAG &DAG) const {
8028  assert(Subtarget->isTargetDarwin() &&
8029         "This function expects a Darwin target");
8030
8031  SDLoc DL(Op);
8032  MVT PtrVT = getPointerTy(DAG.getDataLayout());
8033  MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8034  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8035
8036  SDValue TLVPAddr =
8037      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8038  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8039
8040  // The first entry in the descriptor is a function pointer that we must call
8041  // to obtain the address of the variable.
8042  SDValue Chain = DAG.getEntryNode();
8043  SDValue FuncTLVGet = DAG.getLoad(
8044      PtrMemVT, DL, Chain, DescAddr,
8045      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
8046      Align(PtrMemVT.getSizeInBits() / 8),
8047      MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
8048  Chain = FuncTLVGet.getValue(1);
8049
8050  // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8051  FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8052
8053  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8054  MFI.setAdjustsStack(true);
8055
8056  // TLS calls preserve all registers except those that absolutely must be
8057  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8058  // silly).
8059  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8060  const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8061  if (Subtarget->hasCustomCallingConv())
8062    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8063
8064  // Finally, we can make the call. This is just a degenerate version of a
8065  // normal AArch64 call node: x0 takes the address of the descriptor, and
8066  // returns the address of the variable in this thread.
8067  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8068  Chain =
8069      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8070                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8071                  DAG.getRegisterMask(Mask), Chain.getValue(1));
8072  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8073}
8074
8075/// Convert a thread-local variable reference into a sequence of instructions to
8076/// compute the variable's address for the local exec TLS model of ELF targets.
8077/// The sequence depends on the maximum TLS area size.
8078SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8079                                                    SDValue ThreadBase,
8080                                                    const SDLoc &DL,
8081                                                    SelectionDAG &DAG) const {
8082  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8083  SDValue TPOff, Addr;
8084
8085  switch (DAG.getTarget().Options.TLSSize) {
8086  default:
8087    llvm_unreachable("Unexpected TLS size");
8088
8089  case 12: {
8090    // mrs   x0, TPIDR_EL0
8091    // add   x0, x0, :tprel_lo12:a
8092    SDValue Var = DAG.getTargetGlobalAddress(
8093        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8094    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8095                                      Var,
8096                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8097                   0);
8098  }
8099
8100  case 24: {
8101    // mrs   x0, TPIDR_EL0
8102    // add   x0, x0, :tprel_hi12:a
8103    // add   x0, x0, :tprel_lo12_nc:a
8104    SDValue HiVar = DAG.getTargetGlobalAddress(
8105        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8106    SDValue LoVar = DAG.getTargetGlobalAddress(
8107        GV, DL, PtrVT, 0,
8108        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8109    Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8110                                      HiVar,
8111                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8112                   0);
8113    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8114                                      LoVar,
8115                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8116                   0);
8117  }
8118
8119  case 32: {
8120    // mrs   x1, TPIDR_EL0
8121    // movz  x0, #:tprel_g1:a
8122    // movk  x0, #:tprel_g0_nc:a
8123    // add   x0, x1, x0
8124    SDValue HiVar = DAG.getTargetGlobalAddress(
8125        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8126    SDValue LoVar = DAG.getTargetGlobalAddress(
8127        GV, DL, PtrVT, 0,
8128        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8129    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8130                                       DAG.getTargetConstant(16, DL, MVT::i32)),
8131                    0);
8132    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8133                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8134                    0);
8135    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8136  }
8137
8138  case 48: {
8139    // mrs   x1, TPIDR_EL0
8140    // movz  x0, #:tprel_g2:a
8141    // movk  x0, #:tprel_g1_nc:a
8142    // movk  x0, #:tprel_g0_nc:a
8143    // add   x0, x1, x0
8144    SDValue HiVar = DAG.getTargetGlobalAddress(
8145        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8146    SDValue MiVar = DAG.getTargetGlobalAddress(
8147        GV, DL, PtrVT, 0,
8148        AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
8149    SDValue LoVar = DAG.getTargetGlobalAddress(
8150        GV, DL, PtrVT, 0,
8151        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8152    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8153                                       DAG.getTargetConstant(32, DL, MVT::i32)),
8154                    0);
8155    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8156                                       DAG.getTargetConstant(16, DL, MVT::i32)),
8157                    0);
8158    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8159                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8160                    0);
8161    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8162  }
8163  }
8164}
8165
8166/// When accessing thread-local variables under either the general-dynamic or
8167/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8168/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8169/// is a function pointer to carry out the resolution.
8170///
8171/// The sequence is:
8172///    adrp  x0, :tlsdesc:var
8173///    ldr   x1, [x0, #:tlsdesc_lo12:var]
8174///    add   x0, x0, #:tlsdesc_lo12:var
8175///    .tlsdesccall var
8176///    blr   x1
8177///    (TPIDR_EL0 offset now in x0)
8178///
8179///  The above sequence must be produced unscheduled, to enable the linker to
8180///  optimize/relax this sequence.
8181///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8182///  above sequence, and expanded really late in the compilation flow, to ensure
8183///  the sequence is produced as per above.
8184SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8185                                                      const SDLoc &DL,
8186                                                      SelectionDAG &DAG) const {
8187  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8188
8189  SDValue Chain = DAG.getEntryNode();
8190  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8191
8192  Chain =
8193      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
8194  SDValue Glue = Chain.getValue(1);
8195
8196  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8197}
8198
8199SDValue
8200AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8201                                                SelectionDAG &DAG) const {
8202  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8203
8204  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8205
8206  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
8207
8208  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
8209    if (Model == TLSModel::LocalDynamic)
8210      Model = TLSModel::GeneralDynamic;
8211  }
8212
8213  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8214      Model != TLSModel::LocalExec)
8215    report_fatal_error("ELF TLS only supported in small memory model or "
8216                       "in local exec TLS model");
8217  // Different choices can be made for the maximum size of the TLS area for a
8218  // module. For the small address model, the default TLS size is 16MiB and the
8219  // maximum TLS size is 4GiB.
8220  // FIXME: add tiny and large code model support for TLS access models other
8221  // than local exec. We currently generate the same code as small for tiny,
8222  // which may be larger than needed.
8223
8224  SDValue TPOff;
8225  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8226  SDLoc DL(Op);
8227  const GlobalValue *GV = GA->getGlobal();
8228
8229  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
8230
8231  if (Model == TLSModel::LocalExec) {
8232    return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
8233  } else if (Model == TLSModel::InitialExec) {
8234    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8235    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
8236  } else if (Model == TLSModel::LocalDynamic) {
8237    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
8238    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
8239    // the beginning of the module's TLS region, followed by a DTPREL offset
8240    // calculation.
8241
8242    // These accesses will need deduplicating if there's more than one.
8243    AArch64FunctionInfo *MFI =
8244        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8245    MFI->incNumLocalDynamicTLSAccesses();
8246
8247    // The call needs a relocation too for linker relaxation. It doesn't make
8248    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8249    // the address.
8250    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
8251                                                  AArch64II::MO_TLS);
8252
8253    // Now we can calculate the offset from TPIDR_EL0 to this module's
8254    // thread-local area.
8255    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8256
8257    // Now use :dtprel_whatever: operations to calculate this variable's offset
8258    // in its thread-storage area.
8259    SDValue HiVar = DAG.getTargetGlobalAddress(
8260        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8261    SDValue LoVar = DAG.getTargetGlobalAddress(
8262        GV, DL, MVT::i64, 0,
8263        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8264
8265    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
8266                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8267                    0);
8268    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
8269                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8270                    0);
8271  } else if (Model == TLSModel::GeneralDynamic) {
8272    // The call needs a relocation too for linker relaxation. It doesn't make
8273    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8274    // the address.
8275    SDValue SymAddr =
8276        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8277
8278    // Finally we can make a call to calculate the offset from tpidr_el0.
8279    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8280  } else
8281    llvm_unreachable("Unsupported ELF TLS access model");
8282
8283  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8284}
8285
8286SDValue
8287AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
8288                                                    SelectionDAG &DAG) const {
8289  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
8290
8291  SDValue Chain = DAG.getEntryNode();
8292  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8293  SDLoc DL(Op);
8294
8295  SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
8296
8297  // Load the ThreadLocalStoragePointer from the TEB
8298  // A pointer to the TLS array is located at offset 0x58 from the TEB.
8299  SDValue TLSArray =
8300      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
8301  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
8302  Chain = TLSArray.getValue(1);
8303
8304  // Load the TLS index from the C runtime;
8305  // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8306  // This also does the same as LOADgot, but using a generic i32 load,
8307  // while LOADgot only loads i64.
8308  SDValue TLSIndexHi =
8309      DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
8310  SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
8311      "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8312  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
8313  SDValue TLSIndex =
8314      DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
8315  TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
8316  Chain = TLSIndex.getValue(1);
8317
8318  // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8319  // offset into the TLSArray.
8320  TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
8321  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
8322                             DAG.getConstant(3, DL, PtrVT));
8323  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
8324                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
8325                            MachinePointerInfo());
8326  Chain = TLS.getValue(1);
8327
8328  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8329  const GlobalValue *GV = GA->getGlobal();
8330  SDValue TGAHi = DAG.getTargetGlobalAddress(
8331      GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8332  SDValue TGALo = DAG.getTargetGlobalAddress(
8333      GV, DL, PtrVT, 0,
8334      AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8335
8336  // Add the offset from the start of the .tls section (section base).
8337  SDValue Addr =
8338      SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
8339                                 DAG.getTargetConstant(0, DL, MVT::i32)),
8340              0);
8341  Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
8342  return Addr;
8343}
8344
8345SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
8346                                                     SelectionDAG &DAG) const {
8347  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8348  if (DAG.getTarget().useEmulatedTLS())
8349    return LowerToTLSEmulatedModel(GA, DAG);
8350
8351  if (Subtarget->isTargetDarwin())
8352    return LowerDarwinGlobalTLSAddress(Op, DAG);
8353  if (Subtarget->isTargetELF())
8354    return LowerELFGlobalTLSAddress(Op, DAG);
8355  if (Subtarget->isTargetWindows())
8356    return LowerWindowsGlobalTLSAddress(Op, DAG);
8357
8358  llvm_unreachable("Unexpected platform trying to use TLS");
8359}
8360
8361// Looks through \param Val to determine the bit that can be used to
8362// check the sign of the value. It returns the unextended value and
8363// the sign bit position.
8364std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
8365  if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
8366    return {Val.getOperand(0),
8367            cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
8368                1};
8369
8370  if (Val.getOpcode() == ISD::SIGN_EXTEND)
8371    return {Val.getOperand(0),
8372            Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8373
8374  return {Val, Val.getValueSizeInBits() - 1};
8375}
8376
8377SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
8378  SDValue Chain = Op.getOperand(0);
8379  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
8380  SDValue LHS = Op.getOperand(2);
8381  SDValue RHS = Op.getOperand(3);
8382  SDValue Dest = Op.getOperand(4);
8383  SDLoc dl(Op);
8384
8385  MachineFunction &MF = DAG.getMachineFunction();
8386  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8387  // will not be produced, as they are conditional branch instructions that do
8388  // not set flags.
8389  bool ProduceNonFlagSettingCondBr =
8390      !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
8391
8392  // Handle f128 first, since lowering it will result in comparing the return
8393  // value of a libcall against zero, which is just what the rest of LowerBR_CC
8394  // is expecting to deal with.
8395  if (LHS.getValueType() == MVT::f128) {
8396    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8397
8398    // If softenSetCCOperands returned a scalar, we need to compare the result
8399    // against zero to select between true and false values.
8400    if (!RHS.getNode()) {
8401      RHS = DAG.getConstant(0, dl, LHS.getValueType());
8402      CC = ISD::SETNE;
8403    }
8404  }
8405
8406  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8407  // instruction.
8408  if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
8409      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8410    // Only lower legal XALUO ops.
8411    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
8412      return SDValue();
8413
8414    // The actual operation with overflow check.
8415    AArch64CC::CondCode OFCC;
8416    SDValue Value, Overflow;
8417    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
8418
8419    if (CC == ISD::SETNE)
8420      OFCC = getInvertedCondCode(OFCC);
8421    SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
8422
8423    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8424                       Overflow);
8425  }
8426
8427  if (LHS.getValueType().isInteger()) {
8428    assert((LHS.getValueType() == RHS.getValueType()) &&
8429           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8430
8431    // If the RHS of the comparison is zero, we can potentially fold this
8432    // to a specialized branch.
8433    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8434    if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
8435      if (CC == ISD::SETEQ) {
8436        // See if we can use a TBZ to fold in an AND as well.
8437        // TBZ has a smaller branch displacement than CBZ.  If the offset is
8438        // out of bounds, a late MI-layer pass rewrites branches.
8439        // 403.gcc is an example that hits this case.
8440        if (LHS.getOpcode() == ISD::AND &&
8441            isa<ConstantSDNode>(LHS.getOperand(1)) &&
8442            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8443          SDValue Test = LHS.getOperand(0);
8444          uint64_t Mask = LHS.getConstantOperandVal(1);
8445          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
8446                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8447                             Dest);
8448        }
8449
8450        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
8451      } else if (CC == ISD::SETNE) {
8452        // See if we can use a TBZ to fold in an AND as well.
8453        // TBZ has a smaller branch displacement than CBZ.  If the offset is
8454        // out of bounds, a late MI-layer pass rewrites branches.
8455        // 403.gcc is an example that hits this case.
8456        if (LHS.getOpcode() == ISD::AND &&
8457            isa<ConstantSDNode>(LHS.getOperand(1)) &&
8458            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8459          SDValue Test = LHS.getOperand(0);
8460          uint64_t Mask = LHS.getConstantOperandVal(1);
8461          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
8462                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8463                             Dest);
8464        }
8465
8466        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
8467      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
8468        // Don't combine AND since emitComparison converts the AND to an ANDS
8469        // (a.k.a. TST) and the test in the test bit and branch instruction
8470        // becomes redundant.  This would also increase register pressure.
8471        uint64_t SignBitPos;
8472        std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8473        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
8474                           DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8475      }
8476    }
8477    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
8478        LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
8479      // Don't combine AND since emitComparison converts the AND to an ANDS
8480      // (a.k.a. TST) and the test in the test bit and branch instruction
8481      // becomes redundant.  This would also increase register pressure.
8482      uint64_t SignBitPos;
8483      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8484      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
8485                         DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8486    }
8487
8488    SDValue CCVal;
8489    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8490    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8491                       Cmp);
8492  }
8493
8494  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
8495         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
8496
8497  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8498  // clean.  Some of them require two branches to implement.
8499  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8500  AArch64CC::CondCode CC1, CC2;
8501  changeFPCCToAArch64CC(CC, CC1, CC2);
8502  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8503  SDValue BR1 =
8504      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
8505  if (CC2 != AArch64CC::AL) {
8506    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8507    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
8508                       Cmp);
8509  }
8510
8511  return BR1;
8512}
8513
8514SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
8515                                              SelectionDAG &DAG) const {
8516  if (!Subtarget->hasNEON())
8517    return SDValue();
8518
8519  EVT VT = Op.getValueType();
8520  EVT IntVT = VT.changeTypeToInteger();
8521  SDLoc DL(Op);
8522
8523  SDValue In1 = Op.getOperand(0);
8524  SDValue In2 = Op.getOperand(1);
8525  EVT SrcVT = In2.getValueType();
8526
8527  if (!SrcVT.bitsEq(VT))
8528    In2 = DAG.getFPExtendOrRound(In2, DL, VT);
8529
8530  if (VT.isScalableVector())
8531    IntVT =
8532        getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
8533
8534  if (VT.isFixedLengthVector() &&
8535    useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) {
8536    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
8537
8538    In1 = convertToScalableVector(DAG, ContainerVT, In1);
8539    In2 = convertToScalableVector(DAG, ContainerVT, In2);
8540
8541    SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
8542    return convertFromScalableVector(DAG, VT, Res);
8543  }
8544
8545  auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
8546    if (VT.isScalableVector())
8547      return getSVESafeBitCast(VT, Op, DAG);
8548
8549    return DAG.getBitcast(VT, Op);
8550  };
8551
8552  SDValue VecVal1, VecVal2;
8553  EVT VecVT;
8554  auto SetVecVal = [&](int Idx = -1) {
8555    if (!VT.isVector()) {
8556      VecVal1 =
8557          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
8558      VecVal2 =
8559          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
8560    } else {
8561      VecVal1 = BitCast(VecVT, In1, DAG);
8562      VecVal2 = BitCast(VecVT, In2, DAG);
8563    }
8564  };
8565  if (VT.isVector()) {
8566    VecVT = IntVT;
8567    SetVecVal();
8568  } else if (VT == MVT::f64) {
8569    VecVT = MVT::v2i64;
8570    SetVecVal(AArch64::dsub);
8571  } else if (VT == MVT::f32) {
8572    VecVT = MVT::v4i32;
8573    SetVecVal(AArch64::ssub);
8574  } else if (VT == MVT::f16) {
8575    VecVT = MVT::v8i16;
8576    SetVecVal(AArch64::hsub);
8577  } else {
8578    llvm_unreachable("Invalid type for copysign!");
8579  }
8580
8581  unsigned BitWidth = In1.getScalarValueSizeInBits();
8582  SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
8583
8584  // We want to materialize a mask with every bit but the high bit set, but the
8585  // AdvSIMD immediate moves cannot materialize that in a single instruction for
8586  // 64-bit elements. Instead, materialize all bits set and then negate that.
8587  if (VT == MVT::f64 || VT == MVT::v2f64) {
8588    SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
8589    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
8590    SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
8591    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
8592  }
8593
8594  SDValue BSP =
8595      DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
8596  if (VT == MVT::f16)
8597    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
8598  if (VT == MVT::f32)
8599    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
8600  if (VT == MVT::f64)
8601    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
8602
8603  return BitCast(VT, BSP, DAG);
8604}
8605
8606SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
8607                                                 SelectionDAG &DAG) const {
8608  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
8609          Attribute::NoImplicitFloat))
8610    return SDValue();
8611
8612  if (!Subtarget->hasNEON())
8613    return SDValue();
8614
8615  bool IsParity = Op.getOpcode() == ISD::PARITY;
8616  SDValue Val = Op.getOperand(0);
8617  SDLoc DL(Op);
8618  EVT VT = Op.getValueType();
8619
8620  // for i32, general parity function using EORs is more efficient compared to
8621  // using floating point
8622  if (VT == MVT::i32 && IsParity)
8623    return SDValue();
8624
8625  // If there is no CNT instruction available, GPR popcount can
8626  // be more efficiently lowered to the following sequence that uses
8627  // AdvSIMD registers/instructions as long as the copies to/from
8628  // the AdvSIMD registers are cheap.
8629  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
8630  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
8631  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
8632  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
8633  if (VT == MVT::i32 || VT == MVT::i64) {
8634    if (VT == MVT::i32)
8635      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
8636    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
8637
8638    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
8639    SDValue UaddLV = DAG.getNode(
8640        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8641        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8642
8643    if (IsParity)
8644      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8645                           DAG.getConstant(1, DL, MVT::i32));
8646
8647    if (VT == MVT::i64)
8648      UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
8649    return UaddLV;
8650  } else if (VT == MVT::i128) {
8651    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
8652
8653    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
8654    SDValue UaddLV = DAG.getNode(
8655        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8656        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8657
8658    if (IsParity)
8659      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8660                           DAG.getConstant(1, DL, MVT::i32));
8661
8662    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
8663  }
8664
8665  assert(!IsParity && "ISD::PARITY of vector types not supported");
8666
8667  if (VT.isScalableVector() ||
8668      useSVEForFixedLengthVectorVT(VT,
8669                                   Subtarget->forceStreamingCompatibleSVE()))
8670    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
8671
8672  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
8673          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
8674         "Unexpected type for custom ctpop lowering");
8675
8676  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
8677  Val = DAG.getBitcast(VT8Bit, Val);
8678  Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
8679
8680  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
8681  unsigned EltSize = 8;
8682  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
8683  while (EltSize != VT.getScalarSizeInBits()) {
8684    EltSize *= 2;
8685    NumElts /= 2;
8686    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
8687    Val = DAG.getNode(
8688        ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
8689        DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
8690  }
8691
8692  return Val;
8693}
8694
8695SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
8696  EVT VT = Op.getValueType();
8697  assert(VT.isScalableVector() ||
8698         useSVEForFixedLengthVectorVT(
8699             VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
8700
8701  SDLoc DL(Op);
8702  SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
8703  return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
8704}
8705
8706SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
8707                                           SelectionDAG &DAG) const {
8708
8709  EVT VT = Op.getValueType();
8710  SDLoc DL(Op);
8711  unsigned Opcode = Op.getOpcode();
8712  ISD::CondCode CC;
8713  switch (Opcode) {
8714  default:
8715    llvm_unreachable("Wrong instruction");
8716  case ISD::SMAX:
8717    CC = ISD::SETGT;
8718    break;
8719  case ISD::SMIN:
8720    CC = ISD::SETLT;
8721    break;
8722  case ISD::UMAX:
8723    CC = ISD::SETUGT;
8724    break;
8725  case ISD::UMIN:
8726    CC = ISD::SETULT;
8727    break;
8728  }
8729
8730  if (VT.isScalableVector() ||
8731      useSVEForFixedLengthVectorVT(
8732          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
8733    switch (Opcode) {
8734    default:
8735      llvm_unreachable("Wrong instruction");
8736    case ISD::SMAX:
8737      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
8738    case ISD::SMIN:
8739      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
8740    case ISD::UMAX:
8741      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
8742    case ISD::UMIN:
8743      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
8744    }
8745  }
8746
8747  SDValue Op0 = Op.getOperand(0);
8748  SDValue Op1 = Op.getOperand(1);
8749  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
8750  return DAG.getSelect(DL, VT, Cond, Op0, Op1);
8751}
8752
8753SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
8754                                               SelectionDAG &DAG) const {
8755  EVT VT = Op.getValueType();
8756
8757  if (VT.isScalableVector() ||
8758      useSVEForFixedLengthVectorVT(
8759          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
8760    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
8761
8762  SDLoc DL(Op);
8763  SDValue REVB;
8764  MVT VST;
8765
8766  switch (VT.getSimpleVT().SimpleTy) {
8767  default:
8768    llvm_unreachable("Invalid type for bitreverse!");
8769
8770  case MVT::v2i32: {
8771    VST = MVT::v8i8;
8772    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8773
8774    break;
8775  }
8776
8777  case MVT::v4i32: {
8778    VST = MVT::v16i8;
8779    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8780
8781    break;
8782  }
8783
8784  case MVT::v1i64: {
8785    VST = MVT::v8i8;
8786    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8787
8788    break;
8789  }
8790
8791  case MVT::v2i64: {
8792    VST = MVT::v16i8;
8793    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8794
8795    break;
8796  }
8797  }
8798
8799  return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
8800                     DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
8801}
8802
8803// Check whether the continuous comparison sequence.
8804static bool
8805isOrXorChain(SDValue N, unsigned &Num,
8806             SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
8807  if (Num == MaxXors)
8808    return false;
8809
8810  // Skip the one-use zext
8811  if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
8812    N = N->getOperand(0);
8813
8814  // The leaf node must be XOR
8815  if (N->getOpcode() == ISD::XOR) {
8816    WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
8817    Num++;
8818    return true;
8819  }
8820
8821  // All the non-leaf nodes must be OR.
8822  if (N->getOpcode() != ISD::OR || !N->hasOneUse())
8823    return false;
8824
8825  if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
8826      isOrXorChain(N->getOperand(1), Num, WorkList))
8827    return true;
8828  return false;
8829}
8830
8831// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
8832static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
8833  SDValue LHS = N->getOperand(0);
8834  SDValue RHS = N->getOperand(1);
8835  SDLoc DL(N);
8836  EVT VT = N->getValueType(0);
8837  SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
8838
8839  // Only handle integer compares.
8840  if (N->getOpcode() != ISD::SETCC)
8841    return SDValue();
8842
8843  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
8844  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
8845  // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
8846  unsigned NumXors = 0;
8847  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
8848      LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
8849      isOrXorChain(LHS, NumXors, WorkList)) {
8850    SDValue XOR0, XOR1;
8851    std::tie(XOR0, XOR1) = WorkList[0];
8852    unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
8853    SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
8854    for (unsigned I = 1; I < WorkList.size(); I++) {
8855      std::tie(XOR0, XOR1) = WorkList[I];
8856      SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
8857      Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
8858    }
8859
8860    // Exit early by inverting the condition, which help reduce indentations.
8861    return Cmp;
8862  }
8863
8864  return SDValue();
8865}
8866
8867SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8868
8869  if (Op.getValueType().isVector())
8870    return LowerVSETCC(Op, DAG);
8871
8872  bool IsStrict = Op->isStrictFPOpcode();
8873  bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
8874  unsigned OpNo = IsStrict ? 1 : 0;
8875  SDValue Chain;
8876  if (IsStrict)
8877    Chain = Op.getOperand(0);
8878  SDValue LHS = Op.getOperand(OpNo + 0);
8879  SDValue RHS = Op.getOperand(OpNo + 1);
8880  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
8881  SDLoc dl(Op);
8882
8883  // We chose ZeroOrOneBooleanContents, so use zero and one.
8884  EVT VT = Op.getValueType();
8885  SDValue TVal = DAG.getConstant(1, dl, VT);
8886  SDValue FVal = DAG.getConstant(0, dl, VT);
8887
8888  // Handle f128 first, since one possible outcome is a normal integer
8889  // comparison which gets picked up by the next if statement.
8890  if (LHS.getValueType() == MVT::f128) {
8891    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
8892                        IsSignaling);
8893
8894    // If softenSetCCOperands returned a scalar, use it.
8895    if (!RHS.getNode()) {
8896      assert(LHS.getValueType() == Op.getValueType() &&
8897             "Unexpected setcc expansion!");
8898      return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
8899    }
8900  }
8901
8902  if (LHS.getValueType().isInteger()) {
8903    SDValue CCVal;
8904    SDValue Cmp = getAArch64Cmp(
8905        LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
8906
8907    // Note that we inverted the condition above, so we reverse the order of
8908    // the true and false operands here.  This will allow the setcc to be
8909    // matched to a single CSINC instruction.
8910    SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
8911    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
8912  }
8913
8914  // Now we know we're dealing with FP values.
8915  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8916         LHS.getValueType() == MVT::f64);
8917
8918  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
8919  // and do the comparison.
8920  SDValue Cmp;
8921  if (IsStrict)
8922    Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
8923  else
8924    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8925
8926  AArch64CC::CondCode CC1, CC2;
8927  changeFPCCToAArch64CC(CC, CC1, CC2);
8928  SDValue Res;
8929  if (CC2 == AArch64CC::AL) {
8930    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
8931                          CC2);
8932    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8933
8934    // Note that we inverted the condition above, so we reverse the order of
8935    // the true and false operands here.  This will allow the setcc to be
8936    // matched to a single CSINC instruction.
8937    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
8938  } else {
8939    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
8940    // totally clean.  Some of them require two CSELs to implement.  As is in
8941    // this case, we emit the first CSEL and then emit a second using the output
8942    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
8943
8944    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
8945    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8946    SDValue CS1 =
8947        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8948
8949    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8950    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8951  }
8952  return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
8953}
8954
8955SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
8956                                               SelectionDAG &DAG) const {
8957
8958  SDValue LHS = Op.getOperand(0);
8959  SDValue RHS = Op.getOperand(1);
8960  EVT VT = LHS.getValueType();
8961  if (VT != MVT::i32 && VT != MVT::i64)
8962    return SDValue();
8963
8964  SDLoc DL(Op);
8965  SDValue Carry = Op.getOperand(2);
8966  // SBCS uses a carry not a borrow so the carry flag should be inverted first.
8967  SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
8968  SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
8969                            LHS, RHS, InvCarry);
8970
8971  EVT OpVT = Op.getValueType();
8972  SDValue TVal = DAG.getConstant(1, DL, OpVT);
8973  SDValue FVal = DAG.getConstant(0, DL, OpVT);
8974
8975  ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
8976  ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
8977  SDValue CCVal =
8978      DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
8979  // Inputs are swapped because the condition is inverted. This will allow
8980  // matching with a single CSINC instruction.
8981  return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
8982                     Cmp.getValue(1));
8983}
8984
8985SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
8986                                              SDValue RHS, SDValue TVal,
8987                                              SDValue FVal, const SDLoc &dl,
8988                                              SelectionDAG &DAG) const {
8989  // Handle f128 first, because it will result in a comparison of some RTLIB
8990  // call result against zero.
8991  if (LHS.getValueType() == MVT::f128) {
8992    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8993
8994    // If softenSetCCOperands returned a scalar, we need to compare the result
8995    // against zero to select between true and false values.
8996    if (!RHS.getNode()) {
8997      RHS = DAG.getConstant(0, dl, LHS.getValueType());
8998      CC = ISD::SETNE;
8999    }
9000  }
9001
9002  // Also handle f16, for which we need to do a f32 comparison.
9003  if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
9004    LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9005    RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9006  }
9007
9008  // Next, handle integers.
9009  if (LHS.getValueType().isInteger()) {
9010    assert((LHS.getValueType() == RHS.getValueType()) &&
9011           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9012
9013    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9014    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9015    ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9016    // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9017    // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9018    // supported types.
9019    if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9020        CTVal->isOne() && CFVal->isAllOnes() &&
9021        LHS.getValueType() == TVal.getValueType()) {
9022      EVT VT = LHS.getValueType();
9023      SDValue Shift =
9024          DAG.getNode(ISD::SRA, dl, VT, LHS,
9025                      DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9026      return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9027    }
9028
9029    unsigned Opcode = AArch64ISD::CSEL;
9030
9031    // If both the TVal and the FVal are constants, see if we can swap them in
9032    // order to for a CSINV or CSINC out of them.
9033    if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9034      std::swap(TVal, FVal);
9035      std::swap(CTVal, CFVal);
9036      CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9037    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9038      std::swap(TVal, FVal);
9039      std::swap(CTVal, CFVal);
9040      CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9041    } else if (TVal.getOpcode() == ISD::XOR) {
9042      // If TVal is a NOT we want to swap TVal and FVal so that we can match
9043      // with a CSINV rather than a CSEL.
9044      if (isAllOnesConstant(TVal.getOperand(1))) {
9045        std::swap(TVal, FVal);
9046        std::swap(CTVal, CFVal);
9047        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9048      }
9049    } else if (TVal.getOpcode() == ISD::SUB) {
9050      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9051      // that we can match with a CSNEG rather than a CSEL.
9052      if (isNullConstant(TVal.getOperand(0))) {
9053        std::swap(TVal, FVal);
9054        std::swap(CTVal, CFVal);
9055        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9056      }
9057    } else if (CTVal && CFVal) {
9058      const int64_t TrueVal = CTVal->getSExtValue();
9059      const int64_t FalseVal = CFVal->getSExtValue();
9060      bool Swap = false;
9061
9062      // If both TVal and FVal are constants, see if FVal is the
9063      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9064      // instead of a CSEL in that case.
9065      if (TrueVal == ~FalseVal) {
9066        Opcode = AArch64ISD::CSINV;
9067      } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9068                 TrueVal == -FalseVal) {
9069        Opcode = AArch64ISD::CSNEG;
9070      } else if (TVal.getValueType() == MVT::i32) {
9071        // If our operands are only 32-bit wide, make sure we use 32-bit
9072        // arithmetic for the check whether we can use CSINC. This ensures that
9073        // the addition in the check will wrap around properly in case there is
9074        // an overflow (which would not be the case if we do the check with
9075        // 64-bit arithmetic).
9076        const uint32_t TrueVal32 = CTVal->getZExtValue();
9077        const uint32_t FalseVal32 = CFVal->getZExtValue();
9078
9079        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9080          Opcode = AArch64ISD::CSINC;
9081
9082          if (TrueVal32 > FalseVal32) {
9083            Swap = true;
9084          }
9085        }
9086      } else {
9087        // 64-bit check whether we can use CSINC.
9088        const uint64_t TrueVal64 = TrueVal;
9089        const uint64_t FalseVal64 = FalseVal;
9090
9091        if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9092          Opcode = AArch64ISD::CSINC;
9093
9094          if (TrueVal > FalseVal) {
9095            Swap = true;
9096          }
9097        }
9098      }
9099
9100      // Swap TVal and FVal if necessary.
9101      if (Swap) {
9102        std::swap(TVal, FVal);
9103        std::swap(CTVal, CFVal);
9104        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9105      }
9106
9107      if (Opcode != AArch64ISD::CSEL) {
9108        // Drop FVal since we can get its value by simply inverting/negating
9109        // TVal.
9110        FVal = TVal;
9111      }
9112    }
9113
9114    // Avoid materializing a constant when possible by reusing a known value in
9115    // a register.  However, don't perform this optimization if the known value
9116    // is one, zero or negative one in the case of a CSEL.  We can always
9117    // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9118    // FVal, respectively.
9119    ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9120    if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9121        !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9122      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9123      // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9124      // "a != C ? x : a" to avoid materializing C.
9125      if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9126        TVal = LHS;
9127      else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9128        FVal = LHS;
9129    } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9130      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9131      // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9132      // avoid materializing C.
9133      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9134      if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9135        Opcode = AArch64ISD::CSINV;
9136        TVal = LHS;
9137        FVal = DAG.getConstant(0, dl, FVal.getValueType());
9138      }
9139    }
9140
9141    SDValue CCVal;
9142    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9143    EVT VT = TVal.getValueType();
9144    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9145  }
9146
9147  // Now we know we're dealing with FP values.
9148  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9149         LHS.getValueType() == MVT::f64);
9150  assert(LHS.getValueType() == RHS.getValueType());
9151  EVT VT = TVal.getValueType();
9152  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9153
9154  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9155  // clean.  Some of them require two CSELs to implement.
9156  AArch64CC::CondCode CC1, CC2;
9157  changeFPCCToAArch64CC(CC, CC1, CC2);
9158
9159  if (DAG.getTarget().Options.UnsafeFPMath) {
9160    // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9161    // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9162    ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
9163    if (RHSVal && RHSVal->isZero()) {
9164      ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
9165      ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
9166
9167      if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9168          CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9169        TVal = LHS;
9170      else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9171               CFVal && CFVal->isZero() &&
9172               FVal.getValueType() == LHS.getValueType())
9173        FVal = LHS;
9174    }
9175  }
9176
9177  // Emit first, and possibly only, CSEL.
9178  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9179  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9180
9181  // If we need a second CSEL, emit it, using the output of the first as the
9182  // RHS.  We're effectively OR'ing the two CC's together.
9183  if (CC2 != AArch64CC::AL) {
9184    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9185    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9186  }
9187
9188  // Otherwise, return the output of the first CSEL.
9189  return CS1;
9190}
9191
9192SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9193                                                  SelectionDAG &DAG) const {
9194  EVT Ty = Op.getValueType();
9195  auto Idx = Op.getConstantOperandAPInt(2);
9196  int64_t IdxVal = Idx.getSExtValue();
9197  assert(Ty.isScalableVector() &&
9198         "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9199
9200  // We can use the splice instruction for certain index values where we are
9201  // able to efficiently generate the correct predicate. The index will be
9202  // inverted and used directly as the input to the ptrue instruction, i.e.
9203  // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9204  // splice predicate. However, we can only do this if we can guarantee that
9205  // there are enough elements in the vector, hence we check the index <= min
9206  // number of elements.
9207  std::optional<unsigned> PredPattern;
9208  if (Ty.isScalableVector() && IdxVal < 0 &&
9209      (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
9210          std::nullopt) {
9211    SDLoc DL(Op);
9212
9213    // Create a predicate where all but the last -IdxVal elements are false.
9214    EVT PredVT = Ty.changeVectorElementType(MVT::i1);
9215    SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
9216    Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
9217
9218    // Now splice the two inputs together using the predicate.
9219    return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
9220                       Op.getOperand(1));
9221  }
9222
9223  // This will select to an EXT instruction, which has a maximum immediate
9224  // value of 255, hence 2048-bits is the maximum value we can lower.
9225  if (IdxVal >= 0 &&
9226      IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
9227    return Op;
9228
9229  return SDValue();
9230}
9231
9232SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
9233                                              SelectionDAG &DAG) const {
9234  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
9235  SDValue LHS = Op.getOperand(0);
9236  SDValue RHS = Op.getOperand(1);
9237  SDValue TVal = Op.getOperand(2);
9238  SDValue FVal = Op.getOperand(3);
9239  SDLoc DL(Op);
9240  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9241}
9242
9243SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
9244                                           SelectionDAG &DAG) const {
9245  SDValue CCVal = Op->getOperand(0);
9246  SDValue TVal = Op->getOperand(1);
9247  SDValue FVal = Op->getOperand(2);
9248  SDLoc DL(Op);
9249
9250  EVT Ty = Op.getValueType();
9251  if (Ty.isScalableVector()) {
9252    SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
9253    MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
9254    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
9255    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9256  }
9257
9258  if (useSVEForFixedLengthVectorVT(Ty)) {
9259    // FIXME: Ideally this would be the same as above using i1 types, however
9260    // for the moment we can't deal with fixed i1 vector types properly, so
9261    // instead extend the predicate to a result type sized integer vector.
9262    MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
9263    MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
9264    SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
9265    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
9266    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9267  }
9268
9269  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
9270  // instruction.
9271  if (ISD::isOverflowIntrOpRes(CCVal)) {
9272    // Only lower legal XALUO ops.
9273    if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
9274      return SDValue();
9275
9276    AArch64CC::CondCode OFCC;
9277    SDValue Value, Overflow;
9278    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
9279    SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
9280
9281    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
9282                       CCVal, Overflow);
9283  }
9284
9285  // Lower it the same way as we would lower a SELECT_CC node.
9286  ISD::CondCode CC;
9287  SDValue LHS, RHS;
9288  if (CCVal.getOpcode() == ISD::SETCC) {
9289    LHS = CCVal.getOperand(0);
9290    RHS = CCVal.getOperand(1);
9291    CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
9292  } else {
9293    LHS = CCVal;
9294    RHS = DAG.getConstant(0, DL, CCVal.getValueType());
9295    CC = ISD::SETNE;
9296  }
9297
9298  // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
9299  // order to use FCSELSrrr
9300  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9301    TVal = SDValue(
9302        DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
9303                           DAG.getUNDEF(MVT::f32), TVal,
9304                           DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
9305        0);
9306    FVal = SDValue(
9307        DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
9308                           DAG.getUNDEF(MVT::f32), FVal,
9309                           DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
9310        0);
9311  }
9312
9313  SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9314
9315  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9316    Res = SDValue(
9317        DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, Ty, Res,
9318                           DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
9319        0);
9320  }
9321
9322  return Res;
9323}
9324
9325SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
9326                                              SelectionDAG &DAG) const {
9327  // Jump table entries as PC relative offsets. No additional tweaking
9328  // is necessary here. Just get the address of the jump table.
9329  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
9330
9331  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9332      !Subtarget->isTargetMachO()) {
9333    return getAddrLarge(JT, DAG);
9334  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9335    return getAddrTiny(JT, DAG);
9336  }
9337  return getAddr(JT, DAG);
9338}
9339
9340SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
9341                                          SelectionDAG &DAG) const {
9342  // Jump table entries as PC relative offsets. No additional tweaking
9343  // is necessary here. Just get the address of the jump table.
9344  SDLoc DL(Op);
9345  SDValue JT = Op.getOperand(1);
9346  SDValue Entry = Op.getOperand(2);
9347  int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
9348
9349  auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9350  AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
9351
9352  SDNode *Dest =
9353      DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
9354                         Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
9355  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
9356                     SDValue(Dest, 0));
9357}
9358
9359SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
9360                                                 SelectionDAG &DAG) const {
9361  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
9362
9363  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
9364    // Use the GOT for the large code model on iOS.
9365    if (Subtarget->isTargetMachO()) {
9366      return getGOT(CP, DAG);
9367    }
9368    return getAddrLarge(CP, DAG);
9369  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9370    return getAddrTiny(CP, DAG);
9371  } else {
9372    return getAddr(CP, DAG);
9373  }
9374}
9375
9376SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
9377                                               SelectionDAG &DAG) const {
9378  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
9379  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9380      !Subtarget->isTargetMachO()) {
9381    return getAddrLarge(BA, DAG);
9382  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9383    return getAddrTiny(BA, DAG);
9384  }
9385  return getAddr(BA, DAG);
9386}
9387
9388SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
9389                                                 SelectionDAG &DAG) const {
9390  AArch64FunctionInfo *FuncInfo =
9391      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9392
9393  SDLoc DL(Op);
9394  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
9395                                 getPointerTy(DAG.getDataLayout()));
9396  FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
9397  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9398  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9399                      MachinePointerInfo(SV));
9400}
9401
9402SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
9403                                                  SelectionDAG &DAG) const {
9404  MachineFunction &MF = DAG.getMachineFunction();
9405  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9406
9407  SDLoc DL(Op);
9408  SDValue FR;
9409  if (Subtarget->isWindowsArm64EC()) {
9410    // With the Arm64EC ABI, we compute the address of the varargs save area
9411    // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9412    // but calls from an entry thunk can pass in a different address.
9413    Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9414    SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
9415    uint64_t StackOffset;
9416    if (FuncInfo->getVarArgsGPRSize() > 0)
9417      StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
9418    else
9419      StackOffset = FuncInfo->getVarArgsStackOffset();
9420    FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
9421                     DAG.getConstant(StackOffset, DL, MVT::i64));
9422  } else {
9423    FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
9424                               ? FuncInfo->getVarArgsGPRIndex()
9425                               : FuncInfo->getVarArgsStackIndex(),
9426                           getPointerTy(DAG.getDataLayout()));
9427  }
9428  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9429  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9430                      MachinePointerInfo(SV));
9431}
9432
9433SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
9434                                                  SelectionDAG &DAG) const {
9435  // The layout of the va_list struct is specified in the AArch64 Procedure Call
9436  // Standard, section B.3.
9437  MachineFunction &MF = DAG.getMachineFunction();
9438  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9439  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9440  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9441  auto PtrVT = getPointerTy(DAG.getDataLayout());
9442  SDLoc DL(Op);
9443
9444  SDValue Chain = Op.getOperand(0);
9445  SDValue VAList = Op.getOperand(1);
9446  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9447  SmallVector<SDValue, 4> MemOps;
9448
9449  // void *__stack at offset 0
9450  unsigned Offset = 0;
9451  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
9452  Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
9453  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
9454                                MachinePointerInfo(SV), Align(PtrSize)));
9455
9456  // void *__gr_top at offset 8 (4 on ILP32)
9457  Offset += PtrSize;
9458  int GPRSize = FuncInfo->getVarArgsGPRSize();
9459  if (GPRSize > 0) {
9460    SDValue GRTop, GRTopAddr;
9461
9462    GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9463                            DAG.getConstant(Offset, DL, PtrVT));
9464
9465    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
9466    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
9467                        DAG.getConstant(GPRSize, DL, PtrVT));
9468    GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
9469
9470    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
9471                                  MachinePointerInfo(SV, Offset),
9472                                  Align(PtrSize)));
9473  }
9474
9475  // void *__vr_top at offset 16 (8 on ILP32)
9476  Offset += PtrSize;
9477  int FPRSize = FuncInfo->getVarArgsFPRSize();
9478  if (FPRSize > 0) {
9479    SDValue VRTop, VRTopAddr;
9480    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9481                            DAG.getConstant(Offset, DL, PtrVT));
9482
9483    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
9484    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
9485                        DAG.getConstant(FPRSize, DL, PtrVT));
9486    VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
9487
9488    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
9489                                  MachinePointerInfo(SV, Offset),
9490                                  Align(PtrSize)));
9491  }
9492
9493  // int __gr_offs at offset 24 (12 on ILP32)
9494  Offset += PtrSize;
9495  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9496                                   DAG.getConstant(Offset, DL, PtrVT));
9497  MemOps.push_back(
9498      DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
9499                   GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9500
9501  // int __vr_offs at offset 28 (16 on ILP32)
9502  Offset += 4;
9503  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9504                                   DAG.getConstant(Offset, DL, PtrVT));
9505  MemOps.push_back(
9506      DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
9507                   VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9508
9509  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9510}
9511
9512SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
9513                                            SelectionDAG &DAG) const {
9514  MachineFunction &MF = DAG.getMachineFunction();
9515
9516  if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
9517    return LowerWin64_VASTART(Op, DAG);
9518  else if (Subtarget->isTargetDarwin())
9519    return LowerDarwin_VASTART(Op, DAG);
9520  else
9521    return LowerAAPCS_VASTART(Op, DAG);
9522}
9523
9524SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
9525                                           SelectionDAG &DAG) const {
9526  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
9527  // pointer.
9528  SDLoc DL(Op);
9529  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9530  unsigned VaListSize =
9531      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
9532          ? PtrSize
9533          : Subtarget->isTargetILP32() ? 20 : 32;
9534  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9535  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9536
9537  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
9538                       DAG.getConstant(VaListSize, DL, MVT::i32),
9539                       Align(PtrSize), false, false, false,
9540                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
9541}
9542
9543SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9544  assert(Subtarget->isTargetDarwin() &&
9545         "automatic va_arg instruction only works on Darwin");
9546
9547  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9548  EVT VT = Op.getValueType();
9549  SDLoc DL(Op);
9550  SDValue Chain = Op.getOperand(0);
9551  SDValue Addr = Op.getOperand(1);
9552  MaybeAlign Align(Op.getConstantOperandVal(3));
9553  unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
9554  auto PtrVT = getPointerTy(DAG.getDataLayout());
9555  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9556  SDValue VAList =
9557      DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
9558  Chain = VAList.getValue(1);
9559  VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
9560
9561  if (VT.isScalableVector())
9562    report_fatal_error("Passing SVE types to variadic functions is "
9563                       "currently not supported");
9564
9565  if (Align && *Align > MinSlotSize) {
9566    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9567                         DAG.getConstant(Align->value() - 1, DL, PtrVT));
9568    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
9569                         DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
9570  }
9571
9572  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
9573  unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
9574
9575  // Scalar integer and FP values smaller than 64 bits are implicitly extended
9576  // up to 64 bits.  At the very least, we have to increase the striding of the
9577  // vaargs list to match this, and for FP values we need to introduce
9578  // FP_ROUND nodes as well.
9579  if (VT.isInteger() && !VT.isVector())
9580    ArgSize = std::max(ArgSize, MinSlotSize);
9581  bool NeedFPTrunc = false;
9582  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
9583    ArgSize = 8;
9584    NeedFPTrunc = true;
9585  }
9586
9587  // Increment the pointer, VAList, to the next vaarg
9588  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9589                               DAG.getConstant(ArgSize, DL, PtrVT));
9590  VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
9591
9592  // Store the incremented VAList to the legalized pointer
9593  SDValue APStore =
9594      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
9595
9596  // Load the actual argument out of the pointer VAList
9597  if (NeedFPTrunc) {
9598    // Load the value as an f64.
9599    SDValue WideFP =
9600        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
9601    // Round the value down to an f32.
9602    SDValue NarrowFP =
9603        DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
9604                    DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
9605    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
9606    // Merge the rounded value with the chain output of the load.
9607    return DAG.getMergeValues(Ops, DL);
9608  }
9609
9610  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
9611}
9612
9613SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
9614                                              SelectionDAG &DAG) const {
9615  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9616  MFI.setFrameAddressIsTaken(true);
9617
9618  EVT VT = Op.getValueType();
9619  SDLoc DL(Op);
9620  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9621  SDValue FrameAddr =
9622      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
9623  while (Depth--)
9624    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
9625                            MachinePointerInfo());
9626
9627  if (Subtarget->isTargetILP32())
9628    FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
9629                            DAG.getValueType(VT));
9630
9631  return FrameAddr;
9632}
9633
9634SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
9635                                              SelectionDAG &DAG) const {
9636  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9637
9638  EVT VT = getPointerTy(DAG.getDataLayout());
9639  SDLoc DL(Op);
9640  int FI = MFI.CreateFixedObject(4, 0, false);
9641  return DAG.getFrameIndex(FI, VT);
9642}
9643
9644#define GET_REGISTER_MATCHER
9645#include "AArch64GenAsmMatcher.inc"
9646
9647// FIXME? Maybe this could be a TableGen attribute on some registers and
9648// this table could be generated automatically from RegInfo.
9649Register AArch64TargetLowering::
9650getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
9651  Register Reg = MatchRegisterName(RegName);
9652  if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
9653    const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
9654    unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
9655    if (!Subtarget->isXRegisterReserved(DwarfRegNum))
9656      Reg = 0;
9657  }
9658  if (Reg)
9659    return Reg;
9660  report_fatal_error(Twine("Invalid register name \""
9661                              + StringRef(RegName)  + "\"."));
9662}
9663
9664SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
9665                                                     SelectionDAG &DAG) const {
9666  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
9667
9668  EVT VT = Op.getValueType();
9669  SDLoc DL(Op);
9670
9671  SDValue FrameAddr =
9672      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
9673  SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9674
9675  return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
9676}
9677
9678SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
9679                                               SelectionDAG &DAG) const {
9680  MachineFunction &MF = DAG.getMachineFunction();
9681  MachineFrameInfo &MFI = MF.getFrameInfo();
9682  MFI.setReturnAddressIsTaken(true);
9683
9684  EVT VT = Op.getValueType();
9685  SDLoc DL(Op);
9686  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9687  SDValue ReturnAddress;
9688  if (Depth) {
9689    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
9690    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9691    ReturnAddress = DAG.getLoad(
9692        VT, DL, DAG.getEntryNode(),
9693        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
9694  } else {
9695    // Return LR, which contains the return address. Mark it an implicit
9696    // live-in.
9697    Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
9698    ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
9699  }
9700
9701  // The XPACLRI instruction assembles to a hint-space instruction before
9702  // Armv8.3-A therefore this instruction can be safely used for any pre
9703  // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
9704  // that instead.
9705  SDNode *St;
9706  if (Subtarget->hasPAuth()) {
9707    St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
9708  } else {
9709    // XPACLRI operates on LR therefore we must move the operand accordingly.
9710    SDValue Chain =
9711        DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
9712    St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
9713  }
9714  return SDValue(St, 0);
9715}
9716
9717/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
9718/// i32 values and take a 2 x i32 value to shift plus a shift amount.
9719SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
9720                                               SelectionDAG &DAG) const {
9721  SDValue Lo, Hi;
9722  expandShiftParts(Op.getNode(), Lo, Hi, DAG);
9723  return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
9724}
9725
9726bool AArch64TargetLowering::isOffsetFoldingLegal(
9727    const GlobalAddressSDNode *GA) const {
9728  // Offsets are folded in the DAG combine rather than here so that we can
9729  // intelligently choose an offset based on the uses.
9730  return false;
9731}
9732
9733bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
9734                                         bool OptForSize) const {
9735  bool IsLegal = false;
9736  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
9737  // 16-bit case when target has full fp16 support.
9738  // FIXME: We should be able to handle f128 as well with a clever lowering.
9739  const APInt ImmInt = Imm.bitcastToAPInt();
9740  if (VT == MVT::f64)
9741    IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
9742  else if (VT == MVT::f32)
9743    IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
9744  else if (VT == MVT::f16 && Subtarget->hasFullFP16())
9745    IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
9746  // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
9747  //       generate that fmov.
9748
9749  // If we can not materialize in immediate field for fmov, check if the
9750  // value can be encoded as the immediate operand of a logical instruction.
9751  // The immediate value will be created with either MOVZ, MOVN, or ORR.
9752  if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
9753    // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
9754    // however the mov+fmov sequence is always better because of the reduced
9755    // cache pressure. The timings are still the same if you consider
9756    // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
9757    // movw+movk is fused). So we limit up to 2 instrdduction at most.
9758    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
9759    AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
9760			      Insn);
9761    unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
9762    IsLegal = Insn.size() <= Limit;
9763  }
9764
9765  LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
9766                    << " imm value: "; Imm.dump(););
9767  return IsLegal;
9768}
9769
9770//===----------------------------------------------------------------------===//
9771//                          AArch64 Optimization Hooks
9772//===----------------------------------------------------------------------===//
9773
9774static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
9775                           SDValue Operand, SelectionDAG &DAG,
9776                           int &ExtraSteps) {
9777  EVT VT = Operand.getValueType();
9778  if ((ST->hasNEON() &&
9779       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
9780        VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
9781        VT == MVT::v4f32)) ||
9782      (ST->hasSVE() &&
9783       (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
9784    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
9785      // For the reciprocal estimates, convergence is quadratic, so the number
9786      // of digits is doubled after each iteration.  In ARMv8, the accuracy of
9787      // the initial estimate is 2^-8.  Thus the number of extra steps to refine
9788      // the result for float (23 mantissa bits) is 2 and for double (52
9789      // mantissa bits) is 3.
9790      ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
9791
9792    return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
9793  }
9794
9795  return SDValue();
9796}
9797
9798SDValue
9799AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
9800                                        const DenormalMode &Mode) const {
9801  SDLoc DL(Op);
9802  EVT VT = Op.getValueType();
9803  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
9804  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
9805  return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
9806}
9807
9808SDValue
9809AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
9810                                                   SelectionDAG &DAG) const {
9811  return Op;
9812}
9813
9814SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
9815                                               SelectionDAG &DAG, int Enabled,
9816                                               int &ExtraSteps,
9817                                               bool &UseOneConst,
9818                                               bool Reciprocal) const {
9819  if (Enabled == ReciprocalEstimate::Enabled ||
9820      (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
9821    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
9822                                       DAG, ExtraSteps)) {
9823      SDLoc DL(Operand);
9824      EVT VT = Operand.getValueType();
9825
9826      SDNodeFlags Flags;
9827      Flags.setAllowReassociation(true);
9828
9829      // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
9830      // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
9831      for (int i = ExtraSteps; i > 0; --i) {
9832        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
9833                                   Flags);
9834        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
9835        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9836      }
9837      if (!Reciprocal)
9838        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
9839
9840      ExtraSteps = 0;
9841      return Estimate;
9842    }
9843
9844  return SDValue();
9845}
9846
9847SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
9848                                                SelectionDAG &DAG, int Enabled,
9849                                                int &ExtraSteps) const {
9850  if (Enabled == ReciprocalEstimate::Enabled)
9851    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
9852                                       DAG, ExtraSteps)) {
9853      SDLoc DL(Operand);
9854      EVT VT = Operand.getValueType();
9855
9856      SDNodeFlags Flags;
9857      Flags.setAllowReassociation(true);
9858
9859      // Newton reciprocal iteration: E * (2 - X * E)
9860      // AArch64 reciprocal iteration instruction: (2 - M * N)
9861      for (int i = ExtraSteps; i > 0; --i) {
9862        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
9863                                   Estimate, Flags);
9864        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9865      }
9866
9867      ExtraSteps = 0;
9868      return Estimate;
9869    }
9870
9871  return SDValue();
9872}
9873
9874//===----------------------------------------------------------------------===//
9875//                          AArch64 Inline Assembly Support
9876//===----------------------------------------------------------------------===//
9877
9878// Table of Constraints
9879// TODO: This is the current set of constraints supported by ARM for the
9880// compiler, not all of them may make sense.
9881//
9882// r - A general register
9883// w - An FP/SIMD register of some size in the range v0-v31
9884// x - An FP/SIMD register of some size in the range v0-v15
9885// I - Constant that can be used with an ADD instruction
9886// J - Constant that can be used with a SUB instruction
9887// K - Constant that can be used with a 32-bit logical instruction
9888// L - Constant that can be used with a 64-bit logical instruction
9889// M - Constant that can be used as a 32-bit MOV immediate
9890// N - Constant that can be used as a 64-bit MOV immediate
9891// Q - A memory reference with base register and no offset
9892// S - A symbolic address
9893// Y - Floating point constant zero
9894// Z - Integer constant zero
9895//
9896//   Note that general register operands will be output using their 64-bit x
9897// register name, whatever the size of the variable, unless the asm operand
9898// is prefixed by the %w modifier. Floating-point and SIMD register operands
9899// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
9900// %q modifier.
9901const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
9902  // At this point, we have to lower this constraint to something else, so we
9903  // lower it to an "r" or "w". However, by doing this we will force the result
9904  // to be in register, while the X constraint is much more permissive.
9905  //
9906  // Although we are correct (we are free to emit anything, without
9907  // constraints), we might break use cases that would expect us to be more
9908  // efficient and emit something else.
9909  if (!Subtarget->hasFPARMv8())
9910    return "r";
9911
9912  if (ConstraintVT.isFloatingPoint())
9913    return "w";
9914
9915  if (ConstraintVT.isVector() &&
9916     (ConstraintVT.getSizeInBits() == 64 ||
9917      ConstraintVT.getSizeInBits() == 128))
9918    return "w";
9919
9920  return "r";
9921}
9922
9923enum PredicateConstraint {
9924  Upl,
9925  Upa,
9926  Invalid
9927};
9928
9929static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
9930  PredicateConstraint P = PredicateConstraint::Invalid;
9931  if (Constraint == "Upa")
9932    P = PredicateConstraint::Upa;
9933  if (Constraint == "Upl")
9934    P = PredicateConstraint::Upl;
9935  return P;
9936}
9937
9938/// getConstraintType - Given a constraint letter, return the type of
9939/// constraint it is for this target.
9940AArch64TargetLowering::ConstraintType
9941AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
9942  if (Constraint.size() == 1) {
9943    switch (Constraint[0]) {
9944    default:
9945      break;
9946    case 'x':
9947    case 'w':
9948    case 'y':
9949      return C_RegisterClass;
9950    // An address with a single base register. Due to the way we
9951    // currently handle addresses it is the same as 'r'.
9952    case 'Q':
9953      return C_Memory;
9954    case 'I':
9955    case 'J':
9956    case 'K':
9957    case 'L':
9958    case 'M':
9959    case 'N':
9960    case 'Y':
9961    case 'Z':
9962      return C_Immediate;
9963    case 'z':
9964    case 'S': // A symbolic address
9965      return C_Other;
9966    }
9967  } else if (parsePredicateConstraint(Constraint) !=
9968             PredicateConstraint::Invalid)
9969      return C_RegisterClass;
9970  return TargetLowering::getConstraintType(Constraint);
9971}
9972
9973/// Examine constraint type and operand type and determine a weight value.
9974/// This object must already have been set up with the operand type
9975/// and the current alternative constraint selected.
9976TargetLowering::ConstraintWeight
9977AArch64TargetLowering::getSingleConstraintMatchWeight(
9978    AsmOperandInfo &info, const char *constraint) const {
9979  ConstraintWeight weight = CW_Invalid;
9980  Value *CallOperandVal = info.CallOperandVal;
9981  // If we don't have a value, we can't do a match,
9982  // but allow it at the lowest weight.
9983  if (!CallOperandVal)
9984    return CW_Default;
9985  Type *type = CallOperandVal->getType();
9986  // Look at the constraint type.
9987  switch (*constraint) {
9988  default:
9989    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
9990    break;
9991  case 'x':
9992  case 'w':
9993  case 'y':
9994    if (type->isFloatingPointTy() || type->isVectorTy())
9995      weight = CW_Register;
9996    break;
9997  case 'z':
9998    weight = CW_Constant;
9999    break;
10000  case 'U':
10001    if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
10002      weight = CW_Register;
10003    break;
10004  }
10005  return weight;
10006}
10007
10008std::pair<unsigned, const TargetRegisterClass *>
10009AArch64TargetLowering::getRegForInlineAsmConstraint(
10010    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10011  if (Constraint.size() == 1) {
10012    switch (Constraint[0]) {
10013    case 'r':
10014      if (VT.isScalableVector())
10015        return std::make_pair(0U, nullptr);
10016      if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10017        return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10018      if (VT.getFixedSizeInBits() == 64)
10019        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10020      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10021    case 'w': {
10022      if (!Subtarget->hasFPARMv8())
10023        break;
10024      if (VT.isScalableVector()) {
10025        if (VT.getVectorElementType() != MVT::i1)
10026          return std::make_pair(0U, &AArch64::ZPRRegClass);
10027        return std::make_pair(0U, nullptr);
10028      }
10029      uint64_t VTSize = VT.getFixedSizeInBits();
10030      if (VTSize == 16)
10031        return std::make_pair(0U, &AArch64::FPR16RegClass);
10032      if (VTSize == 32)
10033        return std::make_pair(0U, &AArch64::FPR32RegClass);
10034      if (VTSize == 64)
10035        return std::make_pair(0U, &AArch64::FPR64RegClass);
10036      if (VTSize == 128)
10037        return std::make_pair(0U, &AArch64::FPR128RegClass);
10038      break;
10039    }
10040    // The instructions that this constraint is designed for can
10041    // only take 128-bit registers so just use that regclass.
10042    case 'x':
10043      if (!Subtarget->hasFPARMv8())
10044        break;
10045      if (VT.isScalableVector())
10046        return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10047      if (VT.getSizeInBits() == 128)
10048        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10049      break;
10050    case 'y':
10051      if (!Subtarget->hasFPARMv8())
10052        break;
10053      if (VT.isScalableVector())
10054        return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10055      break;
10056    }
10057  } else {
10058    PredicateConstraint PC = parsePredicateConstraint(Constraint);
10059    if (PC != PredicateConstraint::Invalid) {
10060      if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
10061        return std::make_pair(0U, nullptr);
10062      bool restricted = (PC == PredicateConstraint::Upl);
10063      return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
10064                        : std::make_pair(0U, &AArch64::PPRRegClass);
10065    }
10066  }
10067  if (StringRef("{cc}").equals_insensitive(Constraint))
10068    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10069
10070  // Use the default implementation in TargetLowering to convert the register
10071  // constraint into a member of a register class.
10072  std::pair<unsigned, const TargetRegisterClass *> Res;
10073  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
10074
10075  // Not found as a standard register?
10076  if (!Res.second) {
10077    unsigned Size = Constraint.size();
10078    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
10079        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
10080      int RegNo;
10081      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
10082      if (!Failed && RegNo >= 0 && RegNo <= 31) {
10083        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
10084        // By default we'll emit v0-v31 for this unless there's a modifier where
10085        // we'll emit the correct register as well.
10086        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
10087          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
10088          Res.second = &AArch64::FPR64RegClass;
10089        } else {
10090          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
10091          Res.second = &AArch64::FPR128RegClass;
10092        }
10093      }
10094    }
10095  }
10096
10097  if (Res.second && !Subtarget->hasFPARMv8() &&
10098      !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
10099      !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
10100    return std::make_pair(0U, nullptr);
10101
10102  return Res;
10103}
10104
10105EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
10106                                                  llvm::Type *Ty,
10107                                                  bool AllowUnknown) const {
10108  if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
10109    return EVT(MVT::i64x8);
10110
10111  return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
10112}
10113
10114/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10115/// vector.  If it is invalid, don't add anything to Ops.
10116void AArch64TargetLowering::LowerAsmOperandForConstraint(
10117    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
10118    SelectionDAG &DAG) const {
10119  SDValue Result;
10120
10121  // Currently only support length 1 constraints.
10122  if (Constraint.length() != 1)
10123    return;
10124
10125  char ConstraintLetter = Constraint[0];
10126  switch (ConstraintLetter) {
10127  default:
10128    break;
10129
10130  // This set of constraints deal with valid constants for various instructions.
10131  // Validate and return a target constant for them if we can.
10132  case 'z': {
10133    // 'z' maps to xzr or wzr so it needs an input of 0.
10134    if (!isNullConstant(Op))
10135      return;
10136
10137    if (Op.getValueType() == MVT::i64)
10138      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
10139    else
10140      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
10141    break;
10142  }
10143  case 'S': {
10144    // An absolute symbolic address or label reference.
10145    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
10146      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
10147                                          GA->getValueType(0));
10148    } else if (const BlockAddressSDNode *BA =
10149                   dyn_cast<BlockAddressSDNode>(Op)) {
10150      Result =
10151          DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
10152    } else
10153      return;
10154    break;
10155  }
10156
10157  case 'I':
10158  case 'J':
10159  case 'K':
10160  case 'L':
10161  case 'M':
10162  case 'N':
10163    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
10164    if (!C)
10165      return;
10166
10167    // Grab the value and do some validation.
10168    uint64_t CVal = C->getZExtValue();
10169    switch (ConstraintLetter) {
10170    // The I constraint applies only to simple ADD or SUB immediate operands:
10171    // i.e. 0 to 4095 with optional shift by 12
10172    // The J constraint applies only to ADD or SUB immediates that would be
10173    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
10174    // instruction [or vice versa], in other words -1 to -4095 with optional
10175    // left shift by 12.
10176    case 'I':
10177      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
10178        break;
10179      return;
10180    case 'J': {
10181      uint64_t NVal = -C->getSExtValue();
10182      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
10183        CVal = C->getSExtValue();
10184        break;
10185      }
10186      return;
10187    }
10188    // The K and L constraints apply *only* to logical immediates, including
10189    // what used to be the MOVI alias for ORR (though the MOVI alias has now
10190    // been removed and MOV should be used). So these constraints have to
10191    // distinguish between bit patterns that are valid 32-bit or 64-bit
10192    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
10193    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
10194    // versa.
10195    case 'K':
10196      if (AArch64_AM::isLogicalImmediate(CVal, 32))
10197        break;
10198      return;
10199    case 'L':
10200      if (AArch64_AM::isLogicalImmediate(CVal, 64))
10201        break;
10202      return;
10203    // The M and N constraints are a superset of K and L respectively, for use
10204    // with the MOV (immediate) alias. As well as the logical immediates they
10205    // also match 32 or 64-bit immediates that can be loaded either using a
10206    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
10207    // (M) or 64-bit 0x1234000000000000 (N) etc.
10208    // As a note some of this code is liberally stolen from the asm parser.
10209    case 'M': {
10210      if (!isUInt<32>(CVal))
10211        return;
10212      if (AArch64_AM::isLogicalImmediate(CVal, 32))
10213        break;
10214      if ((CVal & 0xFFFF) == CVal)
10215        break;
10216      if ((CVal & 0xFFFF0000ULL) == CVal)
10217        break;
10218      uint64_t NCVal = ~(uint32_t)CVal;
10219      if ((NCVal & 0xFFFFULL) == NCVal)
10220        break;
10221      if ((NCVal & 0xFFFF0000ULL) == NCVal)
10222        break;
10223      return;
10224    }
10225    case 'N': {
10226      if (AArch64_AM::isLogicalImmediate(CVal, 64))
10227        break;
10228      if ((CVal & 0xFFFFULL) == CVal)
10229        break;
10230      if ((CVal & 0xFFFF0000ULL) == CVal)
10231        break;
10232      if ((CVal & 0xFFFF00000000ULL) == CVal)
10233        break;
10234      if ((CVal & 0xFFFF000000000000ULL) == CVal)
10235        break;
10236      uint64_t NCVal = ~CVal;
10237      if ((NCVal & 0xFFFFULL) == NCVal)
10238        break;
10239      if ((NCVal & 0xFFFF0000ULL) == NCVal)
10240        break;
10241      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
10242        break;
10243      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
10244        break;
10245      return;
10246    }
10247    default:
10248      return;
10249    }
10250
10251    // All assembler immediates are 64-bit integers.
10252    Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
10253    break;
10254  }
10255
10256  if (Result.getNode()) {
10257    Ops.push_back(Result);
10258    return;
10259  }
10260
10261  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10262}
10263
10264//===----------------------------------------------------------------------===//
10265//                     AArch64 Advanced SIMD Support
10266//===----------------------------------------------------------------------===//
10267
10268/// WidenVector - Given a value in the V64 register class, produce the
10269/// equivalent value in the V128 register class.
10270static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
10271  EVT VT = V64Reg.getValueType();
10272  unsigned NarrowSize = VT.getVectorNumElements();
10273  MVT EltTy = VT.getVectorElementType().getSimpleVT();
10274  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
10275  SDLoc DL(V64Reg);
10276
10277  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
10278                     V64Reg, DAG.getConstant(0, DL, MVT::i64));
10279}
10280
10281/// getExtFactor - Determine the adjustment factor for the position when
10282/// generating an "extract from vector registers" instruction.
10283static unsigned getExtFactor(SDValue &V) {
10284  EVT EltType = V.getValueType().getVectorElementType();
10285  return EltType.getSizeInBits() / 8;
10286}
10287
10288/// NarrowVector - Given a value in the V128 register class, produce the
10289/// equivalent value in the V64 register class.
10290static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
10291  EVT VT = V128Reg.getValueType();
10292  unsigned WideSize = VT.getVectorNumElements();
10293  MVT EltTy = VT.getVectorElementType().getSimpleVT();
10294  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
10295  SDLoc DL(V128Reg);
10296
10297  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
10298}
10299
10300// Gather data to see if the operation can be modelled as a
10301// shuffle in combination with VEXTs.
10302SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
10303                                                  SelectionDAG &DAG) const {
10304  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
10305  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
10306  SDLoc dl(Op);
10307  EVT VT = Op.getValueType();
10308  assert(!VT.isScalableVector() &&
10309         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
10310  unsigned NumElts = VT.getVectorNumElements();
10311
10312  struct ShuffleSourceInfo {
10313    SDValue Vec;
10314    unsigned MinElt;
10315    unsigned MaxElt;
10316
10317    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
10318    // be compatible with the shuffle we intend to construct. As a result
10319    // ShuffleVec will be some sliding window into the original Vec.
10320    SDValue ShuffleVec;
10321
10322    // Code should guarantee that element i in Vec starts at element "WindowBase
10323    // + i * WindowScale in ShuffleVec".
10324    int WindowBase;
10325    int WindowScale;
10326
10327    ShuffleSourceInfo(SDValue Vec)
10328      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
10329          ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
10330
10331    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
10332  };
10333
10334  // First gather all vectors used as an immediate source for this BUILD_VECTOR
10335  // node.
10336  SmallVector<ShuffleSourceInfo, 2> Sources;
10337  for (unsigned i = 0; i < NumElts; ++i) {
10338    SDValue V = Op.getOperand(i);
10339    if (V.isUndef())
10340      continue;
10341    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10342             !isa<ConstantSDNode>(V.getOperand(1)) ||
10343             V.getOperand(0).getValueType().isScalableVector()) {
10344      LLVM_DEBUG(
10345          dbgs() << "Reshuffle failed: "
10346                    "a shuffle can only come from building a vector from "
10347                    "various elements of other fixed-width vectors, provided "
10348                    "their indices are constant\n");
10349      return SDValue();
10350    }
10351
10352    // Add this element source to the list if it's not already there.
10353    SDValue SourceVec = V.getOperand(0);
10354    auto Source = find(Sources, SourceVec);
10355    if (Source == Sources.end())
10356      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
10357
10358    // Update the minimum and maximum lane number seen.
10359    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
10360    Source->MinElt = std::min(Source->MinElt, EltNo);
10361    Source->MaxElt = std::max(Source->MaxElt, EltNo);
10362  }
10363
10364  // If we have 3 or 4 sources, try to generate a TBL, which will at least be
10365  // better than moving to/from gpr registers for larger vectors.
10366  if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
10367    // Construct a mask for the tbl. We may need to adjust the index for types
10368    // larger than i8.
10369    SmallVector<unsigned, 16> Mask;
10370    unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
10371    for (unsigned I = 0; I < NumElts; ++I) {
10372      SDValue V = Op.getOperand(I);
10373      if (V.isUndef()) {
10374        for (unsigned OF = 0; OF < OutputFactor; OF++)
10375          Mask.push_back(-1);
10376        continue;
10377      }
10378      // Set the Mask lanes adjusted for the size of the input and output
10379      // lanes. The Mask is always i8, so it will set OutputFactor lanes per
10380      // output element, adjusted in their positions per input and output types.
10381      unsigned Lane = V.getConstantOperandVal(1);
10382      for (unsigned S = 0; S < Sources.size(); S++) {
10383        if (V.getOperand(0) == Sources[S].Vec) {
10384          unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
10385          unsigned InputBase = 16 * S + Lane * InputSize / 8;
10386          for (unsigned OF = 0; OF < OutputFactor; OF++)
10387            Mask.push_back(InputBase + OF);
10388          break;
10389        }
10390      }
10391    }
10392
10393    // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
10394    // v16i8, and the TBLMask
10395    SmallVector<SDValue, 16> TBLOperands;
10396    TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
10397                                              ? Intrinsic::aarch64_neon_tbl3
10398                                              : Intrinsic::aarch64_neon_tbl4,
10399                                          dl, MVT::i32));
10400    for (unsigned i = 0; i < Sources.size(); i++) {
10401      SDValue Src = Sources[i].Vec;
10402      EVT SrcVT = Src.getValueType();
10403      Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
10404      assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
10405             "Expected a legally typed vector");
10406      if (SrcVT.is64BitVector())
10407        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
10408                          DAG.getUNDEF(MVT::v8i8));
10409      TBLOperands.push_back(Src);
10410    }
10411
10412    SmallVector<SDValue, 16> TBLMask;
10413    for (unsigned i = 0; i < Mask.size(); i++)
10414      TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
10415    assert((Mask.size() == 8 || Mask.size() == 16) &&
10416           "Expected a v8i8 or v16i8 Mask");
10417    TBLOperands.push_back(
10418        DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
10419
10420    SDValue Shuffle =
10421        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10422                    Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
10423    return DAG.getBitcast(VT, Shuffle);
10424  }
10425
10426  if (Sources.size() > 2) {
10427    LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
10428                      << "sensible when at most two source vectors are "
10429                      << "involved\n");
10430    return SDValue();
10431  }
10432
10433  // Find out the smallest element size among result and two sources, and use
10434  // it as element size to build the shuffle_vector.
10435  EVT SmallestEltTy = VT.getVectorElementType();
10436  for (auto &Source : Sources) {
10437    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
10438    if (SrcEltTy.bitsLT(SmallestEltTy)) {
10439      SmallestEltTy = SrcEltTy;
10440    }
10441  }
10442  unsigned ResMultiplier =
10443      VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10444  uint64_t VTSize = VT.getFixedSizeInBits();
10445  NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
10446  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
10447
10448  // If the source vector is too wide or too narrow, we may nevertheless be able
10449  // to construct a compatible shuffle either by concatenating it with UNDEF or
10450  // extracting a suitable range of elements.
10451  for (auto &Src : Sources) {
10452    EVT SrcVT = Src.ShuffleVec.getValueType();
10453
10454    TypeSize SrcVTSize = SrcVT.getSizeInBits();
10455    if (SrcVTSize == TypeSize::Fixed(VTSize))
10456      continue;
10457
10458    // This stage of the search produces a source with the same element type as
10459    // the original, but with a total width matching the BUILD_VECTOR output.
10460    EVT EltVT = SrcVT.getVectorElementType();
10461    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
10462    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
10463
10464    if (SrcVTSize.getFixedValue() < VTSize) {
10465      assert(2 * SrcVTSize == VTSize);
10466      // We can pad out the smaller vector for free, so if it's part of a
10467      // shuffle...
10468      Src.ShuffleVec =
10469          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
10470                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
10471      continue;
10472    }
10473
10474    if (SrcVTSize.getFixedValue() != 2 * VTSize) {
10475      LLVM_DEBUG(
10476          dbgs() << "Reshuffle failed: result vector too small to extract\n");
10477      return SDValue();
10478    }
10479
10480    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
10481      LLVM_DEBUG(
10482          dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
10483      return SDValue();
10484    }
10485
10486    if (Src.MinElt >= NumSrcElts) {
10487      // The extraction can just take the second half
10488      Src.ShuffleVec =
10489          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10490                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
10491      Src.WindowBase = -NumSrcElts;
10492    } else if (Src.MaxElt < NumSrcElts) {
10493      // The extraction can just take the first half
10494      Src.ShuffleVec =
10495          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10496                      DAG.getConstant(0, dl, MVT::i64));
10497    } else {
10498      // An actual VEXT is needed
10499      SDValue VEXTSrc1 =
10500          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10501                      DAG.getConstant(0, dl, MVT::i64));
10502      SDValue VEXTSrc2 =
10503          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10504                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
10505      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
10506
10507      if (!SrcVT.is64BitVector()) {
10508        LLVM_DEBUG(
10509          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
10510                    "for SVE vectors.");
10511        return SDValue();
10512      }
10513
10514      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
10515                                   VEXTSrc2,
10516                                   DAG.getConstant(Imm, dl, MVT::i32));
10517      Src.WindowBase = -Src.MinElt;
10518    }
10519  }
10520
10521  // Another possible incompatibility occurs from the vector element types. We
10522  // can fix this by bitcasting the source vectors to the same type we intend
10523  // for the shuffle.
10524  for (auto &Src : Sources) {
10525    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
10526    if (SrcEltTy == SmallestEltTy)
10527      continue;
10528    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
10529    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
10530    Src.WindowScale =
10531        SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10532    Src.WindowBase *= Src.WindowScale;
10533  }
10534
10535  // Final check before we try to actually produce a shuffle.
10536  LLVM_DEBUG(for (auto Src
10537                  : Sources)
10538                 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
10539
10540  // The stars all align, our next step is to produce the mask for the shuffle.
10541  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
10542  int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
10543  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
10544    SDValue Entry = Op.getOperand(i);
10545    if (Entry.isUndef())
10546      continue;
10547
10548    auto Src = find(Sources, Entry.getOperand(0));
10549    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
10550
10551    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
10552    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
10553    // segment.
10554    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
10555    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
10556                               VT.getScalarSizeInBits());
10557    int LanesDefined = BitsDefined / BitsPerShuffleLane;
10558
10559    // This source is expected to fill ResMultiplier lanes of the final shuffle,
10560    // starting at the appropriate offset.
10561    int *LaneMask = &Mask[i * ResMultiplier];
10562
10563    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
10564    ExtractBase += NumElts * (Src - Sources.begin());
10565    for (int j = 0; j < LanesDefined; ++j)
10566      LaneMask[j] = ExtractBase + j;
10567  }
10568
10569  // Final check before we try to produce nonsense...
10570  if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
10571    LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
10572    return SDValue();
10573  }
10574
10575  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
10576  for (unsigned i = 0; i < Sources.size(); ++i)
10577    ShuffleOps[i] = Sources[i].ShuffleVec;
10578
10579  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
10580                                         ShuffleOps[1], Mask);
10581  SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
10582
10583  LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
10584             dbgs() << "Reshuffle, creating node: "; V.dump(););
10585
10586  return V;
10587}
10588
10589// check if an EXT instruction can handle the shuffle mask when the
10590// vector sources of the shuffle are the same.
10591static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
10592  unsigned NumElts = VT.getVectorNumElements();
10593
10594  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
10595  if (M[0] < 0)
10596    return false;
10597
10598  Imm = M[0];
10599
10600  // If this is a VEXT shuffle, the immediate value is the index of the first
10601  // element.  The other shuffle indices must be the successive elements after
10602  // the first one.
10603  unsigned ExpectedElt = Imm;
10604  for (unsigned i = 1; i < NumElts; ++i) {
10605    // Increment the expected index.  If it wraps around, just follow it
10606    // back to index zero and keep going.
10607    ++ExpectedElt;
10608    if (ExpectedElt == NumElts)
10609      ExpectedElt = 0;
10610
10611    if (M[i] < 0)
10612      continue; // ignore UNDEF indices
10613    if (ExpectedElt != static_cast<unsigned>(M[i]))
10614      return false;
10615  }
10616
10617  return true;
10618}
10619
10620// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
10621// v4i32s. This is really a truncate, which we can construct out of (legal)
10622// concats and truncate nodes.
10623static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
10624  if (V.getValueType() != MVT::v16i8)
10625    return SDValue();
10626  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
10627
10628  for (unsigned X = 0; X < 4; X++) {
10629    // Check the first item in each group is an extract from lane 0 of a v4i32
10630    // or v4i16.
10631    SDValue BaseExt = V.getOperand(X * 4);
10632    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10633        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
10634         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
10635        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
10636        BaseExt.getConstantOperandVal(1) != 0)
10637      return SDValue();
10638    SDValue Base = BaseExt.getOperand(0);
10639    // And check the other items are extracts from the same vector.
10640    for (unsigned Y = 1; Y < 4; Y++) {
10641      SDValue Ext = V.getOperand(X * 4 + Y);
10642      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10643          Ext.getOperand(0) != Base ||
10644          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
10645          Ext.getConstantOperandVal(1) != Y)
10646        return SDValue();
10647    }
10648  }
10649
10650  // Turn the buildvector into a series of truncates and concates, which will
10651  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
10652  // concat together to produce 2 v8i16. These are both truncated and concat
10653  // together.
10654  SDLoc DL(V);
10655  SDValue Trunc[4] = {
10656      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
10657      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
10658  for (SDValue &V : Trunc)
10659    if (V.getValueType() == MVT::v4i32)
10660      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
10661  SDValue Concat0 =
10662      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
10663  SDValue Concat1 =
10664      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
10665  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
10666  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
10667  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
10668}
10669
10670/// Check if a vector shuffle corresponds to a DUP instructions with a larger
10671/// element width than the vector lane type. If that is the case the function
10672/// returns true and writes the value of the DUP instruction lane operand into
10673/// DupLaneOp
10674static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
10675                          unsigned &DupLaneOp) {
10676  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
10677         "Only possible block sizes for wide DUP are: 16, 32, 64");
10678
10679  if (BlockSize <= VT.getScalarSizeInBits())
10680    return false;
10681  if (BlockSize % VT.getScalarSizeInBits() != 0)
10682    return false;
10683  if (VT.getSizeInBits() % BlockSize != 0)
10684    return false;
10685
10686  size_t SingleVecNumElements = VT.getVectorNumElements();
10687  size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
10688  size_t NumBlocks = VT.getSizeInBits() / BlockSize;
10689
10690  // We are looking for masks like
10691  // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
10692  // might be replaced by 'undefined'. BlockIndices will eventually contain
10693  // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
10694  // for the above examples)
10695  SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
10696  for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
10697    for (size_t I = 0; I < NumEltsPerBlock; I++) {
10698      int Elt = M[BlockIndex * NumEltsPerBlock + I];
10699      if (Elt < 0)
10700        continue;
10701      // For now we don't support shuffles that use the second operand
10702      if ((unsigned)Elt >= SingleVecNumElements)
10703        return false;
10704      if (BlockElts[I] < 0)
10705        BlockElts[I] = Elt;
10706      else if (BlockElts[I] != Elt)
10707        return false;
10708    }
10709
10710  // We found a candidate block (possibly with some undefs). It must be a
10711  // sequence of consecutive integers starting with a value divisible by
10712  // NumEltsPerBlock with some values possibly replaced by undef-s.
10713
10714  // Find first non-undef element
10715  auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
10716  assert(FirstRealEltIter != BlockElts.end() &&
10717         "Shuffle with all-undefs must have been caught by previous cases, "
10718         "e.g. isSplat()");
10719  if (FirstRealEltIter == BlockElts.end()) {
10720    DupLaneOp = 0;
10721    return true;
10722  }
10723
10724  // Index of FirstRealElt in BlockElts
10725  size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
10726
10727  if ((unsigned)*FirstRealEltIter < FirstRealIndex)
10728    return false;
10729  // BlockElts[0] must have the following value if it isn't undef:
10730  size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
10731
10732  // Check the first element
10733  if (Elt0 % NumEltsPerBlock != 0)
10734    return false;
10735  // Check that the sequence indeed consists of consecutive integers (modulo
10736  // undefs)
10737  for (size_t I = 0; I < NumEltsPerBlock; I++)
10738    if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
10739      return false;
10740
10741  DupLaneOp = Elt0 / NumEltsPerBlock;
10742  return true;
10743}
10744
10745// check if an EXT instruction can handle the shuffle mask when the
10746// vector sources of the shuffle are different.
10747static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
10748                      unsigned &Imm) {
10749  // Look for the first non-undef element.
10750  const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
10751
10752  // Benefit form APInt to handle overflow when calculating expected element.
10753  unsigned NumElts = VT.getVectorNumElements();
10754  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
10755  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
10756  // The following shuffle indices must be the successive elements after the
10757  // first real element.
10758  bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
10759    return Elt != ExpectedElt++ && Elt != -1;
10760  });
10761  if (FoundWrongElt)
10762    return false;
10763
10764  // The index of an EXT is the first element if it is not UNDEF.
10765  // Watch out for the beginning UNDEFs. The EXT index should be the expected
10766  // value of the first element.  E.g.
10767  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
10768  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
10769  // ExpectedElt is the last mask index plus 1.
10770  Imm = ExpectedElt.getZExtValue();
10771
10772  // There are two difference cases requiring to reverse input vectors.
10773  // For example, for vector <4 x i32> we have the following cases,
10774  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
10775  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
10776  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
10777  // to reverse two input vectors.
10778  if (Imm < NumElts)
10779    ReverseEXT = true;
10780  else
10781    Imm -= NumElts;
10782
10783  return true;
10784}
10785
10786/// isREVMask - Check if a vector shuffle corresponds to a REV
10787/// instruction with the specified blocksize.  (The order of the elements
10788/// within each block of the vector is reversed.)
10789static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
10790  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
10791          BlockSize == 128) &&
10792         "Only possible block sizes for REV are: 16, 32, 64, 128");
10793
10794  unsigned EltSz = VT.getScalarSizeInBits();
10795  unsigned NumElts = VT.getVectorNumElements();
10796  unsigned BlockElts = M[0] + 1;
10797  // If the first shuffle index is UNDEF, be optimistic.
10798  if (M[0] < 0)
10799    BlockElts = BlockSize / EltSz;
10800
10801  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
10802    return false;
10803
10804  for (unsigned i = 0; i < NumElts; ++i) {
10805    if (M[i] < 0)
10806      continue; // ignore UNDEF indices
10807    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
10808      return false;
10809  }
10810
10811  return true;
10812}
10813
10814static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10815  unsigned NumElts = VT.getVectorNumElements();
10816  if (NumElts % 2 != 0)
10817    return false;
10818  WhichResult = (M[0] == 0 ? 0 : 1);
10819  unsigned Idx = WhichResult * NumElts / 2;
10820  for (unsigned i = 0; i != NumElts; i += 2) {
10821    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10822        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
10823      return false;
10824    Idx += 1;
10825  }
10826
10827  return true;
10828}
10829
10830static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10831  unsigned NumElts = VT.getVectorNumElements();
10832  WhichResult = (M[0] == 0 ? 0 : 1);
10833  for (unsigned i = 0; i != NumElts; ++i) {
10834    if (M[i] < 0)
10835      continue; // ignore UNDEF indices
10836    if ((unsigned)M[i] != 2 * i + WhichResult)
10837      return false;
10838  }
10839
10840  return true;
10841}
10842
10843static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10844  unsigned NumElts = VT.getVectorNumElements();
10845  if (NumElts % 2 != 0)
10846    return false;
10847  WhichResult = (M[0] == 0 ? 0 : 1);
10848  for (unsigned i = 0; i < NumElts; i += 2) {
10849    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10850        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
10851      return false;
10852  }
10853  return true;
10854}
10855
10856/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
10857/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10858/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
10859static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10860  unsigned NumElts = VT.getVectorNumElements();
10861  if (NumElts % 2 != 0)
10862    return false;
10863  WhichResult = (M[0] == 0 ? 0 : 1);
10864  unsigned Idx = WhichResult * NumElts / 2;
10865  for (unsigned i = 0; i != NumElts; i += 2) {
10866    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10867        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
10868      return false;
10869    Idx += 1;
10870  }
10871
10872  return true;
10873}
10874
10875/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
10876/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10877/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
10878static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10879  unsigned Half = VT.getVectorNumElements() / 2;
10880  WhichResult = (M[0] == 0 ? 0 : 1);
10881  for (unsigned j = 0; j != 2; ++j) {
10882    unsigned Idx = WhichResult;
10883    for (unsigned i = 0; i != Half; ++i) {
10884      int MIdx = M[i + j * Half];
10885      if (MIdx >= 0 && (unsigned)MIdx != Idx)
10886        return false;
10887      Idx += 2;
10888    }
10889  }
10890
10891  return true;
10892}
10893
10894/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
10895/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10896/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
10897static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10898  unsigned NumElts = VT.getVectorNumElements();
10899  if (NumElts % 2 != 0)
10900    return false;
10901  WhichResult = (M[0] == 0 ? 0 : 1);
10902  for (unsigned i = 0; i < NumElts; i += 2) {
10903    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10904        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
10905      return false;
10906  }
10907  return true;
10908}
10909
10910static bool isINSMask(ArrayRef<int> M, int NumInputElements,
10911                      bool &DstIsLeft, int &Anomaly) {
10912  if (M.size() != static_cast<size_t>(NumInputElements))
10913    return false;
10914
10915  int NumLHSMatch = 0, NumRHSMatch = 0;
10916  int LastLHSMismatch = -1, LastRHSMismatch = -1;
10917
10918  for (int i = 0; i < NumInputElements; ++i) {
10919    if (M[i] == -1) {
10920      ++NumLHSMatch;
10921      ++NumRHSMatch;
10922      continue;
10923    }
10924
10925    if (M[i] == i)
10926      ++NumLHSMatch;
10927    else
10928      LastLHSMismatch = i;
10929
10930    if (M[i] == i + NumInputElements)
10931      ++NumRHSMatch;
10932    else
10933      LastRHSMismatch = i;
10934  }
10935
10936  if (NumLHSMatch == NumInputElements - 1) {
10937    DstIsLeft = true;
10938    Anomaly = LastLHSMismatch;
10939    return true;
10940  } else if (NumRHSMatch == NumInputElements - 1) {
10941    DstIsLeft = false;
10942    Anomaly = LastRHSMismatch;
10943    return true;
10944  }
10945
10946  return false;
10947}
10948
10949static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
10950  if (VT.getSizeInBits() != 128)
10951    return false;
10952
10953  unsigned NumElts = VT.getVectorNumElements();
10954
10955  for (int I = 0, E = NumElts / 2; I != E; I++) {
10956    if (Mask[I] != I)
10957      return false;
10958  }
10959
10960  int Offset = NumElts / 2;
10961  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
10962    if (Mask[I] != I + SplitLHS * Offset)
10963      return false;
10964  }
10965
10966  return true;
10967}
10968
10969static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
10970  SDLoc DL(Op);
10971  EVT VT = Op.getValueType();
10972  SDValue V0 = Op.getOperand(0);
10973  SDValue V1 = Op.getOperand(1);
10974  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
10975
10976  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
10977      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
10978    return SDValue();
10979
10980  bool SplitV0 = V0.getValueSizeInBits() == 128;
10981
10982  if (!isConcatMask(Mask, VT, SplitV0))
10983    return SDValue();
10984
10985  EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
10986  if (SplitV0) {
10987    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
10988                     DAG.getConstant(0, DL, MVT::i64));
10989  }
10990  if (V1.getValueSizeInBits() == 128) {
10991    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
10992                     DAG.getConstant(0, DL, MVT::i64));
10993  }
10994  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
10995}
10996
10997/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10998/// the specified operations to build the shuffle. ID is the perfect-shuffle
10999//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11000//table entry and LHS/RHS are the immediate inputs for this stage of the
11001//shuffle.
11002static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
11003                                      SDValue V2, unsigned PFEntry, SDValue LHS,
11004                                      SDValue RHS, SelectionDAG &DAG,
11005                                      const SDLoc &dl) {
11006  unsigned OpNum = (PFEntry >> 26) & 0x0F;
11007  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
11008  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
11009
11010  enum {
11011    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11012    OP_VREV,
11013    OP_VDUP0,
11014    OP_VDUP1,
11015    OP_VDUP2,
11016    OP_VDUP3,
11017    OP_VEXT1,
11018    OP_VEXT2,
11019    OP_VEXT3,
11020    OP_VUZPL,  // VUZP, left result
11021    OP_VUZPR,  // VUZP, right result
11022    OP_VZIPL,  // VZIP, left result
11023    OP_VZIPR,  // VZIP, right result
11024    OP_VTRNL,  // VTRN, left result
11025    OP_VTRNR,  // VTRN, right result
11026    OP_MOVLANE // Move lane. RHSID is the lane to move into
11027  };
11028
11029  if (OpNum == OP_COPY) {
11030    if (LHSID == (1 * 9 + 2) * 9 + 3)
11031      return LHS;
11032    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
11033    return RHS;
11034  }
11035
11036  if (OpNum == OP_MOVLANE) {
11037    // Decompose a PerfectShuffle ID to get the Mask for lane Elt
11038    auto getPFIDLane = [](unsigned ID, int Elt) -> int {
11039      assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
11040      Elt = 3 - Elt;
11041      while (Elt > 0) {
11042        ID /= 9;
11043        Elt--;
11044      }
11045      return (ID % 9 == 8) ? -1 : ID % 9;
11046    };
11047
11048    // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
11049    // get the lane to move from from the PFID, which is always from the
11050    // original vectors (V1 or V2).
11051    SDValue OpLHS = GeneratePerfectShuffle(
11052        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
11053    EVT VT = OpLHS.getValueType();
11054    assert(RHSID < 8 && "Expected a lane index for RHSID!");
11055    unsigned ExtLane = 0;
11056    SDValue Input;
11057
11058    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
11059    // convert into a higher type.
11060    if (RHSID & 0x4) {
11061      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
11062      if (MaskElt == -1)
11063        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
11064      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11065      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
11066      Input = MaskElt < 2 ? V1 : V2;
11067      if (VT.getScalarSizeInBits() == 16) {
11068        Input = DAG.getBitcast(MVT::v2f32, Input);
11069        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
11070      } else {
11071        assert(VT.getScalarSizeInBits() == 32 &&
11072               "Expected 16 or 32 bit shuffle elemements");
11073        Input = DAG.getBitcast(MVT::v2f64, Input);
11074        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
11075      }
11076    } else {
11077      int MaskElt = getPFIDLane(ID, RHSID);
11078      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11079      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
11080      Input = MaskElt < 4 ? V1 : V2;
11081      // Be careful about creating illegal types. Use f16 instead of i16.
11082      if (VT == MVT::v4i16) {
11083        Input = DAG.getBitcast(MVT::v4f16, Input);
11084        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
11085      }
11086    }
11087    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
11088                              Input.getValueType().getVectorElementType(),
11089                              Input, DAG.getVectorIdxConstant(ExtLane, dl));
11090    SDValue Ins =
11091        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
11092                    Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
11093    return DAG.getBitcast(VT, Ins);
11094  }
11095
11096  SDValue OpLHS, OpRHS;
11097  OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
11098                                 RHS, DAG, dl);
11099  OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
11100                                 RHS, DAG, dl);
11101  EVT VT = OpLHS.getValueType();
11102
11103  switch (OpNum) {
11104  default:
11105    llvm_unreachable("Unknown shuffle opcode!");
11106  case OP_VREV:
11107    // VREV divides the vector in half and swaps within the half.
11108    if (VT.getVectorElementType() == MVT::i32 ||
11109        VT.getVectorElementType() == MVT::f32)
11110      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
11111    // vrev <4 x i16> -> REV32
11112    if (VT.getVectorElementType() == MVT::i16 ||
11113        VT.getVectorElementType() == MVT::f16 ||
11114        VT.getVectorElementType() == MVT::bf16)
11115      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
11116    // vrev <4 x i8> -> REV16
11117    assert(VT.getVectorElementType() == MVT::i8);
11118    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
11119  case OP_VDUP0:
11120  case OP_VDUP1:
11121  case OP_VDUP2:
11122  case OP_VDUP3: {
11123    EVT EltTy = VT.getVectorElementType();
11124    unsigned Opcode;
11125    if (EltTy == MVT::i8)
11126      Opcode = AArch64ISD::DUPLANE8;
11127    else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
11128      Opcode = AArch64ISD::DUPLANE16;
11129    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
11130      Opcode = AArch64ISD::DUPLANE32;
11131    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
11132      Opcode = AArch64ISD::DUPLANE64;
11133    else
11134      llvm_unreachable("Invalid vector element type?");
11135
11136    if (VT.getSizeInBits() == 64)
11137      OpLHS = WidenVector(OpLHS, DAG);
11138    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
11139    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
11140  }
11141  case OP_VEXT1:
11142  case OP_VEXT2:
11143  case OP_VEXT3: {
11144    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
11145    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
11146                       DAG.getConstant(Imm, dl, MVT::i32));
11147  }
11148  case OP_VUZPL:
11149    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
11150                       OpRHS);
11151  case OP_VUZPR:
11152    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
11153                       OpRHS);
11154  case OP_VZIPL:
11155    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
11156                       OpRHS);
11157  case OP_VZIPR:
11158    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
11159                       OpRHS);
11160  case OP_VTRNL:
11161    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
11162                       OpRHS);
11163  case OP_VTRNR:
11164    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
11165                       OpRHS);
11166  }
11167}
11168
11169static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
11170                           SelectionDAG &DAG) {
11171  // Check to see if we can use the TBL instruction.
11172  SDValue V1 = Op.getOperand(0);
11173  SDValue V2 = Op.getOperand(1);
11174  SDLoc DL(Op);
11175
11176  EVT EltVT = Op.getValueType().getVectorElementType();
11177  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
11178
11179  bool Swap = false;
11180  if (V1.isUndef() || isZerosVector(V1.getNode())) {
11181    std::swap(V1, V2);
11182    Swap = true;
11183  }
11184
11185  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
11186  // out of range values with 0s. We do need to make sure that any out-of-range
11187  // values are really out-of-range for a v16i8 vector.
11188  bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
11189  MVT IndexVT = MVT::v8i8;
11190  unsigned IndexLen = 8;
11191  if (Op.getValueSizeInBits() == 128) {
11192    IndexVT = MVT::v16i8;
11193    IndexLen = 16;
11194  }
11195
11196  SmallVector<SDValue, 8> TBLMask;
11197  for (int Val : ShuffleMask) {
11198    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
11199      unsigned Offset = Byte + Val * BytesPerElt;
11200      if (Swap)
11201        Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
11202      if (IsUndefOrZero && Offset >= IndexLen)
11203        Offset = 255;
11204      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
11205    }
11206  }
11207
11208  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
11209  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
11210
11211  SDValue Shuffle;
11212  if (IsUndefOrZero) {
11213    if (IndexLen == 8)
11214      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
11215    Shuffle = DAG.getNode(
11216        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11217        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11218        DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11219  } else {
11220    if (IndexLen == 8) {
11221      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
11222      Shuffle = DAG.getNode(
11223          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11224          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11225          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11226    } else {
11227      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
11228      // cannot currently represent the register constraints on the input
11229      // table registers.
11230      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
11231      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
11232      //                   IndexLen));
11233      Shuffle = DAG.getNode(
11234          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11235          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
11236          V2Cst,
11237          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11238    }
11239  }
11240  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
11241}
11242
11243static unsigned getDUPLANEOp(EVT EltType) {
11244  if (EltType == MVT::i8)
11245    return AArch64ISD::DUPLANE8;
11246  if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
11247    return AArch64ISD::DUPLANE16;
11248  if (EltType == MVT::i32 || EltType == MVT::f32)
11249    return AArch64ISD::DUPLANE32;
11250  if (EltType == MVT::i64 || EltType == MVT::f64)
11251    return AArch64ISD::DUPLANE64;
11252
11253  llvm_unreachable("Invalid vector element type?");
11254}
11255
11256static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
11257                            unsigned Opcode, SelectionDAG &DAG) {
11258  // Try to eliminate a bitcasted extract subvector before a DUPLANE.
11259  auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
11260    // Match: dup (bitcast (extract_subv X, C)), LaneC
11261    if (BitCast.getOpcode() != ISD::BITCAST ||
11262        BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
11263      return false;
11264
11265    // The extract index must align in the destination type. That may not
11266    // happen if the bitcast is from narrow to wide type.
11267    SDValue Extract = BitCast.getOperand(0);
11268    unsigned ExtIdx = Extract.getConstantOperandVal(1);
11269    unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
11270    unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
11271    unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
11272    if (ExtIdxInBits % CastedEltBitWidth != 0)
11273      return false;
11274
11275    // Can't handle cases where vector size is not 128-bit
11276    if (!Extract.getOperand(0).getValueType().is128BitVector())
11277      return false;
11278
11279    // Update the lane value by offsetting with the scaled extract index.
11280    LaneC += ExtIdxInBits / CastedEltBitWidth;
11281
11282    // Determine the casted vector type of the wide vector input.
11283    // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
11284    // Examples:
11285    // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
11286    // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
11287    unsigned SrcVecNumElts =
11288        Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
11289    CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
11290                              SrcVecNumElts);
11291    return true;
11292  };
11293  MVT CastVT;
11294  if (getScaledOffsetDup(V, Lane, CastVT)) {
11295    V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
11296  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
11297             V.getOperand(0).getValueType().is128BitVector()) {
11298    // The lane is incremented by the index of the extract.
11299    // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
11300    Lane += V.getConstantOperandVal(1);
11301    V = V.getOperand(0);
11302  } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
11303    // The lane is decremented if we are splatting from the 2nd operand.
11304    // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
11305    unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
11306    Lane -= Idx * VT.getVectorNumElements() / 2;
11307    V = WidenVector(V.getOperand(Idx), DAG);
11308  } else if (VT.getSizeInBits() == 64) {
11309    // Widen the operand to 128-bit register with undef.
11310    V = WidenVector(V, DAG);
11311  }
11312  return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
11313}
11314
11315// Return true if we can get a new shuffle mask by checking the parameter mask
11316// array to test whether every two adjacent mask values are continuous and
11317// starting from an even number.
11318static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
11319                           SmallVectorImpl<int> &NewMask) {
11320  unsigned NumElts = VT.getVectorNumElements();
11321  if (NumElts % 2 != 0)
11322    return false;
11323
11324  NewMask.clear();
11325  for (unsigned i = 0; i < NumElts; i += 2) {
11326    int M0 = M[i];
11327    int M1 = M[i + 1];
11328
11329    // If both elements are undef, new mask is undef too.
11330    if (M0 == -1 && M1 == -1) {
11331      NewMask.push_back(-1);
11332      continue;
11333    }
11334
11335    if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
11336      NewMask.push_back(M1 / 2);
11337      continue;
11338    }
11339
11340    if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
11341      NewMask.push_back(M0 / 2);
11342      continue;
11343    }
11344
11345    NewMask.clear();
11346    return false;
11347  }
11348
11349  assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
11350  return true;
11351}
11352
11353// Try to widen element type to get a new mask value for a better permutation
11354// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
11355// UZP1/2, TRN1/2, REV, INS, etc.
11356// For example:
11357//  shufflevector <4 x i32> %a, <4 x i32> %b,
11358//                <4 x i32> <i32 6, i32 7, i32 2, i32 3>
11359// is equivalent to:
11360//  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
11361// Finally, we can get:
11362//  mov     v0.d[0], v1.d[1]
11363static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
11364  SDLoc DL(Op);
11365  EVT VT = Op.getValueType();
11366  EVT ScalarVT = VT.getVectorElementType();
11367  unsigned ElementSize = ScalarVT.getFixedSizeInBits();
11368  SDValue V0 = Op.getOperand(0);
11369  SDValue V1 = Op.getOperand(1);
11370  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11371
11372  // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
11373  // We need to make sure the wider element type is legal. Thus, ElementSize
11374  // should be not larger than 32 bits, and i1 type should also be excluded.
11375  if (ElementSize > 32 || ElementSize == 1)
11376    return SDValue();
11377
11378  SmallVector<int, 8> NewMask;
11379  if (isWideTypeMask(Mask, VT, NewMask)) {
11380    MVT NewEltVT = VT.isFloatingPoint()
11381                       ? MVT::getFloatingPointVT(ElementSize * 2)
11382                       : MVT::getIntegerVT(ElementSize * 2);
11383    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11384    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11385      V0 = DAG.getBitcast(NewVT, V0);
11386      V1 = DAG.getBitcast(NewVT, V1);
11387      return DAG.getBitcast(VT,
11388                            DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
11389    }
11390  }
11391
11392  return SDValue();
11393}
11394
11395// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
11396static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
11397                                               ArrayRef<int> ShuffleMask,
11398                                               SelectionDAG &DAG) {
11399  SDValue Tbl1 = Op->getOperand(0);
11400  SDValue Tbl2 = Op->getOperand(1);
11401  SDLoc dl(Op);
11402  SDValue Tbl2ID =
11403      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
11404
11405  EVT VT = Op.getValueType();
11406  if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11407      Tbl1->getOperand(0) != Tbl2ID ||
11408      Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11409      Tbl2->getOperand(0) != Tbl2ID)
11410    return SDValue();
11411
11412  if (Tbl1->getValueType(0) != MVT::v16i8 ||
11413      Tbl2->getValueType(0) != MVT::v16i8)
11414    return SDValue();
11415
11416  SDValue Mask1 = Tbl1->getOperand(3);
11417  SDValue Mask2 = Tbl2->getOperand(3);
11418  SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
11419  for (unsigned I = 0; I < 16; I++) {
11420    if (ShuffleMask[I] < 16)
11421      TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
11422    else {
11423      auto *C =
11424          dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
11425      if (!C)
11426        return SDValue();
11427      TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
11428    }
11429  }
11430
11431  SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
11432  SDValue ID =
11433      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
11434
11435  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
11436                     {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
11437                      Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
11438}
11439
11440// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
11441// but we don't have an appropriate instruction,
11442// so custom-lower it as ZIP1-with-zeros.
11443SDValue
11444AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
11445                                                     SelectionDAG &DAG) const {
11446  SDLoc dl(Op);
11447  EVT VT = Op.getValueType();
11448  SDValue SrcOp = Op.getOperand(0);
11449  EVT SrcVT = SrcOp.getValueType();
11450  assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
11451         "Unexpected extension factor.");
11452  unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
11453  // FIXME: support multi-step zipping?
11454  if (Scale != 2)
11455    return SDValue();
11456  SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
11457  return DAG.getBitcast(VT,
11458                        DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
11459}
11460
11461SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
11462                                                   SelectionDAG &DAG) const {
11463  SDLoc dl(Op);
11464  EVT VT = Op.getValueType();
11465
11466  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
11467
11468  if (useSVEForFixedLengthVectorVT(VT,
11469                                   Subtarget->forceStreamingCompatibleSVE()))
11470    return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
11471
11472  // Convert shuffles that are directly supported on NEON to target-specific
11473  // DAG nodes, instead of keeping them as shuffles and matching them again
11474  // during code selection.  This is more efficient and avoids the possibility
11475  // of inconsistencies between legalization and selection.
11476  ArrayRef<int> ShuffleMask = SVN->getMask();
11477
11478  SDValue V1 = Op.getOperand(0);
11479  SDValue V2 = Op.getOperand(1);
11480
11481  assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
11482  assert(ShuffleMask.size() == VT.getVectorNumElements() &&
11483         "Unexpected VECTOR_SHUFFLE mask size!");
11484
11485  if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
11486    return Res;
11487
11488  if (SVN->isSplat()) {
11489    int Lane = SVN->getSplatIndex();
11490    // If this is undef splat, generate it via "just" vdup, if possible.
11491    if (Lane == -1)
11492      Lane = 0;
11493
11494    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
11495      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
11496                         V1.getOperand(0));
11497    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
11498    // constant. If so, we can just reference the lane's definition directly.
11499    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
11500        !isa<ConstantSDNode>(V1.getOperand(Lane)))
11501      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
11502
11503    // Otherwise, duplicate from the lane of the input vector.
11504    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
11505    return constructDup(V1, Lane, dl, VT, Opcode, DAG);
11506  }
11507
11508  // Check if the mask matches a DUP for a wider element
11509  for (unsigned LaneSize : {64U, 32U, 16U}) {
11510    unsigned Lane = 0;
11511    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
11512      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
11513                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
11514                                                        : AArch64ISD::DUPLANE16;
11515      // Cast V1 to an integer vector with required lane size
11516      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
11517      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
11518      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
11519      V1 = DAG.getBitcast(NewVecTy, V1);
11520      // Constuct the DUP instruction
11521      V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
11522      // Cast back to the original type
11523      return DAG.getBitcast(VT, V1);
11524    }
11525  }
11526
11527  if (isREVMask(ShuffleMask, VT, 64))
11528    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
11529  if (isREVMask(ShuffleMask, VT, 32))
11530    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
11531  if (isREVMask(ShuffleMask, VT, 16))
11532    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
11533
11534  if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
11535       (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
11536      ShuffleVectorInst::isReverseMask(ShuffleMask)) {
11537    SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
11538    return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
11539                       DAG.getConstant(8, dl, MVT::i32));
11540  }
11541
11542  bool ReverseEXT = false;
11543  unsigned Imm;
11544  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
11545    if (ReverseEXT)
11546      std::swap(V1, V2);
11547    Imm *= getExtFactor(V1);
11548    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
11549                       DAG.getConstant(Imm, dl, MVT::i32));
11550  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
11551    Imm *= getExtFactor(V1);
11552    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
11553                       DAG.getConstant(Imm, dl, MVT::i32));
11554  }
11555
11556  unsigned WhichResult;
11557  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
11558    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11559    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11560  }
11561  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
11562    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11563    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11564  }
11565  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
11566    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11567    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11568  }
11569
11570  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11571    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11572    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11573  }
11574  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11575    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11576    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11577  }
11578  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11579    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11580    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11581  }
11582
11583  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
11584    return Concat;
11585
11586  bool DstIsLeft;
11587  int Anomaly;
11588  int NumInputElements = V1.getValueType().getVectorNumElements();
11589  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
11590    SDValue DstVec = DstIsLeft ? V1 : V2;
11591    SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
11592
11593    SDValue SrcVec = V1;
11594    int SrcLane = ShuffleMask[Anomaly];
11595    if (SrcLane >= NumInputElements) {
11596      SrcVec = V2;
11597      SrcLane -= VT.getVectorNumElements();
11598    }
11599    SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
11600
11601    EVT ScalarVT = VT.getVectorElementType();
11602
11603    if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
11604      ScalarVT = MVT::i32;
11605
11606    return DAG.getNode(
11607        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
11608        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
11609        DstLaneV);
11610  }
11611
11612  if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
11613    return NewSD;
11614
11615  // If the shuffle is not directly supported and it has 4 elements, use
11616  // the PerfectShuffle-generated table to synthesize it from other shuffles.
11617  unsigned NumElts = VT.getVectorNumElements();
11618  if (NumElts == 4) {
11619    unsigned PFIndexes[4];
11620    for (unsigned i = 0; i != 4; ++i) {
11621      if (ShuffleMask[i] < 0)
11622        PFIndexes[i] = 8;
11623      else
11624        PFIndexes[i] = ShuffleMask[i];
11625    }
11626
11627    // Compute the index in the perfect shuffle table.
11628    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
11629                            PFIndexes[2] * 9 + PFIndexes[3];
11630    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
11631    return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
11632                                  dl);
11633  }
11634
11635  return GenerateTBL(Op, ShuffleMask, DAG);
11636}
11637
11638SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
11639                                                 SelectionDAG &DAG) const {
11640  EVT VT = Op.getValueType();
11641
11642  if (useSVEForFixedLengthVectorVT(VT,
11643                                   Subtarget->forceStreamingCompatibleSVE()))
11644    return LowerToScalableOp(Op, DAG);
11645
11646  assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
11647         "Unexpected vector type!");
11648
11649  // We can handle the constant cases during isel.
11650  if (isa<ConstantSDNode>(Op.getOperand(0)))
11651    return Op;
11652
11653  // There isn't a natural way to handle the general i1 case, so we use some
11654  // trickery with whilelo.
11655  SDLoc DL(Op);
11656  SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
11657  SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
11658                         DAG.getValueType(MVT::i1));
11659  SDValue ID =
11660      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
11661  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11662  if (VT == MVT::nxv1i1)
11663    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
11664                       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
11665                                   Zero, SplatVal),
11666                       Zero);
11667  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
11668}
11669
11670SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
11671                                             SelectionDAG &DAG) const {
11672  SDLoc DL(Op);
11673
11674  EVT VT = Op.getValueType();
11675  if (!isTypeLegal(VT) || !VT.isScalableVector())
11676    return SDValue();
11677
11678  // Current lowering only supports the SVE-ACLE types.
11679  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
11680    return SDValue();
11681
11682  // The DUPQ operation is indepedent of element type so normalise to i64s.
11683  SDValue Idx128 = Op.getOperand(2);
11684
11685  // DUPQ can be used when idx is in range.
11686  auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
11687  if (CIdx && (CIdx->getZExtValue() <= 3)) {
11688    SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
11689    return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
11690  }
11691
11692  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
11693
11694  // The ACLE says this must produce the same result as:
11695  //   svtbl(data, svadd_x(svptrue_b64(),
11696  //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
11697  //                       index * 2))
11698  SDValue One = DAG.getConstant(1, DL, MVT::i64);
11699  SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
11700
11701  // create the vector 0,1,0,1,...
11702  SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
11703  SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
11704
11705  // create the vector idx64,idx64+1,idx64,idx64+1,...
11706  SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
11707  SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
11708  SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
11709
11710  // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
11711  SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
11712  return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
11713}
11714
11715
11716static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
11717                               APInt &UndefBits) {
11718  EVT VT = BVN->getValueType(0);
11719  APInt SplatBits, SplatUndef;
11720  unsigned SplatBitSize;
11721  bool HasAnyUndefs;
11722  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11723    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
11724
11725    for (unsigned i = 0; i < NumSplats; ++i) {
11726      CnstBits <<= SplatBitSize;
11727      UndefBits <<= SplatBitSize;
11728      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
11729      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
11730    }
11731
11732    return true;
11733  }
11734
11735  return false;
11736}
11737
11738// Try 64-bit splatted SIMD immediate.
11739static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11740                                 const APInt &Bits) {
11741  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11742    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11743    EVT VT = Op.getValueType();
11744    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
11745
11746    if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
11747      Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
11748
11749      SDLoc dl(Op);
11750      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11751                                DAG.getConstant(Value, dl, MVT::i32));
11752      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11753    }
11754  }
11755
11756  return SDValue();
11757}
11758
11759// Try 32-bit splatted SIMD immediate.
11760static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11761                                  const APInt &Bits,
11762                                  const SDValue *LHS = nullptr) {
11763  EVT VT = Op.getValueType();
11764  if (VT.isFixedLengthVector() &&
11765      DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
11766    return SDValue();
11767
11768  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11769    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11770    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11771    bool isAdvSIMDModImm = false;
11772    uint64_t Shift;
11773
11774    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
11775      Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
11776      Shift = 0;
11777    }
11778    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
11779      Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
11780      Shift = 8;
11781    }
11782    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
11783      Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
11784      Shift = 16;
11785    }
11786    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
11787      Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
11788      Shift = 24;
11789    }
11790
11791    if (isAdvSIMDModImm) {
11792      SDLoc dl(Op);
11793      SDValue Mov;
11794
11795      if (LHS)
11796        Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11797                          DAG.getConstant(Value, dl, MVT::i32),
11798                          DAG.getConstant(Shift, dl, MVT::i32));
11799      else
11800        Mov = DAG.getNode(NewOp, dl, MovTy,
11801                          DAG.getConstant(Value, dl, MVT::i32),
11802                          DAG.getConstant(Shift, dl, MVT::i32));
11803
11804      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11805    }
11806  }
11807
11808  return SDValue();
11809}
11810
11811// Try 16-bit splatted SIMD immediate.
11812static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11813                                  const APInt &Bits,
11814                                  const SDValue *LHS = nullptr) {
11815  EVT VT = Op.getValueType();
11816  if (VT.isFixedLengthVector() &&
11817      DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
11818    return SDValue();
11819
11820  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11821    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11822    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
11823    bool isAdvSIMDModImm = false;
11824    uint64_t Shift;
11825
11826    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
11827      Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
11828      Shift = 0;
11829    }
11830    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
11831      Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
11832      Shift = 8;
11833    }
11834
11835    if (isAdvSIMDModImm) {
11836      SDLoc dl(Op);
11837      SDValue Mov;
11838
11839      if (LHS)
11840        Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11841                          DAG.getConstant(Value, dl, MVT::i32),
11842                          DAG.getConstant(Shift, dl, MVT::i32));
11843      else
11844        Mov = DAG.getNode(NewOp, dl, MovTy,
11845                          DAG.getConstant(Value, dl, MVT::i32),
11846                          DAG.getConstant(Shift, dl, MVT::i32));
11847
11848      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11849    }
11850  }
11851
11852  return SDValue();
11853}
11854
11855// Try 32-bit splatted SIMD immediate with shifted ones.
11856static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
11857                                    SelectionDAG &DAG, const APInt &Bits) {
11858  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11859    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11860    EVT VT = Op.getValueType();
11861    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11862    bool isAdvSIMDModImm = false;
11863    uint64_t Shift;
11864
11865    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
11866      Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
11867      Shift = 264;
11868    }
11869    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
11870      Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
11871      Shift = 272;
11872    }
11873
11874    if (isAdvSIMDModImm) {
11875      SDLoc dl(Op);
11876      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11877                                DAG.getConstant(Value, dl, MVT::i32),
11878                                DAG.getConstant(Shift, dl, MVT::i32));
11879      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11880    }
11881  }
11882
11883  return SDValue();
11884}
11885
11886// Try 8-bit splatted SIMD immediate.
11887static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11888                                 const APInt &Bits) {
11889  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11890    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11891    EVT VT = Op.getValueType();
11892    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
11893
11894    if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
11895      Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
11896
11897      SDLoc dl(Op);
11898      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11899                                DAG.getConstant(Value, dl, MVT::i32));
11900      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11901    }
11902  }
11903
11904  return SDValue();
11905}
11906
11907// Try FP splatted SIMD immediate.
11908static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11909                                  const APInt &Bits) {
11910  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11911    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11912    EVT VT = Op.getValueType();
11913    bool isWide = (VT.getSizeInBits() == 128);
11914    MVT MovTy;
11915    bool isAdvSIMDModImm = false;
11916
11917    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
11918      Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
11919      MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
11920    }
11921    else if (isWide &&
11922             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
11923      Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
11924      MovTy = MVT::v2f64;
11925    }
11926
11927    if (isAdvSIMDModImm) {
11928      SDLoc dl(Op);
11929      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11930                                DAG.getConstant(Value, dl, MVT::i32));
11931      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11932    }
11933  }
11934
11935  return SDValue();
11936}
11937
11938// Specialized code to quickly find if PotentialBVec is a BuildVector that
11939// consists of only the same constant int value, returned in reference arg
11940// ConstVal
11941static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
11942                                     uint64_t &ConstVal) {
11943  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
11944  if (!Bvec)
11945    return false;
11946  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
11947  if (!FirstElt)
11948    return false;
11949  EVT VT = Bvec->getValueType(0);
11950  unsigned NumElts = VT.getVectorNumElements();
11951  for (unsigned i = 1; i < NumElts; ++i)
11952    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
11953      return false;
11954  ConstVal = FirstElt->getZExtValue();
11955  return true;
11956}
11957
11958// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
11959// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
11960// BUILD_VECTORs with constant element C1, C2 is a constant, and:
11961//   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
11962//   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
11963// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
11964static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
11965  EVT VT = N->getValueType(0);
11966
11967  if (!VT.isVector())
11968    return SDValue();
11969
11970  SDLoc DL(N);
11971
11972  SDValue And;
11973  SDValue Shift;
11974
11975  SDValue FirstOp = N->getOperand(0);
11976  unsigned FirstOpc = FirstOp.getOpcode();
11977  SDValue SecondOp = N->getOperand(1);
11978  unsigned SecondOpc = SecondOp.getOpcode();
11979
11980  // Is one of the operands an AND or a BICi? The AND may have been optimised to
11981  // a BICi in order to use an immediate instead of a register.
11982  // Is the other operand an shl or lshr? This will have been turned into:
11983  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
11984  if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
11985      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
11986    And = FirstOp;
11987    Shift = SecondOp;
11988
11989  } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
11990             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
11991    And = SecondOp;
11992    Shift = FirstOp;
11993  } else
11994    return SDValue();
11995
11996  bool IsAnd = And.getOpcode() == ISD::AND;
11997  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
11998
11999  // Is the shift amount constant?
12000  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
12001  if (!C2node)
12002    return SDValue();
12003
12004  uint64_t C1;
12005  if (IsAnd) {
12006    // Is the and mask vector all constant?
12007    if (!isAllConstantBuildVector(And.getOperand(1), C1))
12008      return SDValue();
12009  } else {
12010    // Reconstruct the corresponding AND immediate from the two BICi immediates.
12011    ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
12012    ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
12013    assert(C1nodeImm && C1nodeShift);
12014    C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
12015  }
12016
12017  // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
12018  // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
12019  // how much one can shift elements of a particular size?
12020  uint64_t C2 = C2node->getZExtValue();
12021  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
12022  if (C2 > ElemSizeInBits)
12023    return SDValue();
12024
12025  APInt C1AsAPInt(ElemSizeInBits, C1);
12026  APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
12027                                  : APInt::getLowBitsSet(ElemSizeInBits, C2);
12028  if (C1AsAPInt != RequiredC1)
12029    return SDValue();
12030
12031  SDValue X = And.getOperand(0);
12032  SDValue Y = Shift.getOperand(0);
12033
12034  unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
12035  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
12036
12037  LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
12038  LLVM_DEBUG(N->dump(&DAG));
12039  LLVM_DEBUG(dbgs() << "into: \n");
12040  LLVM_DEBUG(ResultSLI->dump(&DAG));
12041
12042  ++NumShiftInserts;
12043  return ResultSLI;
12044}
12045
12046SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
12047                                             SelectionDAG &DAG) const {
12048  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
12049                                   Subtarget->forceStreamingCompatibleSVE()))
12050    return LowerToScalableOp(Op, DAG);
12051
12052  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
12053  if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
12054    return Res;
12055
12056  EVT VT = Op.getValueType();
12057
12058  SDValue LHS = Op.getOperand(0);
12059  BuildVectorSDNode *BVN =
12060      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
12061  if (!BVN) {
12062    // OR commutes, so try swapping the operands.
12063    LHS = Op.getOperand(1);
12064    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
12065  }
12066  if (!BVN)
12067    return Op;
12068
12069  APInt DefBits(VT.getSizeInBits(), 0);
12070  APInt UndefBits(VT.getSizeInBits(), 0);
12071  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12072    SDValue NewOp;
12073
12074    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12075                                    DefBits, &LHS)) ||
12076        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12077                                    DefBits, &LHS)))
12078      return NewOp;
12079
12080    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12081                                    UndefBits, &LHS)) ||
12082        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12083                                    UndefBits, &LHS)))
12084      return NewOp;
12085  }
12086
12087  // We can always fall back to a non-immediate OR.
12088  return Op;
12089}
12090
12091// Normalize the operands of BUILD_VECTOR. The value of constant operands will
12092// be truncated to fit element width.
12093static SDValue NormalizeBuildVector(SDValue Op,
12094                                    SelectionDAG &DAG) {
12095  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12096  SDLoc dl(Op);
12097  EVT VT = Op.getValueType();
12098  EVT EltTy= VT.getVectorElementType();
12099
12100  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
12101    return Op;
12102
12103  SmallVector<SDValue, 16> Ops;
12104  for (SDValue Lane : Op->ops()) {
12105    // For integer vectors, type legalization would have promoted the
12106    // operands already. Otherwise, if Op is a floating-point splat
12107    // (with operands cast to integers), then the only possibilities
12108    // are constants and UNDEFs.
12109    if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
12110      APInt LowBits(EltTy.getSizeInBits(),
12111                    CstLane->getZExtValue());
12112      Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
12113    } else if (Lane.getNode()->isUndef()) {
12114      Lane = DAG.getUNDEF(MVT::i32);
12115    } else {
12116      assert(Lane.getValueType() == MVT::i32 &&
12117             "Unexpected BUILD_VECTOR operand type");
12118    }
12119    Ops.push_back(Lane);
12120  }
12121  return DAG.getBuildVector(VT, dl, Ops);
12122}
12123
12124static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
12125  EVT VT = Op.getValueType();
12126
12127  APInt DefBits(VT.getSizeInBits(), 0);
12128  APInt UndefBits(VT.getSizeInBits(), 0);
12129  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12130  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12131    SDValue NewOp;
12132    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12133        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12134        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12135        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12136        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12137        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12138      return NewOp;
12139
12140    DefBits = ~DefBits;
12141    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12142        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12143        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12144      return NewOp;
12145
12146    DefBits = UndefBits;
12147    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12148        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12149        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12150        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12151        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12152        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12153      return NewOp;
12154
12155    DefBits = ~UndefBits;
12156    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12157        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12158        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12159      return NewOp;
12160  }
12161
12162  return SDValue();
12163}
12164
12165SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
12166                                                 SelectionDAG &DAG) const {
12167  EVT VT = Op.getValueType();
12168
12169  if (useSVEForFixedLengthVectorVT(VT,
12170                                   Subtarget->forceStreamingCompatibleSVE())) {
12171    if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
12172      SDLoc DL(Op);
12173      EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
12174      SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
12175      SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
12176      SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
12177      return convertFromScalableVector(DAG, Op.getValueType(), Seq);
12178    }
12179
12180    // Revert to common legalisation for all other variants.
12181    return SDValue();
12182  }
12183
12184  // Try to build a simple constant vector.
12185  Op = NormalizeBuildVector(Op, DAG);
12186  // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
12187  // abort.
12188  if (Op.getOpcode() != ISD::BUILD_VECTOR)
12189    return SDValue();
12190
12191  if (VT.isInteger()) {
12192    // Certain vector constants, used to express things like logical NOT and
12193    // arithmetic NEG, are passed through unmodified.  This allows special
12194    // patterns for these operations to match, which will lower these constants
12195    // to whatever is proven necessary.
12196    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12197    if (BVN->isConstant())
12198      if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
12199        unsigned BitSize = VT.getVectorElementType().getSizeInBits();
12200        APInt Val(BitSize,
12201                  Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
12202        if (Val.isZero() || Val.isAllOnes())
12203          return Op;
12204      }
12205  }
12206
12207  if (SDValue V = ConstantBuildVector(Op, DAG))
12208    return V;
12209
12210  // Scan through the operands to find some interesting properties we can
12211  // exploit:
12212  //   1) If only one value is used, we can use a DUP, or
12213  //   2) if only the low element is not undef, we can just insert that, or
12214  //   3) if only one constant value is used (w/ some non-constant lanes),
12215  //      we can splat the constant value into the whole vector then fill
12216  //      in the non-constant lanes.
12217  //   4) FIXME: If different constant values are used, but we can intelligently
12218  //             select the values we'll be overwriting for the non-constant
12219  //             lanes such that we can directly materialize the vector
12220  //             some other way (MOVI, e.g.), we can be sneaky.
12221  //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
12222  SDLoc dl(Op);
12223  unsigned NumElts = VT.getVectorNumElements();
12224  bool isOnlyLowElement = true;
12225  bool usesOnlyOneValue = true;
12226  bool usesOnlyOneConstantValue = true;
12227  bool isConstant = true;
12228  bool AllLanesExtractElt = true;
12229  unsigned NumConstantLanes = 0;
12230  unsigned NumDifferentLanes = 0;
12231  unsigned NumUndefLanes = 0;
12232  SDValue Value;
12233  SDValue ConstantValue;
12234  for (unsigned i = 0; i < NumElts; ++i) {
12235    SDValue V = Op.getOperand(i);
12236    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12237      AllLanesExtractElt = false;
12238    if (V.isUndef()) {
12239      ++NumUndefLanes;
12240      continue;
12241    }
12242    if (i > 0)
12243      isOnlyLowElement = false;
12244    if (!isIntOrFPConstant(V))
12245      isConstant = false;
12246
12247    if (isIntOrFPConstant(V)) {
12248      ++NumConstantLanes;
12249      if (!ConstantValue.getNode())
12250        ConstantValue = V;
12251      else if (ConstantValue != V)
12252        usesOnlyOneConstantValue = false;
12253    }
12254
12255    if (!Value.getNode())
12256      Value = V;
12257    else if (V != Value) {
12258      usesOnlyOneValue = false;
12259      ++NumDifferentLanes;
12260    }
12261  }
12262
12263  if (!Value.getNode()) {
12264    LLVM_DEBUG(
12265        dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
12266    return DAG.getUNDEF(VT);
12267  }
12268
12269  // Convert BUILD_VECTOR where all elements but the lowest are undef into
12270  // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
12271  // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
12272  if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
12273    LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
12274                         "SCALAR_TO_VECTOR node\n");
12275    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
12276  }
12277
12278  if (AllLanesExtractElt) {
12279    SDNode *Vector = nullptr;
12280    bool Even = false;
12281    bool Odd = false;
12282    // Check whether the extract elements match the Even pattern <0,2,4,...> or
12283    // the Odd pattern <1,3,5,...>.
12284    for (unsigned i = 0; i < NumElts; ++i) {
12285      SDValue V = Op.getOperand(i);
12286      const SDNode *N = V.getNode();
12287      if (!isa<ConstantSDNode>(N->getOperand(1)))
12288        break;
12289      SDValue N0 = N->getOperand(0);
12290
12291      // All elements are extracted from the same vector.
12292      if (!Vector) {
12293        Vector = N0.getNode();
12294        // Check that the type of EXTRACT_VECTOR_ELT matches the type of
12295        // BUILD_VECTOR.
12296        if (VT.getVectorElementType() !=
12297            N0.getValueType().getVectorElementType())
12298          break;
12299      } else if (Vector != N0.getNode()) {
12300        Odd = false;
12301        Even = false;
12302        break;
12303      }
12304
12305      // Extracted values are either at Even indices <0,2,4,...> or at Odd
12306      // indices <1,3,5,...>.
12307      uint64_t Val = N->getConstantOperandVal(1);
12308      if (Val == 2 * i) {
12309        Even = true;
12310        continue;
12311      }
12312      if (Val - 1 == 2 * i) {
12313        Odd = true;
12314        continue;
12315      }
12316
12317      // Something does not match: abort.
12318      Odd = false;
12319      Even = false;
12320      break;
12321    }
12322    if (Even || Odd) {
12323      SDValue LHS =
12324          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
12325                      DAG.getConstant(0, dl, MVT::i64));
12326      SDValue RHS =
12327          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
12328                      DAG.getConstant(NumElts, dl, MVT::i64));
12329
12330      if (Even && !Odd)
12331        return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
12332                           RHS);
12333      if (Odd && !Even)
12334        return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
12335                           RHS);
12336    }
12337  }
12338
12339  // Use DUP for non-constant splats. For f32 constant splats, reduce to
12340  // i32 and try again.
12341  if (usesOnlyOneValue) {
12342    if (!isConstant) {
12343      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12344          Value.getValueType() != VT) {
12345        LLVM_DEBUG(
12346            dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
12347        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
12348      }
12349
12350      // This is actually a DUPLANExx operation, which keeps everything vectory.
12351
12352      SDValue Lane = Value.getOperand(1);
12353      Value = Value.getOperand(0);
12354      if (Value.getValueSizeInBits() == 64) {
12355        LLVM_DEBUG(
12356            dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
12357                      "widening it\n");
12358        Value = WidenVector(Value, DAG);
12359      }
12360
12361      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
12362      return DAG.getNode(Opcode, dl, VT, Value, Lane);
12363    }
12364
12365    if (VT.getVectorElementType().isFloatingPoint()) {
12366      SmallVector<SDValue, 8> Ops;
12367      EVT EltTy = VT.getVectorElementType();
12368      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
12369               EltTy == MVT::f64) && "Unsupported floating-point vector type");
12370      LLVM_DEBUG(
12371          dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
12372                    "BITCASTS, and try again\n");
12373      MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
12374      for (unsigned i = 0; i < NumElts; ++i)
12375        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
12376      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
12377      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
12378      LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
12379                 Val.dump(););
12380      Val = LowerBUILD_VECTOR(Val, DAG);
12381      if (Val.getNode())
12382        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
12383    }
12384  }
12385
12386  // If we need to insert a small number of different non-constant elements and
12387  // the vector width is sufficiently large, prefer using DUP with the common
12388  // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
12389  // skip the constant lane handling below.
12390  bool PreferDUPAndInsert =
12391      !isConstant && NumDifferentLanes >= 1 &&
12392      NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
12393      NumDifferentLanes >= NumConstantLanes;
12394
12395  // If there was only one constant value used and for more than one lane,
12396  // start by splatting that value, then replace the non-constant lanes. This
12397  // is better than the default, which will perform a separate initialization
12398  // for each lane.
12399  if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
12400    // Firstly, try to materialize the splat constant.
12401    SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
12402            Val = ConstantBuildVector(Vec, DAG);
12403    if (!Val) {
12404      // Otherwise, materialize the constant and splat it.
12405      Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
12406      DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
12407    }
12408
12409    // Now insert the non-constant lanes.
12410    for (unsigned i = 0; i < NumElts; ++i) {
12411      SDValue V = Op.getOperand(i);
12412      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12413      if (!isIntOrFPConstant(V))
12414        // Note that type legalization likely mucked about with the VT of the
12415        // source operand, so we may have to convert it here before inserting.
12416        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
12417    }
12418    return Val;
12419  }
12420
12421  // This will generate a load from the constant pool.
12422  if (isConstant) {
12423    LLVM_DEBUG(
12424        dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
12425                  "expansion\n");
12426    return SDValue();
12427  }
12428
12429  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12430  // v4i32s. This is really a truncate, which we can construct out of (legal)
12431  // concats and truncate nodes.
12432  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
12433    return M;
12434
12435  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
12436  if (NumElts >= 4) {
12437    if (SDValue shuffle = ReconstructShuffle(Op, DAG))
12438      return shuffle;
12439  }
12440
12441  if (PreferDUPAndInsert) {
12442    // First, build a constant vector with the common element.
12443    SmallVector<SDValue, 8> Ops(NumElts, Value);
12444    SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
12445    // Next, insert the elements that do not match the common value.
12446    for (unsigned I = 0; I < NumElts; ++I)
12447      if (Op.getOperand(I) != Value)
12448        NewVector =
12449            DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
12450                        Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
12451
12452    return NewVector;
12453  }
12454
12455  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
12456  // know the default expansion would otherwise fall back on something even
12457  // worse. For a vector with one or two non-undef values, that's
12458  // scalar_to_vector for the elements followed by a shuffle (provided the
12459  // shuffle is valid for the target) and materialization element by element
12460  // on the stack followed by a load for everything else.
12461  if (!isConstant && !usesOnlyOneValue) {
12462    LLVM_DEBUG(
12463        dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
12464                  "of INSERT_VECTOR_ELT\n");
12465
12466    SDValue Vec = DAG.getUNDEF(VT);
12467    SDValue Op0 = Op.getOperand(0);
12468    unsigned i = 0;
12469
12470    // Use SCALAR_TO_VECTOR for lane zero to
12471    // a) Avoid a RMW dependency on the full vector register, and
12472    // b) Allow the register coalescer to fold away the copy if the
12473    //    value is already in an S or D register, and we're forced to emit an
12474    //    INSERT_SUBREG that we can't fold anywhere.
12475    //
12476    // We also allow types like i8 and i16 which are illegal scalar but legal
12477    // vector element types. After type-legalization the inserted value is
12478    // extended (i32) and it is safe to cast them to the vector type by ignoring
12479    // the upper bits of the lowest lane (e.g. v8i8, v4i16).
12480    if (!Op0.isUndef()) {
12481      LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
12482      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
12483      ++i;
12484    }
12485    LLVM_DEBUG(if (i < NumElts) dbgs()
12486                   << "Creating nodes for the other vector elements:\n";);
12487    for (; i < NumElts; ++i) {
12488      SDValue V = Op.getOperand(i);
12489      if (V.isUndef())
12490        continue;
12491      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12492      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
12493    }
12494    return Vec;
12495  }
12496
12497  LLVM_DEBUG(
12498      dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
12499                "better alternative\n");
12500  return SDValue();
12501}
12502
12503SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
12504                                                   SelectionDAG &DAG) const {
12505  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
12506                                   Subtarget->forceStreamingCompatibleSVE()))
12507    return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
12508
12509  assert(Op.getValueType().isScalableVector() &&
12510         isTypeLegal(Op.getValueType()) &&
12511         "Expected legal scalable vector type!");
12512
12513  if (isTypeLegal(Op.getOperand(0).getValueType())) {
12514    unsigned NumOperands = Op->getNumOperands();
12515    assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
12516           "Unexpected number of operands in CONCAT_VECTORS");
12517
12518    if (NumOperands == 2)
12519      return Op;
12520
12521    // Concat each pair of subvectors and pack into the lower half of the array.
12522    SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
12523    while (ConcatOps.size() > 1) {
12524      for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
12525        SDValue V1 = ConcatOps[I];
12526        SDValue V2 = ConcatOps[I + 1];
12527        EVT SubVT = V1.getValueType();
12528        EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
12529        ConcatOps[I / 2] =
12530            DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
12531      }
12532      ConcatOps.resize(ConcatOps.size() / 2);
12533    }
12534    return ConcatOps[0];
12535  }
12536
12537  return SDValue();
12538}
12539
12540SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12541                                                      SelectionDAG &DAG) const {
12542  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
12543
12544  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
12545                                   Subtarget->forceStreamingCompatibleSVE()))
12546    return LowerFixedLengthInsertVectorElt(Op, DAG);
12547
12548  // Check for non-constant or out of range lane.
12549  EVT VT = Op.getOperand(0).getValueType();
12550
12551  if (VT.getScalarType() == MVT::i1) {
12552    EVT VectorVT = getPromotedVTForPredicate(VT);
12553    SDLoc DL(Op);
12554    SDValue ExtendedVector =
12555        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
12556    SDValue ExtendedValue =
12557        DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
12558                             VectorVT.getScalarType().getSizeInBits() < 32
12559                                 ? MVT::i32
12560                                 : VectorVT.getScalarType());
12561    ExtendedVector =
12562        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
12563                    ExtendedValue, Op.getOperand(2));
12564    return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
12565  }
12566
12567  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
12568  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12569    return SDValue();
12570
12571  // Insertion/extraction are legal for V128 types.
12572  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12573      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12574      VT == MVT::v8f16 || VT == MVT::v8bf16)
12575    return Op;
12576
12577  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12578      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12579      VT != MVT::v4bf16)
12580    return SDValue();
12581
12582  // For V64 types, we perform insertion by expanding the value
12583  // to a V128 type and perform the insertion on that.
12584  SDLoc DL(Op);
12585  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12586  EVT WideTy = WideVec.getValueType();
12587
12588  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
12589                             Op.getOperand(1), Op.getOperand(2));
12590  // Re-narrow the resultant vector.
12591  return NarrowVector(Node, DAG);
12592}
12593
12594SDValue
12595AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12596                                               SelectionDAG &DAG) const {
12597  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
12598  EVT VT = Op.getOperand(0).getValueType();
12599
12600  if (VT.getScalarType() == MVT::i1) {
12601    // We can't directly extract from an SVE predicate; extend it first.
12602    // (This isn't the only possible lowering, but it's straightforward.)
12603    EVT VectorVT = getPromotedVTForPredicate(VT);
12604    SDLoc DL(Op);
12605    SDValue Extend =
12606        DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
12607    MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
12608    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
12609                                  Extend, Op.getOperand(1));
12610    return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
12611  }
12612
12613  if (useSVEForFixedLengthVectorVT(VT,
12614                                   Subtarget->forceStreamingCompatibleSVE()))
12615    return LowerFixedLengthExtractVectorElt(Op, DAG);
12616
12617  // Check for non-constant or out of range lane.
12618  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
12619  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12620    return SDValue();
12621
12622  // Insertion/extraction are legal for V128 types.
12623  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12624      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12625      VT == MVT::v8f16 || VT == MVT::v8bf16)
12626    return Op;
12627
12628  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12629      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12630      VT != MVT::v4bf16)
12631    return SDValue();
12632
12633  // For V64 types, we perform extraction by expanding the value
12634  // to a V128 type and perform the extraction on that.
12635  SDLoc DL(Op);
12636  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12637  EVT WideTy = WideVec.getValueType();
12638
12639  EVT ExtrTy = WideTy.getVectorElementType();
12640  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
12641    ExtrTy = MVT::i32;
12642
12643  // For extractions, we just return the result directly.
12644  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
12645                     Op.getOperand(1));
12646}
12647
12648SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
12649                                                      SelectionDAG &DAG) const {
12650  assert(Op.getValueType().isFixedLengthVector() &&
12651         "Only cases that extract a fixed length vector are supported!");
12652
12653  EVT InVT = Op.getOperand(0).getValueType();
12654  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12655  unsigned Size = Op.getValueSizeInBits();
12656
12657  // If we don't have legal types yet, do nothing
12658  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
12659    return SDValue();
12660
12661  if (InVT.isScalableVector()) {
12662    // This will be matched by custom code during ISelDAGToDAG.
12663    if (Idx == 0 && isPackedVectorType(InVT, DAG))
12664      return Op;
12665
12666    return SDValue();
12667  }
12668
12669  // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
12670  if (Idx == 0 && InVT.getSizeInBits() <= 128)
12671    return Op;
12672
12673  // If this is extracting the upper 64-bits of a 128-bit vector, we match
12674  // that directly.
12675  if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
12676      InVT.getSizeInBits() == 128 && !Subtarget->forceStreamingCompatibleSVE())
12677    return Op;
12678
12679  if (useSVEForFixedLengthVectorVT(InVT,
12680                                   Subtarget->forceStreamingCompatibleSVE())) {
12681    SDLoc DL(Op);
12682
12683    EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
12684    SDValue NewInVec =
12685        convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
12686
12687    SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
12688                                 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
12689    return convertFromScalableVector(DAG, Op.getValueType(), Splice);
12690  }
12691
12692  return SDValue();
12693}
12694
12695SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
12696                                                     SelectionDAG &DAG) const {
12697  assert(Op.getValueType().isScalableVector() &&
12698         "Only expect to lower inserts into scalable vectors!");
12699
12700  EVT InVT = Op.getOperand(1).getValueType();
12701  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
12702
12703  SDValue Vec0 = Op.getOperand(0);
12704  SDValue Vec1 = Op.getOperand(1);
12705  SDLoc DL(Op);
12706  EVT VT = Op.getValueType();
12707
12708  if (InVT.isScalableVector()) {
12709    if (!isTypeLegal(VT))
12710      return SDValue();
12711
12712    // Break down insert_subvector into simpler parts.
12713    if (VT.getVectorElementType() == MVT::i1) {
12714      unsigned NumElts = VT.getVectorMinNumElements();
12715      EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12716
12717      SDValue Lo, Hi;
12718      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12719                       DAG.getVectorIdxConstant(0, DL));
12720      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12721                       DAG.getVectorIdxConstant(NumElts / 2, DL));
12722      if (Idx < (NumElts / 2)) {
12723        SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
12724                                    DAG.getVectorIdxConstant(Idx, DL));
12725        return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
12726      } else {
12727        SDValue NewHi =
12728            DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
12729                        DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
12730        return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
12731      }
12732    }
12733
12734    // Ensure the subvector is half the size of the main vector.
12735    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
12736      return SDValue();
12737
12738    // Here narrow and wide refers to the vector element types. After "casting"
12739    // both vectors must have the same bit length and so because the subvector
12740    // has fewer elements, those elements need to be bigger.
12741    EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
12742    EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
12743
12744    // NOP cast operands to the largest legal vector of the same element count.
12745    if (VT.isFloatingPoint()) {
12746      Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
12747      Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
12748    } else {
12749      // Legal integer vectors are already their largest so Vec0 is fine as is.
12750      Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
12751    }
12752
12753    // To replace the top/bottom half of vector V with vector SubV we widen the
12754    // preserved half of V, concatenate this to SubV (the order depending on the
12755    // half being replaced) and then narrow the result.
12756    SDValue Narrow;
12757    if (Idx == 0) {
12758      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
12759      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
12760    } else {
12761      assert(Idx == InVT.getVectorMinNumElements() &&
12762             "Invalid subvector index!");
12763      SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
12764      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
12765    }
12766
12767    return getSVESafeBitCast(VT, Narrow, DAG);
12768  }
12769
12770  if (Idx == 0 && isPackedVectorType(VT, DAG)) {
12771    // This will be matched by custom code during ISelDAGToDAG.
12772    if (Vec0.isUndef())
12773      return Op;
12774
12775    std::optional<unsigned> PredPattern =
12776        getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
12777    auto PredTy = VT.changeVectorElementType(MVT::i1);
12778    SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
12779    SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
12780    return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
12781  }
12782
12783  return SDValue();
12784}
12785
12786static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
12787  if (Op.getOpcode() != AArch64ISD::DUP &&
12788      Op.getOpcode() != ISD::SPLAT_VECTOR &&
12789      Op.getOpcode() != ISD::BUILD_VECTOR)
12790    return false;
12791
12792  if (Op.getOpcode() == ISD::BUILD_VECTOR &&
12793      !isAllConstantBuildVector(Op, SplatVal))
12794    return false;
12795
12796  if (Op.getOpcode() != ISD::BUILD_VECTOR &&
12797      !isa<ConstantSDNode>(Op->getOperand(0)))
12798    return false;
12799
12800  SplatVal = Op->getConstantOperandVal(0);
12801  if (Op.getValueType().getVectorElementType() != MVT::i64)
12802    SplatVal = (int32_t)SplatVal;
12803
12804  Negated = false;
12805  if (isPowerOf2_64(SplatVal))
12806    return true;
12807
12808  Negated = true;
12809  if (isPowerOf2_64(-SplatVal)) {
12810    SplatVal = -SplatVal;
12811    return true;
12812  }
12813
12814  return false;
12815}
12816
12817SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
12818  EVT VT = Op.getValueType();
12819  SDLoc dl(Op);
12820
12821  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
12822    return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
12823
12824  assert(VT.isScalableVector() && "Expected a scalable vector.");
12825
12826  bool Signed = Op.getOpcode() == ISD::SDIV;
12827  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
12828
12829  bool Negated;
12830  uint64_t SplatVal;
12831  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
12832    SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
12833    SDValue Res =
12834        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
12835                    DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
12836    if (Negated)
12837      Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
12838
12839    return Res;
12840  }
12841
12842  if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
12843    return LowerToPredicatedOp(Op, DAG, PredOpcode);
12844
12845  // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
12846  // operations, and truncate the result.
12847  EVT WidenedVT;
12848  if (VT == MVT::nxv16i8)
12849    WidenedVT = MVT::nxv8i16;
12850  else if (VT == MVT::nxv8i16)
12851    WidenedVT = MVT::nxv4i32;
12852  else
12853    llvm_unreachable("Unexpected Custom DIV operation");
12854
12855  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
12856  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
12857  SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
12858  SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
12859  SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
12860  SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
12861  SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
12862  SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
12863  return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
12864}
12865
12866bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
12867  // Currently no fixed length shuffles that require SVE are legal.
12868  if (useSVEForFixedLengthVectorVT(VT,
12869                                   Subtarget->forceStreamingCompatibleSVE()))
12870    return false;
12871
12872  if (VT.getVectorNumElements() == 4 &&
12873      (VT.is128BitVector() || VT.is64BitVector())) {
12874    unsigned Cost = getPerfectShuffleCost(M);
12875    if (Cost <= 1)
12876      return true;
12877  }
12878
12879  bool DummyBool;
12880  int DummyInt;
12881  unsigned DummyUnsigned;
12882
12883  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
12884          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
12885          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
12886          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
12887          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
12888          isZIPMask(M, VT, DummyUnsigned) ||
12889          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
12890          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
12891          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
12892          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
12893          isConcatMask(M, VT, VT.getSizeInBits() == 128));
12894}
12895
12896bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
12897                                                   EVT VT) const {
12898  // Just delegate to the generic legality, clear masks aren't special.
12899  return isShuffleMaskLegal(M, VT);
12900}
12901
12902/// getVShiftImm - Check if this is a valid build_vector for the immediate
12903/// operand of a vector shift operation, where all the elements of the
12904/// build_vector must have the same constant integer value.
12905static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
12906  // Ignore bit_converts.
12907  while (Op.getOpcode() == ISD::BITCAST)
12908    Op = Op.getOperand(0);
12909  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
12910  APInt SplatBits, SplatUndef;
12911  unsigned SplatBitSize;
12912  bool HasAnyUndefs;
12913  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
12914                                    HasAnyUndefs, ElementBits) ||
12915      SplatBitSize > ElementBits)
12916    return false;
12917  Cnt = SplatBits.getSExtValue();
12918  return true;
12919}
12920
12921/// isVShiftLImm - Check if this is a valid build_vector for the immediate
12922/// operand of a vector shift left operation.  That value must be in the range:
12923///   0 <= Value < ElementBits for a left shift; or
12924///   0 <= Value <= ElementBits for a long left shift.
12925static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
12926  assert(VT.isVector() && "vector shift count is not a vector type");
12927  int64_t ElementBits = VT.getScalarSizeInBits();
12928  if (!getVShiftImm(Op, ElementBits, Cnt))
12929    return false;
12930  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
12931}
12932
12933/// isVShiftRImm - Check if this is a valid build_vector for the immediate
12934/// operand of a vector shift right operation. The value must be in the range:
12935///   1 <= Value <= ElementBits for a right shift; or
12936static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
12937  assert(VT.isVector() && "vector shift count is not a vector type");
12938  int64_t ElementBits = VT.getScalarSizeInBits();
12939  if (!getVShiftImm(Op, ElementBits, Cnt))
12940    return false;
12941  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
12942}
12943
12944SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
12945                                             SelectionDAG &DAG) const {
12946  EVT VT = Op.getValueType();
12947
12948  if (VT.getScalarType() == MVT::i1) {
12949    // Lower i1 truncate to `(x & 1) != 0`.
12950    SDLoc dl(Op);
12951    EVT OpVT = Op.getOperand(0).getValueType();
12952    SDValue Zero = DAG.getConstant(0, dl, OpVT);
12953    SDValue One = DAG.getConstant(1, dl, OpVT);
12954    SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
12955    return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
12956  }
12957
12958  if (!VT.isVector() || VT.isScalableVector())
12959    return SDValue();
12960
12961  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
12962                                   Subtarget->forceStreamingCompatibleSVE()))
12963    return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
12964
12965  return SDValue();
12966}
12967
12968SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
12969                                                      SelectionDAG &DAG) const {
12970  EVT VT = Op.getValueType();
12971  SDLoc DL(Op);
12972  int64_t Cnt;
12973
12974  if (!Op.getOperand(1).getValueType().isVector())
12975    return Op;
12976  unsigned EltSize = VT.getScalarSizeInBits();
12977
12978  switch (Op.getOpcode()) {
12979  case ISD::SHL:
12980    if (VT.isScalableVector() ||
12981        useSVEForFixedLengthVectorVT(VT,
12982                                     Subtarget->forceStreamingCompatibleSVE()))
12983      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
12984
12985    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
12986      return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
12987                         DAG.getConstant(Cnt, DL, MVT::i32));
12988    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12989                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
12990                                       MVT::i32),
12991                       Op.getOperand(0), Op.getOperand(1));
12992  case ISD::SRA:
12993  case ISD::SRL:
12994    if (VT.isScalableVector() ||
12995        useSVEForFixedLengthVectorVT(
12996            VT, Subtarget->forceStreamingCompatibleSVE())) {
12997      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
12998                                                : AArch64ISD::SRL_PRED;
12999      return LowerToPredicatedOp(Op, DAG, Opc);
13000    }
13001
13002    // Right shift immediate
13003    if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
13004      unsigned Opc =
13005          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
13006      return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
13007                         DAG.getConstant(Cnt, DL, MVT::i32));
13008    }
13009
13010    // Right shift register.  Note, there is not a shift right register
13011    // instruction, but the shift left register instruction takes a signed
13012    // value, where negative numbers specify a right shift.
13013    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
13014                                                : Intrinsic::aarch64_neon_ushl;
13015    // negate the shift amount
13016    SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
13017                                   Op.getOperand(1));
13018    SDValue NegShiftLeft =
13019        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13020                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
13021                    NegShift);
13022    return NegShiftLeft;
13023  }
13024
13025  llvm_unreachable("unexpected shift opcode");
13026}
13027
13028static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
13029                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
13030                                    const SDLoc &dl, SelectionDAG &DAG) {
13031  EVT SrcVT = LHS.getValueType();
13032  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
13033         "function only supposed to emit natural comparisons");
13034
13035  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
13036  APInt CnstBits(VT.getSizeInBits(), 0);
13037  APInt UndefBits(VT.getSizeInBits(), 0);
13038  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
13039  bool IsZero = IsCnst && (CnstBits == 0);
13040
13041  if (SrcVT.getVectorElementType().isFloatingPoint()) {
13042    switch (CC) {
13043    default:
13044      return SDValue();
13045    case AArch64CC::NE: {
13046      SDValue Fcmeq;
13047      if (IsZero)
13048        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13049      else
13050        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13051      return DAG.getNOT(dl, Fcmeq, VT);
13052    }
13053    case AArch64CC::EQ:
13054      if (IsZero)
13055        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13056      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13057    case AArch64CC::GE:
13058      if (IsZero)
13059        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
13060      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
13061    case AArch64CC::GT:
13062      if (IsZero)
13063        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
13064      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
13065    case AArch64CC::LE:
13066      if (!NoNans)
13067        return SDValue();
13068      // If we ignore NaNs then we can use to the LS implementation.
13069      [[fallthrough]];
13070    case AArch64CC::LS:
13071      if (IsZero)
13072        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
13073      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
13074    case AArch64CC::LT:
13075      if (!NoNans)
13076        return SDValue();
13077      // If we ignore NaNs then we can use to the MI implementation.
13078      [[fallthrough]];
13079    case AArch64CC::MI:
13080      if (IsZero)
13081        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
13082      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
13083    }
13084  }
13085
13086  switch (CC) {
13087  default:
13088    return SDValue();
13089  case AArch64CC::NE: {
13090    SDValue Cmeq;
13091    if (IsZero)
13092      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
13093    else
13094      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
13095    return DAG.getNOT(dl, Cmeq, VT);
13096  }
13097  case AArch64CC::EQ:
13098    if (IsZero)
13099      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
13100    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
13101  case AArch64CC::GE:
13102    if (IsZero)
13103      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
13104    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
13105  case AArch64CC::GT:
13106    if (IsZero)
13107      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
13108    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
13109  case AArch64CC::LE:
13110    if (IsZero)
13111      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
13112    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
13113  case AArch64CC::LS:
13114    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
13115  case AArch64CC::LO:
13116    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
13117  case AArch64CC::LT:
13118    if (IsZero)
13119      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
13120    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
13121  case AArch64CC::HI:
13122    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
13123  case AArch64CC::HS:
13124    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
13125  }
13126}
13127
13128SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
13129                                           SelectionDAG &DAG) const {
13130  if (Op.getValueType().isScalableVector())
13131    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
13132
13133  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
13134                                   Subtarget->forceStreamingCompatibleSVE()))
13135    return LowerFixedLengthVectorSetccToSVE(Op, DAG);
13136
13137  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13138  SDValue LHS = Op.getOperand(0);
13139  SDValue RHS = Op.getOperand(1);
13140  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
13141  SDLoc dl(Op);
13142
13143  if (LHS.getValueType().getVectorElementType().isInteger()) {
13144    assert(LHS.getValueType() == RHS.getValueType());
13145    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
13146    SDValue Cmp =
13147        EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
13148    return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
13149  }
13150
13151  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
13152
13153  // Make v4f16 (only) fcmp operations utilise vector instructions
13154  // v8f16 support will be a litle more complicated
13155  if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
13156    if (LHS.getValueType().getVectorNumElements() == 4) {
13157      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
13158      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
13159      SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
13160      DAG.ReplaceAllUsesWith(Op, NewSetcc);
13161      CmpVT = MVT::v4i32;
13162    } else
13163      return SDValue();
13164  }
13165
13166  assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
13167          LHS.getValueType().getVectorElementType() != MVT::f128);
13168
13169  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
13170  // clean.  Some of them require two branches to implement.
13171  AArch64CC::CondCode CC1, CC2;
13172  bool ShouldInvert;
13173  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
13174
13175  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
13176  SDValue Cmp =
13177      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
13178  if (!Cmp.getNode())
13179    return SDValue();
13180
13181  if (CC2 != AArch64CC::AL) {
13182    SDValue Cmp2 =
13183        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
13184    if (!Cmp2.getNode())
13185      return SDValue();
13186
13187    Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
13188  }
13189
13190  Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
13191
13192  if (ShouldInvert)
13193    Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
13194
13195  return Cmp;
13196}
13197
13198static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
13199                                  SelectionDAG &DAG) {
13200  SDValue VecOp = ScalarOp.getOperand(0);
13201  auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
13202  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
13203                     DAG.getConstant(0, DL, MVT::i64));
13204}
13205
13206SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
13207                                              SelectionDAG &DAG) const {
13208  SDValue Src = Op.getOperand(0);
13209
13210  // Try to lower fixed length reductions to SVE.
13211  EVT SrcVT = Src.getValueType();
13212  bool OverrideNEON = Subtarget->forceStreamingCompatibleSVE() ||
13213                      Op.getOpcode() == ISD::VECREDUCE_AND ||
13214                      Op.getOpcode() == ISD::VECREDUCE_OR ||
13215                      Op.getOpcode() == ISD::VECREDUCE_XOR ||
13216                      Op.getOpcode() == ISD::VECREDUCE_FADD ||
13217                      (Op.getOpcode() != ISD::VECREDUCE_ADD &&
13218                       SrcVT.getVectorElementType() == MVT::i64);
13219  if (SrcVT.isScalableVector() ||
13220      useSVEForFixedLengthVectorVT(
13221          SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
13222
13223    if (SrcVT.getVectorElementType() == MVT::i1)
13224      return LowerPredReductionToSVE(Op, DAG);
13225
13226    switch (Op.getOpcode()) {
13227    case ISD::VECREDUCE_ADD:
13228      return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
13229    case ISD::VECREDUCE_AND:
13230      return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
13231    case ISD::VECREDUCE_OR:
13232      return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
13233    case ISD::VECREDUCE_SMAX:
13234      return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
13235    case ISD::VECREDUCE_SMIN:
13236      return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
13237    case ISD::VECREDUCE_UMAX:
13238      return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
13239    case ISD::VECREDUCE_UMIN:
13240      return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
13241    case ISD::VECREDUCE_XOR:
13242      return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
13243    case ISD::VECREDUCE_FADD:
13244      return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
13245    case ISD::VECREDUCE_FMAX:
13246      return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
13247    case ISD::VECREDUCE_FMIN:
13248      return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
13249    default:
13250      llvm_unreachable("Unhandled fixed length reduction");
13251    }
13252  }
13253
13254  // Lower NEON reductions.
13255  SDLoc dl(Op);
13256  switch (Op.getOpcode()) {
13257  case ISD::VECREDUCE_ADD:
13258    return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
13259  case ISD::VECREDUCE_SMAX:
13260    return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
13261  case ISD::VECREDUCE_SMIN:
13262    return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
13263  case ISD::VECREDUCE_UMAX:
13264    return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
13265  case ISD::VECREDUCE_UMIN:
13266    return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
13267  case ISD::VECREDUCE_FMAX: {
13268    return DAG.getNode(
13269        ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
13270        DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
13271        Src);
13272  }
13273  case ISD::VECREDUCE_FMIN: {
13274    return DAG.getNode(
13275        ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
13276        DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
13277        Src);
13278  }
13279  default:
13280    llvm_unreachable("Unhandled reduction");
13281  }
13282}
13283
13284SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
13285                                                    SelectionDAG &DAG) const {
13286  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13287  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
13288    return SDValue();
13289
13290  // LSE has an atomic load-add instruction, but not a load-sub.
13291  SDLoc dl(Op);
13292  MVT VT = Op.getSimpleValueType();
13293  SDValue RHS = Op.getOperand(2);
13294  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
13295  RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
13296  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
13297                       Op.getOperand(0), Op.getOperand(1), RHS,
13298                       AN->getMemOperand());
13299}
13300
13301SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
13302                                                    SelectionDAG &DAG) const {
13303  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13304  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
13305    return SDValue();
13306
13307  // LSE has an atomic load-clear instruction, but not a load-and.
13308  SDLoc dl(Op);
13309  MVT VT = Op.getSimpleValueType();
13310  SDValue RHS = Op.getOperand(2);
13311  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
13312  RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
13313  return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
13314                       Op.getOperand(0), Op.getOperand(1), RHS,
13315                       AN->getMemOperand());
13316}
13317
13318SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
13319    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
13320  SDLoc dl(Op);
13321  EVT PtrVT = getPointerTy(DAG.getDataLayout());
13322  SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
13323                                               PtrVT, 0);
13324
13325  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
13326  const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
13327  if (Subtarget->hasCustomCallingConv())
13328    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
13329
13330  Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
13331                     DAG.getConstant(4, dl, MVT::i64));
13332  Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
13333  Chain =
13334      DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
13335                  Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
13336                  DAG.getRegisterMask(Mask), Chain.getValue(1));
13337  // To match the actual intent better, we should read the output from X15 here
13338  // again (instead of potentially spilling it to the stack), but rereading Size
13339  // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
13340  // here.
13341
13342  Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
13343                     DAG.getConstant(4, dl, MVT::i64));
13344  return Chain;
13345}
13346
13347SDValue
13348AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
13349                                               SelectionDAG &DAG) const {
13350  assert(Subtarget->isTargetWindows() &&
13351         "Only Windows alloca probing supported");
13352  SDLoc dl(Op);
13353  // Get the inputs.
13354  SDNode *Node = Op.getNode();
13355  SDValue Chain = Op.getOperand(0);
13356  SDValue Size = Op.getOperand(1);
13357  MaybeAlign Align =
13358      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
13359  EVT VT = Node->getValueType(0);
13360
13361  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
13362          "no-stack-arg-probe")) {
13363    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
13364    Chain = SP.getValue(1);
13365    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
13366    if (Align)
13367      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
13368                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
13369    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
13370    SDValue Ops[2] = {SP, Chain};
13371    return DAG.getMergeValues(Ops, dl);
13372  }
13373
13374  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
13375
13376  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
13377
13378  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
13379  Chain = SP.getValue(1);
13380  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
13381  if (Align)
13382    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
13383                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
13384  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
13385
13386  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
13387
13388  SDValue Ops[2] = {SP, Chain};
13389  return DAG.getMergeValues(Ops, dl);
13390}
13391
13392SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
13393                                           SelectionDAG &DAG) const {
13394  EVT VT = Op.getValueType();
13395  assert(VT != MVT::i64 && "Expected illegal VSCALE node");
13396
13397  SDLoc DL(Op);
13398  APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
13399  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
13400                            VT);
13401}
13402
13403/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
13404template <unsigned NumVecs>
13405static bool
13406setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
13407              AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
13408  Info.opc = ISD::INTRINSIC_VOID;
13409  // Retrieve EC from first vector argument.
13410  const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
13411  ElementCount EC = VT.getVectorElementCount();
13412#ifndef NDEBUG
13413  // Check the assumption that all input vectors are the same type.
13414  for (unsigned I = 0; I < NumVecs; ++I)
13415    assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
13416           "Invalid type.");
13417#endif
13418  // memVT is `NumVecs * VT`.
13419  Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
13420                                EC * NumVecs);
13421  Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
13422  Info.offset = 0;
13423  Info.align.reset();
13424  Info.flags = MachineMemOperand::MOStore;
13425  return true;
13426}
13427
13428/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
13429/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
13430/// specified in the intrinsic calls.
13431bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13432                                               const CallInst &I,
13433                                               MachineFunction &MF,
13434                                               unsigned Intrinsic) const {
13435  auto &DL = I.getModule()->getDataLayout();
13436  switch (Intrinsic) {
13437  case Intrinsic::aarch64_sve_st2:
13438    return setInfoSVEStN<2>(*this, DL, Info, I);
13439  case Intrinsic::aarch64_sve_st3:
13440    return setInfoSVEStN<3>(*this, DL, Info, I);
13441  case Intrinsic::aarch64_sve_st4:
13442    return setInfoSVEStN<4>(*this, DL, Info, I);
13443  case Intrinsic::aarch64_neon_ld2:
13444  case Intrinsic::aarch64_neon_ld3:
13445  case Intrinsic::aarch64_neon_ld4:
13446  case Intrinsic::aarch64_neon_ld1x2:
13447  case Intrinsic::aarch64_neon_ld1x3:
13448  case Intrinsic::aarch64_neon_ld1x4:
13449  case Intrinsic::aarch64_neon_ld2lane:
13450  case Intrinsic::aarch64_neon_ld3lane:
13451  case Intrinsic::aarch64_neon_ld4lane:
13452  case Intrinsic::aarch64_neon_ld2r:
13453  case Intrinsic::aarch64_neon_ld3r:
13454  case Intrinsic::aarch64_neon_ld4r: {
13455    Info.opc = ISD::INTRINSIC_W_CHAIN;
13456    // Conservatively set memVT to the entire set of vectors loaded.
13457    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13458    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13459    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13460    Info.offset = 0;
13461    Info.align.reset();
13462    // volatile loads with NEON intrinsics not supported
13463    Info.flags = MachineMemOperand::MOLoad;
13464    return true;
13465  }
13466  case Intrinsic::aarch64_neon_st2:
13467  case Intrinsic::aarch64_neon_st3:
13468  case Intrinsic::aarch64_neon_st4:
13469  case Intrinsic::aarch64_neon_st1x2:
13470  case Intrinsic::aarch64_neon_st1x3:
13471  case Intrinsic::aarch64_neon_st1x4:
13472  case Intrinsic::aarch64_neon_st2lane:
13473  case Intrinsic::aarch64_neon_st3lane:
13474  case Intrinsic::aarch64_neon_st4lane: {
13475    Info.opc = ISD::INTRINSIC_VOID;
13476    // Conservatively set memVT to the entire set of vectors stored.
13477    unsigned NumElts = 0;
13478    for (const Value *Arg : I.args()) {
13479      Type *ArgTy = Arg->getType();
13480      if (!ArgTy->isVectorTy())
13481        break;
13482      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13483    }
13484    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13485    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13486    Info.offset = 0;
13487    Info.align.reset();
13488    // volatile stores with NEON intrinsics not supported
13489    Info.flags = MachineMemOperand::MOStore;
13490    return true;
13491  }
13492  case Intrinsic::aarch64_ldaxr:
13493  case Intrinsic::aarch64_ldxr: {
13494    Type *ValTy = I.getParamElementType(0);
13495    Info.opc = ISD::INTRINSIC_W_CHAIN;
13496    Info.memVT = MVT::getVT(ValTy);
13497    Info.ptrVal = I.getArgOperand(0);
13498    Info.offset = 0;
13499    Info.align = DL.getABITypeAlign(ValTy);
13500    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13501    return true;
13502  }
13503  case Intrinsic::aarch64_stlxr:
13504  case Intrinsic::aarch64_stxr: {
13505    Type *ValTy = I.getParamElementType(1);
13506    Info.opc = ISD::INTRINSIC_W_CHAIN;
13507    Info.memVT = MVT::getVT(ValTy);
13508    Info.ptrVal = I.getArgOperand(1);
13509    Info.offset = 0;
13510    Info.align = DL.getABITypeAlign(ValTy);
13511    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13512    return true;
13513  }
13514  case Intrinsic::aarch64_ldaxp:
13515  case Intrinsic::aarch64_ldxp:
13516    Info.opc = ISD::INTRINSIC_W_CHAIN;
13517    Info.memVT = MVT::i128;
13518    Info.ptrVal = I.getArgOperand(0);
13519    Info.offset = 0;
13520    Info.align = Align(16);
13521    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13522    return true;
13523  case Intrinsic::aarch64_stlxp:
13524  case Intrinsic::aarch64_stxp:
13525    Info.opc = ISD::INTRINSIC_W_CHAIN;
13526    Info.memVT = MVT::i128;
13527    Info.ptrVal = I.getArgOperand(2);
13528    Info.offset = 0;
13529    Info.align = Align(16);
13530    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13531    return true;
13532  case Intrinsic::aarch64_sve_ldnt1: {
13533    Type *ElTy = cast<VectorType>(I.getType())->getElementType();
13534    Info.opc = ISD::INTRINSIC_W_CHAIN;
13535    Info.memVT = MVT::getVT(I.getType());
13536    Info.ptrVal = I.getArgOperand(1);
13537    Info.offset = 0;
13538    Info.align = DL.getABITypeAlign(ElTy);
13539    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
13540    return true;
13541  }
13542  case Intrinsic::aarch64_sve_stnt1: {
13543    Type *ElTy =
13544        cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
13545    Info.opc = ISD::INTRINSIC_W_CHAIN;
13546    Info.memVT = MVT::getVT(I.getOperand(0)->getType());
13547    Info.ptrVal = I.getArgOperand(2);
13548    Info.offset = 0;
13549    Info.align = DL.getABITypeAlign(ElTy);
13550    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
13551    return true;
13552  }
13553  case Intrinsic::aarch64_mops_memset_tag: {
13554    Value *Dst = I.getArgOperand(0);
13555    Value *Val = I.getArgOperand(1);
13556    Info.opc = ISD::INTRINSIC_W_CHAIN;
13557    Info.memVT = MVT::getVT(Val->getType());
13558    Info.ptrVal = Dst;
13559    Info.offset = 0;
13560    Info.align = I.getParamAlign(0).valueOrOne();
13561    Info.flags = MachineMemOperand::MOStore;
13562    // The size of the memory being operated on is unknown at this point
13563    Info.size = MemoryLocation::UnknownSize;
13564    return true;
13565  }
13566  default:
13567    break;
13568  }
13569
13570  return false;
13571}
13572
13573bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
13574                                                  ISD::LoadExtType ExtTy,
13575                                                  EVT NewVT) const {
13576  // TODO: This may be worth removing. Check regression tests for diffs.
13577  if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
13578    return false;
13579
13580  // If we're reducing the load width in order to avoid having to use an extra
13581  // instruction to do extension then it's probably a good idea.
13582  if (ExtTy != ISD::NON_EXTLOAD)
13583    return true;
13584  // Don't reduce load width if it would prevent us from combining a shift into
13585  // the offset.
13586  MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
13587  assert(Mem);
13588  const SDValue &Base = Mem->getBasePtr();
13589  if (Base.getOpcode() == ISD::ADD &&
13590      Base.getOperand(1).getOpcode() == ISD::SHL &&
13591      Base.getOperand(1).hasOneUse() &&
13592      Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
13593    // It's unknown whether a scalable vector has a power-of-2 bitwidth.
13594    if (Mem->getMemoryVT().isScalableVector())
13595      return false;
13596    // The shift can be combined if it matches the size of the value being
13597    // loaded (and so reducing the width would make it not match).
13598    uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
13599    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
13600    if (ShiftAmount == Log2_32(LoadBytes))
13601      return false;
13602  }
13603  // We have no reason to disallow reducing the load width, so allow it.
13604  return true;
13605}
13606
13607// Truncations from 64-bit GPR to 32-bit GPR is free.
13608bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
13609  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13610    return false;
13611  uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
13612  uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
13613  return NumBits1 > NumBits2;
13614}
13615bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
13616  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13617    return false;
13618  uint64_t NumBits1 = VT1.getFixedSizeInBits();
13619  uint64_t NumBits2 = VT2.getFixedSizeInBits();
13620  return NumBits1 > NumBits2;
13621}
13622
13623/// Check if it is profitable to hoist instruction in then/else to if.
13624/// Not profitable if I and it's user can form a FMA instruction
13625/// because we prefer FMSUB/FMADD.
13626bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
13627  if (I->getOpcode() != Instruction::FMul)
13628    return true;
13629
13630  if (!I->hasOneUse())
13631    return true;
13632
13633  Instruction *User = I->user_back();
13634
13635  if (!(User->getOpcode() == Instruction::FSub ||
13636        User->getOpcode() == Instruction::FAdd))
13637    return true;
13638
13639  const TargetOptions &Options = getTargetMachine().Options;
13640  const Function *F = I->getFunction();
13641  const DataLayout &DL = F->getParent()->getDataLayout();
13642  Type *Ty = User->getOperand(0)->getType();
13643
13644  return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
13645           isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
13646           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
13647            Options.UnsafeFPMath));
13648}
13649
13650// All 32-bit GPR operations implicitly zero the high-half of the corresponding
13651// 64-bit GPR.
13652bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
13653  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13654    return false;
13655  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
13656  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
13657  return NumBits1 == 32 && NumBits2 == 64;
13658}
13659bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
13660  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13661    return false;
13662  unsigned NumBits1 = VT1.getSizeInBits();
13663  unsigned NumBits2 = VT2.getSizeInBits();
13664  return NumBits1 == 32 && NumBits2 == 64;
13665}
13666
13667bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
13668  EVT VT1 = Val.getValueType();
13669  if (isZExtFree(VT1, VT2)) {
13670    return true;
13671  }
13672
13673  if (Val.getOpcode() != ISD::LOAD)
13674    return false;
13675
13676  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
13677  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
13678          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
13679          VT1.getSizeInBits() <= 32);
13680}
13681
13682bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
13683  if (isa<FPExtInst>(Ext))
13684    return false;
13685
13686  // Vector types are not free.
13687  if (Ext->getType()->isVectorTy())
13688    return false;
13689
13690  for (const Use &U : Ext->uses()) {
13691    // The extension is free if we can fold it with a left shift in an
13692    // addressing mode or an arithmetic operation: add, sub, and cmp.
13693
13694    // Is there a shift?
13695    const Instruction *Instr = cast<Instruction>(U.getUser());
13696
13697    // Is this a constant shift?
13698    switch (Instr->getOpcode()) {
13699    case Instruction::Shl:
13700      if (!isa<ConstantInt>(Instr->getOperand(1)))
13701        return false;
13702      break;
13703    case Instruction::GetElementPtr: {
13704      gep_type_iterator GTI = gep_type_begin(Instr);
13705      auto &DL = Ext->getModule()->getDataLayout();
13706      std::advance(GTI, U.getOperandNo()-1);
13707      Type *IdxTy = GTI.getIndexedType();
13708      // This extension will end up with a shift because of the scaling factor.
13709      // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
13710      // Get the shift amount based on the scaling factor:
13711      // log2(sizeof(IdxTy)) - log2(8).
13712      uint64_t ShiftAmt =
13713          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
13714          3;
13715      // Is the constant foldable in the shift of the addressing mode?
13716      // I.e., shift amount is between 1 and 4 inclusive.
13717      if (ShiftAmt == 0 || ShiftAmt > 4)
13718        return false;
13719      break;
13720    }
13721    case Instruction::Trunc:
13722      // Check if this is a noop.
13723      // trunc(sext ty1 to ty2) to ty1.
13724      if (Instr->getType() == Ext->getOperand(0)->getType())
13725        continue;
13726      [[fallthrough]];
13727    default:
13728      return false;
13729    }
13730
13731    // At this point we can use the bfm family, so this extension is free
13732    // for that use.
13733  }
13734  return true;
13735}
13736
13737static bool isSplatShuffle(Value *V) {
13738  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
13739    return all_equal(Shuf->getShuffleMask());
13740  return false;
13741}
13742
13743/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
13744/// or upper half of the vector elements.
13745static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
13746                                     bool AllowSplat = false) {
13747  auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
13748    auto *FullTy = FullV->getType();
13749    auto *HalfTy = HalfV->getType();
13750    return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
13751           2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
13752  };
13753
13754  auto extractHalf = [](Value *FullV, Value *HalfV) {
13755    auto *FullVT = cast<FixedVectorType>(FullV->getType());
13756    auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
13757    return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
13758  };
13759
13760  ArrayRef<int> M1, M2;
13761  Value *S1Op1 = nullptr, *S2Op1 = nullptr;
13762  if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
13763      !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
13764    return false;
13765
13766  // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
13767  // it is not checked as an extract below.
13768  if (AllowSplat && isSplatShuffle(Op1))
13769    S1Op1 = nullptr;
13770  if (AllowSplat && isSplatShuffle(Op2))
13771    S2Op1 = nullptr;
13772
13773  // Check that the operands are half as wide as the result and we extract
13774  // half of the elements of the input vectors.
13775  if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
13776      (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
13777    return false;
13778
13779  // Check the mask extracts either the lower or upper half of vector
13780  // elements.
13781  int M1Start = 0;
13782  int M2Start = 0;
13783  int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
13784  if ((S1Op1 &&
13785       !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
13786      (S2Op1 &&
13787       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
13788    return false;
13789
13790  if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
13791      (M2Start != 0 && M2Start != (NumElements / 2)))
13792    return false;
13793  if (S1Op1 && S2Op1 && M1Start != M2Start)
13794    return false;
13795
13796  return true;
13797}
13798
13799/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
13800/// of the vector elements.
13801static bool areExtractExts(Value *Ext1, Value *Ext2) {
13802  auto areExtDoubled = [](Instruction *Ext) {
13803    return Ext->getType()->getScalarSizeInBits() ==
13804           2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
13805  };
13806
13807  if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
13808      !match(Ext2, m_ZExtOrSExt(m_Value())) ||
13809      !areExtDoubled(cast<Instruction>(Ext1)) ||
13810      !areExtDoubled(cast<Instruction>(Ext2)))
13811    return false;
13812
13813  return true;
13814}
13815
13816/// Check if Op could be used with vmull_high_p64 intrinsic.
13817static bool isOperandOfVmullHighP64(Value *Op) {
13818  Value *VectorOperand = nullptr;
13819  ConstantInt *ElementIndex = nullptr;
13820  return match(Op, m_ExtractElt(m_Value(VectorOperand),
13821                                m_ConstantInt(ElementIndex))) &&
13822         ElementIndex->getValue() == 1 &&
13823         isa<FixedVectorType>(VectorOperand->getType()) &&
13824         cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
13825}
13826
13827/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
13828static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
13829  return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
13830}
13831
13832/// Check if sinking \p I's operands to I's basic block is profitable, because
13833/// the operands can be folded into a target instruction, e.g.
13834/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
13835bool AArch64TargetLowering::shouldSinkOperands(
13836    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
13837  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
13838    switch (II->getIntrinsicID()) {
13839    case Intrinsic::aarch64_neon_smull:
13840    case Intrinsic::aarch64_neon_umull:
13841      if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
13842                                   /*AllowSplat=*/true)) {
13843        Ops.push_back(&II->getOperandUse(0));
13844        Ops.push_back(&II->getOperandUse(1));
13845        return true;
13846      }
13847      [[fallthrough]];
13848
13849    case Intrinsic::fma:
13850      if (isa<VectorType>(I->getType()) &&
13851          cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
13852          !Subtarget->hasFullFP16())
13853        return false;
13854      [[fallthrough]];
13855    case Intrinsic::aarch64_neon_sqdmull:
13856    case Intrinsic::aarch64_neon_sqdmulh:
13857    case Intrinsic::aarch64_neon_sqrdmulh:
13858      // Sink splats for index lane variants
13859      if (isSplatShuffle(II->getOperand(0)))
13860        Ops.push_back(&II->getOperandUse(0));
13861      if (isSplatShuffle(II->getOperand(1)))
13862        Ops.push_back(&II->getOperandUse(1));
13863      return !Ops.empty();
13864    case Intrinsic::aarch64_sve_ptest_first:
13865    case Intrinsic::aarch64_sve_ptest_last:
13866      if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
13867        if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
13868          Ops.push_back(&II->getOperandUse(0));
13869      return !Ops.empty();
13870    case Intrinsic::aarch64_sme_write_horiz:
13871    case Intrinsic::aarch64_sme_write_vert:
13872    case Intrinsic::aarch64_sme_writeq_horiz:
13873    case Intrinsic::aarch64_sme_writeq_vert: {
13874      auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
13875      if (!Idx || Idx->getOpcode() != Instruction::Add)
13876        return false;
13877      Ops.push_back(&II->getOperandUse(1));
13878      return true;
13879    }
13880    case Intrinsic::aarch64_sme_read_horiz:
13881    case Intrinsic::aarch64_sme_read_vert:
13882    case Intrinsic::aarch64_sme_readq_horiz:
13883    case Intrinsic::aarch64_sme_readq_vert:
13884    case Intrinsic::aarch64_sme_ld1b_vert:
13885    case Intrinsic::aarch64_sme_ld1h_vert:
13886    case Intrinsic::aarch64_sme_ld1w_vert:
13887    case Intrinsic::aarch64_sme_ld1d_vert:
13888    case Intrinsic::aarch64_sme_ld1q_vert:
13889    case Intrinsic::aarch64_sme_st1b_vert:
13890    case Intrinsic::aarch64_sme_st1h_vert:
13891    case Intrinsic::aarch64_sme_st1w_vert:
13892    case Intrinsic::aarch64_sme_st1d_vert:
13893    case Intrinsic::aarch64_sme_st1q_vert:
13894    case Intrinsic::aarch64_sme_ld1b_horiz:
13895    case Intrinsic::aarch64_sme_ld1h_horiz:
13896    case Intrinsic::aarch64_sme_ld1w_horiz:
13897    case Intrinsic::aarch64_sme_ld1d_horiz:
13898    case Intrinsic::aarch64_sme_ld1q_horiz:
13899    case Intrinsic::aarch64_sme_st1b_horiz:
13900    case Intrinsic::aarch64_sme_st1h_horiz:
13901    case Intrinsic::aarch64_sme_st1w_horiz:
13902    case Intrinsic::aarch64_sme_st1d_horiz:
13903    case Intrinsic::aarch64_sme_st1q_horiz: {
13904      auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
13905      if (!Idx || Idx->getOpcode() != Instruction::Add)
13906        return false;
13907      Ops.push_back(&II->getOperandUse(3));
13908      return true;
13909    }
13910    case Intrinsic::aarch64_neon_pmull:
13911      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
13912        return false;
13913      Ops.push_back(&II->getOperandUse(0));
13914      Ops.push_back(&II->getOperandUse(1));
13915      return true;
13916    case Intrinsic::aarch64_neon_pmull64:
13917      if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
13918                                     II->getArgOperand(1)))
13919        return false;
13920      Ops.push_back(&II->getArgOperandUse(0));
13921      Ops.push_back(&II->getArgOperandUse(1));
13922      return true;
13923    default:
13924      return false;
13925    }
13926  }
13927
13928  if (!I->getType()->isVectorTy())
13929    return false;
13930
13931  switch (I->getOpcode()) {
13932  case Instruction::Sub:
13933  case Instruction::Add: {
13934    if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
13935      return false;
13936
13937    // If the exts' operands extract either the lower or upper elements, we
13938    // can sink them too.
13939    auto Ext1 = cast<Instruction>(I->getOperand(0));
13940    auto Ext2 = cast<Instruction>(I->getOperand(1));
13941    if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
13942      Ops.push_back(&Ext1->getOperandUse(0));
13943      Ops.push_back(&Ext2->getOperandUse(0));
13944    }
13945
13946    Ops.push_back(&I->getOperandUse(0));
13947    Ops.push_back(&I->getOperandUse(1));
13948
13949    return true;
13950  }
13951  case Instruction::Mul: {
13952    int NumZExts = 0, NumSExts = 0;
13953    for (auto &Op : I->operands()) {
13954      // Make sure we are not already sinking this operand
13955      if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
13956        continue;
13957
13958      if (match(&Op, m_SExt(m_Value()))) {
13959        NumSExts++;
13960        continue;
13961      } else if (match(&Op, m_ZExt(m_Value()))) {
13962        NumZExts++;
13963        continue;
13964      }
13965
13966      ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
13967
13968      // If the Shuffle is a splat and the operand is a zext/sext, sinking the
13969      // operand and the s/zext can help create indexed s/umull. This is
13970      // especially useful to prevent i64 mul being scalarized.
13971      if (Shuffle && isSplatShuffle(Shuffle) &&
13972          match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
13973        Ops.push_back(&Shuffle->getOperandUse(0));
13974        Ops.push_back(&Op);
13975        if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
13976          NumSExts++;
13977        else
13978          NumZExts++;
13979        continue;
13980      }
13981
13982      if (!Shuffle)
13983        continue;
13984
13985      Value *ShuffleOperand = Shuffle->getOperand(0);
13986      InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
13987      if (!Insert)
13988        continue;
13989
13990      Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
13991      if (!OperandInstr)
13992        continue;
13993
13994      ConstantInt *ElementConstant =
13995          dyn_cast<ConstantInt>(Insert->getOperand(2));
13996      // Check that the insertelement is inserting into element 0
13997      if (!ElementConstant || ElementConstant->getZExtValue() != 0)
13998        continue;
13999
14000      unsigned Opcode = OperandInstr->getOpcode();
14001      if (Opcode == Instruction::SExt)
14002        NumSExts++;
14003      else if (Opcode == Instruction::ZExt)
14004        NumZExts++;
14005      else {
14006        // If we find that the top bits are known 0, then we can sink and allow
14007        // the backend to generate a umull.
14008        unsigned Bitwidth = I->getType()->getScalarSizeInBits();
14009        APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
14010        const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
14011        if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
14012          continue;
14013        NumZExts++;
14014      }
14015
14016      Ops.push_back(&Shuffle->getOperandUse(0));
14017      Ops.push_back(&Op);
14018    }
14019
14020    // Is it profitable to sink if we found two of the same type of extends.
14021    return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
14022  }
14023  default:
14024    return false;
14025  }
14026  return false;
14027}
14028
14029static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
14030  Value *Op = ZExt->getOperand(0);
14031  auto *SrcTy = cast<FixedVectorType>(Op->getType());
14032  auto *DstTy = cast<FixedVectorType>(ZExt->getType());
14033  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
14034  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
14035  assert(DstWidth % SrcWidth == 0 &&
14036         "TBL lowering is not supported for a ZExt instruction with this "
14037         "source & destination element type.");
14038  unsigned ZExtFactor = DstWidth / SrcWidth;
14039  unsigned NumElts = SrcTy->getNumElements();
14040  IRBuilder<> Builder(ZExt);
14041  SmallVector<int> Mask;
14042  // Create a mask that selects <0,...,Op[i]> for each lane of the destination
14043  // vector to replace the original ZExt. This can later be lowered to a set of
14044  // tbl instructions.
14045  for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
14046    if (IsLittleEndian) {
14047      if (i % ZExtFactor == 0)
14048        Mask.push_back(i / ZExtFactor);
14049      else
14050        Mask.push_back(NumElts);
14051    } else {
14052      if ((i + 1) % ZExtFactor == 0)
14053        Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
14054      else
14055        Mask.push_back(NumElts);
14056    }
14057  }
14058
14059  auto *FirstEltZero = Builder.CreateInsertElement(
14060      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
14061  Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
14062  Result = Builder.CreateBitCast(Result, DstTy);
14063  ZExt->replaceAllUsesWith(Result);
14064  ZExt->eraseFromParent();
14065}
14066
14067static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
14068  IRBuilder<> Builder(TI);
14069  SmallVector<Value *> Parts;
14070  int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
14071  auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
14072  auto *DstTy = cast<FixedVectorType>(TI->getType());
14073  assert(SrcTy->getElementType()->isIntegerTy() &&
14074         "Non-integer type source vector element is not supported");
14075  assert(DstTy->getElementType()->isIntegerTy(8) &&
14076         "Unsupported destination vector element type");
14077  unsigned SrcElemTySz =
14078      cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
14079  unsigned DstElemTySz =
14080      cast<IntegerType>(DstTy->getElementType())->getBitWidth();
14081  assert((SrcElemTySz % DstElemTySz == 0) &&
14082         "Cannot lower truncate to tbl instructions for a source element size "
14083         "that is not divisible by the destination element size");
14084  unsigned TruncFactor = SrcElemTySz / DstElemTySz;
14085  assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
14086         "Unsupported source vector element type size");
14087  Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
14088
14089  // Create a mask to choose every nth byte from the source vector table of
14090  // bytes to create the truncated destination vector, where 'n' is the truncate
14091  // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
14092  // 0,8,16,..Y*8th bytes for the little-endian format
14093  SmallVector<Constant *, 16> MaskConst;
14094  for (int Itr = 0; Itr < 16; Itr++) {
14095    if (Itr < NumElements)
14096      MaskConst.push_back(Builder.getInt8(
14097          IsLittleEndian ? Itr * TruncFactor
14098                         : Itr * TruncFactor + (TruncFactor - 1)));
14099    else
14100      MaskConst.push_back(Builder.getInt8(255));
14101  }
14102
14103  int MaxTblSz = 128 * 4;
14104  int MaxSrcSz = SrcElemTySz * NumElements;
14105  int ElemsPerTbl =
14106      (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
14107  assert(ElemsPerTbl <= 16 &&
14108         "Maximum elements selected using TBL instruction cannot exceed 16!");
14109
14110  int ShuffleCount = 128 / SrcElemTySz;
14111  SmallVector<int> ShuffleLanes;
14112  for (int i = 0; i < ShuffleCount; ++i)
14113    ShuffleLanes.push_back(i);
14114
14115  // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
14116  // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
14117  // call TBL & save the result in a vector of TBL results for combining later.
14118  SmallVector<Value *> Results;
14119  while (ShuffleLanes.back() < NumElements) {
14120    Parts.push_back(Builder.CreateBitCast(
14121        Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
14122
14123    if (Parts.size() == 4) {
14124      auto *F = Intrinsic::getDeclaration(TI->getModule(),
14125                                          Intrinsic::aarch64_neon_tbl4, VecTy);
14126      Parts.push_back(ConstantVector::get(MaskConst));
14127      Results.push_back(Builder.CreateCall(F, Parts));
14128      Parts.clear();
14129    }
14130
14131    for (int i = 0; i < ShuffleCount; ++i)
14132      ShuffleLanes[i] += ShuffleCount;
14133  }
14134
14135  assert((Parts.empty() || Results.empty()) &&
14136         "Lowering trunc for vectors requiring different TBL instructions is "
14137         "not supported!");
14138  // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
14139  // registers
14140  if (!Parts.empty()) {
14141    Intrinsic::ID TblID;
14142    switch (Parts.size()) {
14143    case 1:
14144      TblID = Intrinsic::aarch64_neon_tbl1;
14145      break;
14146    case 2:
14147      TblID = Intrinsic::aarch64_neon_tbl2;
14148      break;
14149    case 3:
14150      TblID = Intrinsic::aarch64_neon_tbl3;
14151      break;
14152    }
14153
14154    auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
14155    Parts.push_back(ConstantVector::get(MaskConst));
14156    Results.push_back(Builder.CreateCall(F, Parts));
14157  }
14158
14159  // Extract the destination vector from TBL result(s) after combining them
14160  // where applicable. Currently, at most two TBLs are supported.
14161  assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
14162                                "more than 2 tbl instructions!");
14163  Value *FinalResult = Results[0];
14164  if (Results.size() == 1) {
14165    if (ElemsPerTbl < 16) {
14166      SmallVector<int> FinalMask(ElemsPerTbl);
14167      std::iota(FinalMask.begin(), FinalMask.end(), 0);
14168      FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
14169    }
14170  } else {
14171    SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
14172    if (ElemsPerTbl < 16) {
14173      std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
14174      std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
14175    } else {
14176      std::iota(FinalMask.begin(), FinalMask.end(), 0);
14177    }
14178    FinalResult =
14179        Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
14180  }
14181
14182  TI->replaceAllUsesWith(FinalResult);
14183  TI->eraseFromParent();
14184}
14185
14186bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
14187                                                               Loop *L) const {
14188  // shuffle_vector instructions are serialized when targeting SVE,
14189  // see LowerSPLAT_VECTOR. This peephole is not beneficial.
14190  if (Subtarget->useSVEForFixedLengthVectors())
14191    return false;
14192
14193  // Try to optimize conversions using tbl. This requires materializing constant
14194  // index vectors, which can increase code size and add loads. Skip the
14195  // transform unless the conversion is in a loop block guaranteed to execute
14196  // and we are not optimizing for size.
14197  Function *F = I->getParent()->getParent();
14198  if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
14199      F->hasOptSize())
14200    return false;
14201
14202  auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
14203  auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
14204  if (!SrcTy || !DstTy)
14205    return false;
14206
14207  // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
14208  // lowered to tbl instructions to insert the original i8 elements
14209  // into i8x lanes. This is enabled for cases where it is beneficial.
14210  auto *ZExt = dyn_cast<ZExtInst>(I);
14211  if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
14212    auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
14213    if (DstWidth % 8 == 0 && DstWidth > 16 && DstWidth < 64) {
14214      createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
14215      return true;
14216    }
14217  }
14218
14219  auto *UIToFP = dyn_cast<UIToFPInst>(I);
14220  if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
14221      DstTy->getElementType()->isFloatTy()) {
14222    IRBuilder<> Builder(I);
14223    auto *ZExt = cast<ZExtInst>(
14224        Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
14225    auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
14226    I->replaceAllUsesWith(UI);
14227    I->eraseFromParent();
14228    createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
14229    return true;
14230  }
14231
14232  // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
14233  // followed by a truncate lowered to using tbl.4.
14234  auto *FPToUI = dyn_cast<FPToUIInst>(I);
14235  if (FPToUI &&
14236      (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
14237      SrcTy->getElementType()->isFloatTy() &&
14238      DstTy->getElementType()->isIntegerTy(8)) {
14239    IRBuilder<> Builder(I);
14240    auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
14241                                          VectorType::getInteger(SrcTy));
14242    auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
14243    I->replaceAllUsesWith(TruncI);
14244    I->eraseFromParent();
14245    createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
14246    return true;
14247  }
14248
14249  // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
14250  // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
14251  // per lane of the input that is represented using 1,2,3 or 4 128-bit table
14252  // registers
14253  auto *TI = dyn_cast<TruncInst>(I);
14254  if (TI && DstTy->getElementType()->isIntegerTy(8) &&
14255      ((SrcTy->getElementType()->isIntegerTy(32) ||
14256        SrcTy->getElementType()->isIntegerTy(64)) &&
14257       (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
14258    createTblForTrunc(TI, Subtarget->isLittleEndian());
14259    return true;
14260  }
14261
14262  return false;
14263}
14264
14265bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
14266                                          Align &RequiredAligment) const {
14267  if (!LoadedType.isSimple() ||
14268      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
14269    return false;
14270  // Cyclone supports unaligned accesses.
14271  RequiredAligment = Align(1);
14272  unsigned NumBits = LoadedType.getSizeInBits();
14273  return NumBits == 32 || NumBits == 64;
14274}
14275
14276/// A helper function for determining the number of interleaved accesses we
14277/// will generate when lowering accesses of the given type.
14278unsigned AArch64TargetLowering::getNumInterleavedAccesses(
14279    VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
14280  unsigned VecSize = 128;
14281  if (UseScalable)
14282    VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
14283  return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
14284}
14285
14286MachineMemOperand::Flags
14287AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
14288  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
14289      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
14290    return MOStridedAccess;
14291  return MachineMemOperand::MONone;
14292}
14293
14294bool AArch64TargetLowering::isLegalInterleavedAccessType(
14295    VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
14296
14297  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
14298  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
14299  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
14300
14301  UseScalable = false;
14302
14303  // Ensure that the predicate for this number of elements is available.
14304  if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements))
14305    return false;
14306
14307  // Ensure the number of vector elements is greater than 1.
14308  if (NumElements < 2)
14309    return false;
14310
14311  // Ensure the element type is legal.
14312  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
14313    return false;
14314
14315  if (Subtarget->forceStreamingCompatibleSVE() ||
14316      (Subtarget->useSVEForFixedLengthVectors() &&
14317       (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
14318        (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
14319         isPowerOf2_32(NumElements) && VecSize > 128)))) {
14320    UseScalable = true;
14321    return true;
14322  }
14323
14324  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
14325  // 128 will be split into multiple interleaved accesses.
14326  return VecSize == 64 || VecSize % 128 == 0;
14327}
14328
14329static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
14330  if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
14331    return ScalableVectorType::get(VTy->getElementType(), 2);
14332
14333  if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
14334    return ScalableVectorType::get(VTy->getElementType(), 4);
14335
14336  if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
14337    return ScalableVectorType::get(VTy->getElementType(), 8);
14338
14339  if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
14340    return ScalableVectorType::get(VTy->getElementType(), 8);
14341
14342  if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
14343    return ScalableVectorType::get(VTy->getElementType(), 2);
14344
14345  if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
14346    return ScalableVectorType::get(VTy->getElementType(), 4);
14347
14348  if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
14349    return ScalableVectorType::get(VTy->getElementType(), 8);
14350
14351  if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
14352    return ScalableVectorType::get(VTy->getElementType(), 16);
14353
14354  llvm_unreachable("Cannot handle input vector type");
14355}
14356
14357/// Lower an interleaved load into a ldN intrinsic.
14358///
14359/// E.g. Lower an interleaved load (Factor = 2):
14360///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
14361///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
14362///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
14363///
14364///      Into:
14365///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
14366///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
14367///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
14368bool AArch64TargetLowering::lowerInterleavedLoad(
14369    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
14370    ArrayRef<unsigned> Indices, unsigned Factor) const {
14371  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14372         "Invalid interleave factor");
14373  assert(!Shuffles.empty() && "Empty shufflevector input");
14374  assert(Shuffles.size() == Indices.size() &&
14375         "Unmatched number of shufflevectors and indices");
14376
14377  const DataLayout &DL = LI->getModule()->getDataLayout();
14378
14379  VectorType *VTy = Shuffles[0]->getType();
14380
14381  // Skip if we do not have NEON and skip illegal vector types. We can
14382  // "legalize" wide vector types into multiple interleaved accesses as long as
14383  // the vector types are divisible by 128.
14384  bool UseScalable;
14385  if (!Subtarget->hasNEON() ||
14386      !isLegalInterleavedAccessType(VTy, DL, UseScalable))
14387    return false;
14388
14389  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
14390
14391  auto *FVTy = cast<FixedVectorType>(VTy);
14392
14393  // A pointer vector can not be the return type of the ldN intrinsics. Need to
14394  // load integer vectors first and then convert to pointer vectors.
14395  Type *EltTy = FVTy->getElementType();
14396  if (EltTy->isPointerTy())
14397    FVTy =
14398        FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
14399
14400  // If we're going to generate more than one load, reset the sub-vector type
14401  // to something legal.
14402  FVTy = FixedVectorType::get(FVTy->getElementType(),
14403                              FVTy->getNumElements() / NumLoads);
14404
14405  auto *LDVTy =
14406      UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
14407
14408  IRBuilder<> Builder(LI);
14409
14410  // The base address of the load.
14411  Value *BaseAddr = LI->getPointerOperand();
14412
14413  if (NumLoads > 1) {
14414    // We will compute the pointer operand of each load from the original base
14415    // address using GEPs. Cast the base address to a pointer to the scalar
14416    // element type.
14417    BaseAddr = Builder.CreateBitCast(
14418        BaseAddr,
14419        LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
14420  }
14421
14422  Type *PtrTy =
14423      UseScalable
14424          ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
14425          : LDVTy->getPointerTo(LI->getPointerAddressSpace());
14426  Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
14427                                 LDVTy->getElementCount());
14428
14429  static const Intrinsic::ID SVELoadIntrs[3] = {
14430      Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
14431      Intrinsic::aarch64_sve_ld4_sret};
14432  static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
14433                                                 Intrinsic::aarch64_neon_ld3,
14434                                                 Intrinsic::aarch64_neon_ld4};
14435  Function *LdNFunc;
14436  if (UseScalable)
14437    LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
14438                                        SVELoadIntrs[Factor - 2], {LDVTy});
14439  else
14440    LdNFunc = Intrinsic::getDeclaration(
14441        LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
14442
14443  // Holds sub-vectors extracted from the load intrinsic return values. The
14444  // sub-vectors are associated with the shufflevector instructions they will
14445  // replace.
14446  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
14447
14448  Value *PTrue = nullptr;
14449  if (UseScalable) {
14450    std::optional<unsigned> PgPattern =
14451        getSVEPredPatternFromNumElements(FVTy->getNumElements());
14452    if (Subtarget->getMinSVEVectorSizeInBits() ==
14453            Subtarget->getMaxSVEVectorSizeInBits() &&
14454        Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
14455      PgPattern = AArch64SVEPredPattern::all;
14456
14457    auto *PTruePat =
14458        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
14459    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
14460                                    {PTruePat});
14461  }
14462
14463  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
14464
14465    // If we're generating more than one load, compute the base address of
14466    // subsequent loads as an offset from the previous.
14467    if (LoadCount > 0)
14468      BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
14469                                            FVTy->getNumElements() * Factor);
14470
14471    CallInst *LdN;
14472    if (UseScalable)
14473      LdN = Builder.CreateCall(
14474          LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
14475    else
14476      LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
14477                               "ldN");
14478
14479    // Extract and store the sub-vectors returned by the load intrinsic.
14480    for (unsigned i = 0; i < Shuffles.size(); i++) {
14481      ShuffleVectorInst *SVI = Shuffles[i];
14482      unsigned Index = Indices[i];
14483
14484      Value *SubVec = Builder.CreateExtractValue(LdN, Index);
14485
14486      if (UseScalable)
14487        SubVec = Builder.CreateExtractVector(
14488            FVTy, SubVec,
14489            ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
14490
14491      // Convert the integer vector to pointer vector if the element is pointer.
14492      if (EltTy->isPointerTy())
14493        SubVec = Builder.CreateIntToPtr(
14494            SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
14495                                         FVTy->getNumElements()));
14496
14497      SubVecs[SVI].push_back(SubVec);
14498    }
14499  }
14500
14501  // Replace uses of the shufflevector instructions with the sub-vectors
14502  // returned by the load intrinsic. If a shufflevector instruction is
14503  // associated with more than one sub-vector, those sub-vectors will be
14504  // concatenated into a single wide vector.
14505  for (ShuffleVectorInst *SVI : Shuffles) {
14506    auto &SubVec = SubVecs[SVI];
14507    auto *WideVec =
14508        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
14509    SVI->replaceAllUsesWith(WideVec);
14510  }
14511
14512  return true;
14513}
14514
14515/// Lower an interleaved store into a stN intrinsic.
14516///
14517/// E.g. Lower an interleaved store (Factor = 3):
14518///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
14519///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
14520///        store <12 x i32> %i.vec, <12 x i32>* %ptr
14521///
14522///      Into:
14523///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
14524///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
14525///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
14526///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
14527///
14528/// Note that the new shufflevectors will be removed and we'll only generate one
14529/// st3 instruction in CodeGen.
14530///
14531/// Example for a more general valid mask (Factor 3). Lower:
14532///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
14533///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
14534///        store <12 x i32> %i.vec, <12 x i32>* %ptr
14535///
14536///      Into:
14537///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
14538///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
14539///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
14540///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
14541bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
14542                                                  ShuffleVectorInst *SVI,
14543                                                  unsigned Factor) const {
14544  // Skip if streaming compatible SVE is enabled, because it generates invalid
14545  // code in streaming mode when SVE length is not specified.
14546  if (Subtarget->forceStreamingCompatibleSVE())
14547    return false;
14548
14549  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14550         "Invalid interleave factor");
14551
14552  auto *VecTy = cast<FixedVectorType>(SVI->getType());
14553  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
14554
14555  unsigned LaneLen = VecTy->getNumElements() / Factor;
14556  Type *EltTy = VecTy->getElementType();
14557  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
14558
14559  const DataLayout &DL = SI->getModule()->getDataLayout();
14560  bool UseScalable;
14561
14562  // Skip if we do not have NEON and skip illegal vector types. We can
14563  // "legalize" wide vector types into multiple interleaved accesses as long as
14564  // the vector types are divisible by 128.
14565  if (!Subtarget->hasNEON() ||
14566      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
14567    return false;
14568
14569  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
14570
14571  Value *Op0 = SVI->getOperand(0);
14572  Value *Op1 = SVI->getOperand(1);
14573  IRBuilder<> Builder(SI);
14574
14575  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
14576  // vectors to integer vectors.
14577  if (EltTy->isPointerTy()) {
14578    Type *IntTy = DL.getIntPtrType(EltTy);
14579    unsigned NumOpElts =
14580        cast<FixedVectorType>(Op0->getType())->getNumElements();
14581
14582    // Convert to the corresponding integer vector.
14583    auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
14584    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
14585    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
14586
14587    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
14588  }
14589
14590  // If we're going to generate more than one store, reset the lane length
14591  // and sub-vector type to something legal.
14592  LaneLen /= NumStores;
14593  SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
14594
14595  auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
14596                            : SubVecTy;
14597
14598  // The base address of the store.
14599  Value *BaseAddr = SI->getPointerOperand();
14600
14601  if (NumStores > 1) {
14602    // We will compute the pointer operand of each store from the original base
14603    // address using GEPs. Cast the base address to a pointer to the scalar
14604    // element type.
14605    BaseAddr = Builder.CreateBitCast(
14606        BaseAddr,
14607        SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
14608  }
14609
14610  auto Mask = SVI->getShuffleMask();
14611
14612  // Sanity check if all the indices are NOT in range.
14613  // If mask is `undef` or `poison`, `Mask` may be a vector of -1s.
14614  // If all of them are `undef`, OOB read will happen later.
14615  if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
14616    return false;
14617  }
14618
14619  Type *PtrTy =
14620      UseScalable
14621          ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
14622          : STVTy->getPointerTo(SI->getPointerAddressSpace());
14623  Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
14624                                 STVTy->getElementCount());
14625
14626  static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
14627                                                 Intrinsic::aarch64_sve_st3,
14628                                                 Intrinsic::aarch64_sve_st4};
14629  static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
14630                                                  Intrinsic::aarch64_neon_st3,
14631                                                  Intrinsic::aarch64_neon_st4};
14632  Function *StNFunc;
14633  if (UseScalable)
14634    StNFunc = Intrinsic::getDeclaration(SI->getModule(),
14635                                        SVEStoreIntrs[Factor - 2], {STVTy});
14636  else
14637    StNFunc = Intrinsic::getDeclaration(
14638        SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
14639
14640  Value *PTrue = nullptr;
14641  if (UseScalable) {
14642    std::optional<unsigned> PgPattern =
14643        getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
14644    if (Subtarget->getMinSVEVectorSizeInBits() ==
14645            Subtarget->getMaxSVEVectorSizeInBits() &&
14646        Subtarget->getMinSVEVectorSizeInBits() ==
14647            DL.getTypeSizeInBits(SubVecTy))
14648      PgPattern = AArch64SVEPredPattern::all;
14649
14650    auto *PTruePat =
14651        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
14652    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
14653                                    {PTruePat});
14654  }
14655
14656  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
14657
14658    SmallVector<Value *, 5> Ops;
14659
14660    // Split the shufflevector operands into sub vectors for the new stN call.
14661    for (unsigned i = 0; i < Factor; i++) {
14662      Value *Shuffle;
14663      unsigned IdxI = StoreCount * LaneLen * Factor + i;
14664      if (Mask[IdxI] >= 0) {
14665        Shuffle = Builder.CreateShuffleVector(
14666            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
14667      } else {
14668        unsigned StartMask = 0;
14669        for (unsigned j = 1; j < LaneLen; j++) {
14670          unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
14671          if (Mask[IdxJ] >= 0) {
14672            StartMask = Mask[IdxJ] - j;
14673            break;
14674          }
14675        }
14676        // Note: Filling undef gaps with random elements is ok, since
14677        // those elements were being written anyway (with undefs).
14678        // In the case of all undefs we're defaulting to using elems from 0
14679        // Note: StartMask cannot be negative, it's checked in
14680        // isReInterleaveMask
14681        Shuffle = Builder.CreateShuffleVector(
14682            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
14683      }
14684
14685      if (UseScalable)
14686        Shuffle = Builder.CreateInsertVector(
14687            STVTy, UndefValue::get(STVTy), Shuffle,
14688            ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
14689
14690      Ops.push_back(Shuffle);
14691    }
14692
14693    if (UseScalable)
14694      Ops.push_back(PTrue);
14695
14696    // If we generating more than one store, we compute the base address of
14697    // subsequent stores as an offset from the previous.
14698    if (StoreCount > 0)
14699      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
14700                                            BaseAddr, LaneLen * Factor);
14701
14702    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
14703    Builder.CreateCall(StNFunc, Ops);
14704  }
14705  return true;
14706}
14707
14708EVT AArch64TargetLowering::getOptimalMemOpType(
14709    const MemOp &Op, const AttributeList &FuncAttributes) const {
14710  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14711  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14712  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14713  // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14714  // taken one instruction to materialize the v2i64 zero and one store (with
14715  // restrictive addressing mode). Just do i64 stores.
14716  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14717  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14718    if (Op.isAligned(AlignCheck))
14719      return true;
14720    unsigned Fast;
14721    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14722                                          MachineMemOperand::MONone, &Fast) &&
14723           Fast;
14724  };
14725
14726  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14727      AlignmentIsAcceptable(MVT::v16i8, Align(16)))
14728    return MVT::v16i8;
14729  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14730    return MVT::f128;
14731  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14732    return MVT::i64;
14733  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14734    return MVT::i32;
14735  return MVT::Other;
14736}
14737
14738LLT AArch64TargetLowering::getOptimalMemOpLLT(
14739    const MemOp &Op, const AttributeList &FuncAttributes) const {
14740  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14741  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14742  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14743  // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14744  // taken one instruction to materialize the v2i64 zero and one store (with
14745  // restrictive addressing mode). Just do i64 stores.
14746  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14747  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14748    if (Op.isAligned(AlignCheck))
14749      return true;
14750    unsigned Fast;
14751    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14752                                          MachineMemOperand::MONone, &Fast) &&
14753           Fast;
14754  };
14755
14756  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14757      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
14758    return LLT::fixed_vector(2, 64);
14759  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14760    return LLT::scalar(128);
14761  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14762    return LLT::scalar(64);
14763  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14764    return LLT::scalar(32);
14765  return LLT();
14766}
14767
14768// 12-bit optionally shifted immediates are legal for adds.
14769bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
14770  if (Immed == std::numeric_limits<int64_t>::min()) {
14771    LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
14772                      << ": avoid UB for INT64_MIN\n");
14773    return false;
14774  }
14775  // Same encoding for add/sub, just flip the sign.
14776  Immed = std::abs(Immed);
14777  bool IsLegal = ((Immed >> 12) == 0 ||
14778                  ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
14779  LLVM_DEBUG(dbgs() << "Is " << Immed
14780                    << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
14781  return IsLegal;
14782}
14783
14784// Return false to prevent folding
14785// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
14786// if the folding leads to worse code.
14787bool AArch64TargetLowering::isMulAddWithConstProfitable(
14788    SDValue AddNode, SDValue ConstNode) const {
14789  // Let the DAGCombiner decide for vector types and large types.
14790  const EVT VT = AddNode.getValueType();
14791  if (VT.isVector() || VT.getScalarSizeInBits() > 64)
14792    return true;
14793
14794  // It is worse if c1 is legal add immediate, while c1*c2 is not
14795  // and has to be composed by at least two instructions.
14796  const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
14797  const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
14798  const int64_t C1 = C1Node->getSExtValue();
14799  const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
14800  if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
14801    return true;
14802  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
14803  AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
14804  if (Insn.size() > 1)
14805    return false;
14806
14807  // Default to true and let the DAGCombiner decide.
14808  return true;
14809}
14810
14811// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
14812// immediates is the same as for an add or a sub.
14813bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
14814  return isLegalAddImmediate(Immed);
14815}
14816
14817/// isLegalAddressingMode - Return true if the addressing mode represented
14818/// by AM is legal for this target, for a load/store of the specified type.
14819bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
14820                                                  const AddrMode &AM, Type *Ty,
14821                                                  unsigned AS, Instruction *I) const {
14822  // AArch64 has five basic addressing modes:
14823  //  reg
14824  //  reg + 9-bit signed offset
14825  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
14826  //  reg1 + reg2
14827  //  reg + SIZE_IN_BYTES * reg
14828
14829  // No global is ever allowed as a base.
14830  if (AM.BaseGV)
14831    return false;
14832
14833  // No reg+reg+imm addressing.
14834  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
14835    return false;
14836
14837  // FIXME: Update this method to support scalable addressing modes.
14838  if (isa<ScalableVectorType>(Ty)) {
14839    uint64_t VecElemNumBytes =
14840        DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
14841    return AM.HasBaseReg && !AM.BaseOffs &&
14842           (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
14843  }
14844
14845  // check reg + imm case:
14846  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
14847  uint64_t NumBytes = 0;
14848  if (Ty->isSized()) {
14849    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
14850    NumBytes = NumBits / 8;
14851    if (!isPowerOf2_64(NumBits))
14852      NumBytes = 0;
14853  }
14854
14855  if (!AM.Scale) {
14856    int64_t Offset = AM.BaseOffs;
14857
14858    // 9-bit signed offset
14859    if (isInt<9>(Offset))
14860      return true;
14861
14862    // 12-bit unsigned offset
14863    unsigned shift = Log2_64(NumBytes);
14864    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
14865        // Must be a multiple of NumBytes (NumBytes is a power of 2)
14866        (Offset >> shift) << shift == Offset)
14867      return true;
14868    return false;
14869  }
14870
14871  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
14872
14873  return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
14874}
14875
14876bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
14877  // Consider splitting large offset of struct or array.
14878  return true;
14879}
14880
14881bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
14882    const MachineFunction &MF, EVT VT) const {
14883  VT = VT.getScalarType();
14884
14885  if (!VT.isSimple())
14886    return false;
14887
14888  switch (VT.getSimpleVT().SimpleTy) {
14889  case MVT::f16:
14890    return Subtarget->hasFullFP16();
14891  case MVT::f32:
14892  case MVT::f64:
14893    return true;
14894  default:
14895    break;
14896  }
14897
14898  return false;
14899}
14900
14901bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
14902                                                       Type *Ty) const {
14903  switch (Ty->getScalarType()->getTypeID()) {
14904  case Type::FloatTyID:
14905  case Type::DoubleTyID:
14906    return true;
14907  default:
14908    return false;
14909  }
14910}
14911
14912bool AArch64TargetLowering::generateFMAsInMachineCombiner(
14913    EVT VT, CodeGenOpt::Level OptLevel) const {
14914  return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
14915         !useSVEForFixedLengthVectorVT(VT);
14916}
14917
14918const MCPhysReg *
14919AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
14920  // LR is a callee-save register, but we must treat it as clobbered by any call
14921  // site. Hence we include LR in the scratch registers, which are in turn added
14922  // as implicit-defs for stackmaps and patchpoints.
14923  static const MCPhysReg ScratchRegs[] = {
14924    AArch64::X16, AArch64::X17, AArch64::LR, 0
14925  };
14926  return ScratchRegs;
14927}
14928
14929bool
14930AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
14931                                                     CombineLevel Level) const {
14932  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
14933          N->getOpcode() == ISD::SRL) &&
14934         "Expected shift op");
14935
14936  SDValue ShiftLHS = N->getOperand(0);
14937  EVT VT = N->getValueType(0);
14938
14939  // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
14940  // combine it with shift 'N' to let it be lowered to UBFX except:
14941  // ((x >> C) & mask) << C.
14942  if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
14943      isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
14944    uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
14945    if (isMask_64(TruncMask)) {
14946      SDValue AndLHS = ShiftLHS.getOperand(0);
14947      if (AndLHS.getOpcode() == ISD::SRL) {
14948        if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
14949          if (N->getOpcode() == ISD::SHL)
14950            if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
14951              return SRLC->getZExtValue() == SHLC->getZExtValue();
14952          return false;
14953        }
14954      }
14955    }
14956  }
14957  return true;
14958}
14959
14960bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
14961    const SDNode *N) const {
14962  assert(N->getOpcode() == ISD::XOR &&
14963         (N->getOperand(0).getOpcode() == ISD::SHL ||
14964          N->getOperand(0).getOpcode() == ISD::SRL) &&
14965         "Expected XOR(SHIFT) pattern");
14966
14967  // Only commute if the entire NOT mask is a hidden shifted mask.
14968  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
14969  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14970  if (XorC && ShiftC) {
14971    unsigned MaskIdx, MaskLen;
14972    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
14973      unsigned ShiftAmt = ShiftC->getZExtValue();
14974      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
14975      if (N->getOperand(0).getOpcode() == ISD::SHL)
14976        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
14977      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
14978    }
14979  }
14980
14981  return false;
14982}
14983
14984bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
14985    const SDNode *N, CombineLevel Level) const {
14986  assert(((N->getOpcode() == ISD::SHL &&
14987           N->getOperand(0).getOpcode() == ISD::SRL) ||
14988          (N->getOpcode() == ISD::SRL &&
14989           N->getOperand(0).getOpcode() == ISD::SHL)) &&
14990         "Expected shift-shift mask");
14991  // Don't allow multiuse shift folding with the same shift amount.
14992  if (!N->getOperand(0)->hasOneUse())
14993    return false;
14994
14995  // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
14996  EVT VT = N->getValueType(0);
14997  if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
14998    auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14999    auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
15000    return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
15001  }
15002
15003  return true;
15004}
15005
15006bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
15007                                                              Type *Ty) const {
15008  assert(Ty->isIntegerTy());
15009
15010  unsigned BitSize = Ty->getPrimitiveSizeInBits();
15011  if (BitSize == 0)
15012    return false;
15013
15014  int64_t Val = Imm.getSExtValue();
15015  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
15016    return true;
15017
15018  if ((int64_t)Val < 0)
15019    Val = ~Val;
15020  if (BitSize == 32)
15021    Val &= (1LL << 32) - 1;
15022
15023  unsigned LZ = countLeadingZeros((uint64_t)Val);
15024  unsigned Shift = (63 - LZ) / 16;
15025  // MOVZ is free so return true for one or fewer MOVK.
15026  return Shift < 3;
15027}
15028
15029bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
15030                                                    unsigned Index) const {
15031  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
15032    return false;
15033
15034  return (Index == 0 || Index == ResVT.getVectorMinNumElements());
15035}
15036
15037/// Turn vector tests of the signbit in the form of:
15038///   xor (sra X, elt_size(X)-1), -1
15039/// into:
15040///   cmge X, X, #0
15041static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
15042                                         const AArch64Subtarget *Subtarget) {
15043  EVT VT = N->getValueType(0);
15044  if (!Subtarget->hasNEON() || !VT.isVector())
15045    return SDValue();
15046
15047  // There must be a shift right algebraic before the xor, and the xor must be a
15048  // 'not' operation.
15049  SDValue Shift = N->getOperand(0);
15050  SDValue Ones = N->getOperand(1);
15051  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
15052      !ISD::isBuildVectorAllOnes(Ones.getNode()))
15053    return SDValue();
15054
15055  // The shift should be smearing the sign bit across each vector element.
15056  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
15057  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
15058  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
15059    return SDValue();
15060
15061  return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
15062}
15063
15064// Given a vecreduce_add node, detect the below pattern and convert it to the
15065// node sequence with UABDL, [S|U]ADB and UADDLP.
15066//
15067// i32 vecreduce_add(
15068//  v16i32 abs(
15069//    v16i32 sub(
15070//     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
15071// =================>
15072// i32 vecreduce_add(
15073//   v4i32 UADDLP(
15074//     v8i16 add(
15075//       v8i16 zext(
15076//         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
15077//       v8i16 zext(
15078//         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
15079static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
15080                                                    SelectionDAG &DAG) {
15081  // Assumed i32 vecreduce_add
15082  if (N->getValueType(0) != MVT::i32)
15083    return SDValue();
15084
15085  SDValue VecReduceOp0 = N->getOperand(0);
15086  unsigned Opcode = VecReduceOp0.getOpcode();
15087  // Assumed v16i32 abs
15088  if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
15089    return SDValue();
15090
15091  SDValue ABS = VecReduceOp0;
15092  // Assumed v16i32 sub
15093  if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
15094      ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
15095    return SDValue();
15096
15097  SDValue SUB = ABS->getOperand(0);
15098  unsigned Opcode0 = SUB->getOperand(0).getOpcode();
15099  unsigned Opcode1 = SUB->getOperand(1).getOpcode();
15100  // Assumed v16i32 type
15101  if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
15102      SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
15103    return SDValue();
15104
15105  // Assumed zext or sext
15106  bool IsZExt = false;
15107  if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
15108    IsZExt = true;
15109  } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
15110    IsZExt = false;
15111  } else
15112    return SDValue();
15113
15114  SDValue EXT0 = SUB->getOperand(0);
15115  SDValue EXT1 = SUB->getOperand(1);
15116  // Assumed zext's operand has v16i8 type
15117  if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
15118      EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
15119    return SDValue();
15120
15121  // Pattern is dectected. Let's convert it to sequence of nodes.
15122  SDLoc DL(N);
15123
15124  // First, create the node pattern of UABD/SABD.
15125  SDValue UABDHigh8Op0 =
15126      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
15127                  DAG.getConstant(8, DL, MVT::i64));
15128  SDValue UABDHigh8Op1 =
15129      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
15130                  DAG.getConstant(8, DL, MVT::i64));
15131  SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
15132                                  UABDHigh8Op0, UABDHigh8Op1);
15133  SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
15134
15135  // Second, create the node pattern of UABAL.
15136  SDValue UABDLo8Op0 =
15137      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
15138                  DAG.getConstant(0, DL, MVT::i64));
15139  SDValue UABDLo8Op1 =
15140      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
15141                  DAG.getConstant(0, DL, MVT::i64));
15142  SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
15143                                UABDLo8Op0, UABDLo8Op1);
15144  SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
15145  SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
15146
15147  // Third, create the node of UADDLP.
15148  SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
15149
15150  // Fourth, create the node of VECREDUCE_ADD.
15151  return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
15152}
15153
15154// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
15155//   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
15156//   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
15157static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
15158                                          const AArch64Subtarget *ST) {
15159  if (!ST->hasDotProd())
15160    return performVecReduceAddCombineWithUADDLP(N, DAG);
15161
15162  SDValue Op0 = N->getOperand(0);
15163  if (N->getValueType(0) != MVT::i32 ||
15164      Op0.getValueType().getVectorElementType() != MVT::i32)
15165    return SDValue();
15166
15167  unsigned ExtOpcode = Op0.getOpcode();
15168  SDValue A = Op0;
15169  SDValue B;
15170  if (ExtOpcode == ISD::MUL) {
15171    A = Op0.getOperand(0);
15172    B = Op0.getOperand(1);
15173    if (A.getOpcode() != B.getOpcode() ||
15174        A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
15175      return SDValue();
15176    ExtOpcode = A.getOpcode();
15177  }
15178  if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
15179    return SDValue();
15180
15181  EVT Op0VT = A.getOperand(0).getValueType();
15182  if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
15183    return SDValue();
15184
15185  SDLoc DL(Op0);
15186  // For non-mla reductions B can be set to 1. For MLA we take the operand of
15187  // the extend B.
15188  if (!B)
15189    B = DAG.getConstant(1, DL, Op0VT);
15190  else
15191    B = B.getOperand(0);
15192
15193  SDValue Zeros =
15194      DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
15195  auto DotOpcode =
15196      (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
15197  SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
15198                            A.getOperand(0), B);
15199  return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
15200}
15201
15202// Given an (integer) vecreduce, we know the order of the inputs does not
15203// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
15204// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
15205// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
15206static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
15207  auto DetectAddExtract = [&](SDValue A) {
15208    // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
15209    // UADDLP(x) if found.
15210    if (A.getOpcode() != ISD::ADD)
15211      return SDValue();
15212    EVT VT = A.getValueType();
15213    SDValue Op0 = A.getOperand(0);
15214    SDValue Op1 = A.getOperand(1);
15215    if (Op0.getOpcode() != Op0.getOpcode() ||
15216        (Op0.getOpcode() != ISD::ZERO_EXTEND &&
15217         Op0.getOpcode() != ISD::SIGN_EXTEND))
15218      return SDValue();
15219    SDValue Ext0 = Op0.getOperand(0);
15220    SDValue Ext1 = Op1.getOperand(0);
15221    if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
15222        Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
15223        Ext0.getOperand(0) != Ext1.getOperand(0))
15224      return SDValue();
15225    // Check that the type is twice the add types, and the extract are from
15226    // upper/lower parts of the same source.
15227    if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
15228        VT.getVectorNumElements() * 2)
15229      return SDValue();
15230    if ((Ext0.getConstantOperandVal(1) != 0 &&
15231         Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
15232        (Ext1.getConstantOperandVal(1) != 0 &&
15233         Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
15234      return SDValue();
15235    unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
15236                                                          : AArch64ISD::SADDLP;
15237    return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
15238  };
15239
15240  SDValue A = N->getOperand(0);
15241  if (SDValue R = DetectAddExtract(A))
15242    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
15243  if (A.getOpcode() == ISD::ADD) {
15244    if (SDValue R = DetectAddExtract(A.getOperand(0)))
15245      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
15246                         DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
15247                                     A.getOperand(1)));
15248    if (SDValue R = DetectAddExtract(A.getOperand(1)))
15249      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
15250                         DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
15251                                     A.getOperand(0)));
15252  }
15253  return SDValue();
15254}
15255
15256
15257static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
15258                                 TargetLowering::DAGCombinerInfo &DCI,
15259                                 const AArch64Subtarget *Subtarget) {
15260  if (DCI.isBeforeLegalizeOps())
15261    return SDValue();
15262
15263  return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
15264}
15265
15266SDValue
15267AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
15268                                     SelectionDAG &DAG,
15269                                     SmallVectorImpl<SDNode *> &Created) const {
15270  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
15271  if (isIntDivCheap(N->getValueType(0), Attr))
15272    return SDValue(N,0); // Lower SDIV as SDIV
15273
15274  EVT VT = N->getValueType(0);
15275
15276  // For scalable and fixed types, mark them as cheap so we can handle it much
15277  // later. This allows us to handle larger than legal types.
15278  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
15279    return SDValue(N, 0);
15280
15281  // fold (sdiv X, pow2)
15282  if ((VT != MVT::i32 && VT != MVT::i64) ||
15283      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
15284    return SDValue();
15285
15286  SDLoc DL(N);
15287  SDValue N0 = N->getOperand(0);
15288  unsigned Lg2 = Divisor.countTrailingZeros();
15289  SDValue Zero = DAG.getConstant(0, DL, VT);
15290  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
15291
15292  // Add (N0 < 0) ? Pow2 - 1 : 0;
15293  SDValue CCVal;
15294  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
15295  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
15296  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
15297
15298  Created.push_back(Cmp.getNode());
15299  Created.push_back(Add.getNode());
15300  Created.push_back(CSel.getNode());
15301
15302  // Divide by pow2.
15303  SDValue SRA =
15304      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
15305
15306  // If we're dividing by a positive value, we're done.  Otherwise, we must
15307  // negate the result.
15308  if (Divisor.isNonNegative())
15309    return SRA;
15310
15311  Created.push_back(SRA.getNode());
15312  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
15313}
15314
15315SDValue
15316AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
15317                                     SelectionDAG &DAG,
15318                                     SmallVectorImpl<SDNode *> &Created) const {
15319  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
15320  if (isIntDivCheap(N->getValueType(0), Attr))
15321    return SDValue(N, 0); // Lower SREM as SREM
15322
15323  EVT VT = N->getValueType(0);
15324
15325  // For scalable and fixed types, mark them as cheap so we can handle it much
15326  // later. This allows us to handle larger than legal types.
15327  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
15328    return SDValue(N, 0);
15329
15330  // fold (srem X, pow2)
15331  if ((VT != MVT::i32 && VT != MVT::i64) ||
15332      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
15333    return SDValue();
15334
15335  unsigned Lg2 = Divisor.countTrailingZeros();
15336  if (Lg2 == 0)
15337    return SDValue();
15338
15339  SDLoc DL(N);
15340  SDValue N0 = N->getOperand(0);
15341  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
15342  SDValue Zero = DAG.getConstant(0, DL, VT);
15343  SDValue CCVal, CSNeg;
15344  if (Lg2 == 1) {
15345    SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
15346    SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
15347    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
15348
15349    Created.push_back(Cmp.getNode());
15350    Created.push_back(And.getNode());
15351  } else {
15352    SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
15353    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
15354
15355    SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
15356    SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
15357    SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
15358    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
15359                        Negs.getValue(1));
15360
15361    Created.push_back(Negs.getNode());
15362    Created.push_back(AndPos.getNode());
15363    Created.push_back(AndNeg.getNode());
15364  }
15365
15366  return CSNeg;
15367}
15368
15369static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
15370  switch(getIntrinsicID(S.getNode())) {
15371  default:
15372    break;
15373  case Intrinsic::aarch64_sve_cntb:
15374    return 8;
15375  case Intrinsic::aarch64_sve_cnth:
15376    return 16;
15377  case Intrinsic::aarch64_sve_cntw:
15378    return 32;
15379  case Intrinsic::aarch64_sve_cntd:
15380    return 64;
15381  }
15382  return {};
15383}
15384
15385/// Calculates what the pre-extend type is, based on the extension
15386/// operation node provided by \p Extend.
15387///
15388/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
15389/// pre-extend type is pulled directly from the operand, while other extend
15390/// operations need a bit more inspection to get this information.
15391///
15392/// \param Extend The SDNode from the DAG that represents the extend operation
15393///
15394/// \returns The type representing the \p Extend source type, or \p MVT::Other
15395/// if no valid type can be determined
15396static EVT calculatePreExtendType(SDValue Extend) {
15397  switch (Extend.getOpcode()) {
15398  case ISD::SIGN_EXTEND:
15399  case ISD::ZERO_EXTEND:
15400    return Extend.getOperand(0).getValueType();
15401  case ISD::AssertSext:
15402  case ISD::AssertZext:
15403  case ISD::SIGN_EXTEND_INREG: {
15404    VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
15405    if (!TypeNode)
15406      return MVT::Other;
15407    return TypeNode->getVT();
15408  }
15409  case ISD::AND: {
15410    ConstantSDNode *Constant =
15411        dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
15412    if (!Constant)
15413      return MVT::Other;
15414
15415    uint32_t Mask = Constant->getZExtValue();
15416
15417    if (Mask == UCHAR_MAX)
15418      return MVT::i8;
15419    else if (Mask == USHRT_MAX)
15420      return MVT::i16;
15421    else if (Mask == UINT_MAX)
15422      return MVT::i32;
15423
15424    return MVT::Other;
15425  }
15426  default:
15427    return MVT::Other;
15428  }
15429}
15430
15431/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
15432/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
15433/// SExt/ZExt rather than the scalar SExt/ZExt
15434static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
15435  EVT VT = BV.getValueType();
15436  if (BV.getOpcode() != ISD::BUILD_VECTOR &&
15437      BV.getOpcode() != ISD::VECTOR_SHUFFLE)
15438    return SDValue();
15439
15440  // Use the first item in the buildvector/shuffle to get the size of the
15441  // extend, and make sure it looks valid.
15442  SDValue Extend = BV->getOperand(0);
15443  unsigned ExtendOpcode = Extend.getOpcode();
15444  bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
15445                ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
15446                ExtendOpcode == ISD::AssertSext;
15447  if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
15448      ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
15449    return SDValue();
15450  // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
15451  // calculatePreExtendType will work without issue.
15452  if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
15453      ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
15454    return SDValue();
15455
15456  // Restrict valid pre-extend data type
15457  EVT PreExtendType = calculatePreExtendType(Extend);
15458  if (PreExtendType == MVT::Other ||
15459      PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
15460    return SDValue();
15461
15462  // Make sure all other operands are equally extended
15463  for (SDValue Op : drop_begin(BV->ops())) {
15464    if (Op.isUndef())
15465      continue;
15466    unsigned Opc = Op.getOpcode();
15467    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
15468                     Opc == ISD::AssertSext;
15469    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
15470      return SDValue();
15471  }
15472
15473  SDValue NBV;
15474  SDLoc DL(BV);
15475  if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15476    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
15477    EVT PreExtendLegalType =
15478        PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
15479    SmallVector<SDValue, 8> NewOps;
15480    for (SDValue Op : BV->ops())
15481      NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
15482                                    : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
15483                                                           PreExtendLegalType));
15484    NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
15485  } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
15486    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
15487    NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
15488                               BV.getOperand(1).isUndef()
15489                                   ? DAG.getUNDEF(PreExtendVT)
15490                                   : BV.getOperand(1).getOperand(0),
15491                               cast<ShuffleVectorSDNode>(BV)->getMask());
15492  }
15493  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
15494}
15495
15496/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
15497/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
15498static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
15499  // If the value type isn't a vector, none of the operands are going to be dups
15500  EVT VT = Mul->getValueType(0);
15501  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
15502    return SDValue();
15503
15504  SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
15505  SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
15506
15507  // Neither operands have been changed, don't make any further changes
15508  if (!Op0 && !Op1)
15509    return SDValue();
15510
15511  SDLoc DL(Mul);
15512  return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
15513                     Op1 ? Op1 : Mul->getOperand(1));
15514}
15515
15516// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
15517// Same for other types with equivalent constants.
15518static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
15519  EVT VT = N->getValueType(0);
15520  if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
15521      VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
15522    return SDValue();
15523  if (N->getOperand(0).getOpcode() != ISD::AND ||
15524      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
15525    return SDValue();
15526
15527  SDValue And = N->getOperand(0);
15528  SDValue Srl = And.getOperand(0);
15529
15530  APInt V1, V2, V3;
15531  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
15532      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
15533      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
15534    return SDValue();
15535
15536  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
15537  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
15538      V3 != (HalfSize - 1))
15539    return SDValue();
15540
15541  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
15542                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
15543                                VT.getVectorElementCount() * 2);
15544
15545  SDLoc DL(N);
15546  SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
15547  SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
15548  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
15549}
15550
15551static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
15552                                 TargetLowering::DAGCombinerInfo &DCI,
15553                                 const AArch64Subtarget *Subtarget) {
15554
15555  if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
15556    return Ext;
15557  if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
15558    return Ext;
15559
15560  if (DCI.isBeforeLegalizeOps())
15561    return SDValue();
15562
15563  // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
15564  // and in MachineCombiner pass, add+mul will be combined into madd.
15565  // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
15566  SDLoc DL(N);
15567  EVT VT = N->getValueType(0);
15568  SDValue N0 = N->getOperand(0);
15569  SDValue N1 = N->getOperand(1);
15570  SDValue MulOper;
15571  unsigned AddSubOpc;
15572
15573  auto IsAddSubWith1 = [&](SDValue V) -> bool {
15574    AddSubOpc = V->getOpcode();
15575    if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
15576      SDValue Opnd = V->getOperand(1);
15577      MulOper = V->getOperand(0);
15578      if (AddSubOpc == ISD::SUB)
15579        std::swap(Opnd, MulOper);
15580      if (auto C = dyn_cast<ConstantSDNode>(Opnd))
15581        return C->isOne();
15582    }
15583    return false;
15584  };
15585
15586  if (IsAddSubWith1(N0)) {
15587    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
15588    return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
15589  }
15590
15591  if (IsAddSubWith1(N1)) {
15592    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
15593    return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
15594  }
15595
15596  // The below optimizations require a constant RHS.
15597  if (!isa<ConstantSDNode>(N1))
15598    return SDValue();
15599
15600  ConstantSDNode *C = cast<ConstantSDNode>(N1);
15601  const APInt &ConstValue = C->getAPIntValue();
15602
15603  // Allow the scaling to be folded into the `cnt` instruction by preventing
15604  // the scaling to be obscured here. This makes it easier to pattern match.
15605  if (IsSVECntIntrinsic(N0) ||
15606     (N0->getOpcode() == ISD::TRUNCATE &&
15607      (IsSVECntIntrinsic(N0->getOperand(0)))))
15608       if (ConstValue.sge(1) && ConstValue.sle(16))
15609         return SDValue();
15610
15611  // Multiplication of a power of two plus/minus one can be done more
15612  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
15613  // future CPUs have a cheaper MADD instruction, this may need to be
15614  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
15615  // 64-bit is 5 cycles, so this is always a win.
15616  // More aggressively, some multiplications N0 * C can be lowered to
15617  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
15618  // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
15619  // TODO: lower more cases.
15620
15621  // TrailingZeroes is used to test if the mul can be lowered to
15622  // shift+add+shift.
15623  unsigned TrailingZeroes = ConstValue.countTrailingZeros();
15624  if (TrailingZeroes) {
15625    // Conservatively do not lower to shift+add+shift if the mul might be
15626    // folded into smul or umul.
15627    if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
15628                            isZeroExtended(N0.getNode(), DAG)))
15629      return SDValue();
15630    // Conservatively do not lower to shift+add+shift if the mul might be
15631    // folded into madd or msub.
15632    if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
15633                           N->use_begin()->getOpcode() == ISD::SUB))
15634      return SDValue();
15635  }
15636  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
15637  // and shift+add+shift.
15638  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
15639  unsigned ShiftAmt;
15640
15641  auto Shl = [&](SDValue N0, unsigned N1) {
15642    SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
15643    return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
15644  };
15645  auto Add = [&](SDValue N0, SDValue N1) {
15646    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
15647  };
15648  auto Sub = [&](SDValue N0, SDValue N1) {
15649    return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
15650  };
15651  auto Negate = [&](SDValue N) {
15652    SDValue Zero = DAG.getConstant(0, DL, VT);
15653    return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
15654  };
15655
15656  // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
15657  // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
15658  // the (2^N - 1) can't be execused via a single instruction.
15659  auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
15660    unsigned BitWidth = C.getBitWidth();
15661    for (unsigned i = 1; i < BitWidth / 2; i++) {
15662      APInt Rem;
15663      APInt X(BitWidth, (1 << i) + 1);
15664      APInt::sdivrem(C, X, N, Rem);
15665      APInt NVMinus1 = N - 1;
15666      if (Rem == 0 && NVMinus1.isPowerOf2()) {
15667        M = X;
15668        return true;
15669      }
15670    }
15671    return false;
15672  };
15673
15674  if (ConstValue.isNonNegative()) {
15675    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
15676    // (mul x, 2^N - 1) => (sub (shl x, N), x)
15677    // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
15678    // (mul x, (2^M + 1) * (2^N + 1))
15679    //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
15680    APInt SCVMinus1 = ShiftedConstValue - 1;
15681    APInt SCVPlus1 = ShiftedConstValue + 1;
15682    APInt CVPlus1 = ConstValue + 1;
15683    APInt CVM, CVN;
15684    if (SCVMinus1.isPowerOf2()) {
15685      ShiftAmt = SCVMinus1.logBase2();
15686      return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
15687    } else if (CVPlus1.isPowerOf2()) {
15688      ShiftAmt = CVPlus1.logBase2();
15689      return Sub(Shl(N0, ShiftAmt), N0);
15690    } else if (SCVPlus1.isPowerOf2()) {
15691      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15692      return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
15693    } else if (Subtarget->hasLSLFast() &&
15694               isPowPlusPlusConst(ConstValue, CVM, CVN)) {
15695      APInt CVMMinus1 = CVM - 1;
15696      APInt CVNMinus1 = CVN - 1;
15697      unsigned ShiftM1 = CVMMinus1.logBase2();
15698      unsigned ShiftN1 = CVNMinus1.logBase2();
15699      // LSLFast implicate that Shifts <= 3 places are fast
15700      if (ShiftM1 <= 3 && ShiftN1 <= 3) {
15701        SDValue MVal = Add(Shl(N0, ShiftM1), N0);
15702        return Add(Shl(MVal, ShiftN1), MVal);
15703      }
15704    }
15705  } else {
15706    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
15707    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
15708    // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
15709    APInt SCVPlus1 = -ShiftedConstValue + 1;
15710    APInt CVNegPlus1 = -ConstValue + 1;
15711    APInt CVNegMinus1 = -ConstValue - 1;
15712    if (CVNegPlus1.isPowerOf2()) {
15713      ShiftAmt = CVNegPlus1.logBase2();
15714      return Sub(N0, Shl(N0, ShiftAmt));
15715    } else if (CVNegMinus1.isPowerOf2()) {
15716      ShiftAmt = CVNegMinus1.logBase2();
15717      return Negate(Add(Shl(N0, ShiftAmt), N0));
15718    } else if (SCVPlus1.isPowerOf2()) {
15719      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15720      return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
15721    }
15722  }
15723
15724  return SDValue();
15725}
15726
15727static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
15728                                                         SelectionDAG &DAG) {
15729  // Take advantage of vector comparisons producing 0 or -1 in each lane to
15730  // optimize away operation when it's from a constant.
15731  //
15732  // The general transformation is:
15733  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
15734  //       AND(VECTOR_CMP(x,y), constant2)
15735  //    constant2 = UNARYOP(constant)
15736
15737  // Early exit if this isn't a vector operation, the operand of the
15738  // unary operation isn't a bitwise AND, or if the sizes of the operations
15739  // aren't the same.
15740  EVT VT = N->getValueType(0);
15741  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
15742      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
15743      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
15744    return SDValue();
15745
15746  // Now check that the other operand of the AND is a constant. We could
15747  // make the transformation for non-constant splats as well, but it's unclear
15748  // that would be a benefit as it would not eliminate any operations, just
15749  // perform one more step in scalar code before moving to the vector unit.
15750  if (BuildVectorSDNode *BV =
15751          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
15752    // Bail out if the vector isn't a constant.
15753    if (!BV->isConstant())
15754      return SDValue();
15755
15756    // Everything checks out. Build up the new and improved node.
15757    SDLoc DL(N);
15758    EVT IntVT = BV->getValueType(0);
15759    // Create a new constant of the appropriate type for the transformed
15760    // DAG.
15761    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
15762    // The AND node needs bitcasts to/from an integer vector type around it.
15763    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
15764    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
15765                                 N->getOperand(0)->getOperand(0), MaskConst);
15766    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
15767    return Res;
15768  }
15769
15770  return SDValue();
15771}
15772
15773static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
15774                                     const AArch64Subtarget *Subtarget) {
15775  // First try to optimize away the conversion when it's conditionally from
15776  // a constant. Vectors only.
15777  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
15778    return Res;
15779
15780  EVT VT = N->getValueType(0);
15781  if (VT != MVT::f32 && VT != MVT::f64)
15782    return SDValue();
15783
15784  // Only optimize when the source and destination types have the same width.
15785  if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
15786    return SDValue();
15787
15788  // If the result of an integer load is only used by an integer-to-float
15789  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
15790  // This eliminates an "integer-to-vector-move" UOP and improves throughput.
15791  SDValue N0 = N->getOperand(0);
15792  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15793      // Do not change the width of a volatile load.
15794      !cast<LoadSDNode>(N0)->isVolatile()) {
15795    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15796    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
15797                               LN0->getPointerInfo(), LN0->getAlign(),
15798                               LN0->getMemOperand()->getFlags());
15799
15800    // Make sure successors of the original load stay after it by updating them
15801    // to use the new Chain.
15802    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
15803
15804    unsigned Opcode =
15805        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
15806    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
15807  }
15808
15809  return SDValue();
15810}
15811
15812/// Fold a floating-point multiply by power of two into floating-point to
15813/// fixed-point conversion.
15814static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
15815                                     TargetLowering::DAGCombinerInfo &DCI,
15816                                     const AArch64Subtarget *Subtarget) {
15817  if (!Subtarget->hasNEON() || Subtarget->forceStreamingCompatibleSVE())
15818    return SDValue();
15819
15820  if (!N->getValueType(0).isSimple())
15821    return SDValue();
15822
15823  SDValue Op = N->getOperand(0);
15824  if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
15825    return SDValue();
15826
15827  if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
15828    return SDValue();
15829
15830  SDValue ConstVec = Op->getOperand(1);
15831  if (!isa<BuildVectorSDNode>(ConstVec))
15832    return SDValue();
15833
15834  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
15835  uint32_t FloatBits = FloatTy.getSizeInBits();
15836  if (FloatBits != 32 && FloatBits != 64 &&
15837      (FloatBits != 16 || !Subtarget->hasFullFP16()))
15838    return SDValue();
15839
15840  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
15841  uint32_t IntBits = IntTy.getSizeInBits();
15842  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15843    return SDValue();
15844
15845  // Avoid conversions where iN is larger than the float (e.g., float -> i64).
15846  if (IntBits > FloatBits)
15847    return SDValue();
15848
15849  BitVector UndefElements;
15850  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15851  int32_t Bits = IntBits == 64 ? 64 : 32;
15852  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
15853  if (C == -1 || C == 0 || C > Bits)
15854    return SDValue();
15855
15856  EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
15857  if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
15858    return SDValue();
15859
15860  if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
15861      N->getOpcode() == ISD::FP_TO_UINT_SAT) {
15862    EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
15863    if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
15864      return SDValue();
15865  }
15866
15867  SDLoc DL(N);
15868  bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
15869                   N->getOpcode() == ISD::FP_TO_SINT_SAT);
15870  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
15871                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;
15872  SDValue FixConv =
15873      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
15874                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
15875                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
15876  // We can handle smaller integers by generating an extra trunc.
15877  if (IntBits < FloatBits)
15878    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
15879
15880  return FixConv;
15881}
15882
15883/// Fold a floating-point divide by power of two into fixed-point to
15884/// floating-point conversion.
15885static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
15886                                  TargetLowering::DAGCombinerInfo &DCI,
15887                                  const AArch64Subtarget *Subtarget) {
15888  if (!Subtarget->hasNEON())
15889    return SDValue();
15890
15891  SDValue Op = N->getOperand(0);
15892  unsigned Opc = Op->getOpcode();
15893  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
15894      !Op.getOperand(0).getValueType().isSimple() ||
15895      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
15896    return SDValue();
15897
15898  SDValue ConstVec = N->getOperand(1);
15899  if (!isa<BuildVectorSDNode>(ConstVec))
15900    return SDValue();
15901
15902  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
15903  int32_t IntBits = IntTy.getSizeInBits();
15904  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15905    return SDValue();
15906
15907  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
15908  int32_t FloatBits = FloatTy.getSizeInBits();
15909  if (FloatBits != 32 && FloatBits != 64)
15910    return SDValue();
15911
15912  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
15913  if (IntBits > FloatBits)
15914    return SDValue();
15915
15916  BitVector UndefElements;
15917  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15918  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
15919  if (C == -1 || C == 0 || C > FloatBits)
15920    return SDValue();
15921
15922  MVT ResTy;
15923  unsigned NumLanes = Op.getValueType().getVectorNumElements();
15924  switch (NumLanes) {
15925  default:
15926    return SDValue();
15927  case 2:
15928    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
15929    break;
15930  case 4:
15931    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
15932    break;
15933  }
15934
15935  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
15936    return SDValue();
15937
15938  SDLoc DL(N);
15939  SDValue ConvInput = Op.getOperand(0);
15940  bool IsSigned = Opc == ISD::SINT_TO_FP;
15941  if (IntBits < FloatBits)
15942    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
15943                            ResTy, ConvInput);
15944
15945  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
15946                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
15947  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
15948                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
15949                     DAG.getConstant(C, DL, MVT::i32));
15950}
15951
15952/// An EXTR instruction is made up of two shifts, ORed together. This helper
15953/// searches for and classifies those shifts.
15954static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
15955                         bool &FromHi) {
15956  if (N.getOpcode() == ISD::SHL)
15957    FromHi = false;
15958  else if (N.getOpcode() == ISD::SRL)
15959    FromHi = true;
15960  else
15961    return false;
15962
15963  if (!isa<ConstantSDNode>(N.getOperand(1)))
15964    return false;
15965
15966  ShiftAmount = N->getConstantOperandVal(1);
15967  Src = N->getOperand(0);
15968  return true;
15969}
15970
15971/// EXTR instruction extracts a contiguous chunk of bits from two existing
15972/// registers viewed as a high/low pair. This function looks for the pattern:
15973/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
15974/// with an EXTR. Can't quite be done in TableGen because the two immediates
15975/// aren't independent.
15976static SDValue tryCombineToEXTR(SDNode *N,
15977                                TargetLowering::DAGCombinerInfo &DCI) {
15978  SelectionDAG &DAG = DCI.DAG;
15979  SDLoc DL(N);
15980  EVT VT = N->getValueType(0);
15981
15982  assert(N->getOpcode() == ISD::OR && "Unexpected root");
15983
15984  if (VT != MVT::i32 && VT != MVT::i64)
15985    return SDValue();
15986
15987  SDValue LHS;
15988  uint32_t ShiftLHS = 0;
15989  bool LHSFromHi = false;
15990  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
15991    return SDValue();
15992
15993  SDValue RHS;
15994  uint32_t ShiftRHS = 0;
15995  bool RHSFromHi = false;
15996  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
15997    return SDValue();
15998
15999  // If they're both trying to come from the high part of the register, they're
16000  // not really an EXTR.
16001  if (LHSFromHi == RHSFromHi)
16002    return SDValue();
16003
16004  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
16005    return SDValue();
16006
16007  if (LHSFromHi) {
16008    std::swap(LHS, RHS);
16009    std::swap(ShiftLHS, ShiftRHS);
16010  }
16011
16012  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
16013                     DAG.getConstant(ShiftRHS, DL, MVT::i64));
16014}
16015
16016static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16017                               const AArch64TargetLowering &TLI) {
16018  EVT VT = N->getValueType(0);
16019  SelectionDAG &DAG = DCI.DAG;
16020  SDLoc DL(N);
16021
16022  if (!VT.isVector())
16023    return SDValue();
16024
16025  // The combining code currently only works for NEON vectors. In particular,
16026  // it does not work for SVE when dealing with vectors wider than 128 bits.
16027  // It also doesn't work for streaming mode because it causes generating
16028  // bsl instructions that are invalid in streaming mode.
16029  if (TLI.useSVEForFixedLengthVectorVT(
16030          VT,
16031          DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE()))
16032    return SDValue();
16033
16034  SDValue N0 = N->getOperand(0);
16035  if (N0.getOpcode() != ISD::AND)
16036    return SDValue();
16037
16038  SDValue N1 = N->getOperand(1);
16039  if (N1.getOpcode() != ISD::AND)
16040    return SDValue();
16041
16042  // InstCombine does (not (neg a)) => (add a -1).
16043  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
16044  // Loop over all combinations of AND operands.
16045  for (int i = 1; i >= 0; --i) {
16046    for (int j = 1; j >= 0; --j) {
16047      SDValue O0 = N0->getOperand(i);
16048      SDValue O1 = N1->getOperand(j);
16049      SDValue Sub, Add, SubSibling, AddSibling;
16050
16051      // Find a SUB and an ADD operand, one from each AND.
16052      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
16053        Sub = O0;
16054        Add = O1;
16055        SubSibling = N0->getOperand(1 - i);
16056        AddSibling = N1->getOperand(1 - j);
16057      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
16058        Add = O0;
16059        Sub = O1;
16060        AddSibling = N0->getOperand(1 - i);
16061        SubSibling = N1->getOperand(1 - j);
16062      } else
16063        continue;
16064
16065      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
16066        continue;
16067
16068      // Constant ones is always righthand operand of the Add.
16069      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
16070        continue;
16071
16072      if (Sub.getOperand(1) != Add.getOperand(0))
16073        continue;
16074
16075      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
16076    }
16077  }
16078
16079  // (or (and a b) (and (not a) c)) => (bsl a b c)
16080  // We only have to look for constant vectors here since the general, variable
16081  // case can be handled in TableGen.
16082  unsigned Bits = VT.getScalarSizeInBits();
16083  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
16084  for (int i = 1; i >= 0; --i)
16085    for (int j = 1; j >= 0; --j) {
16086      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
16087      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
16088      if (!BVN0 || !BVN1)
16089        continue;
16090
16091      bool FoundMatch = true;
16092      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
16093        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
16094        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
16095        if (!CN0 || !CN1 ||
16096            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
16097          FoundMatch = false;
16098          break;
16099        }
16100      }
16101
16102      if (FoundMatch)
16103        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
16104                           N0->getOperand(1 - i), N1->getOperand(1 - j));
16105    }
16106
16107  return SDValue();
16108}
16109
16110// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
16111// convert to csel(ccmp(.., cc0)), depending on cc1:
16112
16113// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
16114// =>
16115// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
16116//
16117// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
16118// =>
16119// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
16120static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
16121  EVT VT = N->getValueType(0);
16122  SDValue CSel0 = N->getOperand(0);
16123  SDValue CSel1 = N->getOperand(1);
16124
16125  if (CSel0.getOpcode() != AArch64ISD::CSEL ||
16126      CSel1.getOpcode() != AArch64ISD::CSEL)
16127    return SDValue();
16128
16129  if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
16130    return SDValue();
16131
16132  if (!isNullConstant(CSel0.getOperand(0)) ||
16133      !isOneConstant(CSel0.getOperand(1)) ||
16134      !isNullConstant(CSel1.getOperand(0)) ||
16135      !isOneConstant(CSel1.getOperand(1)))
16136    return SDValue();
16137
16138  SDValue Cmp0 = CSel0.getOperand(3);
16139  SDValue Cmp1 = CSel1.getOperand(3);
16140  AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
16141  AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
16142  if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
16143    return SDValue();
16144  if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
16145      Cmp0.getOpcode() == AArch64ISD::SUBS) {
16146    std::swap(Cmp0, Cmp1);
16147    std::swap(CC0, CC1);
16148  }
16149
16150  if (Cmp1.getOpcode() != AArch64ISD::SUBS)
16151    return SDValue();
16152
16153  SDLoc DL(N);
16154  SDValue CCmp, Condition;
16155  unsigned NZCV;
16156
16157  if (N->getOpcode() == ISD::AND) {
16158    AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
16159    Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
16160    NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
16161  } else {
16162    AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
16163    Condition = DAG.getConstant(CC0, DL, MVT_CC);
16164    NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
16165  }
16166
16167  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
16168
16169  auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
16170  if (Op1 && Op1->getAPIntValue().isNegative() &&
16171      Op1->getAPIntValue().sgt(-32)) {
16172    // CCMP accept the constant int the range [0, 31]
16173    // if the Op1 is a constant in the range [-31, -1], we
16174    // can select to CCMN to avoid the extra mov
16175    SDValue AbsOp1 =
16176        DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
16177    CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
16178                       NZCVOp, Condition, Cmp0);
16179  } else {
16180    CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
16181                       Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
16182  }
16183  return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
16184                     CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
16185                     CCmp);
16186}
16187
16188static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16189                                const AArch64Subtarget *Subtarget,
16190                                const AArch64TargetLowering &TLI) {
16191  SelectionDAG &DAG = DCI.DAG;
16192  EVT VT = N->getValueType(0);
16193
16194  if (SDValue R = performANDORCSELCombine(N, DAG))
16195    return R;
16196
16197  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
16198    return SDValue();
16199
16200  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
16201  if (SDValue Res = tryCombineToEXTR(N, DCI))
16202    return Res;
16203
16204  if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
16205    return Res;
16206
16207  return SDValue();
16208}
16209
16210static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
16211  if (!MemVT.getVectorElementType().isSimple())
16212    return false;
16213
16214  uint64_t MaskForTy = 0ull;
16215  switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
16216  case MVT::i8:
16217    MaskForTy = 0xffull;
16218    break;
16219  case MVT::i16:
16220    MaskForTy = 0xffffull;
16221    break;
16222  case MVT::i32:
16223    MaskForTy = 0xffffffffull;
16224    break;
16225  default:
16226    return false;
16227    break;
16228  }
16229
16230  if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
16231    if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
16232      return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
16233
16234  return false;
16235}
16236
16237static bool isAllInactivePredicate(SDValue N) {
16238  // Look through cast.
16239  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
16240    N = N.getOperand(0);
16241
16242  return ISD::isConstantSplatVectorAllZeros(N.getNode());
16243}
16244
16245static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
16246  unsigned NumElts = N.getValueType().getVectorMinNumElements();
16247
16248  // Look through cast.
16249  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
16250    N = N.getOperand(0);
16251    // When reinterpreting from a type with fewer elements the "new" elements
16252    // are not active, so bail if they're likely to be used.
16253    if (N.getValueType().getVectorMinNumElements() < NumElts)
16254      return false;
16255  }
16256
16257  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
16258    return true;
16259
16260  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
16261  // or smaller than the implicit element type represented by N.
16262  // NOTE: A larger element count implies a smaller element type.
16263  if (N.getOpcode() == AArch64ISD::PTRUE &&
16264      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
16265    return N.getValueType().getVectorMinNumElements() >= NumElts;
16266
16267  // If we're compiling for a specific vector-length, we can check if the
16268  // pattern's VL equals that of the scalable vector at runtime.
16269  if (N.getOpcode() == AArch64ISD::PTRUE) {
16270    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16271    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
16272    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
16273    if (MaxSVESize && MinSVESize == MaxSVESize) {
16274      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
16275      unsigned PatNumElts =
16276          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
16277      return PatNumElts == (NumElts * VScale);
16278    }
16279  }
16280
16281  return false;
16282}
16283
16284static SDValue performReinterpretCastCombine(SDNode *N) {
16285  SDValue LeafOp = SDValue(N, 0);
16286  SDValue Op = N->getOperand(0);
16287  while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
16288         LeafOp.getValueType() != Op.getValueType())
16289    Op = Op->getOperand(0);
16290  if (LeafOp.getValueType() == Op.getValueType())
16291    return Op;
16292  return SDValue();
16293}
16294
16295static SDValue performSVEAndCombine(SDNode *N,
16296                                    TargetLowering::DAGCombinerInfo &DCI) {
16297  if (DCI.isBeforeLegalizeOps())
16298    return SDValue();
16299
16300  SelectionDAG &DAG = DCI.DAG;
16301  SDValue Src = N->getOperand(0);
16302  unsigned Opc = Src->getOpcode();
16303
16304  // Zero/any extend of an unsigned unpack
16305  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
16306    SDValue UnpkOp = Src->getOperand(0);
16307    SDValue Dup = N->getOperand(1);
16308
16309    if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
16310      return SDValue();
16311
16312    SDLoc DL(N);
16313    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
16314    if (!C)
16315      return SDValue();
16316
16317    uint64_t ExtVal = C->getZExtValue();
16318
16319    // If the mask is fully covered by the unpack, we don't need to push
16320    // a new AND onto the operand
16321    EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
16322    if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
16323        (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
16324        (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
16325      return Src;
16326
16327    // Truncate to prevent a DUP with an over wide constant
16328    APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
16329
16330    // Otherwise, make sure we propagate the AND to the operand
16331    // of the unpack
16332    Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
16333                      DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
16334
16335    SDValue And = DAG.getNode(ISD::AND, DL,
16336                              UnpkOp->getValueType(0), UnpkOp, Dup);
16337
16338    return DAG.getNode(Opc, DL, N->getValueType(0), And);
16339  }
16340
16341  // If both sides of AND operations are i1 splat_vectors then
16342  // we can produce just i1 splat_vector as the result.
16343  if (isAllActivePredicate(DAG, N->getOperand(0)))
16344    return N->getOperand(1);
16345  if (isAllActivePredicate(DAG, N->getOperand(1)))
16346    return N->getOperand(0);
16347
16348  if (!EnableCombineMGatherIntrinsics)
16349    return SDValue();
16350
16351  SDValue Mask = N->getOperand(1);
16352
16353  if (!Src.hasOneUse())
16354    return SDValue();
16355
16356  EVT MemVT;
16357
16358  // SVE load instructions perform an implicit zero-extend, which makes them
16359  // perfect candidates for combining.
16360  switch (Opc) {
16361  case AArch64ISD::LD1_MERGE_ZERO:
16362  case AArch64ISD::LDNF1_MERGE_ZERO:
16363  case AArch64ISD::LDFF1_MERGE_ZERO:
16364    MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
16365    break;
16366  case AArch64ISD::GLD1_MERGE_ZERO:
16367  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
16368  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
16369  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
16370  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
16371  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
16372  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
16373  case AArch64ISD::GLDFF1_MERGE_ZERO:
16374  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
16375  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
16376  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
16377  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
16378  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
16379  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
16380  case AArch64ISD::GLDNT1_MERGE_ZERO:
16381    MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
16382    break;
16383  default:
16384    return SDValue();
16385  }
16386
16387  if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
16388    return Src;
16389
16390  return SDValue();
16391}
16392
16393static SDValue performANDCombine(SDNode *N,
16394                                 TargetLowering::DAGCombinerInfo &DCI) {
16395  SelectionDAG &DAG = DCI.DAG;
16396  SDValue LHS = N->getOperand(0);
16397  SDValue RHS = N->getOperand(1);
16398  EVT VT = N->getValueType(0);
16399
16400  if (SDValue R = performANDORCSELCombine(N, DAG))
16401    return R;
16402
16403  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
16404    return SDValue();
16405
16406  if (VT.isScalableVector())
16407    return performSVEAndCombine(N, DCI);
16408
16409  // The combining code below works only for NEON vectors. In particular, it
16410  // does not work for SVE when dealing with vectors wider than 128 bits.
16411  if (!VT.is64BitVector() && !VT.is128BitVector())
16412    return SDValue();
16413
16414  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
16415  if (!BVN)
16416    return SDValue();
16417
16418  // AND does not accept an immediate, so check if we can use a BIC immediate
16419  // instruction instead. We do this here instead of using a (and x, (mvni imm))
16420  // pattern in isel, because some immediates may be lowered to the preferred
16421  // (and x, (movi imm)) form, even though an mvni representation also exists.
16422  APInt DefBits(VT.getSizeInBits(), 0);
16423  APInt UndefBits(VT.getSizeInBits(), 0);
16424  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
16425    SDValue NewOp;
16426
16427    DefBits = ~DefBits;
16428    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
16429                                    DefBits, &LHS)) ||
16430        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
16431                                    DefBits, &LHS)))
16432      return NewOp;
16433
16434    UndefBits = ~UndefBits;
16435    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
16436                                    UndefBits, &LHS)) ||
16437        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
16438                                    UndefBits, &LHS)))
16439      return NewOp;
16440  }
16441
16442  return SDValue();
16443}
16444
16445static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
16446  switch (Opcode) {
16447  case ISD::STRICT_FADD:
16448  case ISD::FADD:
16449    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
16450  case ISD::ADD:
16451    return VT == MVT::i64;
16452  default:
16453    return false;
16454  }
16455}
16456
16457static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
16458                        AArch64CC::CondCode Cond);
16459
16460static bool isPredicateCCSettingOp(SDValue N) {
16461  if ((N.getOpcode() == ISD::SETCC) ||
16462      (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
16463       (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
16464        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
16465        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
16466        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
16467        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
16468        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
16469        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
16470        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
16471        // get_active_lane_mask is lowered to a whilelo instruction.
16472        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
16473    return true;
16474
16475  return false;
16476}
16477
16478// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
16479// ... into: "ptrue p, all" + PTEST
16480static SDValue
16481performFirstTrueTestVectorCombine(SDNode *N,
16482                                  TargetLowering::DAGCombinerInfo &DCI,
16483                                  const AArch64Subtarget *Subtarget) {
16484  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
16485  // Make sure PTEST can be legalised with illegal types.
16486  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
16487    return SDValue();
16488
16489  SDValue N0 = N->getOperand(0);
16490  EVT VT = N0.getValueType();
16491
16492  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
16493      !isNullConstant(N->getOperand(1)))
16494    return SDValue();
16495
16496  // Restricted the DAG combine to only cases where we're extracting from a
16497  // flag-setting operation.
16498  if (!isPredicateCCSettingOp(N0))
16499    return SDValue();
16500
16501  // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
16502  SelectionDAG &DAG = DCI.DAG;
16503  SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
16504  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
16505}
16506
16507// Materialize : Idx = (add (mul vscale, NumEls), -1)
16508//               i1 = extract_vector_elt t37, Constant:i64<Idx>
16509//     ... into: "ptrue p, all" + PTEST
16510static SDValue
16511performLastTrueTestVectorCombine(SDNode *N,
16512                                 TargetLowering::DAGCombinerInfo &DCI,
16513                                 const AArch64Subtarget *Subtarget) {
16514  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
16515  // Make sure PTEST is legal types.
16516  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
16517    return SDValue();
16518
16519  SDValue N0 = N->getOperand(0);
16520  EVT OpVT = N0.getValueType();
16521
16522  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
16523    return SDValue();
16524
16525  // Idx == (add (mul vscale, NumEls), -1)
16526  SDValue Idx = N->getOperand(1);
16527  if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
16528    return SDValue();
16529
16530  SDValue VS = Idx.getOperand(0);
16531  if (VS.getOpcode() != ISD::VSCALE)
16532    return SDValue();
16533
16534  unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
16535  if (VS.getConstantOperandVal(0) != NumEls)
16536    return SDValue();
16537
16538  // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
16539  SelectionDAG &DAG = DCI.DAG;
16540  SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
16541  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
16542}
16543
16544static SDValue
16545performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16546                               const AArch64Subtarget *Subtarget) {
16547  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
16548  if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
16549    return Res;
16550  if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
16551    return Res;
16552
16553  SelectionDAG &DAG = DCI.DAG;
16554  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
16555  ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
16556
16557  EVT VT = N->getValueType(0);
16558  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
16559  bool IsStrict = N0->isStrictFPOpcode();
16560
16561  // extract(dup x) -> x
16562  if (N0.getOpcode() == AArch64ISD::DUP)
16563    return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
16564
16565  // Rewrite for pairwise fadd pattern
16566  //   (f32 (extract_vector_elt
16567  //           (fadd (vXf32 Other)
16568  //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
16569  // ->
16570  //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
16571  //              (extract_vector_elt (vXf32 Other) 1))
16572  // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
16573  // we can only do this when it's used only by the extract_vector_elt.
16574  if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
16575      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
16576      (!IsStrict || N0.hasOneUse())) {
16577    SDLoc DL(N0);
16578    SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
16579    SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
16580
16581    ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
16582    SDValue Other = N00;
16583
16584    // And handle the commutative case.
16585    if (!Shuffle) {
16586      Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
16587      Other = N01;
16588    }
16589
16590    if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
16591        Other == Shuffle->getOperand(0)) {
16592      SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
16593                                     DAG.getConstant(0, DL, MVT::i64));
16594      SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
16595                                     DAG.getConstant(1, DL, MVT::i64));
16596      if (!IsStrict)
16597        return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
16598
16599      // For strict_fadd we need uses of the final extract_vector to be replaced
16600      // with the strict_fadd, but we also need uses of the chain output of the
16601      // original strict_fadd to use the chain output of the new strict_fadd as
16602      // otherwise it may not be deleted.
16603      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
16604                                {VT, MVT::Other},
16605                                {N0->getOperand(0), Extract1, Extract2});
16606      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
16607      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
16608      return SDValue(N, 0);
16609    }
16610  }
16611
16612  return SDValue();
16613}
16614
16615static SDValue performConcatVectorsCombine(SDNode *N,
16616                                           TargetLowering::DAGCombinerInfo &DCI,
16617                                           SelectionDAG &DAG) {
16618  SDLoc dl(N);
16619  EVT VT = N->getValueType(0);
16620  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
16621  unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
16622
16623  if (VT.isScalableVector())
16624    return SDValue();
16625
16626  // Optimize concat_vectors of truncated vectors, where the intermediate
16627  // type is illegal, to avoid said illegality,  e.g.,
16628  //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
16629  //                          (v2i16 (truncate (v2i64)))))
16630  // ->
16631  //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
16632  //                                    (v4i32 (bitcast (v2i64))),
16633  //                                    <0, 2, 4, 6>)))
16634  // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
16635  // on both input and result type, so we might generate worse code.
16636  // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
16637  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
16638      N1Opc == ISD::TRUNCATE) {
16639    SDValue N00 = N0->getOperand(0);
16640    SDValue N10 = N1->getOperand(0);
16641    EVT N00VT = N00.getValueType();
16642
16643    if (N00VT == N10.getValueType() &&
16644        (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
16645        N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
16646      MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
16647      SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
16648      for (size_t i = 0; i < Mask.size(); ++i)
16649        Mask[i] = i * 2;
16650      return DAG.getNode(ISD::TRUNCATE, dl, VT,
16651                         DAG.getVectorShuffle(
16652                             MidVT, dl,
16653                             DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
16654                             DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
16655    }
16656  }
16657
16658  if (N->getOperand(0).getValueType() == MVT::v4i8) {
16659    // If we have a concat of v4i8 loads, convert them to a buildvector of f32
16660    // loads to prevent having to go through the v4i8 load legalization that
16661    // needs to extend each element into a larger type.
16662    if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
16663          if (V.getValueType() != MVT::v4i8)
16664            return false;
16665          if (V.isUndef())
16666            return true;
16667          LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
16668          return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
16669                 LD->getExtensionType() == ISD::NON_EXTLOAD;
16670        })) {
16671      EVT NVT =
16672          EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
16673      SmallVector<SDValue> Ops;
16674
16675      for (unsigned i = 0; i < N->getNumOperands(); i++) {
16676        SDValue V = N->getOperand(i);
16677        if (V.isUndef())
16678          Ops.push_back(DAG.getUNDEF(MVT::f32));
16679        else {
16680          LoadSDNode *LD = cast<LoadSDNode>(V);
16681          SDValue NewLoad =
16682              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
16683                          LD->getMemOperand());
16684          DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
16685          Ops.push_back(NewLoad);
16686        }
16687      }
16688      return DAG.getBitcast(N->getValueType(0),
16689                            DAG.getBuildVector(NVT, dl, Ops));
16690    }
16691  }
16692
16693  // Canonicalise concat_vectors to replace concatenations of truncated nots
16694  // with nots of concatenated truncates. This in some cases allows for multiple
16695  // redundant negations to be eliminated.
16696  //  (concat_vectors (v4i16 (truncate (not (v4i32)))),
16697  //                  (v4i16 (truncate (not (v4i32)))))
16698  // ->
16699  //  (not (concat_vectors (v4i16 (truncate (v4i32))),
16700  //                       (v4i16 (truncate (v4i32)))))
16701  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
16702      N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
16703      N->isOnlyUserOf(N1.getNode())) {
16704    auto isBitwiseVectorNegate = [](SDValue V) {
16705      return V->getOpcode() == ISD::XOR &&
16706             ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
16707    };
16708    SDValue N00 = N0->getOperand(0);
16709    SDValue N10 = N1->getOperand(0);
16710    if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
16711        isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
16712      return DAG.getNOT(
16713          dl,
16714          DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16715                      DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
16716                                  N00->getOperand(0)),
16717                      DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
16718                                  N10->getOperand(0))),
16719          VT);
16720    }
16721  }
16722
16723  // Wait till after everything is legalized to try this. That way we have
16724  // legal vector types and such.
16725  if (DCI.isBeforeLegalizeOps())
16726    return SDValue();
16727
16728  // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
16729  // extracted subvectors from the same original vectors. Combine these into a
16730  // single avg that operates on the two original vectors.
16731  // avgceil is the target independant name for rhadd, avgfloor is a hadd.
16732  // Example:
16733  //  (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
16734  //                                   extract_subvector (v16i8 OpB, <0>))),
16735  //                  (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
16736  //                                   extract_subvector (v16i8 OpB, <8>)))))
16737  // ->
16738  //  (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
16739  if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
16740      (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
16741       N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
16742    SDValue N00 = N0->getOperand(0);
16743    SDValue N01 = N0->getOperand(1);
16744    SDValue N10 = N1->getOperand(0);
16745    SDValue N11 = N1->getOperand(1);
16746
16747    EVT N00VT = N00.getValueType();
16748    EVT N10VT = N10.getValueType();
16749
16750    if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16751        N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16752        N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16753        N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
16754      SDValue N00Source = N00->getOperand(0);
16755      SDValue N01Source = N01->getOperand(0);
16756      SDValue N10Source = N10->getOperand(0);
16757      SDValue N11Source = N11->getOperand(0);
16758
16759      if (N00Source == N10Source && N01Source == N11Source &&
16760          N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
16761        assert(N0.getValueType() == N1.getValueType());
16762
16763        uint64_t N00Index = N00.getConstantOperandVal(1);
16764        uint64_t N01Index = N01.getConstantOperandVal(1);
16765        uint64_t N10Index = N10.getConstantOperandVal(1);
16766        uint64_t N11Index = N11.getConstantOperandVal(1);
16767
16768        if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
16769            N10Index == N00VT.getVectorNumElements())
16770          return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
16771      }
16772    }
16773  }
16774
16775  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
16776  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
16777  // canonicalise to that.
16778  if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
16779    assert(VT.getScalarSizeInBits() == 64);
16780    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
16781                       DAG.getConstant(0, dl, MVT::i64));
16782  }
16783
16784  // Canonicalise concat_vectors so that the right-hand vector has as few
16785  // bit-casts as possible before its real operation. The primary matching
16786  // destination for these operations will be the narrowing "2" instructions,
16787  // which depend on the operation being performed on this right-hand vector.
16788  // For example,
16789  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
16790  // becomes
16791  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
16792
16793  if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
16794    return SDValue();
16795  SDValue RHS = N1->getOperand(0);
16796  MVT RHSTy = RHS.getValueType().getSimpleVT();
16797  // If the RHS is not a vector, this is not the pattern we're looking for.
16798  if (!RHSTy.isVector())
16799    return SDValue();
16800
16801  LLVM_DEBUG(
16802      dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
16803
16804  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
16805                                  RHSTy.getVectorNumElements() * 2);
16806  return DAG.getNode(ISD::BITCAST, dl, VT,
16807                     DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
16808                                 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
16809                                 RHS));
16810}
16811
16812static SDValue
16813performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16814                               SelectionDAG &DAG) {
16815  if (DCI.isBeforeLegalizeOps())
16816    return SDValue();
16817
16818  EVT VT = N->getValueType(0);
16819  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
16820    return SDValue();
16821
16822  SDValue V = N->getOperand(0);
16823
16824  // NOTE: This combine exists in DAGCombiner, but that version's legality check
16825  // blocks this combine because the non-const case requires custom lowering.
16826  //
16827  // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
16828  if (V.getOpcode() == ISD::SPLAT_VECTOR)
16829    if (isa<ConstantSDNode>(V.getOperand(0)))
16830      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
16831
16832  return SDValue();
16833}
16834
16835static SDValue
16836performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16837                              SelectionDAG &DAG) {
16838  SDLoc DL(N);
16839  SDValue Vec = N->getOperand(0);
16840  SDValue SubVec = N->getOperand(1);
16841  uint64_t IdxVal = N->getConstantOperandVal(2);
16842  EVT VecVT = Vec.getValueType();
16843  EVT SubVT = SubVec.getValueType();
16844
16845  // Only do this for legal fixed vector types.
16846  if (!VecVT.isFixedLengthVector() ||
16847      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
16848      !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
16849    return SDValue();
16850
16851  // Ignore widening patterns.
16852  if (IdxVal == 0 && Vec.isUndef())
16853    return SDValue();
16854
16855  // Subvector must be half the width and an "aligned" insertion.
16856  unsigned NumSubElts = SubVT.getVectorNumElements();
16857  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
16858      (IdxVal != 0 && IdxVal != NumSubElts))
16859    return SDValue();
16860
16861  // Fold insert_subvector -> concat_vectors
16862  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
16863  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
16864  SDValue Lo, Hi;
16865  if (IdxVal == 0) {
16866    Lo = SubVec;
16867    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16868                     DAG.getVectorIdxConstant(NumSubElts, DL));
16869  } else {
16870    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16871                     DAG.getVectorIdxConstant(0, DL));
16872    Hi = SubVec;
16873  }
16874  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
16875}
16876
16877static SDValue tryCombineFixedPointConvert(SDNode *N,
16878                                           TargetLowering::DAGCombinerInfo &DCI,
16879                                           SelectionDAG &DAG) {
16880  // Wait until after everything is legalized to try this. That way we have
16881  // legal vector types and such.
16882  if (DCI.isBeforeLegalizeOps())
16883    return SDValue();
16884  // Transform a scalar conversion of a value from a lane extract into a
16885  // lane extract of a vector conversion. E.g., from foo1 to foo2:
16886  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
16887  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
16888  //
16889  // The second form interacts better with instruction selection and the
16890  // register allocator to avoid cross-class register copies that aren't
16891  // coalescable due to a lane reference.
16892
16893  // Check the operand and see if it originates from a lane extract.
16894  SDValue Op1 = N->getOperand(1);
16895  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16896    return SDValue();
16897
16898  // Yep, no additional predication needed. Perform the transform.
16899  SDValue IID = N->getOperand(0);
16900  SDValue Shift = N->getOperand(2);
16901  SDValue Vec = Op1.getOperand(0);
16902  SDValue Lane = Op1.getOperand(1);
16903  EVT ResTy = N->getValueType(0);
16904  EVT VecResTy;
16905  SDLoc DL(N);
16906
16907  // The vector width should be 128 bits by the time we get here, even
16908  // if it started as 64 bits (the extract_vector handling will have
16909  // done so). Bail if it is not.
16910  if (Vec.getValueSizeInBits() != 128)
16911    return SDValue();
16912
16913  if (Vec.getValueType() == MVT::v4i32)
16914    VecResTy = MVT::v4f32;
16915  else if (Vec.getValueType() == MVT::v2i64)
16916    VecResTy = MVT::v2f64;
16917  else
16918    return SDValue();
16919
16920  SDValue Convert =
16921      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
16922  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
16923}
16924
16925// AArch64 high-vector "long" operations are formed by performing the non-high
16926// version on an extract_subvector of each operand which gets the high half:
16927//
16928//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
16929//
16930// However, there are cases which don't have an extract_high explicitly, but
16931// have another operation that can be made compatible with one for free. For
16932// example:
16933//
16934//  (dupv64 scalar) --> (extract_high (dup128 scalar))
16935//
16936// This routine does the actual conversion of such DUPs, once outer routines
16937// have determined that everything else is in order.
16938// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
16939// similarly here.
16940static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
16941  MVT VT = N.getSimpleValueType();
16942  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16943      N.getConstantOperandVal(1) == 0)
16944    N = N.getOperand(0);
16945
16946  switch (N.getOpcode()) {
16947  case AArch64ISD::DUP:
16948  case AArch64ISD::DUPLANE8:
16949  case AArch64ISD::DUPLANE16:
16950  case AArch64ISD::DUPLANE32:
16951  case AArch64ISD::DUPLANE64:
16952  case AArch64ISD::MOVI:
16953  case AArch64ISD::MOVIshift:
16954  case AArch64ISD::MOVIedit:
16955  case AArch64ISD::MOVImsl:
16956  case AArch64ISD::MVNIshift:
16957  case AArch64ISD::MVNImsl:
16958    break;
16959  default:
16960    // FMOV could be supported, but isn't very useful, as it would only occur
16961    // if you passed a bitcast' floating point immediate to an eligible long
16962    // integer op (addl, smull, ...).
16963    return SDValue();
16964  }
16965
16966  if (!VT.is64BitVector())
16967    return SDValue();
16968
16969  SDLoc DL(N);
16970  unsigned NumElems = VT.getVectorNumElements();
16971  if (N.getValueType().is64BitVector()) {
16972    MVT ElementTy = VT.getVectorElementType();
16973    MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
16974    N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
16975  }
16976
16977  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
16978                     DAG.getConstant(NumElems, DL, MVT::i64));
16979}
16980
16981static bool isEssentiallyExtractHighSubvector(SDValue N) {
16982  if (N.getOpcode() == ISD::BITCAST)
16983    N = N.getOperand(0);
16984  if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
16985    return false;
16986  if (N.getOperand(0).getValueType().isScalableVector())
16987    return false;
16988  return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
16989         N.getOperand(0).getValueType().getVectorNumElements() / 2;
16990}
16991
16992/// Helper structure to keep track of ISD::SET_CC operands.
16993struct GenericSetCCInfo {
16994  const SDValue *Opnd0;
16995  const SDValue *Opnd1;
16996  ISD::CondCode CC;
16997};
16998
16999/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
17000struct AArch64SetCCInfo {
17001  const SDValue *Cmp;
17002  AArch64CC::CondCode CC;
17003};
17004
17005/// Helper structure to keep track of SetCC information.
17006union SetCCInfo {
17007  GenericSetCCInfo Generic;
17008  AArch64SetCCInfo AArch64;
17009};
17010
17011/// Helper structure to be able to read SetCC information.  If set to
17012/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
17013/// GenericSetCCInfo.
17014struct SetCCInfoAndKind {
17015  SetCCInfo Info;
17016  bool IsAArch64;
17017};
17018
17019/// Check whether or not \p Op is a SET_CC operation, either a generic or
17020/// an
17021/// AArch64 lowered one.
17022/// \p SetCCInfo is filled accordingly.
17023/// \post SetCCInfo is meanginfull only when this function returns true.
17024/// \return True when Op is a kind of SET_CC operation.
17025static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
17026  // If this is a setcc, this is straight forward.
17027  if (Op.getOpcode() == ISD::SETCC) {
17028    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
17029    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
17030    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17031    SetCCInfo.IsAArch64 = false;
17032    return true;
17033  }
17034  // Otherwise, check if this is a matching csel instruction.
17035  // In other words:
17036  // - csel 1, 0, cc
17037  // - csel 0, 1, !cc
17038  if (Op.getOpcode() != AArch64ISD::CSEL)
17039    return false;
17040  // Set the information about the operands.
17041  // TODO: we want the operands of the Cmp not the csel
17042  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
17043  SetCCInfo.IsAArch64 = true;
17044  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
17045      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
17046
17047  // Check that the operands matches the constraints:
17048  // (1) Both operands must be constants.
17049  // (2) One must be 1 and the other must be 0.
17050  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
17051  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
17052
17053  // Check (1).
17054  if (!TValue || !FValue)
17055    return false;
17056
17057  // Check (2).
17058  if (!TValue->isOne()) {
17059    // Update the comparison when we are interested in !cc.
17060    std::swap(TValue, FValue);
17061    SetCCInfo.Info.AArch64.CC =
17062        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
17063  }
17064  return TValue->isOne() && FValue->isZero();
17065}
17066
17067// Returns true if Op is setcc or zext of setcc.
17068static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
17069  if (isSetCC(Op, Info))
17070    return true;
17071  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
17072    isSetCC(Op->getOperand(0), Info));
17073}
17074
17075// The folding we want to perform is:
17076// (add x, [zext] (setcc cc ...) )
17077//   -->
17078// (csel x, (add x, 1), !cc ...)
17079//
17080// The latter will get matched to a CSINC instruction.
17081static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
17082  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
17083  SDValue LHS = Op->getOperand(0);
17084  SDValue RHS = Op->getOperand(1);
17085  SetCCInfoAndKind InfoAndKind;
17086
17087  // If both operands are a SET_CC, then we don't want to perform this
17088  // folding and create another csel as this results in more instructions
17089  // (and higher register usage).
17090  if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
17091      isSetCCOrZExtSetCC(RHS, InfoAndKind))
17092    return SDValue();
17093
17094  // If neither operand is a SET_CC, give up.
17095  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
17096    std::swap(LHS, RHS);
17097    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
17098      return SDValue();
17099  }
17100
17101  // FIXME: This could be generatized to work for FP comparisons.
17102  EVT CmpVT = InfoAndKind.IsAArch64
17103                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
17104                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
17105  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
17106    return SDValue();
17107
17108  SDValue CCVal;
17109  SDValue Cmp;
17110  SDLoc dl(Op);
17111  if (InfoAndKind.IsAArch64) {
17112    CCVal = DAG.getConstant(
17113        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
17114        MVT::i32);
17115    Cmp = *InfoAndKind.Info.AArch64.Cmp;
17116  } else
17117    Cmp = getAArch64Cmp(
17118        *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
17119        ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
17120        dl);
17121
17122  EVT VT = Op->getValueType(0);
17123  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
17124  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
17125}
17126
17127// ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
17128static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
17129  EVT VT = N->getValueType(0);
17130  // Only scalar integer and vector types.
17131  if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
17132    return SDValue();
17133
17134  SDValue LHS = N->getOperand(0);
17135  SDValue RHS = N->getOperand(1);
17136  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17137      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
17138    return SDValue();
17139
17140  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
17141  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
17142  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
17143    return SDValue();
17144
17145  SDValue Op1 = LHS->getOperand(0);
17146  SDValue Op2 = RHS->getOperand(0);
17147  EVT OpVT1 = Op1.getValueType();
17148  EVT OpVT2 = Op2.getValueType();
17149  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
17150      Op2.getOpcode() != AArch64ISD::UADDV ||
17151      OpVT1.getVectorElementType() != VT)
17152    return SDValue();
17153
17154  SDValue Val1 = Op1.getOperand(0);
17155  SDValue Val2 = Op2.getOperand(0);
17156  EVT ValVT = Val1->getValueType(0);
17157  SDLoc DL(N);
17158  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
17159  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17160                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
17161                     DAG.getConstant(0, DL, MVT::i64));
17162}
17163
17164/// Perform the scalar expression combine in the form of:
17165///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
17166///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
17167static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
17168  EVT VT = N->getValueType(0);
17169  if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
17170    return SDValue();
17171
17172  SDValue LHS = N->getOperand(0);
17173  SDValue RHS = N->getOperand(1);
17174
17175  // Handle commutivity.
17176  if (LHS.getOpcode() != AArch64ISD::CSEL &&
17177      LHS.getOpcode() != AArch64ISD::CSNEG) {
17178    std::swap(LHS, RHS);
17179    if (LHS.getOpcode() != AArch64ISD::CSEL &&
17180        LHS.getOpcode() != AArch64ISD::CSNEG) {
17181      return SDValue();
17182    }
17183  }
17184
17185  if (!LHS.hasOneUse())
17186    return SDValue();
17187
17188  AArch64CC::CondCode AArch64CC =
17189      static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
17190
17191  // The CSEL should include a const one operand, and the CSNEG should include
17192  // One or NegOne operand.
17193  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
17194  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
17195  if (!CTVal || !CFVal)
17196    return SDValue();
17197
17198  if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
17199        (CTVal->isOne() || CFVal->isOne())) &&
17200      !(LHS.getOpcode() == AArch64ISD::CSNEG &&
17201        (CTVal->isOne() || CFVal->isAllOnes())))
17202    return SDValue();
17203
17204  // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
17205  if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
17206      !CFVal->isOne()) {
17207    std::swap(CTVal, CFVal);
17208    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
17209  }
17210
17211  SDLoc DL(N);
17212  // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
17213  if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
17214      !CFVal->isAllOnes()) {
17215    APInt C = -1 * CFVal->getAPIntValue();
17216    CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
17217    CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
17218    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
17219  }
17220
17221  // It might be neutral for larger constants, as the immediate need to be
17222  // materialized in a register.
17223  APInt ADDC = CTVal->getAPIntValue();
17224  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17225  if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
17226    return SDValue();
17227
17228  assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
17229          (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
17230         "Unexpected constant value");
17231
17232  SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
17233  SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
17234  SDValue Cmp = LHS.getOperand(3);
17235
17236  return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
17237}
17238
17239// ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
17240static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
17241  EVT VT = N->getValueType(0);
17242  if (N->getOpcode() != ISD::ADD)
17243    return SDValue();
17244
17245  SDValue Dot = N->getOperand(0);
17246  SDValue A = N->getOperand(1);
17247  // Handle commutivity
17248  auto isZeroDot = [](SDValue Dot) {
17249    return (Dot.getOpcode() == AArch64ISD::UDOT ||
17250            Dot.getOpcode() == AArch64ISD::SDOT) &&
17251           isZerosVector(Dot.getOperand(0).getNode());
17252  };
17253  if (!isZeroDot(Dot))
17254    std::swap(Dot, A);
17255  if (!isZeroDot(Dot))
17256    return SDValue();
17257
17258  return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
17259                     Dot.getOperand(2));
17260}
17261
17262static bool isNegatedInteger(SDValue Op) {
17263  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
17264}
17265
17266static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
17267  SDLoc DL(Op);
17268  EVT VT = Op.getValueType();
17269  SDValue Zero = DAG.getConstant(0, DL, VT);
17270  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
17271}
17272
17273// Try to fold
17274//
17275// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
17276//
17277// The folding helps csel to be matched with csneg without generating
17278// redundant neg instruction, which includes negation of the csel expansion
17279// of abs node lowered by lowerABS.
17280static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
17281  if (!isNegatedInteger(SDValue(N, 0)))
17282    return SDValue();
17283
17284  SDValue CSel = N->getOperand(1);
17285  if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
17286    return SDValue();
17287
17288  SDValue N0 = CSel.getOperand(0);
17289  SDValue N1 = CSel.getOperand(1);
17290
17291  // If both of them is not negations, it's not worth the folding as it
17292  // introduces two additional negations while reducing one negation.
17293  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
17294    return SDValue();
17295
17296  SDValue N0N = getNegatedInteger(N0, DAG);
17297  SDValue N1N = getNegatedInteger(N1, DAG);
17298
17299  SDLoc DL(N);
17300  EVT VT = CSel.getValueType();
17301  return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
17302                     CSel.getOperand(3));
17303}
17304
17305// The basic add/sub long vector instructions have variants with "2" on the end
17306// which act on the high-half of their inputs. They are normally matched by
17307// patterns like:
17308//
17309// (add (zeroext (extract_high LHS)),
17310//      (zeroext (extract_high RHS)))
17311// -> uaddl2 vD, vN, vM
17312//
17313// However, if one of the extracts is something like a duplicate, this
17314// instruction can still be used profitably. This function puts the DAG into a
17315// more appropriate form for those patterns to trigger.
17316static SDValue performAddSubLongCombine(SDNode *N,
17317                                        TargetLowering::DAGCombinerInfo &DCI,
17318                                        SelectionDAG &DAG) {
17319  if (DCI.isBeforeLegalizeOps())
17320    return SDValue();
17321
17322  MVT VT = N->getSimpleValueType(0);
17323  if (!VT.is128BitVector()) {
17324    if (N->getOpcode() == ISD::ADD)
17325      return performSetccAddFolding(N, DAG);
17326    return SDValue();
17327  }
17328
17329  // Make sure both branches are extended in the same way.
17330  SDValue LHS = N->getOperand(0);
17331  SDValue RHS = N->getOperand(1);
17332  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
17333       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
17334      LHS.getOpcode() != RHS.getOpcode())
17335    return SDValue();
17336
17337  unsigned ExtType = LHS.getOpcode();
17338
17339  // It's not worth doing if at least one of the inputs isn't already an
17340  // extract, but we don't know which it'll be so we have to try both.
17341  if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
17342    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
17343    if (!RHS.getNode())
17344      return SDValue();
17345
17346    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
17347  } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
17348    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
17349    if (!LHS.getNode())
17350      return SDValue();
17351
17352    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
17353  }
17354
17355  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
17356}
17357
17358static bool isCMP(SDValue Op) {
17359  return Op.getOpcode() == AArch64ISD::SUBS &&
17360         !Op.getNode()->hasAnyUseOfValue(0);
17361}
17362
17363// (CSEL 1 0 CC Cond) => CC
17364// (CSEL 0 1 CC Cond) => !CC
17365static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
17366  if (Op.getOpcode() != AArch64ISD::CSEL)
17367    return std::nullopt;
17368  auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
17369  if (CC == AArch64CC::AL || CC == AArch64CC::NV)
17370    return std::nullopt;
17371  SDValue OpLHS = Op.getOperand(0);
17372  SDValue OpRHS = Op.getOperand(1);
17373  if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
17374    return CC;
17375  if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
17376    return getInvertedCondCode(CC);
17377
17378  return std::nullopt;
17379}
17380
17381// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
17382// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
17383static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
17384  SDValue CmpOp = Op->getOperand(2);
17385  if (!isCMP(CmpOp))
17386    return SDValue();
17387
17388  if (IsAdd) {
17389    if (!isOneConstant(CmpOp.getOperand(1)))
17390      return SDValue();
17391  } else {
17392    if (!isNullConstant(CmpOp.getOperand(0)))
17393      return SDValue();
17394  }
17395
17396  SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
17397  auto CC = getCSETCondCode(CsetOp);
17398  if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
17399    return SDValue();
17400
17401  return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
17402                     Op->getOperand(0), Op->getOperand(1),
17403                     CsetOp.getOperand(3));
17404}
17405
17406// (ADC x 0 cond) => (CINC x HS cond)
17407static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
17408  SDValue LHS = N->getOperand(0);
17409  SDValue RHS = N->getOperand(1);
17410  SDValue Cond = N->getOperand(2);
17411
17412  if (!isNullConstant(RHS))
17413    return SDValue();
17414
17415  EVT VT = N->getValueType(0);
17416  SDLoc DL(N);
17417
17418  // (CINC x cc cond) <=> (CSINC x x !cc cond)
17419  SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
17420  return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
17421}
17422
17423// Transform vector add(zext i8 to i32, zext i8 to i32)
17424//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
17425// This allows extra uses of saddl/uaddl at the lower vector widths, and less
17426// extends.
17427static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
17428  EVT VT = N->getValueType(0);
17429  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
17430      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
17431       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
17432      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
17433       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
17434      N->getOperand(0).getOperand(0).getValueType() !=
17435          N->getOperand(1).getOperand(0).getValueType())
17436    return SDValue();
17437
17438  SDValue N0 = N->getOperand(0).getOperand(0);
17439  SDValue N1 = N->getOperand(1).getOperand(0);
17440  EVT InVT = N0.getValueType();
17441
17442  EVT S1 = InVT.getScalarType();
17443  EVT S2 = VT.getScalarType();
17444  if ((S2 == MVT::i32 && S1 == MVT::i8) ||
17445      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
17446    SDLoc DL(N);
17447    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17448                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),
17449                                  VT.getVectorElementCount());
17450    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
17451    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
17452    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
17453    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
17454  }
17455  return SDValue();
17456}
17457
17458static SDValue performBuildVectorCombine(SDNode *N,
17459                                         TargetLowering::DAGCombinerInfo &DCI,
17460                                         SelectionDAG &DAG) {
17461  SDLoc DL(N);
17462  EVT VT = N->getValueType(0);
17463
17464  // A build vector of two extracted elements is equivalent to an
17465  // extract subvector where the inner vector is any-extended to the
17466  // extract_vector_elt VT.
17467  //    (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
17468  //                  (extract_elt_iXX_to_i32 vec Idx+1))
17469  // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
17470
17471  // For now, only consider the v2i32 case, which arises as a result of
17472  // legalization.
17473  if (VT != MVT::v2i32)
17474    return SDValue();
17475
17476  SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
17477  // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
17478  if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
17479      Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
17480      // Constant index.
17481      isa<ConstantSDNode>(Elt0->getOperand(1)) &&
17482      isa<ConstantSDNode>(Elt1->getOperand(1)) &&
17483      // Both EXTRACT_VECTOR_ELT from same vector...
17484      Elt0->getOperand(0) == Elt1->getOperand(0) &&
17485      // ... and contiguous. First element's index +1 == second element's index.
17486      Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
17487      // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
17488      // ResultType's known minimum vector length.
17489      Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
17490    SDValue VecToExtend = Elt0->getOperand(0);
17491    EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
17492    if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
17493      return SDValue();
17494
17495    SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
17496
17497    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
17498    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
17499                       SubvectorIdx);
17500  }
17501
17502  return SDValue();
17503}
17504
17505static SDValue performTruncateCombine(SDNode *N,
17506                                      SelectionDAG &DAG) {
17507  EVT VT = N->getValueType(0);
17508  SDValue N0 = N->getOperand(0);
17509  if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
17510      N0.getOpcode() == AArch64ISD::DUP) {
17511    SDValue Op = N0.getOperand(0);
17512    if (VT.getScalarType() == MVT::i32 &&
17513        N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
17514      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
17515    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
17516  }
17517
17518  return SDValue();
17519}
17520
17521// Check an node is an extend or shift operand
17522static bool isExtendOrShiftOperand(SDValue N) {
17523  unsigned Opcode = N.getOpcode();
17524  if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_INREG ||
17525      Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND) {
17526    EVT SrcVT;
17527    if (Opcode == ISD::SIGN_EXTEND_INREG)
17528      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
17529    else
17530      SrcVT = N.getOperand(0).getValueType();
17531
17532    return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
17533  } else if (Opcode == ISD::AND) {
17534    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
17535    if (!CSD)
17536      return false;
17537    uint64_t AndMask = CSD->getZExtValue();
17538    return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
17539  } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
17540    return isa<ConstantSDNode>(N.getOperand(1));
17541  }
17542
17543  return false;
17544}
17545
17546// (N - Y) + Z --> (Z - Y) + N
17547// when N is an extend or shift operand
17548static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
17549                                         SelectionDAG &DAG) {
17550  auto IsOneUseExtend = [](SDValue N) {
17551    return N.hasOneUse() && isExtendOrShiftOperand(N);
17552  };
17553
17554  // DAGCombiner will revert the combination when Z is constant cause
17555  // dead loop. So don't enable the combination when Z is constant.
17556  // If Z is one use shift C, we also can't do the optimization.
17557  // It will falling to self infinite loop.
17558  if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
17559    return SDValue();
17560
17561  if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
17562    return SDValue();
17563
17564  SDValue Shift = SUB.getOperand(0);
17565  if (!IsOneUseExtend(Shift))
17566    return SDValue();
17567
17568  SDLoc DL(N);
17569  EVT VT = N->getValueType(0);
17570
17571  SDValue Y = SUB.getOperand(1);
17572  SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
17573  return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
17574}
17575
17576static SDValue performAddCombineForShiftedOperands(SDNode *N,
17577                                                   SelectionDAG &DAG) {
17578  // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
17579  // commutative.
17580  if (N->getOpcode() != ISD::ADD)
17581    return SDValue();
17582
17583  // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
17584  // shifted register is only available for i32 and i64.
17585  EVT VT = N->getValueType(0);
17586  if (VT != MVT::i32 && VT != MVT::i64)
17587    return SDValue();
17588
17589  SDLoc DL(N);
17590  SDValue LHS = N->getOperand(0);
17591  SDValue RHS = N->getOperand(1);
17592
17593  if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
17594    return Val;
17595  if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
17596    return Val;
17597
17598  uint64_t LHSImm = 0, RHSImm = 0;
17599  // If both operand are shifted by imm and shift amount is not greater than 4
17600  // for one operand, swap LHS and RHS to put operand with smaller shift amount
17601  // on RHS.
17602  //
17603  // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
17604  // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
17605  // with LSL (shift > 4). For the rest of processors, this is no-op for
17606  // performance or correctness.
17607  if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
17608      isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
17609      RHSImm > 4 && LHS.hasOneUse())
17610    return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
17611
17612  return SDValue();
17613}
17614
17615static SDValue performAddSubCombine(SDNode *N,
17616                                    TargetLowering::DAGCombinerInfo &DCI,
17617                                    SelectionDAG &DAG) {
17618  // Try to change sum of two reductions.
17619  if (SDValue Val = performAddUADDVCombine(N, DAG))
17620    return Val;
17621  if (SDValue Val = performAddDotCombine(N, DAG))
17622    return Val;
17623  if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
17624    return Val;
17625  if (SDValue Val = performNegCSelCombine(N, DAG))
17626    return Val;
17627  if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
17628    return Val;
17629  if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
17630    return Val;
17631
17632  return performAddSubLongCombine(N, DCI, DAG);
17633}
17634
17635// Massage DAGs which we can use the high-half "long" operations on into
17636// something isel will recognize better. E.g.
17637//
17638// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
17639//   (aarch64_neon_umull (extract_high (v2i64 vec)))
17640//                     (extract_high (v2i64 (dup128 scalar)))))
17641//
17642static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
17643                                       TargetLowering::DAGCombinerInfo &DCI,
17644                                       SelectionDAG &DAG) {
17645  if (DCI.isBeforeLegalizeOps())
17646    return SDValue();
17647
17648  SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
17649  SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
17650  assert(LHS.getValueType().is64BitVector() &&
17651         RHS.getValueType().is64BitVector() &&
17652         "unexpected shape for long operation");
17653
17654  // Either node could be a DUP, but it's not worth doing both of them (you'd
17655  // just as well use the non-high version) so look for a corresponding extract
17656  // operation on the other "wing".
17657  if (isEssentiallyExtractHighSubvector(LHS)) {
17658    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
17659    if (!RHS.getNode())
17660      return SDValue();
17661  } else if (isEssentiallyExtractHighSubvector(RHS)) {
17662    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
17663    if (!LHS.getNode())
17664      return SDValue();
17665  }
17666
17667  if (IID == Intrinsic::not_intrinsic)
17668    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
17669
17670  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
17671                     N->getOperand(0), LHS, RHS);
17672}
17673
17674static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
17675  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
17676  unsigned ElemBits = ElemTy.getSizeInBits();
17677
17678  int64_t ShiftAmount;
17679  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
17680    APInt SplatValue, SplatUndef;
17681    unsigned SplatBitSize;
17682    bool HasAnyUndefs;
17683    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
17684                              HasAnyUndefs, ElemBits) ||
17685        SplatBitSize != ElemBits)
17686      return SDValue();
17687
17688    ShiftAmount = SplatValue.getSExtValue();
17689  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17690    ShiftAmount = CVN->getSExtValue();
17691  } else
17692    return SDValue();
17693
17694  unsigned Opcode;
17695  bool IsRightShift;
17696  switch (IID) {
17697  default:
17698    llvm_unreachable("Unknown shift intrinsic");
17699  case Intrinsic::aarch64_neon_sqshl:
17700    Opcode = AArch64ISD::SQSHL_I;
17701    IsRightShift = false;
17702    break;
17703  case Intrinsic::aarch64_neon_uqshl:
17704    Opcode = AArch64ISD::UQSHL_I;
17705    IsRightShift = false;
17706    break;
17707  case Intrinsic::aarch64_neon_srshl:
17708    Opcode = AArch64ISD::SRSHR_I;
17709    IsRightShift = true;
17710    break;
17711  case Intrinsic::aarch64_neon_urshl:
17712    Opcode = AArch64ISD::URSHR_I;
17713    IsRightShift = true;
17714    break;
17715  case Intrinsic::aarch64_neon_sqshlu:
17716    Opcode = AArch64ISD::SQSHLU_I;
17717    IsRightShift = false;
17718    break;
17719  case Intrinsic::aarch64_neon_sshl:
17720  case Intrinsic::aarch64_neon_ushl:
17721    // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
17722    // left shift for positive shift amounts. Below, we only replace the current
17723    // node with VSHL, if this condition is met.
17724    Opcode = AArch64ISD::VSHL;
17725    IsRightShift = false;
17726    break;
17727  }
17728
17729  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
17730    SDLoc dl(N);
17731    return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17732                       DAG.getConstant(-ShiftAmount, dl, MVT::i32));
17733  } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
17734    SDLoc dl(N);
17735    return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17736                       DAG.getConstant(ShiftAmount, dl, MVT::i32));
17737  }
17738
17739  return SDValue();
17740}
17741
17742// The CRC32[BH] instructions ignore the high bits of their data operand. Since
17743// the intrinsics must be legal and take an i32, this means there's almost
17744// certainly going to be a zext in the DAG which we can eliminate.
17745static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
17746  SDValue AndN = N->getOperand(2);
17747  if (AndN.getOpcode() != ISD::AND)
17748    return SDValue();
17749
17750  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
17751  if (!CMask || CMask->getZExtValue() != Mask)
17752    return SDValue();
17753
17754  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
17755                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
17756}
17757
17758static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
17759                                           SelectionDAG &DAG) {
17760  SDLoc dl(N);
17761  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
17762                     DAG.getNode(Opc, dl,
17763                                 N->getOperand(1).getSimpleValueType(),
17764                                 N->getOperand(1)),
17765                     DAG.getConstant(0, dl, MVT::i64));
17766}
17767
17768static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
17769  SDLoc DL(N);
17770  SDValue Op1 = N->getOperand(1);
17771  SDValue Op2 = N->getOperand(2);
17772  EVT ScalarTy = Op2.getValueType();
17773  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17774    ScalarTy = MVT::i32;
17775
17776  // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
17777  SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
17778  SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
17779  SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
17780  SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
17781  return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
17782}
17783
17784static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
17785  SDLoc dl(N);
17786  SDValue Scalar = N->getOperand(3);
17787  EVT ScalarTy = Scalar.getValueType();
17788
17789  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17790    Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
17791
17792  SDValue Passthru = N->getOperand(1);
17793  SDValue Pred = N->getOperand(2);
17794  return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
17795                     Pred, Scalar, Passthru);
17796}
17797
17798static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
17799  SDLoc dl(N);
17800  LLVMContext &Ctx = *DAG.getContext();
17801  EVT VT = N->getValueType(0);
17802
17803  assert(VT.isScalableVector() && "Expected a scalable vector.");
17804
17805  // Current lowering only supports the SVE-ACLE types.
17806  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
17807    return SDValue();
17808
17809  unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
17810  unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
17811  EVT ByteVT =
17812      EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
17813
17814  // Convert everything to the domain of EXT (i.e bytes).
17815  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
17816  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
17817  SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
17818                            DAG.getConstant(ElemSize, dl, MVT::i32));
17819
17820  SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
17821  return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
17822}
17823
17824static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
17825                                        TargetLowering::DAGCombinerInfo &DCI,
17826                                        SelectionDAG &DAG) {
17827  if (DCI.isBeforeLegalize())
17828    return SDValue();
17829
17830  SDValue Comparator = N->getOperand(3);
17831  if (Comparator.getOpcode() == AArch64ISD::DUP ||
17832      Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
17833    unsigned IID = getIntrinsicID(N);
17834    EVT VT = N->getValueType(0);
17835    EVT CmpVT = N->getOperand(2).getValueType();
17836    SDValue Pred = N->getOperand(1);
17837    SDValue Imm;
17838    SDLoc DL(N);
17839
17840    switch (IID) {
17841    default:
17842      llvm_unreachable("Called with wrong intrinsic!");
17843      break;
17844
17845    // Signed comparisons
17846    case Intrinsic::aarch64_sve_cmpeq_wide:
17847    case Intrinsic::aarch64_sve_cmpne_wide:
17848    case Intrinsic::aarch64_sve_cmpge_wide:
17849    case Intrinsic::aarch64_sve_cmpgt_wide:
17850    case Intrinsic::aarch64_sve_cmplt_wide:
17851    case Intrinsic::aarch64_sve_cmple_wide: {
17852      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17853        int64_t ImmVal = CN->getSExtValue();
17854        if (ImmVal >= -16 && ImmVal <= 15)
17855          Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17856        else
17857          return SDValue();
17858      }
17859      break;
17860    }
17861    // Unsigned comparisons
17862    case Intrinsic::aarch64_sve_cmphs_wide:
17863    case Intrinsic::aarch64_sve_cmphi_wide:
17864    case Intrinsic::aarch64_sve_cmplo_wide:
17865    case Intrinsic::aarch64_sve_cmpls_wide:  {
17866      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17867        uint64_t ImmVal = CN->getZExtValue();
17868        if (ImmVal <= 127)
17869          Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17870        else
17871          return SDValue();
17872      }
17873      break;
17874    }
17875    }
17876
17877    if (!Imm)
17878      return SDValue();
17879
17880    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
17881    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
17882                       N->getOperand(2), Splat, DAG.getCondCode(CC));
17883  }
17884
17885  return SDValue();
17886}
17887
17888static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
17889                        AArch64CC::CondCode Cond) {
17890  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17891
17892  SDLoc DL(Op);
17893  assert(Op.getValueType().isScalableVector() &&
17894         TLI.isTypeLegal(Op.getValueType()) &&
17895         "Expected legal scalable vector type!");
17896  assert(Op.getValueType() == Pg.getValueType() &&
17897         "Expected same type for PTEST operands");
17898
17899  // Ensure target specific opcodes are using legal type.
17900  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
17901  SDValue TVal = DAG.getConstant(1, DL, OutVT);
17902  SDValue FVal = DAG.getConstant(0, DL, OutVT);
17903
17904  // Ensure operands have type nxv16i1.
17905  if (Op.getValueType() != MVT::nxv16i1) {
17906    if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
17907        isZeroingInactiveLanes(Op))
17908      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
17909    else
17910      Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
17911    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
17912  }
17913
17914  // Set condition code (CC) flags.
17915  SDValue Test = DAG.getNode(
17916      Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
17917      DL, MVT::Other, Pg, Op);
17918
17919  // Convert CC to integer based on requested condition.
17920  // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
17921  SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
17922  SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
17923  return DAG.getZExtOrTrunc(Res, DL, VT);
17924}
17925
17926static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
17927                                      SelectionDAG &DAG) {
17928  SDLoc DL(N);
17929
17930  SDValue Pred = N->getOperand(1);
17931  SDValue VecToReduce = N->getOperand(2);
17932
17933  // NOTE: The integer reduction's result type is not always linked to the
17934  // operand's element type so we construct it from the intrinsic's result type.
17935  EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
17936  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17937
17938  // SVE reductions set the whole vector register with the first element
17939  // containing the reduction result, which we'll now extract.
17940  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17941  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17942                     Zero);
17943}
17944
17945static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
17946                                     SelectionDAG &DAG) {
17947  SDLoc DL(N);
17948
17949  SDValue Pred = N->getOperand(1);
17950  SDValue VecToReduce = N->getOperand(2);
17951
17952  EVT ReduceVT = VecToReduce.getValueType();
17953  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17954
17955  // SVE reductions set the whole vector register with the first element
17956  // containing the reduction result, which we'll now extract.
17957  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17958  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17959                     Zero);
17960}
17961
17962static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
17963                                            SelectionDAG &DAG) {
17964  SDLoc DL(N);
17965
17966  SDValue Pred = N->getOperand(1);
17967  SDValue InitVal = N->getOperand(2);
17968  SDValue VecToReduce = N->getOperand(3);
17969  EVT ReduceVT = VecToReduce.getValueType();
17970
17971  // Ordered reductions use the first lane of the result vector as the
17972  // reduction's initial value.
17973  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17974  InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
17975                        DAG.getUNDEF(ReduceVT), InitVal, Zero);
17976
17977  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
17978
17979  // SVE reductions set the whole vector register with the first element
17980  // containing the reduction result, which we'll now extract.
17981  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17982                     Zero);
17983}
17984
17985// If a merged operation has no inactive lanes we can relax it to a predicated
17986// or unpredicated operation, which potentially allows better isel (perhaps
17987// using immediate forms) or relaxing register reuse requirements.
17988static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
17989                                       SelectionDAG &DAG, bool UnpredOp = false,
17990                                       bool SwapOperands = false) {
17991  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
17992  assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
17993  SDValue Pg = N->getOperand(1);
17994  SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
17995  SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
17996
17997  // ISD way to specify an all active predicate.
17998  if (isAllActivePredicate(DAG, Pg)) {
17999    if (UnpredOp)
18000      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
18001
18002    return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
18003  }
18004
18005  // FUTURE: SplatVector(true)
18006  return SDValue();
18007}
18008
18009static SDValue performIntrinsicCombine(SDNode *N,
18010                                       TargetLowering::DAGCombinerInfo &DCI,
18011                                       const AArch64Subtarget *Subtarget) {
18012  SelectionDAG &DAG = DCI.DAG;
18013  unsigned IID = getIntrinsicID(N);
18014  switch (IID) {
18015  default:
18016    break;
18017  case Intrinsic::get_active_lane_mask: {
18018    SDValue Res = SDValue();
18019    EVT VT = N->getValueType(0);
18020    if (VT.isFixedLengthVector()) {
18021      // We can use the SVE whilelo instruction to lower this intrinsic by
18022      // creating the appropriate sequence of scalable vector operations and
18023      // then extracting a fixed-width subvector from the scalable vector.
18024
18025      SDLoc DL(N);
18026      SDValue ID =
18027          DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
18028
18029      EVT WhileVT = EVT::getVectorVT(
18030          *DAG.getContext(), MVT::i1,
18031          ElementCount::getScalable(VT.getVectorNumElements()));
18032
18033      // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
18034      EVT PromVT = getPromotedVTForPredicate(WhileVT);
18035
18036      // Get the fixed-width equivalent of PromVT for extraction.
18037      EVT ExtVT =
18038          EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
18039                           VT.getVectorElementCount());
18040
18041      Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
18042                        N->getOperand(1), N->getOperand(2));
18043      Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
18044      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
18045                        DAG.getConstant(0, DL, MVT::i64));
18046      Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
18047    }
18048    return Res;
18049  }
18050  case Intrinsic::aarch64_neon_vcvtfxs2fp:
18051  case Intrinsic::aarch64_neon_vcvtfxu2fp:
18052    return tryCombineFixedPointConvert(N, DCI, DAG);
18053  case Intrinsic::aarch64_neon_saddv:
18054    return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
18055  case Intrinsic::aarch64_neon_uaddv:
18056    return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
18057  case Intrinsic::aarch64_neon_sminv:
18058    return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
18059  case Intrinsic::aarch64_neon_uminv:
18060    return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
18061  case Intrinsic::aarch64_neon_smaxv:
18062    return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
18063  case Intrinsic::aarch64_neon_umaxv:
18064    return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
18065  case Intrinsic::aarch64_neon_fmax:
18066    return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
18067                       N->getOperand(1), N->getOperand(2));
18068  case Intrinsic::aarch64_neon_fmin:
18069    return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
18070                       N->getOperand(1), N->getOperand(2));
18071  case Intrinsic::aarch64_neon_fmaxnm:
18072    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
18073                       N->getOperand(1), N->getOperand(2));
18074  case Intrinsic::aarch64_neon_fminnm:
18075    return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
18076                       N->getOperand(1), N->getOperand(2));
18077  case Intrinsic::aarch64_neon_smull:
18078    return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
18079                       N->getOperand(1), N->getOperand(2));
18080  case Intrinsic::aarch64_neon_umull:
18081    return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
18082                       N->getOperand(1), N->getOperand(2));
18083  case Intrinsic::aarch64_neon_pmull:
18084    return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
18085                       N->getOperand(1), N->getOperand(2));
18086  case Intrinsic::aarch64_neon_sqdmull:
18087    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
18088  case Intrinsic::aarch64_neon_sqshl:
18089  case Intrinsic::aarch64_neon_uqshl:
18090  case Intrinsic::aarch64_neon_sqshlu:
18091  case Intrinsic::aarch64_neon_srshl:
18092  case Intrinsic::aarch64_neon_urshl:
18093  case Intrinsic::aarch64_neon_sshl:
18094  case Intrinsic::aarch64_neon_ushl:
18095    return tryCombineShiftImm(IID, N, DAG);
18096  case Intrinsic::aarch64_neon_rshrn: {
18097    EVT VT = N->getOperand(1).getValueType();
18098    SDLoc DL(N);
18099    SDValue Imm =
18100        DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT);
18101    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm);
18102    SDValue Sht =
18103        DAG.getNode(ISD::SRL, DL, VT, Add,
18104                    DAG.getConstant(N->getConstantOperandVal(2), DL, VT));
18105    return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht);
18106  }
18107  case Intrinsic::aarch64_crc32b:
18108  case Intrinsic::aarch64_crc32cb:
18109    return tryCombineCRC32(0xff, N, DAG);
18110  case Intrinsic::aarch64_crc32h:
18111  case Intrinsic::aarch64_crc32ch:
18112    return tryCombineCRC32(0xffff, N, DAG);
18113  case Intrinsic::aarch64_sve_saddv:
18114    // There is no i64 version of SADDV because the sign is irrelevant.
18115    if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
18116      return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
18117    else
18118      return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
18119  case Intrinsic::aarch64_sve_uaddv:
18120    return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
18121  case Intrinsic::aarch64_sve_smaxv:
18122    return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
18123  case Intrinsic::aarch64_sve_umaxv:
18124    return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
18125  case Intrinsic::aarch64_sve_sminv:
18126    return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
18127  case Intrinsic::aarch64_sve_uminv:
18128    return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
18129  case Intrinsic::aarch64_sve_orv:
18130    return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
18131  case Intrinsic::aarch64_sve_eorv:
18132    return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
18133  case Intrinsic::aarch64_sve_andv:
18134    return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
18135  case Intrinsic::aarch64_sve_index:
18136    return LowerSVEIntrinsicIndex(N, DAG);
18137  case Intrinsic::aarch64_sve_dup:
18138    return LowerSVEIntrinsicDUP(N, DAG);
18139  case Intrinsic::aarch64_sve_dup_x:
18140    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
18141                       N->getOperand(1));
18142  case Intrinsic::aarch64_sve_ext:
18143    return LowerSVEIntrinsicEXT(N, DAG);
18144  case Intrinsic::aarch64_sve_mul:
18145    return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
18146  case Intrinsic::aarch64_sve_mul_u:
18147    return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
18148                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18149  case Intrinsic::aarch64_sve_smulh:
18150    return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
18151  case Intrinsic::aarch64_sve_smulh_u:
18152    return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
18153                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18154  case Intrinsic::aarch64_sve_umulh:
18155    return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
18156  case Intrinsic::aarch64_sve_umulh_u:
18157    return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
18158                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18159  case Intrinsic::aarch64_sve_smin:
18160    return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
18161  case Intrinsic::aarch64_sve_smin_u:
18162    return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
18163                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18164  case Intrinsic::aarch64_sve_umin:
18165    return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
18166  case Intrinsic::aarch64_sve_umin_u:
18167    return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
18168                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18169  case Intrinsic::aarch64_sve_smax:
18170    return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
18171  case Intrinsic::aarch64_sve_smax_u:
18172    return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
18173                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18174  case Intrinsic::aarch64_sve_umax:
18175    return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
18176  case Intrinsic::aarch64_sve_umax_u:
18177    return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
18178                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18179  case Intrinsic::aarch64_sve_lsl:
18180    return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
18181  case Intrinsic::aarch64_sve_lsl_u:
18182    return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
18183                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18184  case Intrinsic::aarch64_sve_lsr:
18185    return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
18186  case Intrinsic::aarch64_sve_lsr_u:
18187    return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
18188                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18189  case Intrinsic::aarch64_sve_asr:
18190    return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
18191  case Intrinsic::aarch64_sve_asr_u:
18192    return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
18193                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18194  case Intrinsic::aarch64_sve_fadd:
18195    return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
18196  case Intrinsic::aarch64_sve_fsub:
18197    return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
18198  case Intrinsic::aarch64_sve_fmul:
18199    return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
18200  case Intrinsic::aarch64_sve_add:
18201    return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
18202  case Intrinsic::aarch64_sve_add_u:
18203    return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
18204                       N->getOperand(3));
18205  case Intrinsic::aarch64_sve_sub:
18206    return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
18207  case Intrinsic::aarch64_sve_sub_u:
18208    return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
18209                       N->getOperand(3));
18210  case Intrinsic::aarch64_sve_subr:
18211    return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
18212  case Intrinsic::aarch64_sve_and:
18213    return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
18214  case Intrinsic::aarch64_sve_bic:
18215    return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
18216  case Intrinsic::aarch64_sve_eor:
18217    return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
18218  case Intrinsic::aarch64_sve_orr:
18219    return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
18220  case Intrinsic::aarch64_sve_sabd:
18221    return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
18222  case Intrinsic::aarch64_sve_sabd_u:
18223    return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
18224                       N->getOperand(2), N->getOperand(3));
18225  case Intrinsic::aarch64_sve_uabd:
18226    return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
18227  case Intrinsic::aarch64_sve_uabd_u:
18228    return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
18229                       N->getOperand(2), N->getOperand(3));
18230  case Intrinsic::aarch64_sve_sdiv_u:
18231    return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
18232                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18233  case Intrinsic::aarch64_sve_udiv_u:
18234    return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
18235                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18236  case Intrinsic::aarch64_sve_sqadd:
18237    return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
18238  case Intrinsic::aarch64_sve_sqsub:
18239    return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
18240  case Intrinsic::aarch64_sve_uqadd:
18241    return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
18242  case Intrinsic::aarch64_sve_uqsub:
18243    return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
18244  case Intrinsic::aarch64_sve_sqadd_x:
18245    return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
18246                       N->getOperand(1), N->getOperand(2));
18247  case Intrinsic::aarch64_sve_sqsub_x:
18248    return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
18249                       N->getOperand(1), N->getOperand(2));
18250  case Intrinsic::aarch64_sve_uqadd_x:
18251    return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
18252                       N->getOperand(1), N->getOperand(2));
18253  case Intrinsic::aarch64_sve_uqsub_x:
18254    return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
18255                       N->getOperand(1), N->getOperand(2));
18256  case Intrinsic::aarch64_sve_asrd:
18257    return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
18258                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18259  case Intrinsic::aarch64_sve_cmphs:
18260    if (!N->getOperand(2).getValueType().isFloatingPoint())
18261      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18262                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
18263                         N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
18264    break;
18265  case Intrinsic::aarch64_sve_cmphi:
18266    if (!N->getOperand(2).getValueType().isFloatingPoint())
18267      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18268                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
18269                         N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
18270    break;
18271  case Intrinsic::aarch64_sve_fcmpge:
18272  case Intrinsic::aarch64_sve_cmpge:
18273    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18274                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
18275                       N->getOperand(3), DAG.getCondCode(ISD::SETGE));
18276    break;
18277  case Intrinsic::aarch64_sve_fcmpgt:
18278  case Intrinsic::aarch64_sve_cmpgt:
18279    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18280                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
18281                       N->getOperand(3), DAG.getCondCode(ISD::SETGT));
18282    break;
18283  case Intrinsic::aarch64_sve_fcmpeq:
18284  case Intrinsic::aarch64_sve_cmpeq:
18285    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18286                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
18287                       N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
18288    break;
18289  case Intrinsic::aarch64_sve_fcmpne:
18290  case Intrinsic::aarch64_sve_cmpne:
18291    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18292                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
18293                       N->getOperand(3), DAG.getCondCode(ISD::SETNE));
18294    break;
18295  case Intrinsic::aarch64_sve_fcmpuo:
18296    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
18297                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
18298                       N->getOperand(3), DAG.getCondCode(ISD::SETUO));
18299    break;
18300  case Intrinsic::aarch64_sve_fadda:
18301    return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
18302  case Intrinsic::aarch64_sve_faddv:
18303    return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
18304  case Intrinsic::aarch64_sve_fmaxnmv:
18305    return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
18306  case Intrinsic::aarch64_sve_fmaxv:
18307    return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
18308  case Intrinsic::aarch64_sve_fminnmv:
18309    return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
18310  case Intrinsic::aarch64_sve_fminv:
18311    return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
18312  case Intrinsic::aarch64_sve_sel:
18313    return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
18314                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
18315  case Intrinsic::aarch64_sve_cmpeq_wide:
18316    return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
18317  case Intrinsic::aarch64_sve_cmpne_wide:
18318    return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
18319  case Intrinsic::aarch64_sve_cmpge_wide:
18320    return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
18321  case Intrinsic::aarch64_sve_cmpgt_wide:
18322    return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
18323  case Intrinsic::aarch64_sve_cmplt_wide:
18324    return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
18325  case Intrinsic::aarch64_sve_cmple_wide:
18326    return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
18327  case Intrinsic::aarch64_sve_cmphs_wide:
18328    return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
18329  case Intrinsic::aarch64_sve_cmphi_wide:
18330    return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
18331  case Intrinsic::aarch64_sve_cmplo_wide:
18332    return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
18333  case Intrinsic::aarch64_sve_cmpls_wide:
18334    return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
18335  case Intrinsic::aarch64_sve_ptest_any:
18336    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
18337                    AArch64CC::ANY_ACTIVE);
18338  case Intrinsic::aarch64_sve_ptest_first:
18339    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
18340                    AArch64CC::FIRST_ACTIVE);
18341  case Intrinsic::aarch64_sve_ptest_last:
18342    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
18343                    AArch64CC::LAST_ACTIVE);
18344  }
18345  return SDValue();
18346}
18347
18348static bool isCheapToExtend(const SDValue &N) {
18349  unsigned OC = N->getOpcode();
18350  return OC == ISD::LOAD || OC == ISD::MLOAD ||
18351         ISD::isConstantSplatVectorAllZeros(N.getNode());
18352}
18353
18354static SDValue
18355performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18356                              SelectionDAG &DAG) {
18357  // If we have (sext (setcc A B)) and A and B are cheap to extend,
18358  // we can move the sext into the arguments and have the same result. For
18359  // example, if A and B are both loads, we can make those extending loads and
18360  // avoid an extra instruction. This pattern appears often in VLS code
18361  // generation where the inputs to the setcc have a different size to the
18362  // instruction that wants to use the result of the setcc.
18363  assert(N->getOpcode() == ISD::SIGN_EXTEND &&
18364         N->getOperand(0)->getOpcode() == ISD::SETCC);
18365  const SDValue SetCC = N->getOperand(0);
18366
18367  const SDValue CCOp0 = SetCC.getOperand(0);
18368  const SDValue CCOp1 = SetCC.getOperand(1);
18369  if (!CCOp0->getValueType(0).isInteger() ||
18370      !CCOp1->getValueType(0).isInteger())
18371    return SDValue();
18372
18373  ISD::CondCode Code =
18374      cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
18375
18376  ISD::NodeType ExtType =
18377      isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18378
18379  if (isCheapToExtend(SetCC.getOperand(0)) &&
18380      isCheapToExtend(SetCC.getOperand(1))) {
18381    const SDValue Ext1 =
18382        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
18383    const SDValue Ext2 =
18384        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
18385
18386    return DAG.getSetCC(
18387        SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
18388        cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
18389  }
18390
18391  return SDValue();
18392}
18393
18394static SDValue performExtendCombine(SDNode *N,
18395                                    TargetLowering::DAGCombinerInfo &DCI,
18396                                    SelectionDAG &DAG) {
18397  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
18398  // we can convert that DUP into another extract_high (of a bigger DUP), which
18399  // helps the backend to decide that an sabdl2 would be useful, saving a real
18400  // extract_high operation.
18401  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
18402      (N->getOperand(0).getOpcode() == ISD::ABDU ||
18403       N->getOperand(0).getOpcode() == ISD::ABDS)) {
18404    SDNode *ABDNode = N->getOperand(0).getNode();
18405    SDValue NewABD =
18406        tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
18407    if (!NewABD.getNode())
18408      return SDValue();
18409
18410    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
18411  }
18412
18413  if (N->getValueType(0).isFixedLengthVector() &&
18414      N->getOpcode() == ISD::SIGN_EXTEND &&
18415      N->getOperand(0)->getOpcode() == ISD::SETCC)
18416    return performSignExtendSetCCCombine(N, DCI, DAG);
18417
18418  return SDValue();
18419}
18420
18421static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
18422                               SDValue SplatVal, unsigned NumVecElts) {
18423  assert(!St.isTruncatingStore() && "cannot split truncating vector store");
18424  Align OrigAlignment = St.getAlign();
18425  unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
18426
18427  // Create scalar stores. This is at least as good as the code sequence for a
18428  // split unaligned store which is a dup.s, ext.b, and two stores.
18429  // Most of the time the three stores should be replaced by store pair
18430  // instructions (stp).
18431  SDLoc DL(&St);
18432  SDValue BasePtr = St.getBasePtr();
18433  uint64_t BaseOffset = 0;
18434
18435  const MachinePointerInfo &PtrInfo = St.getPointerInfo();
18436  SDValue NewST1 =
18437      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
18438                   OrigAlignment, St.getMemOperand()->getFlags());
18439
18440  // As this in ISel, we will not merge this add which may degrade results.
18441  if (BasePtr->getOpcode() == ISD::ADD &&
18442      isa<ConstantSDNode>(BasePtr->getOperand(1))) {
18443    BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
18444    BasePtr = BasePtr->getOperand(0);
18445  }
18446
18447  unsigned Offset = EltOffset;
18448  while (--NumVecElts) {
18449    Align Alignment = commonAlignment(OrigAlignment, Offset);
18450    SDValue OffsetPtr =
18451        DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
18452                    DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
18453    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
18454                          PtrInfo.getWithOffset(Offset), Alignment,
18455                          St.getMemOperand()->getFlags());
18456    Offset += EltOffset;
18457  }
18458  return NewST1;
18459}
18460
18461// Returns an SVE type that ContentTy can be trivially sign or zero extended
18462// into.
18463static MVT getSVEContainerType(EVT ContentTy) {
18464  assert(ContentTy.isSimple() && "No SVE containers for extended types");
18465
18466  switch (ContentTy.getSimpleVT().SimpleTy) {
18467  default:
18468    llvm_unreachable("No known SVE container for this MVT type");
18469  case MVT::nxv2i8:
18470  case MVT::nxv2i16:
18471  case MVT::nxv2i32:
18472  case MVT::nxv2i64:
18473  case MVT::nxv2f32:
18474  case MVT::nxv2f64:
18475    return MVT::nxv2i64;
18476  case MVT::nxv4i8:
18477  case MVT::nxv4i16:
18478  case MVT::nxv4i32:
18479  case MVT::nxv4f32:
18480    return MVT::nxv4i32;
18481  case MVT::nxv8i8:
18482  case MVT::nxv8i16:
18483  case MVT::nxv8f16:
18484  case MVT::nxv8bf16:
18485    return MVT::nxv8i16;
18486  case MVT::nxv16i8:
18487    return MVT::nxv16i8;
18488  }
18489}
18490
18491static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
18492  SDLoc DL(N);
18493  EVT VT = N->getValueType(0);
18494
18495  if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
18496    return SDValue();
18497
18498  EVT ContainerVT = VT;
18499  if (ContainerVT.isInteger())
18500    ContainerVT = getSVEContainerType(ContainerVT);
18501
18502  SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
18503  SDValue Ops[] = { N->getOperand(0), // Chain
18504                    N->getOperand(2), // Pg
18505                    N->getOperand(3), // Base
18506                    DAG.getValueType(VT) };
18507
18508  SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
18509  SDValue LoadChain = SDValue(Load.getNode(), 1);
18510
18511  if (ContainerVT.isInteger() && (VT != ContainerVT))
18512    Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
18513
18514  return DAG.getMergeValues({ Load, LoadChain }, DL);
18515}
18516
18517static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
18518  SDLoc DL(N);
18519  EVT VT = N->getValueType(0);
18520  EVT PtrTy = N->getOperand(3).getValueType();
18521
18522  EVT LoadVT = VT;
18523  if (VT.isFloatingPoint())
18524    LoadVT = VT.changeTypeToInteger();
18525
18526  auto *MINode = cast<MemIntrinsicSDNode>(N);
18527  SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
18528  SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
18529                                MINode->getOperand(3), DAG.getUNDEF(PtrTy),
18530                                MINode->getOperand(2), PassThru,
18531                                MINode->getMemoryVT(), MINode->getMemOperand(),
18532                                ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
18533
18534   if (VT.isFloatingPoint()) {
18535     SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
18536     return DAG.getMergeValues(Ops, DL);
18537   }
18538
18539  return L;
18540}
18541
18542template <unsigned Opcode>
18543static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
18544  static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
18545                    Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
18546                "Unsupported opcode.");
18547  SDLoc DL(N);
18548  EVT VT = N->getValueType(0);
18549
18550  EVT LoadVT = VT;
18551  if (VT.isFloatingPoint())
18552    LoadVT = VT.changeTypeToInteger();
18553
18554  SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
18555  SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
18556  SDValue LoadChain = SDValue(Load.getNode(), 1);
18557
18558  if (VT.isFloatingPoint())
18559    Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
18560
18561  return DAG.getMergeValues({Load, LoadChain}, DL);
18562}
18563
18564static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
18565  SDLoc DL(N);
18566  SDValue Data = N->getOperand(2);
18567  EVT DataVT = Data.getValueType();
18568  EVT HwSrcVt = getSVEContainerType(DataVT);
18569  SDValue InputVT = DAG.getValueType(DataVT);
18570
18571  if (DataVT.isFloatingPoint())
18572    InputVT = DAG.getValueType(HwSrcVt);
18573
18574  SDValue SrcNew;
18575  if (Data.getValueType().isFloatingPoint())
18576    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
18577  else
18578    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
18579
18580  SDValue Ops[] = { N->getOperand(0), // Chain
18581                    SrcNew,
18582                    N->getOperand(4), // Base
18583                    N->getOperand(3), // Pg
18584                    InputVT
18585                  };
18586
18587  return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
18588}
18589
18590static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
18591  SDLoc DL(N);
18592
18593  SDValue Data = N->getOperand(2);
18594  EVT DataVT = Data.getValueType();
18595  EVT PtrTy = N->getOperand(4).getValueType();
18596
18597  if (DataVT.isFloatingPoint())
18598    Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
18599
18600  auto *MINode = cast<MemIntrinsicSDNode>(N);
18601  return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
18602                            DAG.getUNDEF(PtrTy), MINode->getOperand(3),
18603                            MINode->getMemoryVT(), MINode->getMemOperand(),
18604                            ISD::UNINDEXED, false, false);
18605}
18606
18607/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
18608/// load store optimizer pass will merge them to store pair stores.  This should
18609/// be better than a movi to create the vector zero followed by a vector store
18610/// if the zero constant is not re-used, since one instructions and one register
18611/// live range will be removed.
18612///
18613/// For example, the final generated code should be:
18614///
18615///   stp xzr, xzr, [x0]
18616///
18617/// instead of:
18618///
18619///   movi v0.2d, #0
18620///   str q0, [x0]
18621///
18622static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
18623  SDValue StVal = St.getValue();
18624  EVT VT = StVal.getValueType();
18625
18626  // Avoid scalarizing zero splat stores for scalable vectors.
18627  if (VT.isScalableVector())
18628    return SDValue();
18629
18630  // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
18631  // 2, 3 or 4 i32 elements.
18632  int NumVecElts = VT.getVectorNumElements();
18633  if (!(((NumVecElts == 2 || NumVecElts == 3) &&
18634         VT.getVectorElementType().getSizeInBits() == 64) ||
18635        ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
18636         VT.getVectorElementType().getSizeInBits() == 32)))
18637    return SDValue();
18638
18639  if (StVal.getOpcode() != ISD::BUILD_VECTOR)
18640    return SDValue();
18641
18642  // If the zero constant has more than one use then the vector store could be
18643  // better since the constant mov will be amortized and stp q instructions
18644  // should be able to be formed.
18645  if (!StVal.hasOneUse())
18646    return SDValue();
18647
18648  // If the store is truncating then it's going down to i16 or smaller, which
18649  // means it can be implemented in a single store anyway.
18650  if (St.isTruncatingStore())
18651    return SDValue();
18652
18653  // If the immediate offset of the address operand is too large for the stp
18654  // instruction, then bail out.
18655  if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
18656    int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
18657    if (Offset < -512 || Offset > 504)
18658      return SDValue();
18659  }
18660
18661  for (int I = 0; I < NumVecElts; ++I) {
18662    SDValue EltVal = StVal.getOperand(I);
18663    if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
18664      return SDValue();
18665  }
18666
18667  // Use a CopyFromReg WZR/XZR here to prevent
18668  // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
18669  SDLoc DL(&St);
18670  unsigned ZeroReg;
18671  EVT ZeroVT;
18672  if (VT.getVectorElementType().getSizeInBits() == 32) {
18673    ZeroReg = AArch64::WZR;
18674    ZeroVT = MVT::i32;
18675  } else {
18676    ZeroReg = AArch64::XZR;
18677    ZeroVT = MVT::i64;
18678  }
18679  SDValue SplatVal =
18680      DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
18681  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
18682}
18683
18684/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
18685/// value. The load store optimizer pass will merge them to store pair stores.
18686/// This has better performance than a splat of the scalar followed by a split
18687/// vector store. Even if the stores are not merged it is four stores vs a dup,
18688/// followed by an ext.b and two stores.
18689static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
18690  SDValue StVal = St.getValue();
18691  EVT VT = StVal.getValueType();
18692
18693  // Don't replace floating point stores, they possibly won't be transformed to
18694  // stp because of the store pair suppress pass.
18695  if (VT.isFloatingPoint())
18696    return SDValue();
18697
18698  // We can express a splat as store pair(s) for 2 or 4 elements.
18699  unsigned NumVecElts = VT.getVectorNumElements();
18700  if (NumVecElts != 4 && NumVecElts != 2)
18701    return SDValue();
18702
18703  // If the store is truncating then it's going down to i16 or smaller, which
18704  // means it can be implemented in a single store anyway.
18705  if (St.isTruncatingStore())
18706    return SDValue();
18707
18708  // Check that this is a splat.
18709  // Make sure that each of the relevant vector element locations are inserted
18710  // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
18711  std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
18712  SDValue SplatVal;
18713  for (unsigned I = 0; I < NumVecElts; ++I) {
18714    // Check for insert vector elements.
18715    if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
18716      return SDValue();
18717
18718    // Check that same value is inserted at each vector element.
18719    if (I == 0)
18720      SplatVal = StVal.getOperand(1);
18721    else if (StVal.getOperand(1) != SplatVal)
18722      return SDValue();
18723
18724    // Check insert element index.
18725    ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
18726    if (!CIndex)
18727      return SDValue();
18728    uint64_t IndexVal = CIndex->getZExtValue();
18729    if (IndexVal >= NumVecElts)
18730      return SDValue();
18731    IndexNotInserted.reset(IndexVal);
18732
18733    StVal = StVal.getOperand(0);
18734  }
18735  // Check that all vector element locations were inserted to.
18736  if (IndexNotInserted.any())
18737      return SDValue();
18738
18739  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
18740}
18741
18742static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18743                           SelectionDAG &DAG,
18744                           const AArch64Subtarget *Subtarget) {
18745
18746  StoreSDNode *S = cast<StoreSDNode>(N);
18747  if (S->isVolatile() || S->isIndexed())
18748    return SDValue();
18749
18750  SDValue StVal = S->getValue();
18751  EVT VT = StVal.getValueType();
18752
18753  if (!VT.isFixedLengthVector())
18754    return SDValue();
18755
18756  // If we get a splat of zeros, convert this vector store to a store of
18757  // scalars. They will be merged into store pairs of xzr thereby removing one
18758  // instruction and one register.
18759  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
18760    return ReplacedZeroSplat;
18761
18762  // FIXME: The logic for deciding if an unaligned store should be split should
18763  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
18764  // a call to that function here.
18765
18766  if (!Subtarget->isMisaligned128StoreSlow())
18767    return SDValue();
18768
18769  // Don't split at -Oz.
18770  if (DAG.getMachineFunction().getFunction().hasMinSize())
18771    return SDValue();
18772
18773  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
18774  // those up regresses performance on micro-benchmarks and olden/bh.
18775  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
18776    return SDValue();
18777
18778  // Split unaligned 16B stores. They are terrible for performance.
18779  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
18780  // extensions can use this to mark that it does not want splitting to happen
18781  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
18782  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
18783  if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
18784      S->getAlign() <= Align(2))
18785    return SDValue();
18786
18787  // If we get a splat of a scalar convert this vector store to a store of
18788  // scalars. They will be merged into store pairs thereby removing two
18789  // instructions.
18790  if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
18791    return ReplacedSplat;
18792
18793  SDLoc DL(S);
18794
18795  // Split VT into two.
18796  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18797  unsigned NumElts = HalfVT.getVectorNumElements();
18798  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18799                                   DAG.getConstant(0, DL, MVT::i64));
18800  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18801                                   DAG.getConstant(NumElts, DL, MVT::i64));
18802  SDValue BasePtr = S->getBasePtr();
18803  SDValue NewST1 =
18804      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
18805                   S->getAlign(), S->getMemOperand()->getFlags());
18806  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
18807                                  DAG.getConstant(8, DL, MVT::i64));
18808  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
18809                      S->getPointerInfo(), S->getAlign(),
18810                      S->getMemOperand()->getFlags());
18811}
18812
18813static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
18814  assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
18815
18816  // splice(pg, op1, undef) -> op1
18817  if (N->getOperand(2).isUndef())
18818    return N->getOperand(1);
18819
18820  return SDValue();
18821}
18822
18823static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
18824                                    const AArch64Subtarget *Subtarget) {
18825  assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
18826          N->getOpcode() == AArch64ISD::UUNPKLO) &&
18827         "Unexpected Opcode!");
18828
18829  // uunpklo/hi undef -> undef
18830  if (N->getOperand(0).isUndef())
18831    return DAG.getUNDEF(N->getValueType(0));
18832
18833  // If this is a masked load followed by an UUNPKLO, fold this into a masked
18834  // extending load.  We can do this even if this is already a masked
18835  // {z,}extload.
18836  if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
18837      N->getOpcode() == AArch64ISD::UUNPKLO) {
18838    MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
18839    SDValue Mask = MLD->getMask();
18840    SDLoc DL(N);
18841
18842    if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
18843        SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
18844        (MLD->getPassThru()->isUndef() ||
18845         isZerosVector(MLD->getPassThru().getNode()))) {
18846      unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
18847      unsigned PgPattern = Mask->getConstantOperandVal(0);
18848      EVT VT = N->getValueType(0);
18849
18850      // Ensure we can double the size of the predicate pattern
18851      unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
18852      if (NumElts &&
18853          NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
18854        Mask =
18855            getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
18856        SDValue PassThru = DAG.getConstant(0, DL, VT);
18857        SDValue NewLoad = DAG.getMaskedLoad(
18858            VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
18859            PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
18860            MLD->getAddressingMode(), ISD::ZEXTLOAD);
18861
18862        DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
18863
18864        return NewLoad;
18865      }
18866    }
18867  }
18868
18869  return SDValue();
18870}
18871
18872static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
18873  SDLoc DL(N);
18874  SDValue Op0 = N->getOperand(0);
18875  SDValue Op1 = N->getOperand(1);
18876  EVT ResVT = N->getValueType(0);
18877
18878  // uzp1(x, undef) -> concat(truncate(x), undef)
18879  if (Op1.getOpcode() == ISD::UNDEF) {
18880    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
18881    switch (ResVT.getSimpleVT().SimpleTy) {
18882    default:
18883      break;
18884    case MVT::v16i8:
18885      BCVT = MVT::v8i16;
18886      HalfVT = MVT::v8i8;
18887      break;
18888    case MVT::v8i16:
18889      BCVT = MVT::v4i32;
18890      HalfVT = MVT::v4i16;
18891      break;
18892    case MVT::v4i32:
18893      BCVT = MVT::v2i64;
18894      HalfVT = MVT::v2i32;
18895      break;
18896    }
18897    if (BCVT != MVT::Other) {
18898      SDValue BC = DAG.getBitcast(BCVT, Op0);
18899      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
18900      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
18901                         DAG.getUNDEF(HalfVT));
18902    }
18903  }
18904
18905  // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
18906  if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
18907    if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18908      SDValue X = Op0.getOperand(0).getOperand(0);
18909      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
18910    }
18911  }
18912
18913  // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
18914  if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
18915    if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18916      SDValue Z = Op1.getOperand(0).getOperand(1);
18917      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
18918    }
18919  }
18920
18921  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
18922  // Only implemented on little-endian subtargets.
18923  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
18924
18925  // This optimization only works on little endian.
18926  if (!IsLittleEndian)
18927    return SDValue();
18928
18929  if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
18930    return SDValue();
18931
18932  auto getSourceOp = [](SDValue Operand) -> SDValue {
18933    const unsigned Opcode = Operand.getOpcode();
18934    if (Opcode == ISD::TRUNCATE)
18935      return Operand->getOperand(0);
18936    if (Opcode == ISD::BITCAST &&
18937        Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
18938      return Operand->getOperand(0)->getOperand(0);
18939    return SDValue();
18940  };
18941
18942  SDValue SourceOp0 = getSourceOp(Op0);
18943  SDValue SourceOp1 = getSourceOp(Op1);
18944
18945  if (!SourceOp0 || !SourceOp1)
18946    return SDValue();
18947
18948  if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
18949      !SourceOp0.getValueType().isSimple())
18950    return SDValue();
18951
18952  EVT ResultTy;
18953
18954  switch (SourceOp0.getSimpleValueType().SimpleTy) {
18955  case MVT::v2i64:
18956    ResultTy = MVT::v4i32;
18957    break;
18958  case MVT::v4i32:
18959    ResultTy = MVT::v8i16;
18960    break;
18961  case MVT::v8i16:
18962    ResultTy = MVT::v16i8;
18963    break;
18964  default:
18965    return SDValue();
18966  }
18967
18968  SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
18969  SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
18970  SDValue UzpResult =
18971      DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
18972
18973  EVT BitcastResultTy;
18974
18975  switch (ResVT.getSimpleVT().SimpleTy) {
18976  case MVT::v2i32:
18977    BitcastResultTy = MVT::v2i64;
18978    break;
18979  case MVT::v4i16:
18980    BitcastResultTy = MVT::v4i32;
18981    break;
18982  case MVT::v8i8:
18983    BitcastResultTy = MVT::v8i16;
18984    break;
18985  default:
18986    llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
18987  }
18988
18989  return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
18990                     DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
18991}
18992
18993static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
18994  unsigned Opc = N->getOpcode();
18995
18996  assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
18997           Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
18998          (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
18999           Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
19000         "Invalid opcode.");
19001
19002  const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
19003                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
19004  const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
19005                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
19006  const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
19007                        Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
19008                        Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
19009                        Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
19010
19011  SDLoc DL(N);
19012  SDValue Chain = N->getOperand(0);
19013  SDValue Pg = N->getOperand(1);
19014  SDValue Base = N->getOperand(2);
19015  SDValue Offset = N->getOperand(3);
19016  SDValue Ty = N->getOperand(4);
19017
19018  EVT ResVT = N->getValueType(0);
19019
19020  const auto OffsetOpc = Offset.getOpcode();
19021  const bool OffsetIsZExt =
19022      OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
19023  const bool OffsetIsSExt =
19024      OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
19025
19026  // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
19027  if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
19028    SDValue ExtPg = Offset.getOperand(0);
19029    VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
19030    EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
19031
19032    // If the predicate for the sign- or zero-extended offset is the
19033    // same as the predicate used for this load and the sign-/zero-extension
19034    // was from a 32-bits...
19035    if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
19036      SDValue UnextendedOffset = Offset.getOperand(1);
19037
19038      unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
19039      if (Signed)
19040        NewOpc = getSignExtendedGatherOpcode(NewOpc);
19041
19042      return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
19043                         {Chain, Pg, Base, UnextendedOffset, Ty});
19044    }
19045  }
19046
19047  return SDValue();
19048}
19049
19050/// Optimize a vector shift instruction and its operand if shifted out
19051/// bits are not used.
19052static SDValue performVectorShiftCombine(SDNode *N,
19053                                         const AArch64TargetLowering &TLI,
19054                                         TargetLowering::DAGCombinerInfo &DCI) {
19055  assert(N->getOpcode() == AArch64ISD::VASHR ||
19056         N->getOpcode() == AArch64ISD::VLSHR);
19057
19058  SDValue Op = N->getOperand(0);
19059  unsigned OpScalarSize = Op.getScalarValueSizeInBits();
19060
19061  unsigned ShiftImm = N->getConstantOperandVal(1);
19062  assert(OpScalarSize > ShiftImm && "Invalid shift imm");
19063
19064  APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
19065  APInt DemandedMask = ~ShiftedOutBits;
19066
19067  if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
19068    return SDValue(N, 0);
19069
19070  return SDValue();
19071}
19072
19073static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
19074  // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
19075  // This transform works in partnership with performSetCCPunpkCombine to
19076  // remove unnecessary transfer of predicates into standard registers and back
19077  if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
19078      N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
19079          MVT::i1) {
19080    SDValue CC = N->getOperand(0)->getOperand(0);
19081    auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
19082    SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
19083                               DAG.getVectorIdxConstant(0, SDLoc(N)));
19084    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
19085  }
19086
19087  return SDValue();
19088}
19089
19090/// Target-specific DAG combine function for post-increment LD1 (lane) and
19091/// post-increment LD1R.
19092static SDValue performPostLD1Combine(SDNode *N,
19093                                     TargetLowering::DAGCombinerInfo &DCI,
19094                                     bool IsLaneOp) {
19095  if (DCI.isBeforeLegalizeOps())
19096    return SDValue();
19097
19098  SelectionDAG &DAG = DCI.DAG;
19099  EVT VT = N->getValueType(0);
19100
19101  if (!VT.is128BitVector() && !VT.is64BitVector())
19102    return SDValue();
19103
19104  unsigned LoadIdx = IsLaneOp ? 1 : 0;
19105  SDNode *LD = N->getOperand(LoadIdx).getNode();
19106  // If it is not LOAD, can not do such combine.
19107  if (LD->getOpcode() != ISD::LOAD)
19108    return SDValue();
19109
19110  // The vector lane must be a constant in the LD1LANE opcode.
19111  SDValue Lane;
19112  if (IsLaneOp) {
19113    Lane = N->getOperand(2);
19114    auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
19115    if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
19116      return SDValue();
19117  }
19118
19119  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
19120  EVT MemVT = LoadSDN->getMemoryVT();
19121  // Check if memory operand is the same type as the vector element.
19122  if (MemVT != VT.getVectorElementType())
19123    return SDValue();
19124
19125  // Check if there are other uses. If so, do not combine as it will introduce
19126  // an extra load.
19127  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
19128       ++UI) {
19129    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
19130      continue;
19131    if (*UI != N)
19132      return SDValue();
19133  }
19134
19135  SDValue Addr = LD->getOperand(1);
19136  SDValue Vector = N->getOperand(0);
19137  // Search for a use of the address operand that is an increment.
19138  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
19139       Addr.getNode()->use_end(); UI != UE; ++UI) {
19140    SDNode *User = *UI;
19141    if (User->getOpcode() != ISD::ADD
19142        || UI.getUse().getResNo() != Addr.getResNo())
19143      continue;
19144
19145    // If the increment is a constant, it must match the memory ref size.
19146    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
19147    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
19148      uint32_t IncVal = CInc->getZExtValue();
19149      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
19150      if (IncVal != NumBytes)
19151        continue;
19152      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
19153    }
19154
19155    // To avoid cycle construction make sure that neither the load nor the add
19156    // are predecessors to each other or the Vector.
19157    SmallPtrSet<const SDNode *, 32> Visited;
19158    SmallVector<const SDNode *, 16> Worklist;
19159    Visited.insert(Addr.getNode());
19160    Worklist.push_back(User);
19161    Worklist.push_back(LD);
19162    Worklist.push_back(Vector.getNode());
19163    if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
19164        SDNode::hasPredecessorHelper(User, Visited, Worklist))
19165      continue;
19166
19167    SmallVector<SDValue, 8> Ops;
19168    Ops.push_back(LD->getOperand(0));  // Chain
19169    if (IsLaneOp) {
19170      Ops.push_back(Vector);           // The vector to be inserted
19171      Ops.push_back(Lane);             // The lane to be inserted in the vector
19172    }
19173    Ops.push_back(Addr);
19174    Ops.push_back(Inc);
19175
19176    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
19177    SDVTList SDTys = DAG.getVTList(Tys);
19178    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
19179    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
19180                                           MemVT,
19181                                           LoadSDN->getMemOperand());
19182
19183    // Update the uses.
19184    SDValue NewResults[] = {
19185        SDValue(LD, 0),            // The result of load
19186        SDValue(UpdN.getNode(), 2) // Chain
19187    };
19188    DCI.CombineTo(LD, NewResults);
19189    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
19190    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
19191
19192    break;
19193  }
19194  return SDValue();
19195}
19196
19197/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
19198/// address translation.
19199static bool performTBISimplification(SDValue Addr,
19200                                     TargetLowering::DAGCombinerInfo &DCI,
19201                                     SelectionDAG &DAG) {
19202  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
19203  KnownBits Known;
19204  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
19205                                        !DCI.isBeforeLegalizeOps());
19206  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19207  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
19208    DCI.CommitTargetLoweringOpt(TLO);
19209    return true;
19210  }
19211  return false;
19212}
19213
19214static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
19215  assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
19216         "Expected STORE dag node in input!");
19217
19218  if (auto Store = dyn_cast<StoreSDNode>(N)) {
19219    if (!Store->isTruncatingStore() || Store->isIndexed())
19220      return SDValue();
19221    SDValue Ext = Store->getValue();
19222    auto ExtOpCode = Ext.getOpcode();
19223    if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
19224        ExtOpCode != ISD::ANY_EXTEND)
19225      return SDValue();
19226    SDValue Orig = Ext->getOperand(0);
19227    if (Store->getMemoryVT() != Orig.getValueType())
19228      return SDValue();
19229    return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
19230                        Store->getBasePtr(), Store->getMemOperand());
19231  }
19232
19233  return SDValue();
19234}
19235
19236// Perform TBI simplification if supported by the target and try to break up
19237// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
19238// load instructions can be selected.
19239static SDValue performLOADCombine(SDNode *N,
19240                                  TargetLowering::DAGCombinerInfo &DCI,
19241                                  SelectionDAG &DAG,
19242                                  const AArch64Subtarget *Subtarget) {
19243  if (Subtarget->supportsAddressTopByteIgnored())
19244    performTBISimplification(N->getOperand(1), DCI, DAG);
19245
19246  LoadSDNode *LD = cast<LoadSDNode>(N);
19247  EVT MemVT = LD->getMemoryVT();
19248  if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
19249    return SDValue(N, 0);
19250
19251  if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
19252      MemVT.getSizeInBits() % 256 == 0 ||
19253      256 % MemVT.getScalarSizeInBits() != 0)
19254    return SDValue(N, 0);
19255
19256  SDLoc DL(LD);
19257  SDValue Chain = LD->getChain();
19258  SDValue BasePtr = LD->getBasePtr();
19259  SDNodeFlags Flags = LD->getFlags();
19260  SmallVector<SDValue, 4> LoadOps;
19261  SmallVector<SDValue, 4> LoadOpsChain;
19262  // Replace any non temporal load over 256-bit with a series of 256 bit loads
19263  // and a scalar/vector load less than 256. This way we can utilize 256-bit
19264  // loads and reduce the amount of load instructions generated.
19265  MVT NewVT =
19266      MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
19267                       256 / MemVT.getVectorElementType().getSizeInBits());
19268  unsigned Num256Loads = MemVT.getSizeInBits() / 256;
19269  // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
19270  for (unsigned I = 0; I < Num256Loads; I++) {
19271    unsigned PtrOffset = I * 32;
19272    SDValue NewPtr = DAG.getMemBasePlusOffset(
19273        BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
19274    Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
19275    SDValue NewLoad = DAG.getLoad(
19276        NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
19277        NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
19278    LoadOps.push_back(NewLoad);
19279    LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
19280  }
19281
19282  // Process remaining bits of the load operation.
19283  // This is done by creating an UNDEF vector to match the size of the
19284  // 256-bit loads and inserting the remaining load to it. We extract the
19285  // original load type at the end using EXTRACT_SUBVECTOR instruction.
19286  unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
19287  unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
19288  MVT RemainingVT = MVT::getVectorVT(
19289      MemVT.getVectorElementType().getSimpleVT(),
19290      BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
19291  SDValue NewPtr =
19292      DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
19293  Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
19294  SDValue RemainingLoad =
19295      DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
19296                  LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
19297                  LD->getMemOperand()->getFlags(), LD->getAAInfo());
19298  SDValue UndefVector = DAG.getUNDEF(NewVT);
19299  SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
19300  SDValue ExtendedReminingLoad =
19301      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
19302                  {UndefVector, RemainingLoad, InsertIdx});
19303  LoadOps.push_back(ExtendedReminingLoad);
19304  LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
19305  EVT ConcatVT =
19306      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19307                       LoadOps.size() * NewVT.getVectorNumElements());
19308  SDValue ConcatVectors =
19309      DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
19310  // Extract the original vector type size.
19311  SDValue ExtractSubVector =
19312      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
19313                  {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
19314  SDValue TokenFactor =
19315      DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
19316  return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
19317}
19318
19319static SDValue performSTORECombine(SDNode *N,
19320                                   TargetLowering::DAGCombinerInfo &DCI,
19321                                   SelectionDAG &DAG,
19322                                   const AArch64Subtarget *Subtarget) {
19323  StoreSDNode *ST = cast<StoreSDNode>(N);
19324  SDValue Chain = ST->getChain();
19325  SDValue Value = ST->getValue();
19326  SDValue Ptr = ST->getBasePtr();
19327  EVT ValueVT = Value.getValueType();
19328
19329  auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
19330    EVT EltVT = VT.getVectorElementType();
19331    return EltVT == MVT::f32 || EltVT == MVT::f64;
19332  };
19333
19334  // If this is an FP_ROUND followed by a store, fold this into a truncating
19335  // store. We can do this even if this is already a truncstore.
19336  // We purposefully don't care about legality of the nodes here as we know
19337  // they can be split down into something legal.
19338  if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
19339      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
19340      Subtarget->useSVEForFixedLengthVectors() &&
19341      ValueVT.isFixedLengthVector() &&
19342      ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
19343      hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
19344    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
19345                             ST->getMemoryVT(), ST->getMemOperand());
19346
19347  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
19348    return Split;
19349
19350  if (Subtarget->supportsAddressTopByteIgnored() &&
19351      performTBISimplification(N->getOperand(2), DCI, DAG))
19352    return SDValue(N, 0);
19353
19354  if (SDValue Store = foldTruncStoreOfExt(DAG, N))
19355    return Store;
19356
19357  return SDValue();
19358}
19359
19360static SDValue performMSTORECombine(SDNode *N,
19361                                    TargetLowering::DAGCombinerInfo &DCI,
19362                                    SelectionDAG &DAG,
19363                                    const AArch64Subtarget *Subtarget) {
19364  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
19365  SDValue Value = MST->getValue();
19366  SDValue Mask = MST->getMask();
19367  SDLoc DL(N);
19368
19369  // If this is a UZP1 followed by a masked store, fold this into a masked
19370  // truncating store.  We can do this even if this is already a masked
19371  // truncstore.
19372  if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
19373      MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
19374      Value.getValueType().isInteger()) {
19375    Value = Value.getOperand(0);
19376    if (Value.getOpcode() == ISD::BITCAST) {
19377      EVT HalfVT =
19378          Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
19379      EVT InVT = Value.getOperand(0).getValueType();
19380
19381      if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
19382        unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
19383        unsigned PgPattern = Mask->getConstantOperandVal(0);
19384
19385        // Ensure we can double the size of the predicate pattern
19386        unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
19387        if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
19388                           MinSVESize) {
19389          Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
19390                          PgPattern);
19391          return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
19392                                    MST->getBasePtr(), MST->getOffset(), Mask,
19393                                    MST->getMemoryVT(), MST->getMemOperand(),
19394                                    MST->getAddressingMode(),
19395                                    /*IsTruncating=*/true);
19396        }
19397      }
19398    }
19399  }
19400
19401  return SDValue();
19402}
19403
19404/// \return true if part of the index was folded into the Base.
19405static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
19406                              SDLoc DL, SelectionDAG &DAG) {
19407  // This function assumes a vector of i64 indices.
19408  EVT IndexVT = Index.getValueType();
19409  if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
19410    return false;
19411
19412  // Simplify:
19413  //   BasePtr = Ptr
19414  //   Index = X + splat(Offset)
19415  // ->
19416  //   BasePtr = Ptr + Offset * scale.
19417  //   Index = X
19418  if (Index.getOpcode() == ISD::ADD) {
19419    if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
19420      Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
19421      BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
19422      Index = Index.getOperand(0);
19423      return true;
19424    }
19425  }
19426
19427  // Simplify:
19428  //   BasePtr = Ptr
19429  //   Index = (X + splat(Offset)) << splat(Shift)
19430  // ->
19431  //   BasePtr = Ptr + (Offset << Shift) * scale)
19432  //   Index = X << splat(shift)
19433  if (Index.getOpcode() == ISD::SHL &&
19434      Index.getOperand(0).getOpcode() == ISD::ADD) {
19435    SDValue Add = Index.getOperand(0);
19436    SDValue ShiftOp = Index.getOperand(1);
19437    SDValue OffsetOp = Add.getOperand(1);
19438    if (auto Shift = DAG.getSplatValue(ShiftOp))
19439      if (auto Offset = DAG.getSplatValue(OffsetOp)) {
19440        Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
19441        Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
19442        BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
19443        Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
19444                            Add.getOperand(0), ShiftOp);
19445        return true;
19446      }
19447  }
19448
19449  return false;
19450}
19451
19452// Analyse the specified address returning true if a more optimal addressing
19453// mode is available. When returning true all parameters are updated to reflect
19454// their recommended values.
19455static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
19456                                     SDValue &BasePtr, SDValue &Index,
19457                                     SelectionDAG &DAG) {
19458  // Try to iteratively fold parts of the index into the base pointer to
19459  // simplify the index as much as possible.
19460  bool Changed = false;
19461  while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
19462    Changed = true;
19463
19464  // Only consider element types that are pointer sized as smaller types can
19465  // be easily promoted.
19466  EVT IndexVT = Index.getValueType();
19467  if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
19468    return Changed;
19469
19470  // Can indices be trivially shrunk?
19471  EVT DataVT = N->getOperand(1).getValueType();
19472  // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
19473  // will later be re-extended to 64 bits in legalization
19474  if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
19475    return Changed;
19476  if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
19477    EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
19478    Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
19479    return true;
19480  }
19481
19482  // Match:
19483  //   Index = step(const)
19484  int64_t Stride = 0;
19485  if (Index.getOpcode() == ISD::STEP_VECTOR) {
19486    Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
19487  }
19488  // Match:
19489  //   Index = step(const) << shift(const)
19490  else if (Index.getOpcode() == ISD::SHL &&
19491           Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
19492    SDValue RHS = Index.getOperand(1);
19493    if (auto *Shift =
19494            dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
19495      int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
19496      Stride = Step << Shift->getZExtValue();
19497    }
19498  }
19499
19500  // Return early because no supported pattern is found.
19501  if (Stride == 0)
19502    return Changed;
19503
19504  if (Stride < std::numeric_limits<int32_t>::min() ||
19505      Stride > std::numeric_limits<int32_t>::max())
19506    return Changed;
19507
19508  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19509  unsigned MaxVScale =
19510      Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
19511  int64_t LastElementOffset =
19512      IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
19513
19514  if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
19515      LastElementOffset > std::numeric_limits<int32_t>::max())
19516    return Changed;
19517
19518  EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
19519  // Stride does not scale explicitly by 'Scale', because it happens in
19520  // the gather/scatter addressing mode.
19521  Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
19522  return true;
19523}
19524
19525static SDValue performMaskedGatherScatterCombine(
19526    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
19527  MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
19528  assert(MGS && "Can only combine gather load or scatter store nodes");
19529
19530  if (!DCI.isBeforeLegalize())
19531    return SDValue();
19532
19533  SDLoc DL(MGS);
19534  SDValue Chain = MGS->getChain();
19535  SDValue Scale = MGS->getScale();
19536  SDValue Index = MGS->getIndex();
19537  SDValue Mask = MGS->getMask();
19538  SDValue BasePtr = MGS->getBasePtr();
19539  ISD::MemIndexType IndexType = MGS->getIndexType();
19540
19541  if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
19542    return SDValue();
19543
19544  // Here we catch such cases early and change MGATHER's IndexType to allow
19545  // the use of an Index that's more legalisation friendly.
19546  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
19547    SDValue PassThru = MGT->getPassThru();
19548    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
19549    return DAG.getMaskedGather(
19550        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
19551        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
19552  }
19553  auto *MSC = cast<MaskedScatterSDNode>(MGS);
19554  SDValue Data = MSC->getValue();
19555  SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
19556  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
19557                              Ops, MSC->getMemOperand(), IndexType,
19558                              MSC->isTruncatingStore());
19559}
19560
19561/// Target-specific DAG combine function for NEON load/store intrinsics
19562/// to merge base address updates.
19563static SDValue performNEONPostLDSTCombine(SDNode *N,
19564                                          TargetLowering::DAGCombinerInfo &DCI,
19565                                          SelectionDAG &DAG) {
19566  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
19567    return SDValue();
19568
19569  unsigned AddrOpIdx = N->getNumOperands() - 1;
19570  SDValue Addr = N->getOperand(AddrOpIdx);
19571
19572  // Search for a use of the address operand that is an increment.
19573  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
19574       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
19575    SDNode *User = *UI;
19576    if (User->getOpcode() != ISD::ADD ||
19577        UI.getUse().getResNo() != Addr.getResNo())
19578      continue;
19579
19580    // Check that the add is independent of the load/store.  Otherwise, folding
19581    // it would create a cycle.
19582    SmallPtrSet<const SDNode *, 32> Visited;
19583    SmallVector<const SDNode *, 16> Worklist;
19584    Visited.insert(Addr.getNode());
19585    Worklist.push_back(N);
19586    Worklist.push_back(User);
19587    if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
19588        SDNode::hasPredecessorHelper(User, Visited, Worklist))
19589      continue;
19590
19591    // Find the new opcode for the updating load/store.
19592    bool IsStore = false;
19593    bool IsLaneOp = false;
19594    bool IsDupOp = false;
19595    unsigned NewOpc = 0;
19596    unsigned NumVecs = 0;
19597    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19598    switch (IntNo) {
19599    default: llvm_unreachable("unexpected intrinsic for Neon base update");
19600    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
19601      NumVecs = 2; break;
19602    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
19603      NumVecs = 3; break;
19604    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
19605      NumVecs = 4; break;
19606    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
19607      NumVecs = 2; IsStore = true; break;
19608    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
19609      NumVecs = 3; IsStore = true; break;
19610    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
19611      NumVecs = 4; IsStore = true; break;
19612    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
19613      NumVecs = 2; break;
19614    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
19615      NumVecs = 3; break;
19616    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
19617      NumVecs = 4; break;
19618    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
19619      NumVecs = 2; IsStore = true; break;
19620    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
19621      NumVecs = 3; IsStore = true; break;
19622    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
19623      NumVecs = 4; IsStore = true; break;
19624    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
19625      NumVecs = 2; IsDupOp = true; break;
19626    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
19627      NumVecs = 3; IsDupOp = true; break;
19628    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
19629      NumVecs = 4; IsDupOp = true; break;
19630    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
19631      NumVecs = 2; IsLaneOp = true; break;
19632    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
19633      NumVecs = 3; IsLaneOp = true; break;
19634    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
19635      NumVecs = 4; IsLaneOp = true; break;
19636    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
19637      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
19638    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
19639      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
19640    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
19641      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
19642    }
19643
19644    EVT VecTy;
19645    if (IsStore)
19646      VecTy = N->getOperand(2).getValueType();
19647    else
19648      VecTy = N->getValueType(0);
19649
19650    // If the increment is a constant, it must match the memory ref size.
19651    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
19652    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
19653      uint32_t IncVal = CInc->getZExtValue();
19654      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
19655      if (IsLaneOp || IsDupOp)
19656        NumBytes /= VecTy.getVectorNumElements();
19657      if (IncVal != NumBytes)
19658        continue;
19659      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
19660    }
19661    SmallVector<SDValue, 8> Ops;
19662    Ops.push_back(N->getOperand(0)); // Incoming chain
19663    // Load lane and store have vector list as input.
19664    if (IsLaneOp || IsStore)
19665      for (unsigned i = 2; i < AddrOpIdx; ++i)
19666        Ops.push_back(N->getOperand(i));
19667    Ops.push_back(Addr); // Base register
19668    Ops.push_back(Inc);
19669
19670    // Return Types.
19671    EVT Tys[6];
19672    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
19673    unsigned n;
19674    for (n = 0; n < NumResultVecs; ++n)
19675      Tys[n] = VecTy;
19676    Tys[n++] = MVT::i64;  // Type of write back register
19677    Tys[n] = MVT::Other;  // Type of the chain
19678    SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
19679
19680    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
19681    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
19682                                           MemInt->getMemoryVT(),
19683                                           MemInt->getMemOperand());
19684
19685    // Update the uses.
19686    std::vector<SDValue> NewResults;
19687    for (unsigned i = 0; i < NumResultVecs; ++i) {
19688      NewResults.push_back(SDValue(UpdN.getNode(), i));
19689    }
19690    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
19691    DCI.CombineTo(N, NewResults);
19692    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
19693
19694    break;
19695  }
19696  return SDValue();
19697}
19698
19699// Checks to see if the value is the prescribed width and returns information
19700// about its extension mode.
19701static
19702bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
19703  ExtType = ISD::NON_EXTLOAD;
19704  switch(V.getNode()->getOpcode()) {
19705  default:
19706    return false;
19707  case ISD::LOAD: {
19708    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
19709    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
19710       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
19711      ExtType = LoadNode->getExtensionType();
19712      return true;
19713    }
19714    return false;
19715  }
19716  case ISD::AssertSext: {
19717    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
19718    if ((TypeNode->getVT() == MVT::i8 && width == 8)
19719       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
19720      ExtType = ISD::SEXTLOAD;
19721      return true;
19722    }
19723    return false;
19724  }
19725  case ISD::AssertZext: {
19726    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
19727    if ((TypeNode->getVT() == MVT::i8 && width == 8)
19728       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
19729      ExtType = ISD::ZEXTLOAD;
19730      return true;
19731    }
19732    return false;
19733  }
19734  case ISD::Constant:
19735  case ISD::TargetConstant: {
19736    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
19737           1LL << (width - 1);
19738  }
19739  }
19740
19741  return true;
19742}
19743
19744// This function does a whole lot of voodoo to determine if the tests are
19745// equivalent without and with a mask. Essentially what happens is that given a
19746// DAG resembling:
19747//
19748//  +-------------+ +-------------+ +-------------+ +-------------+
19749//  |    Input    | | AddConstant | | CompConstant| |     CC      |
19750//  +-------------+ +-------------+ +-------------+ +-------------+
19751//           |           |           |               |
19752//           V           V           |    +----------+
19753//          +-------------+  +----+  |    |
19754//          |     ADD     |  |0xff|  |    |
19755//          +-------------+  +----+  |    |
19756//                  |           |    |    |
19757//                  V           V    |    |
19758//                 +-------------+   |    |
19759//                 |     AND     |   |    |
19760//                 +-------------+   |    |
19761//                      |            |    |
19762//                      +-----+      |    |
19763//                            |      |    |
19764//                            V      V    V
19765//                           +-------------+
19766//                           |     CMP     |
19767//                           +-------------+
19768//
19769// The AND node may be safely removed for some combinations of inputs. In
19770// particular we need to take into account the extension type of the Input,
19771// the exact values of AddConstant, CompConstant, and CC, along with the nominal
19772// width of the input (this can work for any width inputs, the above graph is
19773// specific to 8 bits.
19774//
19775// The specific equations were worked out by generating output tables for each
19776// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
19777// problem was simplified by working with 4 bit inputs, which means we only
19778// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
19779// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
19780// patterns present in both extensions (0,7). For every distinct set of
19781// AddConstant and CompConstants bit patterns we can consider the masked and
19782// unmasked versions to be equivalent if the result of this function is true for
19783// all 16 distinct bit patterns of for the current extension type of Input (w0).
19784//
19785//   sub      w8, w0, w1
19786//   and      w10, w8, #0x0f
19787//   cmp      w8, w2
19788//   cset     w9, AArch64CC
19789//   cmp      w10, w2
19790//   cset     w11, AArch64CC
19791//   cmp      w9, w11
19792//   cset     w0, eq
19793//   ret
19794//
19795// Since the above function shows when the outputs are equivalent it defines
19796// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
19797// would be expensive to run during compiles. The equations below were written
19798// in a test harness that confirmed they gave equivalent outputs to the above
19799// for all inputs function, so they can be used determine if the removal is
19800// legal instead.
19801//
19802// isEquivalentMaskless() is the code for testing if the AND can be removed
19803// factored out of the DAG recognition as the DAG can take several forms.
19804
19805static bool isEquivalentMaskless(unsigned CC, unsigned width,
19806                                 ISD::LoadExtType ExtType, int AddConstant,
19807                                 int CompConstant) {
19808  // By being careful about our equations and only writing the in term
19809  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
19810  // make them generally applicable to all bit widths.
19811  int MaxUInt = (1 << width);
19812
19813  // For the purposes of these comparisons sign extending the type is
19814  // equivalent to zero extending the add and displacing it by half the integer
19815  // width. Provided we are careful and make sure our equations are valid over
19816  // the whole range we can just adjust the input and avoid writing equations
19817  // for sign extended inputs.
19818  if (ExtType == ISD::SEXTLOAD)
19819    AddConstant -= (1 << (width-1));
19820
19821  switch(CC) {
19822  case AArch64CC::LE:
19823  case AArch64CC::GT:
19824    if ((AddConstant == 0) ||
19825        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
19826        (AddConstant >= 0 && CompConstant < 0) ||
19827        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
19828      return true;
19829    break;
19830  case AArch64CC::LT:
19831  case AArch64CC::GE:
19832    if ((AddConstant == 0) ||
19833        (AddConstant >= 0 && CompConstant <= 0) ||
19834        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
19835      return true;
19836    break;
19837  case AArch64CC::HI:
19838  case AArch64CC::LS:
19839    if ((AddConstant >= 0 && CompConstant < 0) ||
19840       (AddConstant <= 0 && CompConstant >= -1 &&
19841        CompConstant < AddConstant + MaxUInt))
19842      return true;
19843   break;
19844  case AArch64CC::PL:
19845  case AArch64CC::MI:
19846    if ((AddConstant == 0) ||
19847        (AddConstant > 0 && CompConstant <= 0) ||
19848        (AddConstant < 0 && CompConstant <= AddConstant))
19849      return true;
19850    break;
19851  case AArch64CC::LO:
19852  case AArch64CC::HS:
19853    if ((AddConstant >= 0 && CompConstant <= 0) ||
19854        (AddConstant <= 0 && CompConstant >= 0 &&
19855         CompConstant <= AddConstant + MaxUInt))
19856      return true;
19857    break;
19858  case AArch64CC::EQ:
19859  case AArch64CC::NE:
19860    if ((AddConstant > 0 && CompConstant < 0) ||
19861        (AddConstant < 0 && CompConstant >= 0 &&
19862         CompConstant < AddConstant + MaxUInt) ||
19863        (AddConstant >= 0 && CompConstant >= 0 &&
19864         CompConstant >= AddConstant) ||
19865        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
19866      return true;
19867    break;
19868  case AArch64CC::VS:
19869  case AArch64CC::VC:
19870  case AArch64CC::AL:
19871  case AArch64CC::NV:
19872    return true;
19873  case AArch64CC::Invalid:
19874    break;
19875  }
19876
19877  return false;
19878}
19879
19880// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
19881// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
19882static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
19883                                        SDNode *AndNode, SelectionDAG &DAG,
19884                                        unsigned CCIndex, unsigned CmpIndex,
19885                                        unsigned CC) {
19886  ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
19887  if (!SubsC)
19888    return SDValue();
19889
19890  APInt SubsAP = SubsC->getAPIntValue();
19891  if (CC == AArch64CC::HI) {
19892    if (!SubsAP.isMask())
19893      return SDValue();
19894  } else if (CC == AArch64CC::LO) {
19895    if (!SubsAP.isPowerOf2())
19896      return SDValue();
19897  } else
19898    return SDValue();
19899
19900  ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
19901  if (!AndC)
19902    return SDValue();
19903
19904  APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
19905
19906  SDLoc DL(N);
19907  APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
19908  SDValue ANDS = DAG.getNode(
19909      AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
19910      DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
19911  SDValue AArch64_CC =
19912      DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
19913                      N->getOperand(CCIndex)->getValueType(0));
19914
19915  // For now, only performCSELCombine and performBRCONDCombine call this
19916  // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
19917  // operands. So just init the ops direct to simplify the code. If we have some
19918  // other case with different CCIndex, CmpIndex, we need to use for loop to
19919  // rewrite the code here.
19920  // TODO: Do we need to assert number of operand is 4 here?
19921  assert((CCIndex == 2 && CmpIndex == 3) &&
19922         "Expected CCIndex to be 2 and CmpIndex to be 3.");
19923  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
19924                   ANDS.getValue(1)};
19925  return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
19926}
19927
19928static
19929SDValue performCONDCombine(SDNode *N,
19930                           TargetLowering::DAGCombinerInfo &DCI,
19931                           SelectionDAG &DAG, unsigned CCIndex,
19932                           unsigned CmpIndex) {
19933  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
19934  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
19935  unsigned CondOpcode = SubsNode->getOpcode();
19936
19937  if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
19938    return SDValue();
19939
19940  // There is a SUBS feeding this condition. Is it fed by a mask we can
19941  // use?
19942
19943  SDNode *AndNode = SubsNode->getOperand(0).getNode();
19944  unsigned MaskBits = 0;
19945
19946  if (AndNode->getOpcode() != ISD::AND)
19947    return SDValue();
19948
19949  if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
19950                                             CmpIndex, CC))
19951    return Val;
19952
19953  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
19954    uint32_t CNV = CN->getZExtValue();
19955    if (CNV == 255)
19956      MaskBits = 8;
19957    else if (CNV == 65535)
19958      MaskBits = 16;
19959  }
19960
19961  if (!MaskBits)
19962    return SDValue();
19963
19964  SDValue AddValue = AndNode->getOperand(0);
19965
19966  if (AddValue.getOpcode() != ISD::ADD)
19967    return SDValue();
19968
19969  // The basic dag structure is correct, grab the inputs and validate them.
19970
19971  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
19972  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
19973  SDValue SubsInputValue = SubsNode->getOperand(1);
19974
19975  // The mask is present and the provenance of all the values is a smaller type,
19976  // lets see if the mask is superfluous.
19977
19978  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
19979      !isa<ConstantSDNode>(SubsInputValue.getNode()))
19980    return SDValue();
19981
19982  ISD::LoadExtType ExtType;
19983
19984  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
19985      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
19986      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
19987    return SDValue();
19988
19989  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
19990                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
19991                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
19992    return SDValue();
19993
19994  // The AND is not necessary, remove it.
19995
19996  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
19997                               SubsNode->getValueType(1));
19998  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
19999
20000  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
20001  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
20002
20003  return SDValue(N, 0);
20004}
20005
20006// Optimize compare with zero and branch.
20007static SDValue performBRCONDCombine(SDNode *N,
20008                                    TargetLowering::DAGCombinerInfo &DCI,
20009                                    SelectionDAG &DAG) {
20010  MachineFunction &MF = DAG.getMachineFunction();
20011  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
20012  // will not be produced, as they are conditional branch instructions that do
20013  // not set flags.
20014  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
20015    return SDValue();
20016
20017  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
20018    N = NV.getNode();
20019  SDValue Chain = N->getOperand(0);
20020  SDValue Dest = N->getOperand(1);
20021  SDValue CCVal = N->getOperand(2);
20022  SDValue Cmp = N->getOperand(3);
20023
20024  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
20025  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
20026  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
20027    return SDValue();
20028
20029  unsigned CmpOpc = Cmp.getOpcode();
20030  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
20031    return SDValue();
20032
20033  // Only attempt folding if there is only one use of the flag and no use of the
20034  // value.
20035  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
20036    return SDValue();
20037
20038  SDValue LHS = Cmp.getOperand(0);
20039  SDValue RHS = Cmp.getOperand(1);
20040
20041  assert(LHS.getValueType() == RHS.getValueType() &&
20042         "Expected the value type to be the same for both operands!");
20043  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
20044    return SDValue();
20045
20046  if (isNullConstant(LHS))
20047    std::swap(LHS, RHS);
20048
20049  if (!isNullConstant(RHS))
20050    return SDValue();
20051
20052  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
20053      LHS.getOpcode() == ISD::SRL)
20054    return SDValue();
20055
20056  // Fold the compare into the branch instruction.
20057  SDValue BR;
20058  if (CC == AArch64CC::EQ)
20059    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
20060  else
20061    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
20062
20063  // Do not add new nodes to DAG combiner worklist.
20064  DCI.CombineTo(N, BR, false);
20065
20066  return SDValue();
20067}
20068
20069static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
20070  unsigned CC = N->getConstantOperandVal(2);
20071  SDValue SUBS = N->getOperand(3);
20072  SDValue Zero, CTTZ;
20073
20074  if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
20075    Zero = N->getOperand(0);
20076    CTTZ = N->getOperand(1);
20077  } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
20078    Zero = N->getOperand(1);
20079    CTTZ = N->getOperand(0);
20080  } else
20081    return SDValue();
20082
20083  if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
20084      (CTTZ.getOpcode() == ISD::TRUNCATE &&
20085       CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
20086    return SDValue();
20087
20088  assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
20089         "Illegal type in CTTZ folding");
20090
20091  if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
20092    return SDValue();
20093
20094  SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
20095                  ? CTTZ.getOperand(0).getOperand(0)
20096                  : CTTZ.getOperand(0);
20097
20098  if (X != SUBS.getOperand(0))
20099    return SDValue();
20100
20101  unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
20102                          ? CTTZ.getOperand(0).getValueSizeInBits()
20103                          : CTTZ.getValueSizeInBits();
20104  SDValue BitWidthMinusOne =
20105      DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
20106  return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
20107                     BitWidthMinusOne);
20108}
20109
20110// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
20111// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
20112// Where x and y are constants and x != y
20113
20114// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
20115// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
20116// Where x and y are constants and x != y
20117static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
20118  SDValue L = Op->getOperand(0);
20119  SDValue R = Op->getOperand(1);
20120  AArch64CC::CondCode OpCC =
20121      static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
20122
20123  SDValue OpCmp = Op->getOperand(3);
20124  if (!isCMP(OpCmp))
20125    return SDValue();
20126
20127  SDValue CmpLHS = OpCmp.getOperand(0);
20128  SDValue CmpRHS = OpCmp.getOperand(1);
20129
20130  if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
20131    std::swap(CmpLHS, CmpRHS);
20132  else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
20133    return SDValue();
20134
20135  SDValue X = CmpLHS->getOperand(0);
20136  SDValue Y = CmpLHS->getOperand(1);
20137  if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
20138    return SDValue();
20139  }
20140
20141  // If one of the constant is opaque constant, x,y sdnode is still different
20142  // but the real value maybe the same. So check APInt here to make sure the
20143  // code is correct.
20144  ConstantSDNode *CX = cast<ConstantSDNode>(X);
20145  ConstantSDNode *CY = cast<ConstantSDNode>(Y);
20146  if (CX->getAPIntValue() == CY->getAPIntValue())
20147    return SDValue();
20148
20149  AArch64CC::CondCode CC =
20150      static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
20151  SDValue Cond = CmpLHS->getOperand(3);
20152
20153  if (CmpRHS == Y)
20154    CC = AArch64CC::getInvertedCondCode(CC);
20155  else if (CmpRHS != X)
20156    return SDValue();
20157
20158  if (OpCC == AArch64CC::NE)
20159    CC = AArch64CC::getInvertedCondCode(CC);
20160  else if (OpCC != AArch64CC::EQ)
20161    return SDValue();
20162
20163  SDLoc DL(Op);
20164  EVT VT = Op->getValueType(0);
20165
20166  SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
20167  return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
20168}
20169
20170// Optimize CSEL instructions
20171static SDValue performCSELCombine(SDNode *N,
20172                                  TargetLowering::DAGCombinerInfo &DCI,
20173                                  SelectionDAG &DAG) {
20174  // CSEL x, x, cc -> x
20175  if (N->getOperand(0) == N->getOperand(1))
20176    return N->getOperand(0);
20177
20178  if (SDValue R = foldCSELOfCSEL(N, DAG))
20179    return R;
20180
20181  // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
20182  // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
20183  if (SDValue Folded = foldCSELofCTTZ(N, DAG))
20184		return Folded;
20185
20186  return performCONDCombine(N, DCI, DAG, 2, 3);
20187}
20188
20189// Try to re-use an already extended operand of a vector SetCC feeding a
20190// extended select. Doing so avoids requiring another full extension of the
20191// SET_CC result when lowering the select.
20192static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
20193  EVT Op0MVT = Op->getOperand(0).getValueType();
20194  if (!Op0MVT.isVector() || Op->use_empty())
20195    return SDValue();
20196
20197  // Make sure that all uses of Op are VSELECTs with result matching types where
20198  // the result type has a larger element type than the SetCC operand.
20199  SDNode *FirstUse = *Op->use_begin();
20200  if (FirstUse->getOpcode() != ISD::VSELECT)
20201    return SDValue();
20202  EVT UseMVT = FirstUse->getValueType(0);
20203  if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
20204    return SDValue();
20205  if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
20206        return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
20207      }))
20208    return SDValue();
20209
20210  APInt V;
20211  if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
20212    return SDValue();
20213
20214  SDLoc DL(Op);
20215  SDValue Op0ExtV;
20216  SDValue Op1ExtV;
20217  ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
20218  // Check if the first operand of the SET_CC is already extended. If it is,
20219  // split the SET_CC and re-use the extended version of the operand.
20220  SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
20221                                        Op->getOperand(0));
20222  SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
20223                                        Op->getOperand(0));
20224  if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
20225    Op0ExtV = SDValue(Op0SExt, 0);
20226    Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
20227  } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
20228    Op0ExtV = SDValue(Op0ZExt, 0);
20229    Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
20230  } else
20231    return SDValue();
20232
20233  return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
20234                     Op0ExtV, Op1ExtV, Op->getOperand(2));
20235}
20236
20237static SDValue performSETCCCombine(SDNode *N,
20238                                   TargetLowering::DAGCombinerInfo &DCI,
20239                                   SelectionDAG &DAG) {
20240  assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
20241  SDValue LHS = N->getOperand(0);
20242  SDValue RHS = N->getOperand(1);
20243  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
20244  SDLoc DL(N);
20245  EVT VT = N->getValueType(0);
20246
20247  if (SDValue V = tryToWidenSetCCOperands(N, DAG))
20248    return V;
20249
20250  // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
20251  if (Cond == ISD::SETNE && isOneConstant(RHS) &&
20252      LHS->getOpcode() == AArch64ISD::CSEL &&
20253      isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
20254      LHS->hasOneUse()) {
20255    // Invert CSEL's condition.
20256    auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
20257    auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
20258    auto NewCond = getInvertedCondCode(OldCond);
20259
20260    // csel 0, 1, !cond, X
20261    SDValue CSEL =
20262        DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
20263                    LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
20264                    LHS.getOperand(3));
20265    return DAG.getZExtOrTrunc(CSEL, DL, VT);
20266  }
20267
20268  // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
20269  if (Cond == ISD::SETNE && isNullConstant(RHS) &&
20270      LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
20271      LHS->hasOneUse()) {
20272    EVT TstVT = LHS->getValueType(0);
20273    if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
20274      // this pattern will get better opt in emitComparison
20275      uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
20276      SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
20277                                DAG.getConstant(TstImm, DL, TstVT));
20278      return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
20279    }
20280  }
20281
20282  // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
20283  //   ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
20284  if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
20285      (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
20286      LHS->getOpcode() == ISD::BITCAST) {
20287    EVT ToVT = LHS->getValueType(0);
20288    EVT FromVT = LHS->getOperand(0).getValueType();
20289    if (FromVT.isFixedLengthVector() &&
20290        FromVT.getVectorElementType() == MVT::i1) {
20291      LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0));
20292      LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS);
20293      return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
20294    }
20295  }
20296
20297  // Try to perform the memcmp when the result is tested for [in]equality with 0
20298  if (SDValue V = performOrXorChainCombine(N, DAG))
20299    return V;
20300
20301  return SDValue();
20302}
20303
20304// Replace a flag-setting operator (eg ANDS) with the generic version
20305// (eg AND) if the flag is unused.
20306static SDValue performFlagSettingCombine(SDNode *N,
20307                                         TargetLowering::DAGCombinerInfo &DCI,
20308                                         unsigned GenericOpcode) {
20309  SDLoc DL(N);
20310  SDValue LHS = N->getOperand(0);
20311  SDValue RHS = N->getOperand(1);
20312  EVT VT = N->getValueType(0);
20313
20314  // If the flag result isn't used, convert back to a generic opcode.
20315  if (!N->hasAnyUseOfValue(1)) {
20316    SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
20317    return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
20318                                  DL);
20319  }
20320
20321  // Combine identical generic nodes into this node, re-using the result.
20322  if (SDNode *Generic = DCI.DAG.getNodeIfExists(
20323          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
20324    DCI.CombineTo(Generic, SDValue(N, 0));
20325
20326  return SDValue();
20327}
20328
20329static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
20330  // setcc_merge_zero pred
20331  //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
20332  //   => extract_subvector (inner setcc_merge_zero)
20333  SDValue Pred = N->getOperand(0);
20334  SDValue LHS = N->getOperand(1);
20335  SDValue RHS = N->getOperand(2);
20336  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
20337
20338  if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
20339      LHS->getOpcode() != ISD::SIGN_EXTEND)
20340    return SDValue();
20341
20342  SDValue Extract = LHS->getOperand(0);
20343  if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
20344      Extract->getValueType(0) != N->getValueType(0) ||
20345      Extract->getConstantOperandVal(1) != 0)
20346    return SDValue();
20347
20348  SDValue InnerSetCC = Extract->getOperand(0);
20349  if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
20350    return SDValue();
20351
20352  // By this point we've effectively got
20353  // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
20354  // lanes are already zero then the trunc(sext()) sequence is redundant and we
20355  // can operate on A directly.
20356  SDValue InnerPred = InnerSetCC.getOperand(0);
20357  if (Pred.getOpcode() == AArch64ISD::PTRUE &&
20358      InnerPred.getOpcode() == AArch64ISD::PTRUE &&
20359      Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
20360      Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
20361      Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
20362    return Extract;
20363
20364  return SDValue();
20365}
20366
20367static SDValue
20368performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
20369  assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
20370         "Unexpected opcode!");
20371
20372  SelectionDAG &DAG = DCI.DAG;
20373  SDValue Pred = N->getOperand(0);
20374  SDValue LHS = N->getOperand(1);
20375  SDValue RHS = N->getOperand(2);
20376  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
20377
20378  if (SDValue V = performSetCCPunpkCombine(N, DAG))
20379    return V;
20380
20381  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
20382      LHS->getOpcode() == ISD::SIGN_EXTEND &&
20383      LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
20384    //    setcc_merge_zero(
20385    //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
20386    // => setcc_merge_zero(pred, ...)
20387    if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
20388        LHS->getOperand(0)->getOperand(0) == Pred)
20389      return LHS->getOperand(0);
20390
20391    //    setcc_merge_zero(
20392    //        all_active, extend(nxvNi1 ...), != splat(0))
20393    // -> nxvNi1 ...
20394    if (isAllActivePredicate(DAG, Pred))
20395      return LHS->getOperand(0);
20396
20397    //    setcc_merge_zero(
20398    //        pred, extend(nxvNi1 ...), != splat(0))
20399    // -> nxvNi1 and(pred, ...)
20400    if (DCI.isAfterLegalizeDAG())
20401      // Do this after legalization to allow more folds on setcc_merge_zero
20402      // to be recognized.
20403      return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
20404                         LHS->getOperand(0), Pred);
20405  }
20406
20407  return SDValue();
20408}
20409
20410// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
20411// as well as whether the test should be inverted.  This code is required to
20412// catch these cases (as opposed to standard dag combines) because
20413// AArch64ISD::TBZ is matched during legalization.
20414static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
20415                                 SelectionDAG &DAG) {
20416
20417  if (!Op->hasOneUse())
20418    return Op;
20419
20420  // We don't handle undef/constant-fold cases below, as they should have
20421  // already been taken care of (e.g. and of 0, test of undefined shifted bits,
20422  // etc.)
20423
20424  // (tbz (trunc x), b) -> (tbz x, b)
20425  // This case is just here to enable more of the below cases to be caught.
20426  if (Op->getOpcode() == ISD::TRUNCATE &&
20427      Bit < Op->getValueType(0).getSizeInBits()) {
20428    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20429  }
20430
20431  // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
20432  if (Op->getOpcode() == ISD::ANY_EXTEND &&
20433      Bit < Op->getOperand(0).getValueSizeInBits()) {
20434    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20435  }
20436
20437  if (Op->getNumOperands() != 2)
20438    return Op;
20439
20440  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
20441  if (!C)
20442    return Op;
20443
20444  switch (Op->getOpcode()) {
20445  default:
20446    return Op;
20447
20448  // (tbz (and x, m), b) -> (tbz x, b)
20449  case ISD::AND:
20450    if ((C->getZExtValue() >> Bit) & 1)
20451      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20452    return Op;
20453
20454  // (tbz (shl x, c), b) -> (tbz x, b-c)
20455  case ISD::SHL:
20456    if (C->getZExtValue() <= Bit &&
20457        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
20458      Bit = Bit - C->getZExtValue();
20459      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20460    }
20461    return Op;
20462
20463  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
20464  case ISD::SRA:
20465    Bit = Bit + C->getZExtValue();
20466    if (Bit >= Op->getValueType(0).getSizeInBits())
20467      Bit = Op->getValueType(0).getSizeInBits() - 1;
20468    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20469
20470  // (tbz (srl x, c), b) -> (tbz x, b+c)
20471  case ISD::SRL:
20472    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
20473      Bit = Bit + C->getZExtValue();
20474      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20475    }
20476    return Op;
20477
20478  // (tbz (xor x, -1), b) -> (tbnz x, b)
20479  case ISD::XOR:
20480    if ((C->getZExtValue() >> Bit) & 1)
20481      Invert = !Invert;
20482    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
20483  }
20484}
20485
20486// Optimize test single bit zero/non-zero and branch.
20487static SDValue performTBZCombine(SDNode *N,
20488                                 TargetLowering::DAGCombinerInfo &DCI,
20489                                 SelectionDAG &DAG) {
20490  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
20491  bool Invert = false;
20492  SDValue TestSrc = N->getOperand(1);
20493  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
20494
20495  if (TestSrc == NewTestSrc)
20496    return SDValue();
20497
20498  unsigned NewOpc = N->getOpcode();
20499  if (Invert) {
20500    if (NewOpc == AArch64ISD::TBZ)
20501      NewOpc = AArch64ISD::TBNZ;
20502    else {
20503      assert(NewOpc == AArch64ISD::TBNZ);
20504      NewOpc = AArch64ISD::TBZ;
20505    }
20506  }
20507
20508  SDLoc DL(N);
20509  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
20510                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
20511}
20512
20513// Swap vselect operands where it may allow a predicated operation to achieve
20514// the `sel`.
20515//
20516//     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
20517//  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
20518static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
20519  auto SelectA = N->getOperand(1);
20520  auto SelectB = N->getOperand(2);
20521  auto NTy = N->getValueType(0);
20522
20523  if (!NTy.isScalableVector())
20524    return SDValue();
20525  SDValue SetCC = N->getOperand(0);
20526  if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
20527    return SDValue();
20528
20529  switch (SelectB.getOpcode()) {
20530  default:
20531    return SDValue();
20532  case ISD::FMUL:
20533  case ISD::FSUB:
20534  case ISD::FADD:
20535    break;
20536  }
20537  if (SelectA != SelectB.getOperand(0))
20538    return SDValue();
20539
20540  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
20541  ISD::CondCode InverseCC =
20542      ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
20543  auto InverseSetCC =
20544      DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
20545                   SetCC.getOperand(1), InverseCC);
20546
20547  return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
20548                     {InverseSetCC, SelectB, SelectA});
20549}
20550
20551// vselect (v1i1 setcc) ->
20552//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
20553// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
20554// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
20555// such VSELECT.
20556static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
20557  if (auto SwapResult = trySwapVSelectOperands(N, DAG))
20558    return SwapResult;
20559
20560  SDValue N0 = N->getOperand(0);
20561  EVT CCVT = N0.getValueType();
20562
20563  if (isAllActivePredicate(DAG, N0))
20564    return N->getOperand(1);
20565
20566  if (isAllInactivePredicate(N0))
20567    return N->getOperand(2);
20568
20569  // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
20570  // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
20571  // supported types.
20572  SDValue SetCC = N->getOperand(0);
20573  if (SetCC.getOpcode() == ISD::SETCC &&
20574      SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
20575    SDValue CmpLHS = SetCC.getOperand(0);
20576    EVT VT = CmpLHS.getValueType();
20577    SDNode *CmpRHS = SetCC.getOperand(1).getNode();
20578    SDNode *SplatLHS = N->getOperand(1).getNode();
20579    SDNode *SplatRHS = N->getOperand(2).getNode();
20580    APInt SplatLHSVal;
20581    if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
20582        VT.isSimple() &&
20583        is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
20584                               MVT::v2i32, MVT::v4i32, MVT::v2i64}),
20585                     VT.getSimpleVT().SimpleTy) &&
20586        ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
20587        SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
20588        ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
20589      unsigned NumElts = VT.getVectorNumElements();
20590      SmallVector<SDValue, 8> Ops(
20591          NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
20592                                   VT.getScalarType()));
20593      SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
20594
20595      auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
20596      auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
20597      return Or;
20598    }
20599  }
20600
20601  if (N0.getOpcode() != ISD::SETCC ||
20602      CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
20603      CCVT.getVectorElementType() != MVT::i1)
20604    return SDValue();
20605
20606  EVT ResVT = N->getValueType(0);
20607  EVT CmpVT = N0.getOperand(0).getValueType();
20608  // Only combine when the result type is of the same size as the compared
20609  // operands.
20610  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
20611    return SDValue();
20612
20613  SDValue IfTrue = N->getOperand(1);
20614  SDValue IfFalse = N->getOperand(2);
20615  SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
20616                       N0.getOperand(0), N0.getOperand(1),
20617                       cast<CondCodeSDNode>(N0.getOperand(2))->get());
20618  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
20619                     IfTrue, IfFalse);
20620}
20621
20622/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
20623/// the compare-mask instructions rather than going via NZCV, even if LHS and
20624/// RHS are really scalar. This replaces any scalar setcc in the above pattern
20625/// with a vector one followed by a DUP shuffle on the result.
20626static SDValue performSelectCombine(SDNode *N,
20627                                    TargetLowering::DAGCombinerInfo &DCI) {
20628  SelectionDAG &DAG = DCI.DAG;
20629  SDValue N0 = N->getOperand(0);
20630  EVT ResVT = N->getValueType(0);
20631
20632  if (N0.getOpcode() != ISD::SETCC)
20633    return SDValue();
20634
20635  if (ResVT.isScalableVector())
20636    return SDValue();
20637
20638  // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
20639  // scalar SetCCResultType. We also don't expect vectors, because we assume
20640  // that selects fed by vector SETCCs are canonicalized to VSELECT.
20641  assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
20642         "Scalar-SETCC feeding SELECT has unexpected result type!");
20643
20644  // If NumMaskElts == 0, the comparison is larger than select result. The
20645  // largest real NEON comparison is 64-bits per lane, which means the result is
20646  // at most 32-bits and an illegal vector. Just bail out for now.
20647  EVT SrcVT = N0.getOperand(0).getValueType();
20648
20649  // Don't try to do this optimization when the setcc itself has i1 operands.
20650  // There are no legal vectors of i1, so this would be pointless.
20651  if (SrcVT == MVT::i1)
20652    return SDValue();
20653
20654  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
20655  if (!ResVT.isVector() || NumMaskElts == 0)
20656    return SDValue();
20657
20658  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
20659  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
20660
20661  // Also bail out if the vector CCVT isn't the same size as ResVT.
20662  // This can happen if the SETCC operand size doesn't divide the ResVT size
20663  // (e.g., f64 vs v3f32).
20664  if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
20665    return SDValue();
20666
20667  // Make sure we didn't create illegal types, if we're not supposed to.
20668  assert(DCI.isBeforeLegalize() ||
20669         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
20670
20671  // First perform a vector comparison, where lane 0 is the one we're interested
20672  // in.
20673  SDLoc DL(N0);
20674  SDValue LHS =
20675      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
20676  SDValue RHS =
20677      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
20678  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
20679
20680  // Now duplicate the comparison mask we want across all other lanes.
20681  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
20682  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
20683  Mask = DAG.getNode(ISD::BITCAST, DL,
20684                     ResVT.changeVectorElementTypeToInteger(), Mask);
20685
20686  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
20687}
20688
20689static SDValue performDUPCombine(SDNode *N,
20690                                 TargetLowering::DAGCombinerInfo &DCI) {
20691  EVT VT = N->getValueType(0);
20692  // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
20693  // 128bit vector version.
20694  if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
20695    EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20696    if (SDNode *LN = DCI.DAG.getNodeIfExists(
20697            N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
20698      SDLoc DL(N);
20699      return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
20700                             DCI.DAG.getConstant(0, DL, MVT::i64));
20701    }
20702  }
20703
20704  return performPostLD1Combine(N, DCI, false);
20705}
20706
20707/// Get rid of unnecessary NVCASTs (that don't change the type).
20708static SDValue performNVCASTCombine(SDNode *N) {
20709  if (N->getValueType(0) == N->getOperand(0).getValueType())
20710    return N->getOperand(0);
20711
20712  return SDValue();
20713}
20714
20715// If all users of the globaladdr are of the form (globaladdr + constant), find
20716// the smallest constant, fold it into the globaladdr's offset and rewrite the
20717// globaladdr as (globaladdr + constant) - constant.
20718static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
20719                                           const AArch64Subtarget *Subtarget,
20720                                           const TargetMachine &TM) {
20721  auto *GN = cast<GlobalAddressSDNode>(N);
20722  if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
20723      AArch64II::MO_NO_FLAG)
20724    return SDValue();
20725
20726  uint64_t MinOffset = -1ull;
20727  for (SDNode *N : GN->uses()) {
20728    if (N->getOpcode() != ISD::ADD)
20729      return SDValue();
20730    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
20731    if (!C)
20732      C = dyn_cast<ConstantSDNode>(N->getOperand(1));
20733    if (!C)
20734      return SDValue();
20735    MinOffset = std::min(MinOffset, C->getZExtValue());
20736  }
20737  uint64_t Offset = MinOffset + GN->getOffset();
20738
20739  // Require that the new offset is larger than the existing one. Otherwise, we
20740  // can end up oscillating between two possible DAGs, for example,
20741  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
20742  if (Offset <= uint64_t(GN->getOffset()))
20743    return SDValue();
20744
20745  // Check whether folding this offset is legal. It must not go out of bounds of
20746  // the referenced object to avoid violating the code model, and must be
20747  // smaller than 2^20 because this is the largest offset expressible in all
20748  // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
20749  // stores an immediate signed 21 bit offset.)
20750  //
20751  // This check also prevents us from folding negative offsets, which will end
20752  // up being treated in the same way as large positive ones. They could also
20753  // cause code model violations, and aren't really common enough to matter.
20754  if (Offset >= (1 << 20))
20755    return SDValue();
20756
20757  const GlobalValue *GV = GN->getGlobal();
20758  Type *T = GV->getValueType();
20759  if (!T->isSized() ||
20760      Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
20761    return SDValue();
20762
20763  SDLoc DL(GN);
20764  SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
20765  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
20766                     DAG.getConstant(MinOffset, DL, MVT::i64));
20767}
20768
20769static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
20770                                  const AArch64Subtarget *Subtarget) {
20771  SDValue BR = N->getOperand(0);
20772  if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
20773      !BR.getValueType().isScalarInteger())
20774    return SDValue();
20775
20776  SDLoc DL(N);
20777  return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
20778}
20779
20780// Turns the vector of indices into a vector of byte offstes by scaling Offset
20781// by (BitWidth / 8).
20782static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
20783                                          SDLoc DL, unsigned BitWidth) {
20784  assert(Offset.getValueType().isScalableVector() &&
20785         "This method is only for scalable vectors of offsets");
20786
20787  SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
20788  SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
20789
20790  return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
20791}
20792
20793/// Check if the value of \p OffsetInBytes can be used as an immediate for
20794/// the gather load/prefetch and scatter store instructions with vector base and
20795/// immediate offset addressing mode:
20796///
20797///      [<Zn>.[S|D]{, #<imm>}]
20798///
20799/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
20800inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
20801                                                  unsigned ScalarSizeInBytes) {
20802  // The immediate is not a multiple of the scalar size.
20803  if (OffsetInBytes % ScalarSizeInBytes)
20804    return false;
20805
20806  // The immediate is out of range.
20807  if (OffsetInBytes / ScalarSizeInBytes > 31)
20808    return false;
20809
20810  return true;
20811}
20812
20813/// Check if the value of \p Offset represents a valid immediate for the SVE
20814/// gather load/prefetch and scatter store instructiona with vector base and
20815/// immediate offset addressing mode:
20816///
20817///      [<Zn>.[S|D]{, #<imm>}]
20818///
20819/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
20820static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
20821                                           unsigned ScalarSizeInBytes) {
20822  ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
20823  return OffsetConst && isValidImmForSVEVecImmAddrMode(
20824                            OffsetConst->getZExtValue(), ScalarSizeInBytes);
20825}
20826
20827static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
20828                                          unsigned Opcode,
20829                                          bool OnlyPackedOffsets = true) {
20830  const SDValue Src = N->getOperand(2);
20831  const EVT SrcVT = Src->getValueType(0);
20832  assert(SrcVT.isScalableVector() &&
20833         "Scatter stores are only possible for SVE vectors");
20834
20835  SDLoc DL(N);
20836  MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
20837
20838  // Make sure that source data will fit into an SVE register
20839  if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
20840    return SDValue();
20841
20842  // For FPs, ACLE only supports _packed_ single and double precision types.
20843  if (SrcElVT.isFloatingPoint())
20844    if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
20845      return SDValue();
20846
20847  // Depending on the addressing mode, this is either a pointer or a vector of
20848  // pointers (that fits into one register)
20849  SDValue Base = N->getOperand(4);
20850  // Depending on the addressing mode, this is either a single offset or a
20851  // vector of offsets  (that fits into one register)
20852  SDValue Offset = N->getOperand(5);
20853
20854  // For "scalar + vector of indices", just scale the indices. This only
20855  // applies to non-temporal scatters because there's no instruction that takes
20856  // indicies.
20857  if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
20858    Offset =
20859        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
20860    Opcode = AArch64ISD::SSTNT1_PRED;
20861  }
20862
20863  // In the case of non-temporal gather loads there's only one SVE instruction
20864  // per data-size: "scalar + vector", i.e.
20865  //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20866  // Since we do have intrinsics that allow the arguments to be in a different
20867  // order, we may need to swap them to match the spec.
20868  if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
20869    std::swap(Base, Offset);
20870
20871  // SST1_IMM requires that the offset is an immediate that is:
20872  //    * a multiple of #SizeInBytes,
20873  //    * in the range [0, 31 x #SizeInBytes],
20874  // where #SizeInBytes is the size in bytes of the stored items. For
20875  // immediates outside that range and non-immediate scalar offsets use SST1 or
20876  // SST1_UXTW instead.
20877  if (Opcode == AArch64ISD::SST1_IMM_PRED) {
20878    if (!isValidImmForSVEVecImmAddrMode(Offset,
20879                                        SrcVT.getScalarSizeInBits() / 8)) {
20880      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20881        Opcode = AArch64ISD::SST1_UXTW_PRED;
20882      else
20883        Opcode = AArch64ISD::SST1_PRED;
20884
20885      std::swap(Base, Offset);
20886    }
20887  }
20888
20889  auto &TLI = DAG.getTargetLoweringInfo();
20890  if (!TLI.isTypeLegal(Base.getValueType()))
20891    return SDValue();
20892
20893  // Some scatter store variants allow unpacked offsets, but only as nxv2i32
20894  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20895  // nxv2i64. Legalize accordingly.
20896  if (!OnlyPackedOffsets &&
20897      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
20898    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
20899
20900  if (!TLI.isTypeLegal(Offset.getValueType()))
20901    return SDValue();
20902
20903  // Source value type that is representable in hardware
20904  EVT HwSrcVt = getSVEContainerType(SrcVT);
20905
20906  // Keep the original type of the input data to store - this is needed to be
20907  // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
20908  // FP values we want the integer equivalent, so just use HwSrcVt.
20909  SDValue InputVT = DAG.getValueType(SrcVT);
20910  if (SrcVT.isFloatingPoint())
20911    InputVT = DAG.getValueType(HwSrcVt);
20912
20913  SDVTList VTs = DAG.getVTList(MVT::Other);
20914  SDValue SrcNew;
20915
20916  if (Src.getValueType().isFloatingPoint())
20917    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
20918  else
20919    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
20920
20921  SDValue Ops[] = {N->getOperand(0), // Chain
20922                   SrcNew,
20923                   N->getOperand(3), // Pg
20924                   Base,
20925                   Offset,
20926                   InputVT};
20927
20928  return DAG.getNode(Opcode, DL, VTs, Ops);
20929}
20930
20931static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
20932                                        unsigned Opcode,
20933                                        bool OnlyPackedOffsets = true) {
20934  const EVT RetVT = N->getValueType(0);
20935  assert(RetVT.isScalableVector() &&
20936         "Gather loads are only possible for SVE vectors");
20937
20938  SDLoc DL(N);
20939
20940  // Make sure that the loaded data will fit into an SVE register
20941  if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
20942    return SDValue();
20943
20944  // Depending on the addressing mode, this is either a pointer or a vector of
20945  // pointers (that fits into one register)
20946  SDValue Base = N->getOperand(3);
20947  // Depending on the addressing mode, this is either a single offset or a
20948  // vector of offsets  (that fits into one register)
20949  SDValue Offset = N->getOperand(4);
20950
20951  // For "scalar + vector of indices", just scale the indices. This only
20952  // applies to non-temporal gathers because there's no instruction that takes
20953  // indicies.
20954  if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
20955    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
20956                                        RetVT.getScalarSizeInBits());
20957    Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
20958  }
20959
20960  // In the case of non-temporal gather loads there's only one SVE instruction
20961  // per data-size: "scalar + vector", i.e.
20962  //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20963  // Since we do have intrinsics that allow the arguments to be in a different
20964  // order, we may need to swap them to match the spec.
20965  if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
20966      Offset.getValueType().isVector())
20967    std::swap(Base, Offset);
20968
20969  // GLD{FF}1_IMM requires that the offset is an immediate that is:
20970  //    * a multiple of #SizeInBytes,
20971  //    * in the range [0, 31 x #SizeInBytes],
20972  // where #SizeInBytes is the size in bytes of the loaded items. For
20973  // immediates outside that range and non-immediate scalar offsets use
20974  // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
20975  if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
20976      Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
20977    if (!isValidImmForSVEVecImmAddrMode(Offset,
20978                                        RetVT.getScalarSizeInBits() / 8)) {
20979      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20980        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20981                     ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
20982                     : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
20983      else
20984        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20985                     ? AArch64ISD::GLD1_MERGE_ZERO
20986                     : AArch64ISD::GLDFF1_MERGE_ZERO;
20987
20988      std::swap(Base, Offset);
20989    }
20990  }
20991
20992  auto &TLI = DAG.getTargetLoweringInfo();
20993  if (!TLI.isTypeLegal(Base.getValueType()))
20994    return SDValue();
20995
20996  // Some gather load variants allow unpacked offsets, but only as nxv2i32
20997  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20998  // nxv2i64. Legalize accordingly.
20999  if (!OnlyPackedOffsets &&
21000      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
21001    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
21002
21003  // Return value type that is representable in hardware
21004  EVT HwRetVt = getSVEContainerType(RetVT);
21005
21006  // Keep the original output value type around - this is needed to be able to
21007  // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
21008  // values we want the integer equivalent, so just use HwRetVT.
21009  SDValue OutVT = DAG.getValueType(RetVT);
21010  if (RetVT.isFloatingPoint())
21011    OutVT = DAG.getValueType(HwRetVt);
21012
21013  SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
21014  SDValue Ops[] = {N->getOperand(0), // Chain
21015                   N->getOperand(2), // Pg
21016                   Base, Offset, OutVT};
21017
21018  SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
21019  SDValue LoadChain = SDValue(Load.getNode(), 1);
21020
21021  if (RetVT.isInteger() && (RetVT != HwRetVt))
21022    Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
21023
21024  // If the original return value was FP, bitcast accordingly. Doing it here
21025  // means that we can avoid adding TableGen patterns for FPs.
21026  if (RetVT.isFloatingPoint())
21027    Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
21028
21029  return DAG.getMergeValues({Load, LoadChain}, DL);
21030}
21031
21032static SDValue
21033performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21034                              SelectionDAG &DAG) {
21035  SDLoc DL(N);
21036  SDValue Src = N->getOperand(0);
21037  unsigned Opc = Src->getOpcode();
21038
21039  // Sign extend of an unsigned unpack -> signed unpack
21040  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
21041
21042    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
21043                                               : AArch64ISD::SUNPKLO;
21044
21045    // Push the sign extend to the operand of the unpack
21046    // This is necessary where, for example, the operand of the unpack
21047    // is another unpack:
21048    // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
21049    // ->
21050    // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
21051    // ->
21052    // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
21053    SDValue ExtOp = Src->getOperand(0);
21054    auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
21055    EVT EltTy = VT.getVectorElementType();
21056    (void)EltTy;
21057
21058    assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
21059           "Sign extending from an invalid type");
21060
21061    EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
21062
21063    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
21064                              ExtOp, DAG.getValueType(ExtVT));
21065
21066    return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
21067  }
21068
21069  if (DCI.isBeforeLegalizeOps())
21070    return SDValue();
21071
21072  if (!EnableCombineMGatherIntrinsics)
21073    return SDValue();
21074
21075  // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
21076  // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
21077  unsigned NewOpc;
21078  unsigned MemVTOpNum = 4;
21079  switch (Opc) {
21080  case AArch64ISD::LD1_MERGE_ZERO:
21081    NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
21082    MemVTOpNum = 3;
21083    break;
21084  case AArch64ISD::LDNF1_MERGE_ZERO:
21085    NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
21086    MemVTOpNum = 3;
21087    break;
21088  case AArch64ISD::LDFF1_MERGE_ZERO:
21089    NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
21090    MemVTOpNum = 3;
21091    break;
21092  case AArch64ISD::GLD1_MERGE_ZERO:
21093    NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
21094    break;
21095  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
21096    NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21097    break;
21098  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
21099    NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
21100    break;
21101  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
21102    NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
21103    break;
21104  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
21105    NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
21106    break;
21107  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
21108    NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
21109    break;
21110  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
21111    NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
21112    break;
21113  case AArch64ISD::GLDFF1_MERGE_ZERO:
21114    NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
21115    break;
21116  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
21117    NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
21118    break;
21119  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
21120    NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
21121    break;
21122  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
21123    NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
21124    break;
21125  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
21126    NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
21127    break;
21128  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
21129    NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
21130    break;
21131  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
21132    NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
21133    break;
21134  case AArch64ISD::GLDNT1_MERGE_ZERO:
21135    NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
21136    break;
21137  default:
21138    return SDValue();
21139  }
21140
21141  EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
21142  EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
21143
21144  if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
21145    return SDValue();
21146
21147  EVT DstVT = N->getValueType(0);
21148  SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
21149
21150  SmallVector<SDValue, 5> Ops;
21151  for (unsigned I = 0; I < Src->getNumOperands(); ++I)
21152    Ops.push_back(Src->getOperand(I));
21153
21154  SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
21155  DCI.CombineTo(N, ExtLoad);
21156  DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
21157
21158  // Return N so it doesn't get rechecked
21159  return SDValue(N, 0);
21160}
21161
21162/// Legalize the gather prefetch (scalar + vector addressing mode) when the
21163/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
21164/// != nxv2i32) do not need legalization.
21165static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
21166  const unsigned OffsetPos = 4;
21167  SDValue Offset = N->getOperand(OffsetPos);
21168
21169  // Not an unpacked vector, bail out.
21170  if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
21171    return SDValue();
21172
21173  // Extend the unpacked offset vector to 64-bit lanes.
21174  SDLoc DL(N);
21175  Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
21176  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
21177  // Replace the offset operand with the 64-bit one.
21178  Ops[OffsetPos] = Offset;
21179
21180  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
21181}
21182
21183/// Combines a node carrying the intrinsic
21184/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
21185/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
21186/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
21187/// sve gather prefetch instruction with vector plus immediate addressing mode.
21188static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
21189                                               unsigned ScalarSizeInBytes) {
21190  const unsigned ImmPos = 4, OffsetPos = 3;
21191  // No need to combine the node if the immediate is valid...
21192  if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
21193    return SDValue();
21194
21195  // ...otherwise swap the offset base with the offset...
21196  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
21197  std::swap(Ops[ImmPos], Ops[OffsetPos]);
21198  // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
21199  // `aarch64_sve_prfb_gather_uxtw_index`.
21200  SDLoc DL(N);
21201  Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
21202                           MVT::i64);
21203
21204  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
21205}
21206
21207// Return true if the vector operation can guarantee only the first lane of its
21208// result contains data, with all bits in other lanes set to zero.
21209static bool isLanes1toNKnownZero(SDValue Op) {
21210  switch (Op.getOpcode()) {
21211  default:
21212    return false;
21213  case AArch64ISD::ANDV_PRED:
21214  case AArch64ISD::EORV_PRED:
21215  case AArch64ISD::FADDA_PRED:
21216  case AArch64ISD::FADDV_PRED:
21217  case AArch64ISD::FMAXNMV_PRED:
21218  case AArch64ISD::FMAXV_PRED:
21219  case AArch64ISD::FMINNMV_PRED:
21220  case AArch64ISD::FMINV_PRED:
21221  case AArch64ISD::ORV_PRED:
21222  case AArch64ISD::SADDV_PRED:
21223  case AArch64ISD::SMAXV_PRED:
21224  case AArch64ISD::SMINV_PRED:
21225  case AArch64ISD::UADDV_PRED:
21226  case AArch64ISD::UMAXV_PRED:
21227  case AArch64ISD::UMINV_PRED:
21228    return true;
21229  }
21230}
21231
21232static SDValue removeRedundantInsertVectorElt(SDNode *N) {
21233  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
21234  SDValue InsertVec = N->getOperand(0);
21235  SDValue InsertElt = N->getOperand(1);
21236  SDValue InsertIdx = N->getOperand(2);
21237
21238  // We only care about inserts into the first element...
21239  if (!isNullConstant(InsertIdx))
21240    return SDValue();
21241  // ...of a zero'd vector...
21242  if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
21243    return SDValue();
21244  // ...where the inserted data was previously extracted...
21245  if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
21246    return SDValue();
21247
21248  SDValue ExtractVec = InsertElt.getOperand(0);
21249  SDValue ExtractIdx = InsertElt.getOperand(1);
21250
21251  // ...from the first element of a vector.
21252  if (!isNullConstant(ExtractIdx))
21253    return SDValue();
21254
21255  // If we get here we are effectively trying to zero lanes 1-N of a vector.
21256
21257  // Ensure there's no type conversion going on.
21258  if (N->getValueType(0) != ExtractVec.getValueType())
21259    return SDValue();
21260
21261  if (!isLanes1toNKnownZero(ExtractVec))
21262    return SDValue();
21263
21264  // The explicit zeroing is redundant.
21265  return ExtractVec;
21266}
21267
21268static SDValue
21269performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
21270  if (SDValue Res = removeRedundantInsertVectorElt(N))
21271    return Res;
21272
21273  return performPostLD1Combine(N, DCI, true);
21274}
21275
21276static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
21277  EVT Ty = N->getValueType(0);
21278  if (Ty.isInteger())
21279    return SDValue();
21280
21281  EVT IntTy = Ty.changeVectorElementTypeToInteger();
21282  EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
21283  if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
21284      IntTy.getVectorElementType().getScalarSizeInBits())
21285    return SDValue();
21286
21287  SDLoc DL(N);
21288  SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
21289                                     DL, ExtIntTy);
21290  SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
21291                                     DL, ExtIntTy);
21292  SDValue Idx = N->getOperand(2);
21293  SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
21294  SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
21295  return DAG.getBitcast(Ty, Trunc);
21296}
21297
21298static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
21299                                      TargetLowering::DAGCombinerInfo &DCI,
21300                                      const AArch64Subtarget *Subtarget) {
21301  SDValue N0 = N->getOperand(0);
21302  EVT VT = N->getValueType(0);
21303
21304  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
21305  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
21306    return SDValue();
21307
21308  auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
21309    EVT EltVT = VT.getVectorElementType();
21310    return EltVT == MVT::f32 || EltVT == MVT::f64;
21311  };
21312
21313  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
21314  // We purposefully don't care about legality of the nodes here as we know
21315  // they can be split down into something legal.
21316  if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
21317      N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
21318      VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
21319      VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
21320    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
21321    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
21322                                     LN0->getChain(), LN0->getBasePtr(),
21323                                     N0.getValueType(), LN0->getMemOperand());
21324    DCI.CombineTo(N, ExtLoad);
21325    DCI.CombineTo(
21326        N0.getNode(),
21327        DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
21328                    DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
21329        ExtLoad.getValue(1));
21330    return SDValue(N, 0); // Return N so it doesn't get rechecked!
21331  }
21332
21333  return SDValue();
21334}
21335
21336static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
21337                                      const AArch64Subtarget *Subtarget) {
21338  EVT VT = N->getValueType(0);
21339
21340  // Don't expand for NEON, SVE2 or SME
21341  if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
21342    return SDValue();
21343
21344  SDLoc DL(N);
21345
21346  SDValue Mask = N->getOperand(0);
21347  SDValue In1 = N->getOperand(1);
21348  SDValue In2 = N->getOperand(2);
21349
21350  SDValue InvMask = DAG.getNOT(DL, Mask, VT);
21351  SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
21352  SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
21353  return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
21354}
21355
21356static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
21357  EVT VT = N->getValueType(0);
21358
21359  SDValue Insert = N->getOperand(0);
21360  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
21361    return SDValue();
21362
21363  if (!Insert.getOperand(0).isUndef())
21364    return SDValue();
21365
21366  uint64_t IdxInsert = Insert.getConstantOperandVal(2);
21367  uint64_t IdxDupLane = N->getConstantOperandVal(1);
21368  if (IdxInsert != 0 || IdxDupLane != 0)
21369    return SDValue();
21370
21371  SDValue Bitcast = Insert.getOperand(1);
21372  if (Bitcast.getOpcode() != ISD::BITCAST)
21373    return SDValue();
21374
21375  SDValue Subvec = Bitcast.getOperand(0);
21376  EVT SubvecVT = Subvec.getValueType();
21377  if (!SubvecVT.is128BitVector())
21378    return SDValue();
21379  EVT NewSubvecVT =
21380      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
21381
21382  SDLoc DL(N);
21383  SDValue NewInsert =
21384      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
21385                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
21386  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
21387                                      NewInsert, N->getOperand(1));
21388  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
21389}
21390
21391SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
21392                                                 DAGCombinerInfo &DCI) const {
21393  SelectionDAG &DAG = DCI.DAG;
21394  switch (N->getOpcode()) {
21395  default:
21396    LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
21397    break;
21398  case ISD::ADD:
21399  case ISD::SUB:
21400    return performAddSubCombine(N, DCI, DAG);
21401  case ISD::BUILD_VECTOR:
21402    return performBuildVectorCombine(N, DCI, DAG);
21403  case ISD::TRUNCATE:
21404    return performTruncateCombine(N, DAG);
21405  case AArch64ISD::ANDS:
21406    return performFlagSettingCombine(N, DCI, ISD::AND);
21407  case AArch64ISD::ADC:
21408    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
21409      return R;
21410    return foldADCToCINC(N, DAG);
21411  case AArch64ISD::SBC:
21412    return foldOverflowCheck(N, DAG, /* IsAdd */ false);
21413  case AArch64ISD::ADCS:
21414    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
21415      return R;
21416    return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
21417  case AArch64ISD::SBCS:
21418    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
21419      return R;
21420    return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
21421  case ISD::XOR:
21422    return performXorCombine(N, DAG, DCI, Subtarget);
21423  case ISD::MUL:
21424    return performMulCombine(N, DAG, DCI, Subtarget);
21425  case ISD::SINT_TO_FP:
21426  case ISD::UINT_TO_FP:
21427    return performIntToFpCombine(N, DAG, Subtarget);
21428  case ISD::FP_TO_SINT:
21429  case ISD::FP_TO_UINT:
21430  case ISD::FP_TO_SINT_SAT:
21431  case ISD::FP_TO_UINT_SAT:
21432    return performFpToIntCombine(N, DAG, DCI, Subtarget);
21433  case ISD::FDIV:
21434    return performFDivCombine(N, DAG, DCI, Subtarget);
21435  case ISD::OR:
21436    return performORCombine(N, DCI, Subtarget, *this);
21437  case ISD::AND:
21438    return performANDCombine(N, DCI);
21439  case ISD::INTRINSIC_WO_CHAIN:
21440    return performIntrinsicCombine(N, DCI, Subtarget);
21441  case ISD::ANY_EXTEND:
21442  case ISD::ZERO_EXTEND:
21443  case ISD::SIGN_EXTEND:
21444    return performExtendCombine(N, DCI, DAG);
21445  case ISD::SIGN_EXTEND_INREG:
21446    return performSignExtendInRegCombine(N, DCI, DAG);
21447  case ISD::CONCAT_VECTORS:
21448    return performConcatVectorsCombine(N, DCI, DAG);
21449  case ISD::EXTRACT_SUBVECTOR:
21450    return performExtractSubvectorCombine(N, DCI, DAG);
21451  case ISD::INSERT_SUBVECTOR:
21452    return performInsertSubvectorCombine(N, DCI, DAG);
21453  case ISD::SELECT:
21454    return performSelectCombine(N, DCI);
21455  case ISD::VSELECT:
21456    return performVSelectCombine(N, DCI.DAG);
21457  case ISD::SETCC:
21458    return performSETCCCombine(N, DCI, DAG);
21459  case ISD::LOAD:
21460    return performLOADCombine(N, DCI, DAG, Subtarget);
21461  case ISD::STORE:
21462    return performSTORECombine(N, DCI, DAG, Subtarget);
21463  case ISD::MSTORE:
21464    return performMSTORECombine(N, DCI, DAG, Subtarget);
21465  case ISD::MGATHER:
21466  case ISD::MSCATTER:
21467    return performMaskedGatherScatterCombine(N, DCI, DAG);
21468  case ISD::VECTOR_SPLICE:
21469    return performSVESpliceCombine(N, DAG);
21470  case ISD::FP_EXTEND:
21471    return performFPExtendCombine(N, DAG, DCI, Subtarget);
21472  case AArch64ISD::BRCOND:
21473    return performBRCONDCombine(N, DCI, DAG);
21474  case AArch64ISD::TBNZ:
21475  case AArch64ISD::TBZ:
21476    return performTBZCombine(N, DCI, DAG);
21477  case AArch64ISD::CSEL:
21478    return performCSELCombine(N, DCI, DAG);
21479  case AArch64ISD::DUP:
21480    return performDUPCombine(N, DCI);
21481  case AArch64ISD::DUPLANE128:
21482    return performDupLane128Combine(N, DAG);
21483  case AArch64ISD::NVCAST:
21484    return performNVCASTCombine(N);
21485  case AArch64ISD::SPLICE:
21486    return performSpliceCombine(N, DAG);
21487  case AArch64ISD::UUNPKLO:
21488  case AArch64ISD::UUNPKHI:
21489    return performUnpackCombine(N, DAG, Subtarget);
21490  case AArch64ISD::UZP1:
21491    return performUzpCombine(N, DAG);
21492  case AArch64ISD::SETCC_MERGE_ZERO:
21493    return performSetccMergeZeroCombine(N, DCI);
21494  case AArch64ISD::REINTERPRET_CAST:
21495    return performReinterpretCastCombine(N);
21496  case AArch64ISD::GLD1_MERGE_ZERO:
21497  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
21498  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
21499  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
21500  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
21501  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
21502  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
21503  case AArch64ISD::GLD1S_MERGE_ZERO:
21504  case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
21505  case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
21506  case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
21507  case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
21508  case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
21509  case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
21510    return performGLD1Combine(N, DAG);
21511  case AArch64ISD::VASHR:
21512  case AArch64ISD::VLSHR:
21513    return performVectorShiftCombine(N, *this, DCI);
21514  case AArch64ISD::SUNPKLO:
21515    return performSunpkloCombine(N, DAG);
21516  case AArch64ISD::BSP:
21517    return performBSPExpandForSVE(N, DAG, Subtarget);
21518  case ISD::INSERT_VECTOR_ELT:
21519    return performInsertVectorEltCombine(N, DCI);
21520  case ISD::EXTRACT_VECTOR_ELT:
21521    return performExtractVectorEltCombine(N, DCI, Subtarget);
21522  case ISD::VECREDUCE_ADD:
21523    return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
21524  case AArch64ISD::UADDV:
21525    return performUADDVCombine(N, DAG);
21526  case AArch64ISD::SMULL:
21527  case AArch64ISD::UMULL:
21528  case AArch64ISD::PMULL:
21529    return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
21530  case ISD::INTRINSIC_VOID:
21531  case ISD::INTRINSIC_W_CHAIN:
21532    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
21533    case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
21534      return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
21535    case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
21536      return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
21537    case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
21538      return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
21539    case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
21540      return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
21541    case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
21542    case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
21543    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
21544    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
21545    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
21546    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
21547    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
21548    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
21549      return legalizeSVEGatherPrefetchOffsVec(N, DAG);
21550    case Intrinsic::aarch64_neon_ld2:
21551    case Intrinsic::aarch64_neon_ld3:
21552    case Intrinsic::aarch64_neon_ld4:
21553    case Intrinsic::aarch64_neon_ld1x2:
21554    case Intrinsic::aarch64_neon_ld1x3:
21555    case Intrinsic::aarch64_neon_ld1x4:
21556    case Intrinsic::aarch64_neon_ld2lane:
21557    case Intrinsic::aarch64_neon_ld3lane:
21558    case Intrinsic::aarch64_neon_ld4lane:
21559    case Intrinsic::aarch64_neon_ld2r:
21560    case Intrinsic::aarch64_neon_ld3r:
21561    case Intrinsic::aarch64_neon_ld4r:
21562    case Intrinsic::aarch64_neon_st2:
21563    case Intrinsic::aarch64_neon_st3:
21564    case Intrinsic::aarch64_neon_st4:
21565    case Intrinsic::aarch64_neon_st1x2:
21566    case Intrinsic::aarch64_neon_st1x3:
21567    case Intrinsic::aarch64_neon_st1x4:
21568    case Intrinsic::aarch64_neon_st2lane:
21569    case Intrinsic::aarch64_neon_st3lane:
21570    case Intrinsic::aarch64_neon_st4lane:
21571      return performNEONPostLDSTCombine(N, DCI, DAG);
21572    case Intrinsic::aarch64_sve_ldnt1:
21573      return performLDNT1Combine(N, DAG);
21574    case Intrinsic::aarch64_sve_ld1rq:
21575      return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
21576    case Intrinsic::aarch64_sve_ld1ro:
21577      return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
21578    case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
21579      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
21580    case Intrinsic::aarch64_sve_ldnt1_gather:
21581      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
21582    case Intrinsic::aarch64_sve_ldnt1_gather_index:
21583      return performGatherLoadCombine(N, DAG,
21584                                      AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
21585    case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
21586      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
21587    case Intrinsic::aarch64_sve_ld1:
21588      return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
21589    case Intrinsic::aarch64_sve_ldnf1:
21590      return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
21591    case Intrinsic::aarch64_sve_ldff1:
21592      return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
21593    case Intrinsic::aarch64_sve_st1:
21594      return performST1Combine(N, DAG);
21595    case Intrinsic::aarch64_sve_stnt1:
21596      return performSTNT1Combine(N, DAG);
21597    case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
21598      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
21599    case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
21600      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
21601    case Intrinsic::aarch64_sve_stnt1_scatter:
21602      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
21603    case Intrinsic::aarch64_sve_stnt1_scatter_index:
21604      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
21605    case Intrinsic::aarch64_sve_ld1_gather:
21606      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
21607    case Intrinsic::aarch64_sve_ld1_gather_index:
21608      return performGatherLoadCombine(N, DAG,
21609                                      AArch64ISD::GLD1_SCALED_MERGE_ZERO);
21610    case Intrinsic::aarch64_sve_ld1_gather_sxtw:
21611      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
21612                                      /*OnlyPackedOffsets=*/false);
21613    case Intrinsic::aarch64_sve_ld1_gather_uxtw:
21614      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
21615                                      /*OnlyPackedOffsets=*/false);
21616    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
21617      return performGatherLoadCombine(N, DAG,
21618                                      AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
21619                                      /*OnlyPackedOffsets=*/false);
21620    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
21621      return performGatherLoadCombine(N, DAG,
21622                                      AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
21623                                      /*OnlyPackedOffsets=*/false);
21624    case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
21625      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
21626    case Intrinsic::aarch64_sve_ldff1_gather:
21627      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
21628    case Intrinsic::aarch64_sve_ldff1_gather_index:
21629      return performGatherLoadCombine(N, DAG,
21630                                      AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
21631    case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
21632      return performGatherLoadCombine(N, DAG,
21633                                      AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
21634                                      /*OnlyPackedOffsets=*/false);
21635    case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
21636      return performGatherLoadCombine(N, DAG,
21637                                      AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
21638                                      /*OnlyPackedOffsets=*/false);
21639    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
21640      return performGatherLoadCombine(N, DAG,
21641                                      AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
21642                                      /*OnlyPackedOffsets=*/false);
21643    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
21644      return performGatherLoadCombine(N, DAG,
21645                                      AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
21646                                      /*OnlyPackedOffsets=*/false);
21647    case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
21648      return performGatherLoadCombine(N, DAG,
21649                                      AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
21650    case Intrinsic::aarch64_sve_st1_scatter:
21651      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
21652    case Intrinsic::aarch64_sve_st1_scatter_index:
21653      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
21654    case Intrinsic::aarch64_sve_st1_scatter_sxtw:
21655      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
21656                                        /*OnlyPackedOffsets=*/false);
21657    case Intrinsic::aarch64_sve_st1_scatter_uxtw:
21658      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
21659                                        /*OnlyPackedOffsets=*/false);
21660    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
21661      return performScatterStoreCombine(N, DAG,
21662                                        AArch64ISD::SST1_SXTW_SCALED_PRED,
21663                                        /*OnlyPackedOffsets=*/false);
21664    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
21665      return performScatterStoreCombine(N, DAG,
21666                                        AArch64ISD::SST1_UXTW_SCALED_PRED,
21667                                        /*OnlyPackedOffsets=*/false);
21668    case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
21669      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
21670    case Intrinsic::aarch64_rndr:
21671    case Intrinsic::aarch64_rndrrs: {
21672      unsigned IntrinsicID =
21673          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21674      auto Register =
21675          (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
21676                                                  : AArch64SysReg::RNDRRS);
21677      SDLoc DL(N);
21678      SDValue A = DAG.getNode(
21679          AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
21680          N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
21681      SDValue B = DAG.getNode(
21682          AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
21683          DAG.getConstant(0, DL, MVT::i32),
21684          DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
21685      return DAG.getMergeValues(
21686          {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
21687    }
21688    default:
21689      break;
21690    }
21691    break;
21692  case ISD::GlobalAddress:
21693    return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
21694  case ISD::CTLZ:
21695    return performCTLZCombine(N, DAG, Subtarget);
21696  }
21697  return SDValue();
21698}
21699
21700// Check if the return value is used as only a return value, as otherwise
21701// we can't perform a tail-call. In particular, we need to check for
21702// target ISD nodes that are returns and any other "odd" constructs
21703// that the generic analysis code won't necessarily catch.
21704bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
21705                                               SDValue &Chain) const {
21706  if (N->getNumValues() != 1)
21707    return false;
21708  if (!N->hasNUsesOfValue(1, 0))
21709    return false;
21710
21711  SDValue TCChain = Chain;
21712  SDNode *Copy = *N->use_begin();
21713  if (Copy->getOpcode() == ISD::CopyToReg) {
21714    // If the copy has a glue operand, we conservatively assume it isn't safe to
21715    // perform a tail call.
21716    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
21717        MVT::Glue)
21718      return false;
21719    TCChain = Copy->getOperand(0);
21720  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
21721    return false;
21722
21723  bool HasRet = false;
21724  for (SDNode *Node : Copy->uses()) {
21725    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
21726      return false;
21727    HasRet = true;
21728  }
21729
21730  if (!HasRet)
21731    return false;
21732
21733  Chain = TCChain;
21734  return true;
21735}
21736
21737// Return whether the an instruction can potentially be optimized to a tail
21738// call. This will cause the optimizers to attempt to move, or duplicate,
21739// return instructions to help enable tail call optimizations for this
21740// instruction.
21741bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
21742  return CI->isTailCall();
21743}
21744
21745bool AArch64TargetLowering::getIndexedAddressParts(
21746    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
21747    ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const {
21748  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
21749    return false;
21750
21751  // Non-null if there is exactly one user of the loaded value (ignoring chain).
21752  SDNode *ValOnlyUser = nullptr;
21753  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
21754       ++UI) {
21755    if (UI.getUse().getResNo() == 1)
21756      continue; // Ignore chain.
21757    if (ValOnlyUser == nullptr)
21758      ValOnlyUser = *UI;
21759    else {
21760      ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
21761      break;
21762    }
21763  }
21764
21765  auto IsUndefOrZero = [](SDValue V) {
21766    return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
21767  };
21768
21769  // If the only user of the value is a scalable vector splat, it is
21770  // preferable to do a replicating load (ld1r*).
21771  if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
21772      (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
21773       (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
21774        IsUndefOrZero(ValOnlyUser->getOperand(2)))))
21775    return false;
21776
21777  Base = Op->getOperand(0);
21778  // All of the indexed addressing mode instructions take a signed
21779  // 9 bit immediate offset.
21780  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
21781    int64_t RHSC = RHS->getSExtValue();
21782    if (Op->getOpcode() == ISD::SUB)
21783      RHSC = -(uint64_t)RHSC;
21784    if (!isInt<9>(RHSC))
21785      return false;
21786    IsInc = (Op->getOpcode() == ISD::ADD);
21787    Offset = Op->getOperand(1);
21788    return true;
21789  }
21790  return false;
21791}
21792
21793bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
21794                                                      SDValue &Offset,
21795                                                      ISD::MemIndexedMode &AM,
21796                                                      SelectionDAG &DAG) const {
21797  EVT VT;
21798  SDValue Ptr;
21799  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
21800    VT = LD->getMemoryVT();
21801    Ptr = LD->getBasePtr();
21802  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
21803    VT = ST->getMemoryVT();
21804    Ptr = ST->getBasePtr();
21805  } else
21806    return false;
21807
21808  bool IsInc;
21809  if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
21810    return false;
21811  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
21812  return true;
21813}
21814
21815bool AArch64TargetLowering::getPostIndexedAddressParts(
21816    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
21817    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
21818  EVT VT;
21819  SDValue Ptr;
21820  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
21821    VT = LD->getMemoryVT();
21822    Ptr = LD->getBasePtr();
21823  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
21824    VT = ST->getMemoryVT();
21825    Ptr = ST->getBasePtr();
21826  } else
21827    return false;
21828
21829  bool IsInc;
21830  if (!getIndexedAddressParts(N, Op, Base, Offset, AM, IsInc, DAG))
21831    return false;
21832  // Post-indexing updates the base, so it's not a valid transform
21833  // if that's not the same as the load's pointer.
21834  if (Ptr != Base)
21835    return false;
21836  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
21837  return true;
21838}
21839
21840void AArch64TargetLowering::ReplaceBITCASTResults(
21841    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21842  SDLoc DL(N);
21843  SDValue Op = N->getOperand(0);
21844  EVT VT = N->getValueType(0);
21845  EVT SrcVT = Op.getValueType();
21846
21847  if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
21848    assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
21849           "Expected fp->int bitcast!");
21850
21851    // Bitcasting between unpacked vector types of different element counts is
21852    // not a NOP because the live elements are laid out differently.
21853    //                01234567
21854    // e.g. nxv2i32 = XX??XX??
21855    //      nxv4f16 = X?X?X?X?
21856    if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
21857      return;
21858
21859    SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
21860    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
21861    return;
21862  }
21863
21864  if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
21865    return;
21866
21867  Op = SDValue(
21868      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
21869                         DAG.getUNDEF(MVT::i32), Op,
21870                         DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
21871      0);
21872  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
21873  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
21874}
21875
21876static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
21877                               SelectionDAG &DAG,
21878                               const AArch64Subtarget *Subtarget) {
21879  EVT VT = N->getValueType(0);
21880  if (!VT.is256BitVector() ||
21881      (VT.getScalarType().isFloatingPoint() &&
21882       !N->getFlags().hasAllowReassociation()) ||
21883      (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
21884    return;
21885
21886  SDValue X = N->getOperand(0);
21887  auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
21888  if (!Shuf) {
21889    Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
21890    X = N->getOperand(1);
21891    if (!Shuf)
21892      return;
21893  }
21894
21895  if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
21896    return;
21897
21898  // Check the mask is 1,0,3,2,5,4,...
21899  ArrayRef<int> Mask = Shuf->getMask();
21900  for (int I = 0, E = Mask.size(); I < E; I++)
21901    if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
21902      return;
21903
21904  SDLoc DL(N);
21905  auto LoHi = DAG.SplitVector(X, DL);
21906  assert(LoHi.first.getValueType() == LoHi.second.getValueType());
21907  SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
21908                             LoHi.first, LoHi.second);
21909
21910  // Shuffle the elements back into order.
21911  SmallVector<int> NMask;
21912  for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
21913    NMask.push_back(I);
21914    NMask.push_back(I);
21915  }
21916  Results.push_back(
21917      DAG.getVectorShuffle(VT, DL,
21918                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
21919                                       DAG.getUNDEF(LoHi.first.getValueType())),
21920                           DAG.getUNDEF(VT), NMask));
21921}
21922
21923static void ReplaceReductionResults(SDNode *N,
21924                                    SmallVectorImpl<SDValue> &Results,
21925                                    SelectionDAG &DAG, unsigned InterOp,
21926                                    unsigned AcrossOp) {
21927  EVT LoVT, HiVT;
21928  SDValue Lo, Hi;
21929  SDLoc dl(N);
21930  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
21931  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
21932  SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
21933  SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
21934  Results.push_back(SplitVal);
21935}
21936
21937static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
21938  SDLoc DL(N);
21939  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
21940  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
21941                           DAG.getNode(ISD::SRL, DL, MVT::i128, N,
21942                                       DAG.getConstant(64, DL, MVT::i64)));
21943  return std::make_pair(Lo, Hi);
21944}
21945
21946void AArch64TargetLowering::ReplaceExtractSubVectorResults(
21947    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21948  SDValue In = N->getOperand(0);
21949  EVT InVT = In.getValueType();
21950
21951  // Common code will handle these just fine.
21952  if (!InVT.isScalableVector() || !InVT.isInteger())
21953    return;
21954
21955  SDLoc DL(N);
21956  EVT VT = N->getValueType(0);
21957
21958  // The following checks bail if this is not a halving operation.
21959
21960  ElementCount ResEC = VT.getVectorElementCount();
21961
21962  if (InVT.getVectorElementCount() != (ResEC * 2))
21963    return;
21964
21965  auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
21966  if (!CIndex)
21967    return;
21968
21969  unsigned Index = CIndex->getZExtValue();
21970  if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
21971    return;
21972
21973  unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
21974  EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
21975
21976  SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
21977  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
21978}
21979
21980// Create an even/odd pair of X registers holding integer value V.
21981static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
21982  SDLoc dl(V.getNode());
21983  SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
21984  SDValue VHi = DAG.getAnyExtOrTrunc(
21985      DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
21986      dl, MVT::i64);
21987  if (DAG.getDataLayout().isBigEndian())
21988    std::swap (VLo, VHi);
21989  SDValue RegClass =
21990      DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
21991  SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
21992  SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
21993  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
21994  return SDValue(
21995      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
21996}
21997
21998static void ReplaceCMP_SWAP_128Results(SDNode *N,
21999                                       SmallVectorImpl<SDValue> &Results,
22000                                       SelectionDAG &DAG,
22001                                       const AArch64Subtarget *Subtarget) {
22002  assert(N->getValueType(0) == MVT::i128 &&
22003         "AtomicCmpSwap on types less than 128 should be legal");
22004
22005  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
22006  if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
22007    // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
22008    // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
22009    SDValue Ops[] = {
22010        createGPRPairNode(DAG, N->getOperand(2)), // Compare value
22011        createGPRPairNode(DAG, N->getOperand(3)), // Store value
22012        N->getOperand(1), // Ptr
22013        N->getOperand(0), // Chain in
22014    };
22015
22016    unsigned Opcode;
22017    switch (MemOp->getMergedOrdering()) {
22018    case AtomicOrdering::Monotonic:
22019      Opcode = AArch64::CASPX;
22020      break;
22021    case AtomicOrdering::Acquire:
22022      Opcode = AArch64::CASPAX;
22023      break;
22024    case AtomicOrdering::Release:
22025      Opcode = AArch64::CASPLX;
22026      break;
22027    case AtomicOrdering::AcquireRelease:
22028    case AtomicOrdering::SequentiallyConsistent:
22029      Opcode = AArch64::CASPALX;
22030      break;
22031    default:
22032      llvm_unreachable("Unexpected ordering!");
22033    }
22034
22035    MachineSDNode *CmpSwap = DAG.getMachineNode(
22036        Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
22037    DAG.setNodeMemRefs(CmpSwap, {MemOp});
22038
22039    unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
22040    if (DAG.getDataLayout().isBigEndian())
22041      std::swap(SubReg1, SubReg2);
22042    SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
22043                                            SDValue(CmpSwap, 0));
22044    SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
22045                                            SDValue(CmpSwap, 0));
22046    Results.push_back(
22047        DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
22048    Results.push_back(SDValue(CmpSwap, 1)); // Chain out
22049    return;
22050  }
22051
22052  unsigned Opcode;
22053  switch (MemOp->getMergedOrdering()) {
22054  case AtomicOrdering::Monotonic:
22055    Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
22056    break;
22057  case AtomicOrdering::Acquire:
22058    Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
22059    break;
22060  case AtomicOrdering::Release:
22061    Opcode = AArch64::CMP_SWAP_128_RELEASE;
22062    break;
22063  case AtomicOrdering::AcquireRelease:
22064  case AtomicOrdering::SequentiallyConsistent:
22065    Opcode = AArch64::CMP_SWAP_128;
22066    break;
22067  default:
22068    llvm_unreachable("Unexpected ordering!");
22069  }
22070
22071  auto Desired = splitInt128(N->getOperand(2), DAG);
22072  auto New = splitInt128(N->getOperand(3), DAG);
22073  SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
22074                   New.first,        New.second,    N->getOperand(0)};
22075  SDNode *CmpSwap = DAG.getMachineNode(
22076      Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
22077      Ops);
22078  DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
22079
22080  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
22081                                SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
22082  Results.push_back(SDValue(CmpSwap, 3));
22083}
22084
22085void AArch64TargetLowering::ReplaceNodeResults(
22086    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
22087  switch (N->getOpcode()) {
22088  default:
22089    llvm_unreachable("Don't know how to custom expand this");
22090  case ISD::BITCAST:
22091    ReplaceBITCASTResults(N, Results, DAG);
22092    return;
22093  case ISD::VECREDUCE_ADD:
22094  case ISD::VECREDUCE_SMAX:
22095  case ISD::VECREDUCE_SMIN:
22096  case ISD::VECREDUCE_UMAX:
22097  case ISD::VECREDUCE_UMIN:
22098    Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
22099    return;
22100  case ISD::ADD:
22101  case ISD::FADD:
22102    ReplaceAddWithADDP(N, Results, DAG, Subtarget);
22103    return;
22104
22105  case ISD::CTPOP:
22106  case ISD::PARITY:
22107    if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
22108      Results.push_back(Result);
22109    return;
22110  case AArch64ISD::SADDV:
22111    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
22112    return;
22113  case AArch64ISD::UADDV:
22114    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
22115    return;
22116  case AArch64ISD::SMINV:
22117    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
22118    return;
22119  case AArch64ISD::UMINV:
22120    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
22121    return;
22122  case AArch64ISD::SMAXV:
22123    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
22124    return;
22125  case AArch64ISD::UMAXV:
22126    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
22127    return;
22128  case ISD::FP_TO_UINT:
22129  case ISD::FP_TO_SINT:
22130  case ISD::STRICT_FP_TO_SINT:
22131  case ISD::STRICT_FP_TO_UINT:
22132    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
22133    // Let normal code take care of it by not adding anything to Results.
22134    return;
22135  case ISD::ATOMIC_CMP_SWAP:
22136    ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
22137    return;
22138  case ISD::ATOMIC_LOAD:
22139  case ISD::LOAD: {
22140    MemSDNode *LoadNode = cast<MemSDNode>(N);
22141    EVT MemVT = LoadNode->getMemoryVT();
22142    // Handle lowering 256 bit non temporal loads into LDNP for little-endian
22143    // targets.
22144    if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
22145        MemVT.getSizeInBits() == 256u &&
22146        (MemVT.getScalarSizeInBits() == 8u ||
22147         MemVT.getScalarSizeInBits() == 16u ||
22148         MemVT.getScalarSizeInBits() == 32u ||
22149         MemVT.getScalarSizeInBits() == 64u)) {
22150
22151      SDValue Result = DAG.getMemIntrinsicNode(
22152          AArch64ISD::LDNP, SDLoc(N),
22153          DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
22154                         MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
22155                         MVT::Other}),
22156          {LoadNode->getChain(), LoadNode->getBasePtr()},
22157          LoadNode->getMemoryVT(), LoadNode->getMemOperand());
22158
22159      SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
22160                                 Result.getValue(0), Result.getValue(1));
22161      Results.append({Pair, Result.getValue(2) /* Chain */});
22162      return;
22163    }
22164
22165    if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
22166        LoadNode->getMemoryVT() != MVT::i128) {
22167      // Non-volatile or atomic loads are optimized later in AArch64's load/store
22168      // optimizer.
22169      return;
22170    }
22171
22172    if (SDValue(N, 0).getValueType() == MVT::i128) {
22173      SDValue Result = DAG.getMemIntrinsicNode(
22174          AArch64ISD::LDP, SDLoc(N),
22175          DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
22176          {LoadNode->getChain(), LoadNode->getBasePtr()},
22177          LoadNode->getMemoryVT(), LoadNode->getMemOperand());
22178
22179      SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
22180                                 Result.getValue(0), Result.getValue(1));
22181      Results.append({Pair, Result.getValue(2) /* Chain */});
22182    }
22183    return;
22184  }
22185  case ISD::EXTRACT_SUBVECTOR:
22186    ReplaceExtractSubVectorResults(N, Results, DAG);
22187    return;
22188  case ISD::INSERT_SUBVECTOR:
22189  case ISD::CONCAT_VECTORS:
22190    // Custom lowering has been requested for INSERT_SUBVECTOR and
22191    // CONCAT_VECTORS -- but delegate to common code for result type
22192    // legalisation
22193    return;
22194  case ISD::INTRINSIC_WO_CHAIN: {
22195    EVT VT = N->getValueType(0);
22196    assert((VT == MVT::i8 || VT == MVT::i16) &&
22197           "custom lowering for unexpected type");
22198
22199    ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
22200    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
22201    switch (IntID) {
22202    default:
22203      return;
22204    case Intrinsic::aarch64_sve_clasta_n: {
22205      SDLoc DL(N);
22206      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
22207      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
22208                           N->getOperand(1), Op2, N->getOperand(3));
22209      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
22210      return;
22211    }
22212    case Intrinsic::aarch64_sve_clastb_n: {
22213      SDLoc DL(N);
22214      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
22215      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
22216                           N->getOperand(1), Op2, N->getOperand(3));
22217      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
22218      return;
22219    }
22220    case Intrinsic::aarch64_sve_lasta: {
22221      SDLoc DL(N);
22222      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
22223                           N->getOperand(1), N->getOperand(2));
22224      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
22225      return;
22226    }
22227    case Intrinsic::aarch64_sve_lastb: {
22228      SDLoc DL(N);
22229      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
22230                           N->getOperand(1), N->getOperand(2));
22231      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
22232      return;
22233    }
22234    }
22235  }
22236  case ISD::READ_REGISTER: {
22237    SDLoc DL(N);
22238    assert(N->getValueType(0) == MVT::i128 &&
22239           "READ_REGISTER custom lowering is only for 128-bit sysregs");
22240    SDValue Chain = N->getOperand(0);
22241    SDValue SysRegName = N->getOperand(1);
22242
22243    SDValue Result = DAG.getNode(
22244        AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
22245        Chain, SysRegName);
22246
22247    // Sysregs are not endian. Result.getValue(0) always contains the lower half
22248    // of the 128-bit System Register value.
22249    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
22250                               Result.getValue(0), Result.getValue(1));
22251    Results.push_back(Pair);
22252    Results.push_back(Result.getValue(2)); // Chain
22253    return;
22254  }
22255  }
22256}
22257
22258bool AArch64TargetLowering::useLoadStackGuardNode() const {
22259  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia() ||
22260      Subtarget->isTargetOpenBSD())
22261    return TargetLowering::useLoadStackGuardNode();
22262  return true;
22263}
22264
22265unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
22266  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
22267  // reciprocal if there are three or more FDIVs.
22268  return 3;
22269}
22270
22271TargetLoweringBase::LegalizeTypeAction
22272AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
22273  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
22274  // v4i16, v2i32 instead of to promote.
22275  if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
22276      VT == MVT::v1f32)
22277    return TypeWidenVector;
22278
22279  return TargetLoweringBase::getPreferredVectorAction(VT);
22280}
22281
22282// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
22283// provided the address is 16-byte aligned.
22284bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
22285  if (!Subtarget->hasLSE2())
22286    return false;
22287
22288  if (auto LI = dyn_cast<LoadInst>(I))
22289    return LI->getType()->getPrimitiveSizeInBits() == 128 &&
22290           LI->getAlign() >= Align(16);
22291
22292  if (auto SI = dyn_cast<StoreInst>(I))
22293    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
22294           SI->getAlign() >= Align(16);
22295
22296  return false;
22297}
22298
22299bool AArch64TargetLowering::shouldInsertFencesForAtomic(
22300    const Instruction *I) const {
22301  return isOpSuitableForLDPSTP(I);
22302}
22303
22304bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
22305    const Instruction *I) const {
22306  // Store-Release instructions only provide seq_cst guarantees when paired with
22307  // Load-Acquire instructions. MSVC CRT does not use these instructions to
22308  // implement seq_cst loads and stores, so we need additional explicit fences
22309  // after memory writes.
22310  if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
22311    return false;
22312
22313  switch (I->getOpcode()) {
22314  default:
22315    return false;
22316  case Instruction::AtomicCmpXchg:
22317    return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
22318           AtomicOrdering::SequentiallyConsistent;
22319  case Instruction::AtomicRMW:
22320    return cast<AtomicRMWInst>(I)->getOrdering() ==
22321           AtomicOrdering::SequentiallyConsistent;
22322  case Instruction::Store:
22323    return cast<StoreInst>(I)->getOrdering() ==
22324           AtomicOrdering::SequentiallyConsistent;
22325  }
22326}
22327
22328// Loads and stores less than 128-bits are already atomic; ones above that
22329// are doomed anyway, so defer to the default libcall and blame the OS when
22330// things go wrong.
22331TargetLoweringBase::AtomicExpansionKind
22332AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22333  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
22334  if (Size != 128 || isOpSuitableForLDPSTP(SI))
22335    return AtomicExpansionKind::None;
22336  return AtomicExpansionKind::Expand;
22337}
22338
22339// Loads and stores less than 128-bits are already atomic; ones above that
22340// are doomed anyway, so defer to the default libcall and blame the OS when
22341// things go wrong.
22342TargetLowering::AtomicExpansionKind
22343AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22344  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
22345
22346  if (Size != 128 || isOpSuitableForLDPSTP(LI))
22347    return AtomicExpansionKind::None;
22348
22349  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
22350  // implement atomicrmw without spilling. If the target address is also on the
22351  // stack and close enough to the spill slot, this can lead to a situation
22352  // where the monitor always gets cleared and the atomic operation can never
22353  // succeed. So at -O0 lower this operation to a CAS loop.
22354  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
22355    return AtomicExpansionKind::CmpXChg;
22356
22357  // Using CAS for an atomic load has a better chance of succeeding under high
22358  // contention situations. So use it if available.
22359  return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
22360                             : AtomicExpansionKind::LLSC;
22361}
22362
22363// For the real atomic operations, we have ldxr/stxr up to 128 bits,
22364TargetLowering::AtomicExpansionKind
22365AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22366  if (AI->isFloatingPointOperation())
22367    return AtomicExpansionKind::CmpXChg;
22368
22369  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
22370  if (Size > 128) return AtomicExpansionKind::None;
22371
22372  // Nand is not supported in LSE.
22373  // Leave 128 bits to LLSC or CmpXChg.
22374  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
22375    if (Subtarget->hasLSE())
22376      return AtomicExpansionKind::None;
22377    if (Subtarget->outlineAtomics()) {
22378      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
22379      // Don't outline them unless
22380      // (1) high level <atomic> support approved:
22381      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
22382      // (2) low level libgcc and compiler-rt support implemented by:
22383      //   min/max outline atomics helpers
22384      if (AI->getOperation() != AtomicRMWInst::Min &&
22385          AI->getOperation() != AtomicRMWInst::Max &&
22386          AI->getOperation() != AtomicRMWInst::UMin &&
22387          AI->getOperation() != AtomicRMWInst::UMax) {
22388        return AtomicExpansionKind::None;
22389      }
22390    }
22391  }
22392
22393  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
22394  // implement atomicrmw without spilling. If the target address is also on the
22395  // stack and close enough to the spill slot, this can lead to a situation
22396  // where the monitor always gets cleared and the atomic operation can never
22397  // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
22398  // we have a single CAS instruction that can replace the loop.
22399  if (getTargetMachine().getOptLevel() == CodeGenOpt::None ||
22400      Subtarget->hasLSE())
22401    return AtomicExpansionKind::CmpXChg;
22402
22403  return AtomicExpansionKind::LLSC;
22404}
22405
22406TargetLowering::AtomicExpansionKind
22407AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
22408    AtomicCmpXchgInst *AI) const {
22409  // If subtarget has LSE, leave cmpxchg intact for codegen.
22410  if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
22411    return AtomicExpansionKind::None;
22412  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
22413  // implement cmpxchg without spilling. If the address being exchanged is also
22414  // on the stack and close enough to the spill slot, this can lead to a
22415  // situation where the monitor always gets cleared and the atomic operation
22416  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
22417  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
22418    return AtomicExpansionKind::None;
22419
22420  // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
22421  // it.
22422  unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
22423  if (Size > 64)
22424    return AtomicExpansionKind::None;
22425
22426  return AtomicExpansionKind::LLSC;
22427}
22428
22429Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
22430                                             Type *ValueTy, Value *Addr,
22431                                             AtomicOrdering Ord) const {
22432  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22433  bool IsAcquire = isAcquireOrStronger(Ord);
22434
22435  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
22436  // intrinsic must return {i64, i64} and we have to recombine them into a
22437  // single i128 here.
22438  if (ValueTy->getPrimitiveSizeInBits() == 128) {
22439    Intrinsic::ID Int =
22440        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
22441    Function *Ldxr = Intrinsic::getDeclaration(M, Int);
22442
22443    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
22444    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
22445
22446    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
22447    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
22448    Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
22449    Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
22450    return Builder.CreateOr(
22451        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
22452  }
22453
22454  Type *Tys[] = { Addr->getType() };
22455  Intrinsic::ID Int =
22456      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
22457  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
22458
22459  const DataLayout &DL = M->getDataLayout();
22460  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
22461  CallInst *CI = Builder.CreateCall(Ldxr, Addr);
22462  CI->addParamAttr(
22463      0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
22464  Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
22465
22466  return Builder.CreateBitCast(Trunc, ValueTy);
22467}
22468
22469void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
22470    IRBuilderBase &Builder) const {
22471  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22472  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
22473}
22474
22475Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
22476                                                   Value *Val, Value *Addr,
22477                                                   AtomicOrdering Ord) const {
22478  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22479  bool IsRelease = isReleaseOrStronger(Ord);
22480
22481  // Since the intrinsics must have legal type, the i128 intrinsics take two
22482  // parameters: "i64, i64". We must marshal Val into the appropriate form
22483  // before the call.
22484  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
22485    Intrinsic::ID Int =
22486        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
22487    Function *Stxr = Intrinsic::getDeclaration(M, Int);
22488    Type *Int64Ty = Type::getInt64Ty(M->getContext());
22489
22490    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
22491    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
22492    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
22493    return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
22494  }
22495
22496  Intrinsic::ID Int =
22497      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
22498  Type *Tys[] = { Addr->getType() };
22499  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
22500
22501  const DataLayout &DL = M->getDataLayout();
22502  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
22503  Val = Builder.CreateBitCast(Val, IntValTy);
22504
22505  CallInst *CI = Builder.CreateCall(
22506      Stxr, {Builder.CreateZExtOrBitCast(
22507                 Val, Stxr->getFunctionType()->getParamType(0)),
22508             Addr});
22509  CI->addParamAttr(1, Attribute::get(Builder.getContext(),
22510                                     Attribute::ElementType, Val->getType()));
22511  return CI;
22512}
22513
22514bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
22515    Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22516    const DataLayout &DL) const {
22517  if (!Ty->isArrayTy()) {
22518    const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
22519    return TySize.isScalable() && TySize.getKnownMinValue() > 128;
22520  }
22521
22522  // All non aggregate members of the type must have the same type
22523  SmallVector<EVT> ValueVTs;
22524  ComputeValueVTs(*this, DL, Ty, ValueVTs);
22525  return all_equal(ValueVTs);
22526}
22527
22528bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
22529                                                            EVT) const {
22530  return false;
22531}
22532
22533static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
22534  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
22535  Function *ThreadPointerFunc =
22536      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
22537  return IRB.CreatePointerCast(
22538      IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
22539                             Offset),
22540      IRB.getInt8PtrTy()->getPointerTo(0));
22541}
22542
22543Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
22544  // Android provides a fixed TLS slot for the stack cookie. See the definition
22545  // of TLS_SLOT_STACK_GUARD in
22546  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
22547  if (Subtarget->isTargetAndroid())
22548    return UseTlsOffset(IRB, 0x28);
22549
22550  // Fuchsia is similar.
22551  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
22552  if (Subtarget->isTargetFuchsia())
22553    return UseTlsOffset(IRB, -0x10);
22554
22555  return TargetLowering::getIRStackGuard(IRB);
22556}
22557
22558void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
22559  // MSVC CRT provides functionalities for stack protection.
22560  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
22561    // MSVC CRT has a global variable holding security cookie.
22562    M.getOrInsertGlobal("__security_cookie",
22563                        Type::getInt8PtrTy(M.getContext()));
22564
22565    // MSVC CRT has a function to validate security cookie.
22566    FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
22567        Subtarget->getSecurityCheckCookieName(),
22568        Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext()));
22569    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
22570      F->setCallingConv(CallingConv::Win64);
22571      F->addParamAttr(0, Attribute::AttrKind::InReg);
22572    }
22573    return;
22574  }
22575  TargetLowering::insertSSPDeclarations(M);
22576}
22577
22578Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
22579  // MSVC CRT has a global variable holding security cookie.
22580  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
22581    return M.getGlobalVariable("__security_cookie");
22582  return TargetLowering::getSDagStackGuard(M);
22583}
22584
22585Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
22586  // MSVC CRT has a function to validate security cookie.
22587  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
22588    return M.getFunction(Subtarget->getSecurityCheckCookieName());
22589  return TargetLowering::getSSPStackGuardCheck(M);
22590}
22591
22592Value *
22593AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
22594  // Android provides a fixed TLS slot for the SafeStack pointer. See the
22595  // definition of TLS_SLOT_SAFESTACK in
22596  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
22597  if (Subtarget->isTargetAndroid())
22598    return UseTlsOffset(IRB, 0x48);
22599
22600  // Fuchsia is similar.
22601  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
22602  if (Subtarget->isTargetFuchsia())
22603    return UseTlsOffset(IRB, -0x8);
22604
22605  return TargetLowering::getSafeStackPointerLocation(IRB);
22606}
22607
22608bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
22609    const Instruction &AndI) const {
22610  // Only sink 'and' mask to cmp use block if it is masking a single bit, since
22611  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
22612  // may be beneficial to sink in other cases, but we would have to check that
22613  // the cmp would not get folded into the br to form a cbz for these to be
22614  // beneficial.
22615  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
22616  if (!Mask)
22617    return false;
22618  return Mask->getValue().isPowerOf2();
22619}
22620
22621bool AArch64TargetLowering::
22622    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
22623        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
22624        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
22625        SelectionDAG &DAG) const {
22626  // Does baseline recommend not to perform the fold by default?
22627  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
22628          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
22629    return false;
22630  // Else, if this is a vector shift, prefer 'shl'.
22631  return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
22632}
22633
22634TargetLowering::ShiftLegalizationStrategy
22635AArch64TargetLowering::preferredShiftLegalizationStrategy(
22636    SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
22637  if (DAG.getMachineFunction().getFunction().hasMinSize() &&
22638      !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
22639    return ShiftLegalizationStrategy::LowerToLibcall;
22640  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
22641                                                            ExpansionFactor);
22642}
22643
22644void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22645  // Update IsSplitCSR in AArch64unctionInfo.
22646  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
22647  AFI->setIsSplitCSR(true);
22648}
22649
22650void AArch64TargetLowering::insertCopiesSplitCSR(
22651    MachineBasicBlock *Entry,
22652    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22653  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
22654  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22655  if (!IStart)
22656    return;
22657
22658  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22659  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22660  MachineBasicBlock::iterator MBBI = Entry->begin();
22661  for (const MCPhysReg *I = IStart; *I; ++I) {
22662    const TargetRegisterClass *RC = nullptr;
22663    if (AArch64::GPR64RegClass.contains(*I))
22664      RC = &AArch64::GPR64RegClass;
22665    else if (AArch64::FPR64RegClass.contains(*I))
22666      RC = &AArch64::FPR64RegClass;
22667    else
22668      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22669
22670    Register NewVR = MRI->createVirtualRegister(RC);
22671    // Create copy from CSR to a virtual register.
22672    // FIXME: this currently does not emit CFI pseudo-instructions, it works
22673    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22674    // nounwind. If we want to generalize this later, we may need to emit
22675    // CFI pseudo-instructions.
22676    assert(Entry->getParent()->getFunction().hasFnAttribute(
22677               Attribute::NoUnwind) &&
22678           "Function should be nounwind in insertCopiesSplitCSR!");
22679    Entry->addLiveIn(*I);
22680    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22681        .addReg(*I);
22682
22683    // Insert the copy-back instructions right before the terminator.
22684    for (auto *Exit : Exits)
22685      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22686              TII->get(TargetOpcode::COPY), *I)
22687          .addReg(NewVR);
22688  }
22689}
22690
22691bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
22692  // Integer division on AArch64 is expensive. However, when aggressively
22693  // optimizing for code size, we prefer to use a div instruction, as it is
22694  // usually smaller than the alternative sequence.
22695  // The exception to this is vector division. Since AArch64 doesn't have vector
22696  // integer division, leaving the division as-is is a loss even in terms of
22697  // size, because it will have to be scalarized, while the alternative code
22698  // sequence can be performed in vector form.
22699  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
22700  return OptSize && !VT.isVector();
22701}
22702
22703bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
22704  // We want inc-of-add for scalars and sub-of-not for vectors.
22705  return VT.isScalarInteger();
22706}
22707
22708bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
22709                                                 EVT VT) const {
22710  // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
22711  // legalize.
22712  if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
22713    return false;
22714  return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
22715}
22716
22717bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
22718  return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
22719}
22720
22721unsigned
22722AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
22723  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
22724    return getPointerTy(DL).getSizeInBits();
22725
22726  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
22727}
22728
22729void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
22730  MachineFrameInfo &MFI = MF.getFrameInfo();
22731  // If we have any vulnerable SVE stack objects then the stack protector
22732  // needs to be placed at the top of the SVE stack area, as the SVE locals
22733  // are placed above the other locals, so we allocate it as if it were a
22734  // scalable vector.
22735  // FIXME: It may be worthwhile having a specific interface for this rather
22736  // than doing it here in finalizeLowering.
22737  if (MFI.hasStackProtectorIndex()) {
22738    for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
22739      if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
22740          MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
22741        MFI.setStackID(MFI.getStackProtectorIndex(),
22742                       TargetStackID::ScalableVector);
22743        MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
22744        break;
22745      }
22746    }
22747  }
22748  MFI.computeMaxCallFrameSize(MF);
22749  TargetLoweringBase::finalizeLowering(MF);
22750}
22751
22752// Unlike X86, we let frame lowering assign offsets to all catch objects.
22753bool AArch64TargetLowering::needsFixedCatchObjects() const {
22754  return false;
22755}
22756
22757bool AArch64TargetLowering::shouldLocalize(
22758    const MachineInstr &MI, const TargetTransformInfo *TTI) const {
22759  auto &MF = *MI.getMF();
22760  auto &MRI = MF.getRegInfo();
22761  auto maxUses = [](unsigned RematCost) {
22762    // A cost of 1 means remats are basically free.
22763    if (RematCost == 1)
22764      return std::numeric_limits<unsigned>::max();
22765    if (RematCost == 2)
22766      return 2U;
22767
22768    // Remat is too expensive, only sink if there's one user.
22769    if (RematCost > 2)
22770      return 1U;
22771    llvm_unreachable("Unexpected remat cost");
22772  };
22773
22774  switch (MI.getOpcode()) {
22775  case TargetOpcode::G_GLOBAL_VALUE: {
22776    // On Darwin, TLS global vars get selected into function calls, which
22777    // we don't want localized, as they can get moved into the middle of a
22778    // another call sequence.
22779    const GlobalValue &GV = *MI.getOperand(1).getGlobal();
22780    if (GV.isThreadLocal() && Subtarget->isTargetMachO())
22781      return false;
22782    break;
22783  }
22784  case TargetOpcode::G_CONSTANT: {
22785    auto *CI = MI.getOperand(1).getCImm();
22786    APInt Imm = CI->getValue();
22787    InstructionCost Cost = TTI->getIntImmCost(
22788        Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
22789    assert(Cost.isValid() && "Expected a valid imm cost");
22790
22791    unsigned RematCost = *Cost.getValue();
22792    Register Reg = MI.getOperand(0).getReg();
22793    unsigned MaxUses = maxUses(RematCost);
22794    // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
22795    if (MaxUses == std::numeric_limits<unsigned>::max())
22796      --MaxUses;
22797    return MRI.hasAtMostUserInstrs(Reg, MaxUses);
22798  }
22799  // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
22800  // localizable.
22801  case AArch64::ADRP:
22802  case AArch64::G_ADD_LOW:
22803    return true;
22804  default:
22805    break;
22806  }
22807  return TargetLoweringBase::shouldLocalize(MI, TTI);
22808}
22809
22810bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
22811  if (isa<ScalableVectorType>(Inst.getType()))
22812    return true;
22813
22814  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
22815    if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
22816      return true;
22817
22818  if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
22819    if (isa<ScalableVectorType>(AI->getAllocatedType()))
22820      return true;
22821  }
22822
22823  // Checks to allow the use of SME instructions
22824  if (auto *Base = dyn_cast<CallBase>(&Inst)) {
22825    auto CallerAttrs = SMEAttrs(*Inst.getFunction());
22826    auto CalleeAttrs = SMEAttrs(*Base);
22827    if (CallerAttrs.requiresSMChange(CalleeAttrs,
22828                                     /*BodyOverridesInterface=*/false) ||
22829        CallerAttrs.requiresLazySave(CalleeAttrs))
22830      return true;
22831  }
22832  return false;
22833}
22834
22835// Return the largest legal scalable vector type that matches VT's element type.
22836static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
22837  assert(VT.isFixedLengthVector() &&
22838         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
22839         "Expected legal fixed length vector!");
22840  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
22841  default:
22842    llvm_unreachable("unexpected element type for SVE container");
22843  case MVT::i8:
22844    return EVT(MVT::nxv16i8);
22845  case MVT::i16:
22846    return EVT(MVT::nxv8i16);
22847  case MVT::i32:
22848    return EVT(MVT::nxv4i32);
22849  case MVT::i64:
22850    return EVT(MVT::nxv2i64);
22851  case MVT::f16:
22852    return EVT(MVT::nxv8f16);
22853  case MVT::f32:
22854    return EVT(MVT::nxv4f32);
22855  case MVT::f64:
22856    return EVT(MVT::nxv2f64);
22857  }
22858}
22859
22860// Return a PTRUE with active lanes corresponding to the extent of VT.
22861static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
22862                                                EVT VT) {
22863  assert(VT.isFixedLengthVector() &&
22864         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
22865         "Expected legal fixed length vector!");
22866
22867  std::optional<unsigned> PgPattern =
22868      getSVEPredPatternFromNumElements(VT.getVectorNumElements());
22869  assert(PgPattern && "Unexpected element count for SVE predicate");
22870
22871  // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
22872  // AArch64SVEPredPattern::all, which can enable the use of unpredicated
22873  // variants of instructions when available.
22874  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22875  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
22876  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
22877  if (MaxSVESize && MinSVESize == MaxSVESize &&
22878      MaxSVESize == VT.getSizeInBits())
22879    PgPattern = AArch64SVEPredPattern::all;
22880
22881  MVT MaskVT;
22882  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
22883  default:
22884    llvm_unreachable("unexpected element type for SVE predicate");
22885  case MVT::i8:
22886    MaskVT = MVT::nxv16i1;
22887    break;
22888  case MVT::i16:
22889  case MVT::f16:
22890    MaskVT = MVT::nxv8i1;
22891    break;
22892  case MVT::i32:
22893  case MVT::f32:
22894    MaskVT = MVT::nxv4i1;
22895    break;
22896  case MVT::i64:
22897  case MVT::f64:
22898    MaskVT = MVT::nxv2i1;
22899    break;
22900  }
22901
22902  return getPTrue(DAG, DL, MaskVT, *PgPattern);
22903}
22904
22905static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
22906                                             EVT VT) {
22907  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
22908         "Expected legal scalable vector!");
22909  auto PredTy = VT.changeVectorElementType(MVT::i1);
22910  return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
22911}
22912
22913static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
22914  if (VT.isFixedLengthVector())
22915    return getPredicateForFixedLengthVector(DAG, DL, VT);
22916
22917  return getPredicateForScalableVector(DAG, DL, VT);
22918}
22919
22920// Grow V to consume an entire SVE register.
22921static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22922  assert(VT.isScalableVector() &&
22923         "Expected to convert into a scalable vector!");
22924  assert(V.getValueType().isFixedLengthVector() &&
22925         "Expected a fixed length vector operand!");
22926  SDLoc DL(V);
22927  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22928  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
22929}
22930
22931// Shrink V so it's just big enough to maintain a VT's worth of data.
22932static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22933  assert(VT.isFixedLengthVector() &&
22934         "Expected to convert into a fixed length vector!");
22935  assert(V.getValueType().isScalableVector() &&
22936         "Expected a scalable vector operand!");
22937  SDLoc DL(V);
22938  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22939  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
22940}
22941
22942// Convert all fixed length vector loads larger than NEON to masked_loads.
22943SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
22944    SDValue Op, SelectionDAG &DAG) const {
22945  auto Load = cast<LoadSDNode>(Op);
22946
22947  SDLoc DL(Op);
22948  EVT VT = Op.getValueType();
22949  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22950  EVT LoadVT = ContainerVT;
22951  EVT MemVT = Load->getMemoryVT();
22952
22953  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
22954
22955  if (VT.isFloatingPoint()) {
22956    LoadVT = ContainerVT.changeTypeToInteger();
22957    MemVT = MemVT.changeTypeToInteger();
22958  }
22959
22960  SDValue NewLoad = DAG.getMaskedLoad(
22961      LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
22962      DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
22963      Load->getAddressingMode(), Load->getExtensionType());
22964
22965  SDValue Result = NewLoad;
22966  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
22967    EVT ExtendVT = ContainerVT.changeVectorElementType(
22968        Load->getMemoryVT().getVectorElementType());
22969
22970    Result = getSVESafeBitCast(ExtendVT, Result, DAG);
22971    Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
22972                         Pg, Result, DAG.getUNDEF(ContainerVT));
22973  } else if (VT.isFloatingPoint()) {
22974    Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
22975  }
22976
22977  Result = convertFromScalableVector(DAG, VT, Result);
22978  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
22979  return DAG.getMergeValues(MergedValues, DL);
22980}
22981
22982static SDValue convertFixedMaskToScalableVector(SDValue Mask,
22983                                                SelectionDAG &DAG) {
22984  SDLoc DL(Mask);
22985  EVT InVT = Mask.getValueType();
22986  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22987
22988  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
22989
22990  if (ISD::isBuildVectorAllOnes(Mask.getNode()))
22991    return Pg;
22992
22993  auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
22994  auto Op2 = DAG.getConstant(0, DL, ContainerVT);
22995
22996  return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
22997                     {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
22998}
22999
23000// Convert all fixed length vector loads larger than NEON to masked_loads.
23001SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
23002    SDValue Op, SelectionDAG &DAG) const {
23003  auto Load = cast<MaskedLoadSDNode>(Op);
23004
23005  SDLoc DL(Op);
23006  EVT VT = Op.getValueType();
23007  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23008
23009  SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
23010
23011  SDValue PassThru;
23012  bool IsPassThruZeroOrUndef = false;
23013
23014  if (Load->getPassThru()->isUndef()) {
23015    PassThru = DAG.getUNDEF(ContainerVT);
23016    IsPassThruZeroOrUndef = true;
23017  } else {
23018    if (ContainerVT.isInteger())
23019      PassThru = DAG.getConstant(0, DL, ContainerVT);
23020    else
23021      PassThru = DAG.getConstantFP(0, DL, ContainerVT);
23022    if (isZerosVector(Load->getPassThru().getNode()))
23023      IsPassThruZeroOrUndef = true;
23024  }
23025
23026  SDValue NewLoad = DAG.getMaskedLoad(
23027      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
23028      Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
23029      Load->getAddressingMode(), Load->getExtensionType());
23030
23031  SDValue Result = NewLoad;
23032  if (!IsPassThruZeroOrUndef) {
23033    SDValue OldPassThru =
23034        convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
23035    Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
23036  }
23037
23038  Result = convertFromScalableVector(DAG, VT, Result);
23039  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
23040  return DAG.getMergeValues(MergedValues, DL);
23041}
23042
23043// Convert all fixed length vector stores larger than NEON to masked_stores.
23044SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
23045    SDValue Op, SelectionDAG &DAG) const {
23046  auto Store = cast<StoreSDNode>(Op);
23047
23048  SDLoc DL(Op);
23049  EVT VT = Store->getValue().getValueType();
23050  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23051  EVT MemVT = Store->getMemoryVT();
23052
23053  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
23054  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
23055
23056  if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
23057    EVT TruncVT = ContainerVT.changeVectorElementType(
23058        Store->getMemoryVT().getVectorElementType());
23059    MemVT = MemVT.changeTypeToInteger();
23060    NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
23061                           NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
23062                           DAG.getUNDEF(TruncVT));
23063    NewValue =
23064        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
23065  } else if (VT.isFloatingPoint()) {
23066    MemVT = MemVT.changeTypeToInteger();
23067    NewValue =
23068        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
23069  }
23070
23071  return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
23072                            Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
23073                            Store->getMemOperand(), Store->getAddressingMode(),
23074                            Store->isTruncatingStore());
23075}
23076
23077SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
23078    SDValue Op, SelectionDAG &DAG) const {
23079  auto *Store = cast<MaskedStoreSDNode>(Op);
23080
23081  SDLoc DL(Op);
23082  EVT VT = Store->getValue().getValueType();
23083  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23084
23085  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
23086  SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
23087
23088  return DAG.getMaskedStore(
23089      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
23090      Mask, Store->getMemoryVT(), Store->getMemOperand(),
23091      Store->getAddressingMode(), Store->isTruncatingStore());
23092}
23093
23094SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
23095    SDValue Op, SelectionDAG &DAG) const {
23096  SDLoc dl(Op);
23097  EVT VT = Op.getValueType();
23098  EVT EltVT = VT.getVectorElementType();
23099
23100  bool Signed = Op.getOpcode() == ISD::SDIV;
23101  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
23102
23103  bool Negated;
23104  uint64_t SplatVal;
23105  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
23106    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23107    SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
23108    SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
23109
23110    SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
23111    SDValue Res =
23112        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
23113    if (Negated)
23114      Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
23115                        DAG.getConstant(0, dl, ContainerVT), Res);
23116
23117    return convertFromScalableVector(DAG, VT, Res);
23118  }
23119
23120  // Scalable vector i32/i64 DIV is supported.
23121  if (EltVT == MVT::i32 || EltVT == MVT::i64)
23122    return LowerToPredicatedOp(Op, DAG, PredOpcode);
23123
23124  // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
23125  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23126  EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
23127  unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23128
23129  // If the wider type is legal: extend, op, and truncate.
23130  EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
23131  if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
23132    SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
23133    SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
23134    SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
23135    return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
23136  }
23137
23138  auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
23139                               &ExtendOpcode](SDValue Op) {
23140    SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
23141    SDValue IdxHalf =
23142        DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
23143    SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
23144    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
23145    return std::pair<SDValue, SDValue>(
23146        {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
23147         DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
23148  };
23149
23150  // If wider type is not legal: split, extend, op, trunc and concat.
23151  auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
23152  auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
23153  SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
23154  SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
23155  SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
23156  SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
23157  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
23158}
23159
23160SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
23161    SDValue Op, SelectionDAG &DAG) const {
23162  EVT VT = Op.getValueType();
23163  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23164
23165  SDLoc DL(Op);
23166  SDValue Val = Op.getOperand(0);
23167  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
23168  Val = convertToScalableVector(DAG, ContainerVT, Val);
23169
23170  bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
23171  unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
23172
23173  // Repeatedly unpack Val until the result is of the desired element type.
23174  switch (ContainerVT.getSimpleVT().SimpleTy) {
23175  default:
23176    llvm_unreachable("unimplemented container type");
23177  case MVT::nxv16i8:
23178    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
23179    if (VT.getVectorElementType() == MVT::i16)
23180      break;
23181    [[fallthrough]];
23182  case MVT::nxv8i16:
23183    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
23184    if (VT.getVectorElementType() == MVT::i32)
23185      break;
23186    [[fallthrough]];
23187  case MVT::nxv4i32:
23188    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
23189    assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
23190    break;
23191  }
23192
23193  return convertFromScalableVector(DAG, VT, Val);
23194}
23195
23196SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
23197    SDValue Op, SelectionDAG &DAG) const {
23198  EVT VT = Op.getValueType();
23199  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23200
23201  SDLoc DL(Op);
23202  SDValue Val = Op.getOperand(0);
23203  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
23204  Val = convertToScalableVector(DAG, ContainerVT, Val);
23205
23206  // Repeatedly truncate Val until the result is of the desired element type.
23207  switch (ContainerVT.getSimpleVT().SimpleTy) {
23208  default:
23209    llvm_unreachable("unimplemented container type");
23210  case MVT::nxv2i64:
23211    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
23212    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
23213    if (VT.getVectorElementType() == MVT::i32)
23214      break;
23215    [[fallthrough]];
23216  case MVT::nxv4i32:
23217    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
23218    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
23219    if (VT.getVectorElementType() == MVT::i16)
23220      break;
23221    [[fallthrough]];
23222  case MVT::nxv8i16:
23223    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
23224    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
23225    assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
23226    break;
23227  }
23228
23229  return convertFromScalableVector(DAG, VT, Val);
23230}
23231
23232SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
23233    SDValue Op, SelectionDAG &DAG) const {
23234  EVT VT = Op.getValueType();
23235  EVT InVT = Op.getOperand(0).getValueType();
23236  assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
23237
23238  SDLoc DL(Op);
23239  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
23240  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
23241
23242  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
23243}
23244
23245SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
23246    SDValue Op, SelectionDAG &DAG) const {
23247  EVT VT = Op.getValueType();
23248  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23249
23250  SDLoc DL(Op);
23251  EVT InVT = Op.getOperand(0).getValueType();
23252  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
23253  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
23254
23255  auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
23256                                 Op.getOperand(1), Op.getOperand(2));
23257
23258  return convertFromScalableVector(DAG, VT, ScalableRes);
23259}
23260
23261// Convert vector operation 'Op' to an equivalent predicated operation whereby
23262// the original operation's type is used to construct a suitable predicate.
23263// NOTE: The results for inactive lanes are undefined.
23264SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
23265                                                   SelectionDAG &DAG,
23266                                                   unsigned NewOp) const {
23267  EVT VT = Op.getValueType();
23268  SDLoc DL(Op);
23269  auto Pg = getPredicateForVector(DAG, DL, VT);
23270
23271  if (VT.isFixedLengthVector()) {
23272    assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
23273    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23274
23275    // Create list of operands by converting existing ones to scalable types.
23276    SmallVector<SDValue, 4> Operands = {Pg};
23277    for (const SDValue &V : Op->op_values()) {
23278      if (isa<CondCodeSDNode>(V)) {
23279        Operands.push_back(V);
23280        continue;
23281      }
23282
23283      if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
23284        EVT VTArg = VTNode->getVT().getVectorElementType();
23285        EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
23286        Operands.push_back(DAG.getValueType(NewVTArg));
23287        continue;
23288      }
23289
23290      assert(isTypeLegal(V.getValueType()) &&
23291             "Expected only legal fixed-width types");
23292      Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
23293    }
23294
23295    if (isMergePassthruOpcode(NewOp))
23296      Operands.push_back(DAG.getUNDEF(ContainerVT));
23297
23298    auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
23299    return convertFromScalableVector(DAG, VT, ScalableRes);
23300  }
23301
23302  assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
23303
23304  SmallVector<SDValue, 4> Operands = {Pg};
23305  for (const SDValue &V : Op->op_values()) {
23306    assert((!V.getValueType().isVector() ||
23307            V.getValueType().isScalableVector()) &&
23308           "Only scalable vectors are supported!");
23309    Operands.push_back(V);
23310  }
23311
23312  if (isMergePassthruOpcode(NewOp))
23313    Operands.push_back(DAG.getUNDEF(VT));
23314
23315  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
23316}
23317
23318// If a fixed length vector operation has no side effects when applied to
23319// undefined elements, we can safely use scalable vectors to perform the same
23320// operation without needing to worry about predication.
23321SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
23322                                                 SelectionDAG &DAG) const {
23323  EVT VT = Op.getValueType();
23324  assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
23325         "Only expected to lower fixed length vector operation!");
23326  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23327
23328  // Create list of operands by converting existing ones to scalable types.
23329  SmallVector<SDValue, 4> Ops;
23330  for (const SDValue &V : Op->op_values()) {
23331    assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
23332
23333    // Pass through non-vector operands.
23334    if (!V.getValueType().isVector()) {
23335      Ops.push_back(V);
23336      continue;
23337    }
23338
23339    // "cast" fixed length vector to a scalable vector.
23340    assert(V.getValueType().isFixedLengthVector() &&
23341           isTypeLegal(V.getValueType()) &&
23342           "Only fixed length vectors are supported!");
23343    Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
23344  }
23345
23346  auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
23347  return convertFromScalableVector(DAG, VT, ScalableRes);
23348}
23349
23350SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
23351    SelectionDAG &DAG) const {
23352  SDLoc DL(ScalarOp);
23353  SDValue AccOp = ScalarOp.getOperand(0);
23354  SDValue VecOp = ScalarOp.getOperand(1);
23355  EVT SrcVT = VecOp.getValueType();
23356  EVT ResVT = SrcVT.getVectorElementType();
23357
23358  EVT ContainerVT = SrcVT;
23359  if (SrcVT.isFixedLengthVector()) {
23360    ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
23361    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
23362  }
23363
23364  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
23365  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23366
23367  // Convert operands to Scalable.
23368  AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
23369                      DAG.getUNDEF(ContainerVT), AccOp, Zero);
23370
23371  // Perform reduction.
23372  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
23373                            Pg, AccOp, VecOp);
23374
23375  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
23376}
23377
23378SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
23379                                                       SelectionDAG &DAG) const {
23380  SDLoc DL(ReduceOp);
23381  SDValue Op = ReduceOp.getOperand(0);
23382  EVT OpVT = Op.getValueType();
23383  EVT VT = ReduceOp.getValueType();
23384
23385  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
23386    return SDValue();
23387
23388  SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
23389
23390  switch (ReduceOp.getOpcode()) {
23391  default:
23392    return SDValue();
23393  case ISD::VECREDUCE_OR:
23394    if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
23395      // The predicate can be 'Op' because
23396      // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
23397      return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
23398    else
23399      return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
23400  case ISD::VECREDUCE_AND: {
23401    Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
23402    return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
23403  }
23404  case ISD::VECREDUCE_XOR: {
23405    SDValue ID =
23406        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
23407    if (OpVT == MVT::nxv1i1) {
23408      // Emulate a CNTP on .Q using .D and a different governing predicate.
23409      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
23410      Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
23411    }
23412    SDValue Cntp =
23413        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
23414    return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
23415  }
23416  }
23417
23418  return SDValue();
23419}
23420
23421SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
23422                                                   SDValue ScalarOp,
23423                                                   SelectionDAG &DAG) const {
23424  SDLoc DL(ScalarOp);
23425  SDValue VecOp = ScalarOp.getOperand(0);
23426  EVT SrcVT = VecOp.getValueType();
23427
23428  if (useSVEForFixedLengthVectorVT(
23429          SrcVT,
23430          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
23431    EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
23432    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
23433  }
23434
23435  // UADDV always returns an i64 result.
23436  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
23437                                                   SrcVT.getVectorElementType();
23438  EVT RdxVT = SrcVT;
23439  if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
23440    RdxVT = getPackedSVEVectorVT(ResVT);
23441
23442  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
23443  SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
23444  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
23445                            Rdx, DAG.getConstant(0, DL, MVT::i64));
23446
23447  // The VEC_REDUCE nodes expect an element size result.
23448  if (ResVT != ScalarOp.getValueType())
23449    Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
23450
23451  return Res;
23452}
23453
23454SDValue
23455AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
23456    SelectionDAG &DAG) const {
23457  EVT VT = Op.getValueType();
23458  SDLoc DL(Op);
23459
23460  EVT InVT = Op.getOperand(1).getValueType();
23461  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
23462  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
23463  SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
23464
23465  // Convert the mask to a predicated (NOTE: We don't need to worry about
23466  // inactive lanes since VSELECT is safe when given undefined elements).
23467  EVT MaskVT = Op.getOperand(0).getValueType();
23468  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
23469  auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
23470  Mask = DAG.getNode(ISD::TRUNCATE, DL,
23471                     MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
23472
23473  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
23474                                Mask, Op1, Op2);
23475
23476  return convertFromScalableVector(DAG, VT, ScalableRes);
23477}
23478
23479SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
23480    SDValue Op, SelectionDAG &DAG) const {
23481  SDLoc DL(Op);
23482  EVT InVT = Op.getOperand(0).getValueType();
23483  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
23484
23485  assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
23486         "Only expected to lower fixed length vector operation!");
23487  assert(Op.getValueType() == InVT.changeTypeToInteger() &&
23488         "Expected integer result of the same bit length as the inputs!");
23489
23490  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
23491  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
23492  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
23493
23494  EVT CmpVT = Pg.getValueType();
23495  auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
23496                         {Pg, Op1, Op2, Op.getOperand(2)});
23497
23498  EVT PromoteVT = ContainerVT.changeTypeToInteger();
23499  auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
23500  return convertFromScalableVector(DAG, Op.getValueType(), Promote);
23501}
23502
23503SDValue
23504AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
23505                                                    SelectionDAG &DAG) const {
23506  SDLoc DL(Op);
23507  auto SrcOp = Op.getOperand(0);
23508  EVT VT = Op.getValueType();
23509  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
23510  EVT ContainerSrcVT =
23511      getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
23512
23513  SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
23514  Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
23515  return convertFromScalableVector(DAG, VT, Op);
23516}
23517
23518SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
23519    SDValue Op, SelectionDAG &DAG) const {
23520  SDLoc DL(Op);
23521  unsigned NumOperands = Op->getNumOperands();
23522
23523  assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
23524         "Unexpected number of operands in CONCAT_VECTORS");
23525
23526  auto SrcOp1 = Op.getOperand(0);
23527  auto SrcOp2 = Op.getOperand(1);
23528  EVT VT = Op.getValueType();
23529  EVT SrcVT = SrcOp1.getValueType();
23530
23531  if (NumOperands > 2) {
23532    SmallVector<SDValue, 4> Ops;
23533    EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23534    for (unsigned I = 0; I < NumOperands; I += 2)
23535      Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
23536                                Op->getOperand(I), Op->getOperand(I + 1)));
23537
23538    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
23539  }
23540
23541  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23542
23543  SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
23544  SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
23545  SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
23546
23547  Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
23548
23549  return convertFromScalableVector(DAG, VT, Op);
23550}
23551
23552SDValue
23553AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
23554                                                     SelectionDAG &DAG) const {
23555  EVT VT = Op.getValueType();
23556  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23557
23558  SDLoc DL(Op);
23559  SDValue Val = Op.getOperand(0);
23560  SDValue Pg = getPredicateForVector(DAG, DL, VT);
23561  EVT SrcVT = Val.getValueType();
23562  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23563  EVT ExtendVT = ContainerVT.changeVectorElementType(
23564      SrcVT.getVectorElementType());
23565
23566  Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
23567  Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
23568
23569  Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
23570  Val = getSVESafeBitCast(ExtendVT, Val, DAG);
23571  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
23572                    Pg, Val, DAG.getUNDEF(ContainerVT));
23573
23574  return convertFromScalableVector(DAG, VT, Val);
23575}
23576
23577SDValue
23578AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
23579                                                    SelectionDAG &DAG) const {
23580  EVT VT = Op.getValueType();
23581  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23582
23583  SDLoc DL(Op);
23584  SDValue Val = Op.getOperand(0);
23585  EVT SrcVT = Val.getValueType();
23586  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
23587  EVT RoundVT = ContainerSrcVT.changeVectorElementType(
23588      VT.getVectorElementType());
23589  SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
23590
23591  Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
23592  Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
23593                    Op.getOperand(1), DAG.getUNDEF(RoundVT));
23594  Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
23595  Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
23596
23597  Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
23598  return DAG.getNode(ISD::BITCAST, DL, VT, Val);
23599}
23600
23601SDValue
23602AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
23603                                                    SelectionDAG &DAG) const {
23604  EVT VT = Op.getValueType();
23605  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23606
23607  bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
23608  unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
23609                             : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
23610
23611  SDLoc DL(Op);
23612  SDValue Val = Op.getOperand(0);
23613  EVT SrcVT = Val.getValueType();
23614  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
23615  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
23616
23617  if (VT.bitsGE(SrcVT)) {
23618    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
23619
23620    Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
23621                      VT.changeTypeToInteger(), Val);
23622
23623    // Safe to use a larger than specified operand because by promoting the
23624    // value nothing has changed from an arithmetic point of view.
23625    Val =
23626        convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
23627    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
23628                      DAG.getUNDEF(ContainerDstVT));
23629    return convertFromScalableVector(DAG, VT, Val);
23630  } else {
23631    EVT CvtVT = ContainerSrcVT.changeVectorElementType(
23632        ContainerDstVT.getVectorElementType());
23633    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
23634
23635    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
23636    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
23637    Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
23638    Val = convertFromScalableVector(DAG, SrcVT, Val);
23639
23640    Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
23641    return DAG.getNode(ISD::BITCAST, DL, VT, Val);
23642  }
23643}
23644
23645SDValue
23646AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
23647                                                    SelectionDAG &DAG) const {
23648  EVT VT = Op.getValueType();
23649  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23650
23651  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
23652  unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
23653                             : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
23654
23655  SDLoc DL(Op);
23656  SDValue Val = Op.getOperand(0);
23657  EVT SrcVT = Val.getValueType();
23658  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
23659  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
23660
23661  if (VT.bitsGT(SrcVT)) {
23662    EVT CvtVT = ContainerDstVT.changeVectorElementType(
23663      ContainerSrcVT.getVectorElementType());
23664    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
23665
23666    Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
23667    Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
23668
23669    Val = convertToScalableVector(DAG, ContainerDstVT, Val);
23670    Val = getSVESafeBitCast(CvtVT, Val, DAG);
23671    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
23672                      DAG.getUNDEF(ContainerDstVT));
23673    return convertFromScalableVector(DAG, VT, Val);
23674  } else {
23675    EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
23676    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
23677
23678    // Safe to use a larger than specified result since an fp_to_int where the
23679    // result doesn't fit into the destination is undefined.
23680    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
23681    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
23682    Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
23683
23684    return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
23685  }
23686}
23687
23688SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
23689    SDValue Op, SelectionDAG &DAG) const {
23690  EVT VT = Op.getValueType();
23691  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
23692
23693  auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
23694  auto ShuffleMask = SVN->getMask();
23695
23696  SDLoc DL(Op);
23697  SDValue Op1 = Op.getOperand(0);
23698  SDValue Op2 = Op.getOperand(1);
23699
23700  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
23701  Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
23702  Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
23703
23704  auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
23705    if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
23706      return MVT::i32;
23707    return ScalarTy;
23708  };
23709
23710  if (SVN->isSplat()) {
23711    unsigned Lane = std::max(0, SVN->getSplatIndex());
23712    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
23713    SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
23714                                  DAG.getConstant(Lane, DL, MVT::i64));
23715    Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
23716    return convertFromScalableVector(DAG, VT, Op);
23717  }
23718
23719  bool ReverseEXT = false;
23720  unsigned Imm;
23721  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
23722      Imm == VT.getVectorNumElements() - 1) {
23723    if (ReverseEXT)
23724      std::swap(Op1, Op2);
23725    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
23726    SDValue Scalar = DAG.getNode(
23727        ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
23728        DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
23729    Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
23730    return convertFromScalableVector(DAG, VT, Op);
23731  }
23732
23733  for (unsigned LaneSize : {64U, 32U, 16U}) {
23734    if (isREVMask(ShuffleMask, VT, LaneSize)) {
23735      EVT NewVT =
23736          getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
23737      unsigned RevOp;
23738      unsigned EltSz = VT.getScalarSizeInBits();
23739      if (EltSz == 8)
23740        RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
23741      else if (EltSz == 16)
23742        RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
23743      else
23744        RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
23745
23746      Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
23747      Op = LowerToPredicatedOp(Op, DAG, RevOp);
23748      Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
23749      return convertFromScalableVector(DAG, VT, Op);
23750    }
23751  }
23752
23753  if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
23754      isREVMask(ShuffleMask, VT, 128)) {
23755    if (!VT.isFloatingPoint())
23756      return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
23757
23758    EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
23759    Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
23760    Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
23761    Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
23762    return convertFromScalableVector(DAG, VT, Op);
23763  }
23764
23765  unsigned WhichResult;
23766  if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
23767    return convertFromScalableVector(
23768        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
23769
23770  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
23771    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
23772    return convertFromScalableVector(
23773        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
23774  }
23775
23776  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
23777    return convertFromScalableVector(
23778        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
23779
23780  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
23781    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
23782    return convertFromScalableVector(
23783        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
23784  }
23785
23786  // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
23787  // represents the same logical operation as performed by a ZIP instruction. In
23788  // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
23789  // equivalent to an AArch64 instruction. There's the extra component of
23790  // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
23791  // only operated on 64/128bit vector types that have a direct mapping to a
23792  // target register and so an exact mapping is implied.
23793  // However, when using SVE for fixed length vectors, most legal vector types
23794  // are actually sub-vectors of a larger SVE register. When mapping
23795  // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
23796  // how the mask's indices translate. Specifically, when the mapping requires
23797  // an exact meaning for a specific vector index (e.g. Index X is the last
23798  // vector element in the register) then such mappings are often only safe when
23799  // the exact SVE register size is know. The main exception to this is when
23800  // indices are logically relative to the first element of either
23801  // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
23802  // when converting from fixed-length to scalable vector types (i.e. the start
23803  // of a fixed length vector is always the start of a scalable vector).
23804  unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23805  unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
23806  if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
23807    if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
23808      Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
23809      return convertFromScalableVector(DAG, VT, Op);
23810    }
23811
23812    if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
23813      return convertFromScalableVector(
23814          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
23815
23816    if (isUZPMask(ShuffleMask, VT, WhichResult)) {
23817      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
23818      return convertFromScalableVector(
23819          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
23820    }
23821
23822    if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
23823      return convertFromScalableVector(
23824          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
23825
23826    if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
23827      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
23828      return convertFromScalableVector(
23829          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
23830    }
23831  }
23832
23833  return SDValue();
23834}
23835
23836SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
23837                                                 SelectionDAG &DAG) const {
23838  SDLoc DL(Op);
23839  EVT InVT = Op.getValueType();
23840
23841  assert(VT.isScalableVector() && isTypeLegal(VT) &&
23842         InVT.isScalableVector() && isTypeLegal(InVT) &&
23843         "Only expect to cast between legal scalable vector types!");
23844  assert(VT.getVectorElementType() != MVT::i1 &&
23845         InVT.getVectorElementType() != MVT::i1 &&
23846         "For predicate bitcasts, use getSVEPredicateBitCast");
23847
23848  if (InVT == VT)
23849    return Op;
23850
23851  EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
23852  EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
23853
23854  // Safe bitcasting between unpacked vector types of different element counts
23855  // is currently unsupported because the following is missing the necessary
23856  // work to ensure the result's elements live where they're supposed to within
23857  // an SVE register.
23858  //                01234567
23859  // e.g. nxv2i32 = XX??XX??
23860  //      nxv4f16 = X?X?X?X?
23861  assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
23862          VT == PackedVT || InVT == PackedInVT) &&
23863         "Unexpected bitcast!");
23864
23865  // Pack input if required.
23866  if (InVT != PackedInVT)
23867    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
23868
23869  Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
23870
23871  // Unpack result if required.
23872  if (VT != PackedVT)
23873    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
23874
23875  return Op;
23876}
23877
23878bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
23879                                                 SDValue N) const {
23880  return ::isAllActivePredicate(DAG, N);
23881}
23882
23883EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
23884  return ::getPromotedVTForPredicate(VT);
23885}
23886
23887bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
23888    SDValue Op, const APInt &OriginalDemandedBits,
23889    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
23890    unsigned Depth) const {
23891
23892  unsigned Opc = Op.getOpcode();
23893  switch (Opc) {
23894  case AArch64ISD::VSHL: {
23895    // Match (VSHL (VLSHR Val X) X)
23896    SDValue ShiftL = Op;
23897    SDValue ShiftR = Op->getOperand(0);
23898    if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
23899      return false;
23900
23901    if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
23902      return false;
23903
23904    unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
23905    unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
23906
23907    // Other cases can be handled as well, but this is not
23908    // implemented.
23909    if (ShiftRBits != ShiftLBits)
23910      return false;
23911
23912    unsigned ScalarSize = Op.getScalarValueSizeInBits();
23913    assert(ScalarSize > ShiftLBits && "Invalid shift imm");
23914
23915    APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
23916    APInt UnusedBits = ~OriginalDemandedBits;
23917
23918    if ((ZeroBits & UnusedBits) != ZeroBits)
23919      return false;
23920
23921    // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
23922    // used - simplify to just Val.
23923    return TLO.CombineTo(Op, ShiftR->getOperand(0));
23924  }
23925  case ISD::INTRINSIC_WO_CHAIN: {
23926    if (auto ElementSize = IsSVECntIntrinsic(Op)) {
23927      unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
23928      if (!MaxSVEVectorSizeInBits)
23929        MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
23930      unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
23931      // The SVE count intrinsics don't support the multiplier immediate so we
23932      // don't have to account for that here. The value returned may be slightly
23933      // over the true required bits, as this is based on the "ALL" pattern. The
23934      // other patterns are also exposed by these intrinsics, but they all
23935      // return a value that's strictly less than "ALL".
23936      unsigned RequiredBits = llvm::bit_width(MaxElements);
23937      unsigned BitWidth = Known.Zero.getBitWidth();
23938      if (RequiredBits < BitWidth)
23939        Known.Zero.setHighBits(BitWidth - RequiredBits);
23940      return false;
23941    }
23942  }
23943  }
23944
23945  return TargetLowering::SimplifyDemandedBitsForTargetNode(
23946      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
23947}
23948
23949bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
23950  return Op.getOpcode() == AArch64ISD::DUP ||
23951         Op.getOpcode() == AArch64ISD::MOVI ||
23952         (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23953          Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
23954         TargetLowering::isTargetCanonicalConstantNode(Op);
23955}
23956
23957bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
23958    unsigned Opc, LLT Ty1, LLT Ty2) const {
23959  return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
23960}
23961
23962bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
23963  return Subtarget->hasComplxNum();
23964}
23965
23966bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
23967    ComplexDeinterleavingOperation Operation, Type *Ty) const {
23968  auto *VTy = dyn_cast<FixedVectorType>(Ty);
23969  if (!VTy)
23970    return false;
23971
23972  auto *ScalarTy = VTy->getScalarType();
23973  unsigned NumElements = VTy->getNumElements();
23974
23975  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
23976  if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth))
23977    return false;
23978
23979  return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
23980         ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
23981}
23982
23983Value *AArch64TargetLowering::createComplexDeinterleavingIR(
23984    Instruction *I, ComplexDeinterleavingOperation OperationType,
23985    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
23986    Value *Accumulator) const {
23987  FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
23988
23989  IRBuilder<> B(I);
23990
23991  unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
23992
23993  assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
23994         "Vector type must be either 64 or a power of 2 that is at least 128");
23995
23996  if (TyWidth > 128) {
23997    int Stride = Ty->getNumElements() / 2;
23998    auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
23999    auto SplitSeqVec = llvm::to_vector(SplitSeq);
24000    ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
24001    ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
24002
24003    auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
24004    auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
24005    auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
24006    auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
24007    Value *LowerSplitAcc = nullptr;
24008    Value *UpperSplitAcc = nullptr;
24009
24010    if (Accumulator) {
24011      LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
24012      UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
24013    }
24014
24015    auto *LowerSplitInt = createComplexDeinterleavingIR(
24016        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
24017    auto *UpperSplitInt = createComplexDeinterleavingIR(
24018        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
24019
24020    ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
24021    return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
24022  }
24023
24024  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
24025    Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
24026                              Intrinsic::aarch64_neon_vcmla_rot90,
24027                              Intrinsic::aarch64_neon_vcmla_rot180,
24028                              Intrinsic::aarch64_neon_vcmla_rot270};
24029
24030    if (Accumulator == nullptr)
24031      Accumulator = ConstantFP::get(Ty, 0);
24032
24033    return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
24034                             {Accumulator, InputB, InputA});
24035  }
24036
24037  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
24038    Intrinsic::ID IntId = Intrinsic::not_intrinsic;
24039    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
24040      IntId = Intrinsic::aarch64_neon_vcadd_rot90;
24041    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
24042      IntId = Intrinsic::aarch64_neon_vcadd_rot270;
24043
24044    if (IntId == Intrinsic::not_intrinsic)
24045      return nullptr;
24046
24047    return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
24048  }
24049
24050  return nullptr;
24051}
24052