1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64ExpandImm.h"
10#include "AArch64TargetTransformInfo.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/LoopInfo.h"
13#include "llvm/Analysis/TargetTransformInfo.h"
14#include "llvm/CodeGen/BasicTTIImpl.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/TargetLowering.h"
17#include "llvm/IR/IntrinsicInst.h"
18#include "llvm/IR/IntrinsicsAArch64.h"
19#include "llvm/Support/Debug.h"
20#include <algorithm>
21using namespace llvm;
22
23#define DEBUG_TYPE "aarch64tti"
24
25static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
26                                               cl::init(true), cl::Hidden);
27
28bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
29                                         const Function *Callee) const {
30  const TargetMachine &TM = getTLI()->getTargetMachine();
31
32  const FeatureBitset &CallerBits =
33      TM.getSubtargetImpl(*Caller)->getFeatureBits();
34  const FeatureBitset &CalleeBits =
35      TM.getSubtargetImpl(*Callee)->getFeatureBits();
36
37  // Inline a callee if its target-features are a subset of the callers
38  // target-features.
39  return (CallerBits & CalleeBits) == CalleeBits;
40}
41
42/// Calculate the cost of materializing a 64-bit value. This helper
43/// method might only calculate a fraction of a larger immediate. Therefore it
44/// is valid to return a cost of ZERO.
45int AArch64TTIImpl::getIntImmCost(int64_t Val) {
46  // Check if the immediate can be encoded within an instruction.
47  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
48    return 0;
49
50  if (Val < 0)
51    Val = ~Val;
52
53  // Calculate how many moves we will need to materialize this constant.
54  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
55  AArch64_IMM::expandMOVImm(Val, 64, Insn);
56  return Insn.size();
57}
58
59/// Calculate the cost of materializing the given constant.
60int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
61                                  TTI::TargetCostKind CostKind) {
62  assert(Ty->isIntegerTy());
63
64  unsigned BitSize = Ty->getPrimitiveSizeInBits();
65  if (BitSize == 0)
66    return ~0U;
67
68  // Sign-extend all constants to a multiple of 64-bit.
69  APInt ImmVal = Imm;
70  if (BitSize & 0x3f)
71    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
72
73  // Split the constant into 64-bit chunks and calculate the cost for each
74  // chunk.
75  int Cost = 0;
76  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
77    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
78    int64_t Val = Tmp.getSExtValue();
79    Cost += getIntImmCost(Val);
80  }
81  // We need at least one instruction to materialze the constant.
82  return std::max(1, Cost);
83}
84
85int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
86                                      const APInt &Imm, Type *Ty,
87                                      TTI::TargetCostKind CostKind) {
88  assert(Ty->isIntegerTy());
89
90  unsigned BitSize = Ty->getPrimitiveSizeInBits();
91  // There is no cost model for constants with a bit size of 0. Return TCC_Free
92  // here, so that constant hoisting will ignore this constant.
93  if (BitSize == 0)
94    return TTI::TCC_Free;
95
96  unsigned ImmIdx = ~0U;
97  switch (Opcode) {
98  default:
99    return TTI::TCC_Free;
100  case Instruction::GetElementPtr:
101    // Always hoist the base address of a GetElementPtr.
102    if (Idx == 0)
103      return 2 * TTI::TCC_Basic;
104    return TTI::TCC_Free;
105  case Instruction::Store:
106    ImmIdx = 0;
107    break;
108  case Instruction::Add:
109  case Instruction::Sub:
110  case Instruction::Mul:
111  case Instruction::UDiv:
112  case Instruction::SDiv:
113  case Instruction::URem:
114  case Instruction::SRem:
115  case Instruction::And:
116  case Instruction::Or:
117  case Instruction::Xor:
118  case Instruction::ICmp:
119    ImmIdx = 1;
120    break;
121  // Always return TCC_Free for the shift value of a shift instruction.
122  case Instruction::Shl:
123  case Instruction::LShr:
124  case Instruction::AShr:
125    if (Idx == 1)
126      return TTI::TCC_Free;
127    break;
128  case Instruction::Trunc:
129  case Instruction::ZExt:
130  case Instruction::SExt:
131  case Instruction::IntToPtr:
132  case Instruction::PtrToInt:
133  case Instruction::BitCast:
134  case Instruction::PHI:
135  case Instruction::Call:
136  case Instruction::Select:
137  case Instruction::Ret:
138  case Instruction::Load:
139    break;
140  }
141
142  if (Idx == ImmIdx) {
143    int NumConstants = (BitSize + 63) / 64;
144    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
145    return (Cost <= NumConstants * TTI::TCC_Basic)
146               ? static_cast<int>(TTI::TCC_Free)
147               : Cost;
148  }
149  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
150}
151
152int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
153                                        const APInt &Imm, Type *Ty,
154                                        TTI::TargetCostKind CostKind) {
155  assert(Ty->isIntegerTy());
156
157  unsigned BitSize = Ty->getPrimitiveSizeInBits();
158  // There is no cost model for constants with a bit size of 0. Return TCC_Free
159  // here, so that constant hoisting will ignore this constant.
160  if (BitSize == 0)
161    return TTI::TCC_Free;
162
163  // Most (all?) AArch64 intrinsics do not support folding immediates into the
164  // selected instruction, so we compute the materialization cost for the
165  // immediate directly.
166  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
167    return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
168
169  switch (IID) {
170  default:
171    return TTI::TCC_Free;
172  case Intrinsic::sadd_with_overflow:
173  case Intrinsic::uadd_with_overflow:
174  case Intrinsic::ssub_with_overflow:
175  case Intrinsic::usub_with_overflow:
176  case Intrinsic::smul_with_overflow:
177  case Intrinsic::umul_with_overflow:
178    if (Idx == 1) {
179      int NumConstants = (BitSize + 63) / 64;
180      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
181      return (Cost <= NumConstants * TTI::TCC_Basic)
182                 ? static_cast<int>(TTI::TCC_Free)
183                 : Cost;
184    }
185    break;
186  case Intrinsic::experimental_stackmap:
187    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
188      return TTI::TCC_Free;
189    break;
190  case Intrinsic::experimental_patchpoint_void:
191  case Intrinsic::experimental_patchpoint_i64:
192    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
193      return TTI::TCC_Free;
194    break;
195  }
196  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
197}
198
199TargetTransformInfo::PopcntSupportKind
200AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
201  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
202  if (TyWidth == 32 || TyWidth == 64)
203    return TTI::PSK_FastHardware;
204  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
205  return TTI::PSK_Software;
206}
207
208bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
209                                           ArrayRef<const Value *> Args) {
210
211  // A helper that returns a vector type from the given type. The number of
212  // elements in type Ty determine the vector width.
213  auto toVectorTy = [&](Type *ArgTy) {
214    return FixedVectorType::get(ArgTy->getScalarType(),
215                                cast<FixedVectorType>(DstTy)->getNumElements());
216  };
217
218  // Exit early if DstTy is not a vector type whose elements are at least
219  // 16-bits wide.
220  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
221    return false;
222
223  // Determine if the operation has a widening variant. We consider both the
224  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
225  // instructions.
226  //
227  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
228  //       verify that their extending operands are eliminated during code
229  //       generation.
230  switch (Opcode) {
231  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
232  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
233    break;
234  default:
235    return false;
236  }
237
238  // To be a widening instruction (either the "wide" or "long" versions), the
239  // second operand must be a sign- or zero extend having a single user. We
240  // only consider extends having a single user because they may otherwise not
241  // be eliminated.
242  if (Args.size() != 2 ||
243      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
244      !Args[1]->hasOneUse())
245    return false;
246  auto *Extend = cast<CastInst>(Args[1]);
247
248  // Legalize the destination type and ensure it can be used in a widening
249  // operation.
250  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
251  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
252  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
253    return false;
254
255  // Legalize the source type and ensure it can be used in a widening
256  // operation.
257  auto *SrcTy = toVectorTy(Extend->getSrcTy());
258  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
259  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
260  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
261    return false;
262
263  // Get the total number of vector elements in the legalized types.
264  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
265  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
266
267  // Return true if the legalized types have the same number of vector elements
268  // and the destination element type size is twice that of the source type.
269  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
270}
271
272int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
273                                     TTI::TargetCostKind CostKind,
274                                     const Instruction *I) {
275  int ISD = TLI->InstructionOpcodeToISD(Opcode);
276  assert(ISD && "Invalid opcode");
277
278  // If the cast is observable, and it is used by a widening instruction (e.g.,
279  // uaddl, saddw, etc.), it may be free.
280  if (I && I->hasOneUse()) {
281    auto *SingleUser = cast<Instruction>(*I->user_begin());
282    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
283    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
284      // If the cast is the second operand, it is free. We will generate either
285      // a "wide" or "long" version of the widening instruction.
286      if (I == SingleUser->getOperand(1))
287        return 0;
288      // If the cast is not the second operand, it will be free if it looks the
289      // same as the second operand. In this case, we will generate a "long"
290      // version of the widening instruction.
291      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
292        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
293            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
294          return 0;
295    }
296  }
297
298  // TODO: Allow non-throughput costs that aren't binary.
299  auto AdjustCost = [&CostKind](int Cost) {
300    if (CostKind != TTI::TCK_RecipThroughput)
301      return Cost == 0 ? 0 : 1;
302    return Cost;
303  };
304
305  EVT SrcTy = TLI->getValueType(DL, Src);
306  EVT DstTy = TLI->getValueType(DL, Dst);
307
308  if (!SrcTy.isSimple() || !DstTy.isSimple())
309    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
310
311  static const TypeConversionCostTblEntry
312  ConversionTbl[] = {
313    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
314    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
315    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
316    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
317
318    // The number of shll instructions for the extension.
319    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
320    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
321    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
322    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
323    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
324    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
325    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
326    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
327    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
328    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
329    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
330    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
331    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
332    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
333    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
334    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
335
336    // LowerVectorINT_TO_FP:
337    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
338    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
339    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
340    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
341    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
342    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
343
344    // Complex: to v2f32
345    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
346    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
347    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
348    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
349    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
350    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
351
352    // Complex: to v4f32
353    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
354    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
355    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
356    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
357
358    // Complex: to v8f32
359    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
360    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
361    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
362    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
363
364    // Complex: to v16f32
365    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
366    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
367
368    // Complex: to v2f64
369    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
370    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
371    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
372    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
373    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
374    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
375
376
377    // LowerVectorFP_TO_INT
378    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
379    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
380    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
381    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
382    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
383    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
384
385    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
386    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
387    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
388    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
389    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
390    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
391    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
392
393    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
394    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
395    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
396    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
397    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
398
399    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
400    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
401    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
402    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
403    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
404    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
405    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
406  };
407
408  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
409                                                 DstTy.getSimpleVT(),
410                                                 SrcTy.getSimpleVT()))
411    return AdjustCost(Entry->Cost);
412
413  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
414}
415
416int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
417                                             VectorType *VecTy,
418                                             unsigned Index) {
419
420  // Make sure we were given a valid extend opcode.
421  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
422         "Invalid opcode");
423
424  // We are extending an element we extract from a vector, so the source type
425  // of the extend is the element type of the vector.
426  auto *Src = VecTy->getElementType();
427
428  // Sign- and zero-extends are for integer types only.
429  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
430
431  // Get the cost for the extract. We compute the cost (if any) for the extend
432  // below.
433  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
434
435  // Legalize the types.
436  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
437  auto DstVT = TLI->getValueType(DL, Dst);
438  auto SrcVT = TLI->getValueType(DL, Src);
439  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
440
441  // If the resulting type is still a vector and the destination type is legal,
442  // we may get the extension for free. If not, get the default cost for the
443  // extend.
444  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
445    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
446
447  // The destination type should be larger than the element type. If not, get
448  // the default cost for the extend.
449  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
450    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
451
452  switch (Opcode) {
453  default:
454    llvm_unreachable("Opcode should be either SExt or ZExt");
455
456  // For sign-extends, we only need a smov, which performs the extension
457  // automatically.
458  case Instruction::SExt:
459    return Cost;
460
461  // For zero-extends, the extend is performed automatically by a umov unless
462  // the destination type is i64 and the element type is i8 or i16.
463  case Instruction::ZExt:
464    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
465      return Cost;
466  }
467
468  // If we are unable to perform the extend for free, get the default cost.
469  return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
470}
471
472unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
473                                        TTI::TargetCostKind CostKind) {
474  if (CostKind != TTI::TCK_RecipThroughput)
475    return Opcode == Instruction::PHI ? 0 : 1;
476  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
477  // Branches are assumed to be predicted.
478  return 0;
479}
480
481int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
482                                       unsigned Index) {
483  assert(Val->isVectorTy() && "This must be a vector type");
484
485  if (Index != -1U) {
486    // Legalize the type.
487    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
488
489    // This type is legalized to a scalar type.
490    if (!LT.second.isVector())
491      return 0;
492
493    // The type may be split. Normalize the index to the new type.
494    unsigned Width = LT.second.getVectorNumElements();
495    Index = Index % Width;
496
497    // The element at index zero is already inside the vector.
498    if (Index == 0)
499      return 0;
500  }
501
502  // All other insert/extracts cost this much.
503  return ST->getVectorInsertExtractBaseCost();
504}
505
506int AArch64TTIImpl::getArithmeticInstrCost(
507    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
508    TTI::OperandValueKind Opd1Info,
509    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
510    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
511    const Instruction *CxtI) {
512  // TODO: Handle more cost kinds.
513  if (CostKind != TTI::TCK_RecipThroughput)
514    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
515                                         Opd2Info, Opd1PropInfo,
516                                         Opd2PropInfo, Args, CxtI);
517
518  // Legalize the type.
519  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
520
521  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
522  // add in the widening overhead specified by the sub-target. Since the
523  // extends feeding widening instructions are performed automatically, they
524  // aren't present in the generated code and have a zero cost. By adding a
525  // widening overhead here, we attach the total cost of the combined operation
526  // to the widening instruction.
527  int Cost = 0;
528  if (isWideningInstruction(Ty, Opcode, Args))
529    Cost += ST->getWideningBaseCost();
530
531  int ISD = TLI->InstructionOpcodeToISD(Opcode);
532
533  switch (ISD) {
534  default:
535    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
536                                                Opd2Info,
537                                                Opd1PropInfo, Opd2PropInfo);
538  case ISD::SDIV:
539    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
540        Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
541      // On AArch64, scalar signed division by constants power-of-two are
542      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
543      // The OperandValue properties many not be same as that of previous
544      // operation; conservatively assume OP_None.
545      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
546                                     Opd1Info, Opd2Info,
547                                     TargetTransformInfo::OP_None,
548                                     TargetTransformInfo::OP_None);
549      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
550                                     Opd1Info, Opd2Info,
551                                     TargetTransformInfo::OP_None,
552                                     TargetTransformInfo::OP_None);
553      Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
554                                     Opd1Info, Opd2Info,
555                                     TargetTransformInfo::OP_None,
556                                     TargetTransformInfo::OP_None);
557      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
558                                     Opd1Info, Opd2Info,
559                                     TargetTransformInfo::OP_None,
560                                     TargetTransformInfo::OP_None);
561      return Cost;
562    }
563    LLVM_FALLTHROUGH;
564  case ISD::UDIV:
565    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
566      auto VT = TLI->getValueType(DL, Ty);
567      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
568        // Vector signed division by constant are expanded to the
569        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
570        // to MULHS + SUB + SRL + ADD + SRL.
571        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
572                                             Opd1Info, Opd2Info,
573                                             TargetTransformInfo::OP_None,
574                                             TargetTransformInfo::OP_None);
575        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
576                                             Opd1Info, Opd2Info,
577                                             TargetTransformInfo::OP_None,
578                                             TargetTransformInfo::OP_None);
579        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
580                                             Opd1Info, Opd2Info,
581                                             TargetTransformInfo::OP_None,
582                                             TargetTransformInfo::OP_None);
583        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
584      }
585    }
586
587    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
588                                          Opd2Info,
589                                          Opd1PropInfo, Opd2PropInfo);
590    if (Ty->isVectorTy()) {
591      // On AArch64, vector divisions are not supported natively and are
592      // expanded into scalar divisions of each pair of elements.
593      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
594                                     Opd1Info, Opd2Info, Opd1PropInfo,
595                                     Opd2PropInfo);
596      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
597                                     Opd1Info, Opd2Info, Opd1PropInfo,
598                                     Opd2PropInfo);
599      // TODO: if one of the arguments is scalar, then it's not necessary to
600      // double the cost of handling the vector elements.
601      Cost += Cost;
602    }
603    return Cost;
604
605  case ISD::ADD:
606  case ISD::MUL:
607  case ISD::XOR:
608  case ISD::OR:
609  case ISD::AND:
610    // These nodes are marked as 'custom' for combining purposes only.
611    // We know that they are legal. See LowerAdd in ISelLowering.
612    return (Cost + 1) * LT.first;
613
614  case ISD::FADD:
615    // These nodes are marked as 'custom' just to lower them to SVE.
616    // We know said lowering will incur no additional cost.
617    if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
618      return (Cost + 2) * LT.first;
619
620    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
621                                                Opd2Info,
622                                                Opd1PropInfo, Opd2PropInfo);
623  }
624}
625
626int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
627                                              const SCEV *Ptr) {
628  // Address computations in vectorized code with non-consecutive addresses will
629  // likely result in more instructions compared to scalar code where the
630  // computation can more often be merged into the index mode. The resulting
631  // extra micro-ops can significantly decrease throughput.
632  unsigned NumVectorInstToHideOverhead = 10;
633  int MaxMergeDistance = 64;
634
635  if (Ty->isVectorTy() && SE &&
636      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
637    return NumVectorInstToHideOverhead;
638
639  // In many cases the address computation is not merged into the instruction
640  // addressing mode.
641  return 1;
642}
643
644int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
645                                       Type *CondTy,
646                                       TTI::TargetCostKind CostKind,
647                                       const Instruction *I) {
648  // TODO: Handle other cost kinds.
649  if (CostKind != TTI::TCK_RecipThroughput)
650    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
651
652  int ISD = TLI->InstructionOpcodeToISD(Opcode);
653  // We don't lower some vector selects well that are wider than the register
654  // width.
655  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
656    // We would need this many instructions to hide the scalarization happening.
657    const int AmortizationCost = 20;
658    static const TypeConversionCostTblEntry
659    VectorSelectTbl[] = {
660      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
661      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
662      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
663      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
664      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
665      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
666    };
667
668    EVT SelCondTy = TLI->getValueType(DL, CondTy);
669    EVT SelValTy = TLI->getValueType(DL, ValTy);
670    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
671      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
672                                                     SelCondTy.getSimpleVT(),
673                                                     SelValTy.getSimpleVT()))
674        return Entry->Cost;
675    }
676  }
677  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
678}
679
680AArch64TTIImpl::TTI::MemCmpExpansionOptions
681AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
682  TTI::MemCmpExpansionOptions Options;
683  if (ST->requiresStrictAlign()) {
684    // TODO: Add cost modeling for strict align. Misaligned loads expand to
685    // a bunch of instructions when strict align is enabled.
686    return Options;
687  }
688  Options.AllowOverlappingLoads = true;
689  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
690  Options.NumLoadsPerBlock = Options.MaxNumLoads;
691  // TODO: Though vector loads usually perform well on AArch64, in some targets
692  // they may wake up the FP unit, which raises the power consumption.  Perhaps
693  // they could be used with no holds barred (-O3).
694  Options.LoadSizes = {8, 4, 2, 1};
695  return Options;
696}
697
698int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
699                                    MaybeAlign Alignment, unsigned AddressSpace,
700                                    TTI::TargetCostKind CostKind,
701                                    const Instruction *I) {
702  // TODO: Handle other cost kinds.
703  if (CostKind != TTI::TCK_RecipThroughput)
704    return 1;
705
706  // Type legalization can't handle structs
707  if (TLI->getValueType(DL, Ty,  true) == MVT::Other)
708    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
709                                  CostKind);
710
711  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
712
713  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
714      LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
715    // Unaligned stores are extremely inefficient. We don't split all
716    // unaligned 128-bit stores because the negative impact that has shown in
717    // practice on inlined block copy code.
718    // We make such stores expensive so that we will only vectorize if there
719    // are 6 other instructions getting vectorized.
720    const int AmortizationCost = 6;
721
722    return LT.first * 2 * AmortizationCost;
723  }
724
725  if (Ty->isVectorTy() &&
726      cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
727    unsigned ProfitableNumElements;
728    if (Opcode == Instruction::Store)
729      // We use a custom trunc store lowering so v.4b should be profitable.
730      ProfitableNumElements = 4;
731    else
732      // We scalarize the loads because there is not v.4b register and we
733      // have to promote the elements to v.2.
734      ProfitableNumElements = 8;
735
736    if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
737      unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
738      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
739      // We generate 2 instructions per vector element.
740      return NumVectorizableInstsToAmortize * NumVecElts * 2;
741    }
742  }
743
744  return LT.first;
745}
746
747int AArch64TTIImpl::getInterleavedMemoryOpCost(
748    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
749    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
750    bool UseMaskForCond, bool UseMaskForGaps) {
751  assert(Factor >= 2 && "Invalid interleave factor");
752  auto *VecVTy = cast<FixedVectorType>(VecTy);
753
754  if (!UseMaskForCond && !UseMaskForGaps &&
755      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
756    unsigned NumElts = VecVTy->getNumElements();
757    auto *SubVecTy =
758        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
759
760    // ldN/stN only support legal vector types of size 64 or 128 in bits.
761    // Accesses having vector types that are a multiple of 128 bits can be
762    // matched to more than one ldN/stN instruction.
763    if (NumElts % Factor == 0 &&
764        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
765      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
766  }
767
768  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
769                                           Alignment, AddressSpace, CostKind,
770                                           UseMaskForCond, UseMaskForGaps);
771}
772
773int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
774  int Cost = 0;
775  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
776  for (auto *I : Tys) {
777    if (!I->isVectorTy())
778      continue;
779    if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
780        128)
781      Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
782              getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
783  }
784  return Cost;
785}
786
787unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
788  return ST->getMaxInterleaveFactor();
789}
790
791// For Falkor, we want to avoid having too many strided loads in a loop since
792// that can exhaust the HW prefetcher resources.  We adjust the unroller
793// MaxCount preference below to attempt to ensure unrolling doesn't create too
794// many strided loads.
795static void
796getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
797                              TargetTransformInfo::UnrollingPreferences &UP) {
798  enum { MaxStridedLoads = 7 };
799  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
800    int StridedLoads = 0;
801    // FIXME? We could make this more precise by looking at the CFG and
802    // e.g. not counting loads in each side of an if-then-else diamond.
803    for (const auto BB : L->blocks()) {
804      for (auto &I : *BB) {
805        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
806        if (!LMemI)
807          continue;
808
809        Value *PtrValue = LMemI->getPointerOperand();
810        if (L->isLoopInvariant(PtrValue))
811          continue;
812
813        const SCEV *LSCEV = SE.getSCEV(PtrValue);
814        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
815        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
816          continue;
817
818        // FIXME? We could take pairing of unrolled load copies into account
819        // by looking at the AddRec, but we would probably have to limit this
820        // to loops with no stores or other memory optimization barriers.
821        ++StridedLoads;
822        // We've seen enough strided loads that seeing more won't make a
823        // difference.
824        if (StridedLoads > MaxStridedLoads / 2)
825          return StridedLoads;
826      }
827    }
828    return StridedLoads;
829  };
830
831  int StridedLoads = countStridedLoads(L, SE);
832  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
833                    << " strided loads\n");
834  // Pick the largest power of 2 unroll count that won't result in too many
835  // strided loads.
836  if (StridedLoads) {
837    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
838    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
839                      << UP.MaxCount << '\n');
840  }
841}
842
843void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
844                                             TTI::UnrollingPreferences &UP) {
845  // Enable partial unrolling and runtime unrolling.
846  BaseT::getUnrollingPreferences(L, SE, UP);
847
848  // For inner loop, it is more likely to be a hot one, and the runtime check
849  // can be promoted out from LICM pass, so the overhead is less, let's try
850  // a larger threshold to unroll more loops.
851  if (L->getLoopDepth() > 1)
852    UP.PartialThreshold *= 2;
853
854  // Disable partial & runtime unrolling on -Os.
855  UP.PartialOptSizeThreshold = 0;
856
857  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
858      EnableFalkorHWPFUnrollFix)
859    getFalkorUnrollingPreferences(L, SE, UP);
860}
861
862void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
863                                           TTI::PeelingPreferences &PP) {
864  BaseT::getPeelingPreferences(L, SE, PP);
865}
866
867Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
868                                                         Type *ExpectedType) {
869  switch (Inst->getIntrinsicID()) {
870  default:
871    return nullptr;
872  case Intrinsic::aarch64_neon_st2:
873  case Intrinsic::aarch64_neon_st3:
874  case Intrinsic::aarch64_neon_st4: {
875    // Create a struct type
876    StructType *ST = dyn_cast<StructType>(ExpectedType);
877    if (!ST)
878      return nullptr;
879    unsigned NumElts = Inst->getNumArgOperands() - 1;
880    if (ST->getNumElements() != NumElts)
881      return nullptr;
882    for (unsigned i = 0, e = NumElts; i != e; ++i) {
883      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
884        return nullptr;
885    }
886    Value *Res = UndefValue::get(ExpectedType);
887    IRBuilder<> Builder(Inst);
888    for (unsigned i = 0, e = NumElts; i != e; ++i) {
889      Value *L = Inst->getArgOperand(i);
890      Res = Builder.CreateInsertValue(Res, L, i);
891    }
892    return Res;
893  }
894  case Intrinsic::aarch64_neon_ld2:
895  case Intrinsic::aarch64_neon_ld3:
896  case Intrinsic::aarch64_neon_ld4:
897    if (Inst->getType() == ExpectedType)
898      return Inst;
899    return nullptr;
900  }
901}
902
903bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
904                                        MemIntrinsicInfo &Info) {
905  switch (Inst->getIntrinsicID()) {
906  default:
907    break;
908  case Intrinsic::aarch64_neon_ld2:
909  case Intrinsic::aarch64_neon_ld3:
910  case Intrinsic::aarch64_neon_ld4:
911    Info.ReadMem = true;
912    Info.WriteMem = false;
913    Info.PtrVal = Inst->getArgOperand(0);
914    break;
915  case Intrinsic::aarch64_neon_st2:
916  case Intrinsic::aarch64_neon_st3:
917  case Intrinsic::aarch64_neon_st4:
918    Info.ReadMem = false;
919    Info.WriteMem = true;
920    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
921    break;
922  }
923
924  switch (Inst->getIntrinsicID()) {
925  default:
926    return false;
927  case Intrinsic::aarch64_neon_ld2:
928  case Intrinsic::aarch64_neon_st2:
929    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
930    break;
931  case Intrinsic::aarch64_neon_ld3:
932  case Intrinsic::aarch64_neon_st3:
933    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
934    break;
935  case Intrinsic::aarch64_neon_ld4:
936  case Intrinsic::aarch64_neon_st4:
937    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
938    break;
939  }
940  return true;
941}
942
943/// See if \p I should be considered for address type promotion. We check if \p
944/// I is a sext with right type and used in memory accesses. If it used in a
945/// "complex" getelementptr, we allow it to be promoted without finding other
946/// sext instructions that sign extended the same initial value. A getelementptr
947/// is considered as "complex" if it has more than 2 operands.
948bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
949    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
950  bool Considerable = false;
951  AllowPromotionWithoutCommonHeader = false;
952  if (!isa<SExtInst>(&I))
953    return false;
954  Type *ConsideredSExtType =
955      Type::getInt64Ty(I.getParent()->getParent()->getContext());
956  if (I.getType() != ConsideredSExtType)
957    return false;
958  // See if the sext is the one with the right type and used in at least one
959  // GetElementPtrInst.
960  for (const User *U : I.users()) {
961    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
962      Considerable = true;
963      // A getelementptr is considered as "complex" if it has more than 2
964      // operands. We will promote a SExt used in such complex GEP as we
965      // expect some computation to be merged if they are done on 64 bits.
966      if (GEPInst->getNumOperands() > 2) {
967        AllowPromotionWithoutCommonHeader = true;
968        break;
969      }
970    }
971  }
972  return Considerable;
973}
974
975bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
976                                           TTI::ReductionFlags Flags) const {
977  auto *VTy = cast<VectorType>(Ty);
978  unsigned ScalarBits = Ty->getScalarSizeInBits();
979  switch (Opcode) {
980  case Instruction::FAdd:
981  case Instruction::FMul:
982  case Instruction::And:
983  case Instruction::Or:
984  case Instruction::Xor:
985  case Instruction::Mul:
986    return false;
987  case Instruction::Add:
988    return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
989  case Instruction::ICmp:
990    return (ScalarBits < 64) &&
991           (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
992  case Instruction::FCmp:
993    return Flags.NoNaN;
994  default:
995    llvm_unreachable("Unhandled reduction opcode");
996  }
997  return false;
998}
999
1000int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
1001                                               VectorType *ValTy,
1002                                               bool IsPairwiseForm,
1003                                               TTI::TargetCostKind CostKind) {
1004
1005  if (IsPairwiseForm)
1006    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1007                                             CostKind);
1008
1009  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1010  MVT MTy = LT.second;
1011  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1012  assert(ISD && "Invalid opcode");
1013
1014  // Horizontal adds can use the 'addv' instruction. We model the cost of these
1015  // instructions as normal vector adds. This is the only arithmetic vector
1016  // reduction operation for which we have an instruction.
1017  static const CostTblEntry CostTblNoPairwise[]{
1018      {ISD::ADD, MVT::v8i8,  1},
1019      {ISD::ADD, MVT::v16i8, 1},
1020      {ISD::ADD, MVT::v4i16, 1},
1021      {ISD::ADD, MVT::v8i16, 1},
1022      {ISD::ADD, MVT::v4i32, 1},
1023  };
1024
1025  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
1026    return LT.first * Entry->Cost;
1027
1028  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1029                                           CostKind);
1030}
1031
1032int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1033                                   int Index, VectorType *SubTp) {
1034  if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
1035      Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
1036    static const CostTblEntry ShuffleTbl[] = {
1037      // Broadcast shuffle kinds can be performed with 'dup'.
1038      { TTI::SK_Broadcast, MVT::v8i8,  1 },
1039      { TTI::SK_Broadcast, MVT::v16i8, 1 },
1040      { TTI::SK_Broadcast, MVT::v4i16, 1 },
1041      { TTI::SK_Broadcast, MVT::v8i16, 1 },
1042      { TTI::SK_Broadcast, MVT::v2i32, 1 },
1043      { TTI::SK_Broadcast, MVT::v4i32, 1 },
1044      { TTI::SK_Broadcast, MVT::v2i64, 1 },
1045      { TTI::SK_Broadcast, MVT::v2f32, 1 },
1046      { TTI::SK_Broadcast, MVT::v4f32, 1 },
1047      { TTI::SK_Broadcast, MVT::v2f64, 1 },
1048      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
1049      // 'zip1/zip2' instructions.
1050      { TTI::SK_Transpose, MVT::v8i8,  1 },
1051      { TTI::SK_Transpose, MVT::v16i8, 1 },
1052      { TTI::SK_Transpose, MVT::v4i16, 1 },
1053      { TTI::SK_Transpose, MVT::v8i16, 1 },
1054      { TTI::SK_Transpose, MVT::v2i32, 1 },
1055      { TTI::SK_Transpose, MVT::v4i32, 1 },
1056      { TTI::SK_Transpose, MVT::v2i64, 1 },
1057      { TTI::SK_Transpose, MVT::v2f32, 1 },
1058      { TTI::SK_Transpose, MVT::v4f32, 1 },
1059      { TTI::SK_Transpose, MVT::v2f64, 1 },
1060      // Select shuffle kinds.
1061      // TODO: handle vXi8/vXi16.
1062      { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
1063      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
1064      { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
1065      { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
1066      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
1067      { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
1068      // PermuteSingleSrc shuffle kinds.
1069      // TODO: handle vXi8/vXi16.
1070      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
1071      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
1072      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
1073      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
1074      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
1075      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
1076    };
1077    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1078    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
1079      return LT.first * Entry->Cost;
1080  }
1081
1082  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1083}
1084