1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/IntrinsicInst.h"
24#include "llvm/IR/Intrinsics.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
47static cl::opt<bool> EnableMaskedLoadStores(
48  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49  cl::desc("Enable the generation of masked loads and stores"));
50
51static cl::opt<bool> DisableLowOverheadLoops(
52  "disable-arm-loloops", cl::Hidden, cl::init(false),
53  cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57                  cl::desc("Enable the generation of WLS loops"));
58
59extern cl::opt<TailPredication::Mode> EnableTailPredication;
60
61extern cl::opt<bool> EnableMaskedGatherScatters;
62
63extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64
65/// Convert a vector load intrinsic into a simple llvm load instruction.
66/// This is beneficial when the underlying object being addressed comes
67/// from a constant, since we get constant-folding for free.
68static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69                               InstCombiner::BuilderTy &Builder) {
70  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71
72  if (!IntrAlign)
73    return nullptr;
74
75  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76                           ? MemAlign
77                           : IntrAlign->getLimitedValue();
78
79  if (!isPowerOf2_32(Alignment))
80    return nullptr;
81
82  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83                                          PointerType::get(II.getType(), 0));
84  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85}
86
87bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
88                                     const Function *Callee) const {
89  const TargetMachine &TM = getTLI()->getTargetMachine();
90  const FeatureBitset &CallerBits =
91      TM.getSubtargetImpl(*Caller)->getFeatureBits();
92  const FeatureBitset &CalleeBits =
93      TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95  // To inline a callee, all features not in the allowed list must match exactly.
96  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97                    (CalleeBits & ~InlineFeaturesAllowed);
98  // For features in the allowed list, the callee's features must be a subset of
99  // the callers'.
100  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101                     (CalleeBits & InlineFeaturesAllowed);
102  return MatchExact && MatchSubset;
103}
104
105TTI::AddressingModeKind
106ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107                                       ScalarEvolution *SE) const {
108  if (ST->hasMVEIntegerOps())
109    return TTI::AMK_PostIndexed;
110
111  if (L->getHeader()->getParent()->hasOptSize())
112    return TTI::AMK_None;
113
114  if (ST->isMClass() && ST->isThumb2() &&
115      L->getNumBlocks() == 1)
116    return TTI::AMK_PreIndexed;
117
118  return TTI::AMK_None;
119}
120
121std::optional<Instruction *>
122ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123  using namespace PatternMatch;
124  Intrinsic::ID IID = II.getIntrinsicID();
125  switch (IID) {
126  default:
127    break;
128  case Intrinsic::arm_neon_vld1: {
129    Align MemAlign =
130        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
131                          &IC.getAssumptionCache(), &IC.getDominatorTree());
132    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133      return IC.replaceInstUsesWith(II, V);
134    }
135    break;
136  }
137
138  case Intrinsic::arm_neon_vld2:
139  case Intrinsic::arm_neon_vld3:
140  case Intrinsic::arm_neon_vld4:
141  case Intrinsic::arm_neon_vld2lane:
142  case Intrinsic::arm_neon_vld3lane:
143  case Intrinsic::arm_neon_vld4lane:
144  case Intrinsic::arm_neon_vst1:
145  case Intrinsic::arm_neon_vst2:
146  case Intrinsic::arm_neon_vst3:
147  case Intrinsic::arm_neon_vst4:
148  case Intrinsic::arm_neon_vst2lane:
149  case Intrinsic::arm_neon_vst3lane:
150  case Intrinsic::arm_neon_vst4lane: {
151    Align MemAlign =
152        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
153                          &IC.getAssumptionCache(), &IC.getDominatorTree());
154    unsigned AlignArg = II.arg_size() - 1;
155    Value *AlignArgOp = II.getArgOperand(AlignArg);
156    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157    if (Align && *Align < MemAlign) {
158      return IC.replaceOperand(
159          II, AlignArg,
160          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161                           false));
162    }
163    break;
164  }
165
166  case Intrinsic::arm_mve_pred_i2v: {
167    Value *Arg = II.getArgOperand(0);
168    Value *ArgArg;
169    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170                       PatternMatch::m_Value(ArgArg))) &&
171        II.getType() == ArgArg->getType()) {
172      return IC.replaceInstUsesWith(II, ArgArg);
173    }
174    Constant *XorMask;
175    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176                             PatternMatch::m_Value(ArgArg)),
177                         PatternMatch::m_Constant(XorMask))) &&
178        II.getType() == ArgArg->getType()) {
179      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180        if (CI->getValue().trunc(16).isAllOnes()) {
181          auto TrueVector = IC.Builder.CreateVectorSplat(
182              cast<FixedVectorType>(II.getType())->getNumElements(),
183              IC.Builder.getTrue());
184          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185        }
186      }
187    }
188    KnownBits ScalarKnown(32);
189    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190                                ScalarKnown, 0)) {
191      return &II;
192    }
193    break;
194  }
195  case Intrinsic::arm_mve_pred_v2i: {
196    Value *Arg = II.getArgOperand(0);
197    Value *ArgArg;
198    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199                       PatternMatch::m_Value(ArgArg)))) {
200      return IC.replaceInstUsesWith(II, ArgArg);
201    }
202    if (!II.getMetadata(LLVMContext::MD_range)) {
203      Type *IntTy32 = Type::getInt32Ty(II.getContext());
204      Metadata *M[] = {
205          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
206          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
207      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
208      return &II;
209    }
210    break;
211  }
212  case Intrinsic::arm_mve_vadc:
213  case Intrinsic::arm_mve_vadc_predicated: {
214    unsigned CarryOp =
215        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
216    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
217           "Bad type for intrinsic!");
218
219    KnownBits CarryKnown(32);
220    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
221                                CarryKnown)) {
222      return &II;
223    }
224    break;
225  }
226  case Intrinsic::arm_mve_vmldava: {
227    Instruction *I = cast<Instruction>(&II);
228    if (I->hasOneUse()) {
229      auto *User = cast<Instruction>(*I->user_begin());
230      Value *OpZ;
231      if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
232          match(I->getOperand(3), m_Zero())) {
233        Value *OpX = I->getOperand(4);
234        Value *OpY = I->getOperand(5);
235        Type *OpTy = OpX->getType();
236
237        IC.Builder.SetInsertPoint(User);
238        Value *V =
239            IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
240                                       {I->getOperand(0), I->getOperand(1),
241                                        I->getOperand(2), OpZ, OpX, OpY});
242
243        IC.replaceInstUsesWith(*User, V);
244        return IC.eraseInstFromFunction(*User);
245      }
246    }
247    return std::nullopt;
248  }
249  }
250  return std::nullopt;
251}
252
253std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
254    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
255    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
256    std::function<void(Instruction *, unsigned, APInt, APInt &)>
257        SimplifyAndSetOp) const {
258
259  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
260  // opcode specifying a Top/Bottom instruction, which can change between
261  // instructions.
262  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
263    unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
264    unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
265
266    // The only odd/even lanes of operand 0 will only be demanded depending
267    // on whether this is a top/bottom instruction.
268    APInt DemandedElts =
269        APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
270                                       : APInt::getHighBitsSet(2, 1));
271    SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
272    // The other lanes will be defined from the inserted elements.
273    UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
274                                                 : APInt::getHighBitsSet(2, 1));
275    return std::nullopt;
276  };
277
278  switch (II.getIntrinsicID()) {
279  default:
280    break;
281  case Intrinsic::arm_mve_vcvt_narrow:
282    SimplifyNarrowInstrTopBottom(2);
283    break;
284  case Intrinsic::arm_mve_vqmovn:
285    SimplifyNarrowInstrTopBottom(4);
286    break;
287  case Intrinsic::arm_mve_vshrn:
288    SimplifyNarrowInstrTopBottom(7);
289    break;
290  }
291
292  return std::nullopt;
293}
294
295InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
296                                          TTI::TargetCostKind CostKind) {
297  assert(Ty->isIntegerTy());
298
299 unsigned Bits = Ty->getPrimitiveSizeInBits();
300 if (Bits == 0 || Imm.getActiveBits() >= 64)
301   return 4;
302
303  int64_t SImmVal = Imm.getSExtValue();
304  uint64_t ZImmVal = Imm.getZExtValue();
305  if (!ST->isThumb()) {
306    if ((SImmVal >= 0 && SImmVal < 65536) ||
307        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
308        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
309      return 1;
310    return ST->hasV6T2Ops() ? 2 : 3;
311  }
312  if (ST->isThumb2()) {
313    if ((SImmVal >= 0 && SImmVal < 65536) ||
314        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
315        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
316      return 1;
317    return ST->hasV6T2Ops() ? 2 : 3;
318  }
319  // Thumb1, any i8 imm cost 1.
320  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
321    return 1;
322  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
323    return 2;
324  // Load from constantpool.
325  return 3;
326}
327
328// Constants smaller than 256 fit in the immediate field of
329// Thumb1 instructions so we return a zero cost and 1 otherwise.
330InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
331                                                  const APInt &Imm, Type *Ty) {
332  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
333    return 0;
334
335  return 1;
336}
337
338// Checks whether Inst is part of a min(max()) or max(min()) pattern
339// that will match to an SSAT instruction. Returns the instruction being
340// saturated, or null if no saturation pattern was found.
341static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
342  Value *LHS, *RHS;
343  ConstantInt *C;
344  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
345
346  if (InstSPF == SPF_SMAX &&
347      PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
348      C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
349
350    auto isSSatMin = [&](Value *MinInst) {
351      if (isa<SelectInst>(MinInst)) {
352        Value *MinLHS, *MinRHS;
353        ConstantInt *MinC;
354        SelectPatternFlavor MinSPF =
355            matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
356        if (MinSPF == SPF_SMIN &&
357            PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
358            MinC->getValue() == ((-Imm) - 1))
359          return true;
360      }
361      return false;
362    };
363
364    if (isSSatMin(Inst->getOperand(1)))
365      return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
366    if (Inst->hasNUses(2) &&
367        (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
368      return Inst->getOperand(1);
369  }
370  return nullptr;
371}
372
373// Look for a FP Saturation pattern, where the instruction can be simplified to
374// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
375static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
376  if (Imm.getBitWidth() != 64 ||
377      Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
378    return false;
379  Value *FP = isSSATMinMaxPattern(Inst, Imm);
380  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
381    FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
382  if (!FP)
383    return false;
384  return isa<FPToSIInst>(FP);
385}
386
387InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
388                                              const APInt &Imm, Type *Ty,
389                                              TTI::TargetCostKind CostKind,
390                                              Instruction *Inst) {
391  // Division by a constant can be turned into multiplication, but only if we
392  // know it's constant. So it's not so much that the immediate is cheap (it's
393  // not), but that the alternative is worse.
394  // FIXME: this is probably unneeded with GlobalISel.
395  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
396       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
397      Idx == 1)
398    return 0;
399
400  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
401  // splitting any large offsets.
402  if (Opcode == Instruction::GetElementPtr && Idx != 0)
403    return 0;
404
405  if (Opcode == Instruction::And) {
406    // UXTB/UXTH
407    if (Imm == 255 || Imm == 65535)
408      return 0;
409    // Conversion to BIC is free, and means we can use ~Imm instead.
410    return std::min(getIntImmCost(Imm, Ty, CostKind),
411                    getIntImmCost(~Imm, Ty, CostKind));
412  }
413
414  if (Opcode == Instruction::Add)
415    // Conversion to SUB is free, and means we can use -Imm instead.
416    return std::min(getIntImmCost(Imm, Ty, CostKind),
417                    getIntImmCost(-Imm, Ty, CostKind));
418
419  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
420      Ty->getIntegerBitWidth() == 32) {
421    int64_t NegImm = -Imm.getSExtValue();
422    if (ST->isThumb2() && NegImm < 1<<12)
423      // icmp X, #-C -> cmn X, #C
424      return 0;
425    if (ST->isThumb() && NegImm < 1<<8)
426      // icmp X, #-C -> adds X, #C
427      return 0;
428  }
429
430  // xor a, -1 can always be folded to MVN
431  if (Opcode == Instruction::Xor && Imm.isAllOnes())
432    return 0;
433
434  // Ensures negative constant of min(max()) or max(min()) patterns that
435  // match to SSAT instructions don't get hoisted
436  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
437      Ty->getIntegerBitWidth() <= 32) {
438    if (isSSATMinMaxPattern(Inst, Imm) ||
439        (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
440         isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
441      return 0;
442  }
443
444  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
445    return 0;
446
447  // We can convert <= -1 to < 0, which is generally quite cheap.
448  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
449    ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
450    if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
451      return std::min(getIntImmCost(Imm, Ty, CostKind),
452                      getIntImmCost(Imm + 1, Ty, CostKind));
453  }
454
455  return getIntImmCost(Imm, Ty, CostKind);
456}
457
458InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
459                                           TTI::TargetCostKind CostKind,
460                                           const Instruction *I) {
461  if (CostKind == TTI::TCK_RecipThroughput &&
462      (ST->hasNEON() || ST->hasMVEIntegerOps())) {
463    // FIXME: The vectorizer is highly sensistive to the cost of these
464    // instructions, which suggests that it may be using the costs incorrectly.
465    // But, for now, just make them free to avoid performance regressions for
466    // vector targets.
467    return 0;
468  }
469  return BaseT::getCFInstrCost(Opcode, CostKind, I);
470}
471
472InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
473                                             Type *Src,
474                                             TTI::CastContextHint CCH,
475                                             TTI::TargetCostKind CostKind,
476                                             const Instruction *I) {
477  int ISD = TLI->InstructionOpcodeToISD(Opcode);
478  assert(ISD && "Invalid opcode");
479
480  // TODO: Allow non-throughput costs that aren't binary.
481  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
482    if (CostKind != TTI::TCK_RecipThroughput)
483      return Cost == 0 ? 0 : 1;
484    return Cost;
485  };
486  auto IsLegalFPType = [this](EVT VT) {
487    EVT EltVT = VT.getScalarType();
488    return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
489            (EltVT == MVT::f64 && ST->hasFP64()) ||
490            (EltVT == MVT::f16 && ST->hasFullFP16());
491  };
492
493  EVT SrcTy = TLI->getValueType(DL, Src);
494  EVT DstTy = TLI->getValueType(DL, Dst);
495
496  if (!SrcTy.isSimple() || !DstTy.isSimple())
497    return AdjustCost(
498        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
499
500  // Extending masked load/Truncating masked stores is expensive because we
501  // currently don't split them. This means that we'll likely end up
502  // loading/storing each element individually (hence the high cost).
503  if ((ST->hasMVEIntegerOps() &&
504       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
505        Opcode == Instruction::SExt)) ||
506      (ST->hasMVEFloatOps() &&
507       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
508       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
509    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
510      return 2 * DstTy.getVectorNumElements() *
511             ST->getMVEVectorCostFactor(CostKind);
512
513  // The extend of other kinds of load is free
514  if (CCH == TTI::CastContextHint::Normal ||
515      CCH == TTI::CastContextHint::Masked) {
516    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
517        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
518        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
519        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
520        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
521        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
522        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
523        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
524        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
525        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
526        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
527        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
528        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
529    };
530    if (const auto *Entry = ConvertCostTableLookup(
531            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
532      return AdjustCost(Entry->Cost);
533
534    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
535        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
536        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
537        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
538        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
539        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
540        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
541        // The following extend from a legal type to an illegal type, so need to
542        // split the load. This introduced an extra load operation, but the
543        // extend is still "free".
544        {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
545        {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
546        {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
547        {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
548        {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
549        {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
550    };
551    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
552      if (const auto *Entry =
553              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
554                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
555        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
556    }
557
558    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
559        // FPExtends are similar but also require the VCVT instructions.
560        {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
561        {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
562    };
563    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
564      if (const auto *Entry =
565              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
566                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
567        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
568    }
569
570    // The truncate of a store is free. This is the mirror of extends above.
571    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
572        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
573        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
574        {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
575        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
576        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
577        {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
578        {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
579    };
580    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
581      if (const auto *Entry =
582              ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
583                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
584        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
585    }
586
587    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
588        {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
589        {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
590    };
591    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
592      if (const auto *Entry =
593              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
594                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
595        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
596    }
597  }
598
599  // NEON vector operations that can extend their inputs.
600  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
601      I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
602    static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
603      // vaddl
604      { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
605      { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
606      // vsubl
607      { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
608      { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
609      // vmull
610      { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
611      { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
612      // vshll
613      { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
614      { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
615    };
616
617    auto *User = cast<Instruction>(*I->user_begin());
618    int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
619    if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
620                                             DstTy.getSimpleVT(),
621                                             SrcTy.getSimpleVT())) {
622      return AdjustCost(Entry->Cost);
623    }
624  }
625
626  // Single to/from double precision conversions.
627  if (Src->isVectorTy() && ST->hasNEON() &&
628      ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
629        DstTy.getScalarType() == MVT::f32) ||
630       (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
631        DstTy.getScalarType() == MVT::f64))) {
632    static const CostTblEntry NEONFltDblTbl[] = {
633        // Vector fptrunc/fpext conversions.
634        {ISD::FP_ROUND, MVT::v2f64, 2},
635        {ISD::FP_EXTEND, MVT::v2f32, 2},
636        {ISD::FP_EXTEND, MVT::v4f32, 4}};
637
638    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
639    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
640      return AdjustCost(LT.first * Entry->Cost);
641  }
642
643  // Some arithmetic, load and store operations have specific instructions
644  // to cast up/down their types automatically at no extra cost.
645  // TODO: Get these tables to know at least what the related operations are.
646  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
647    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
648    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
649    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
650    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
651    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
652    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
653
654    // The number of vmovl instructions for the extension.
655    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
656    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
657    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
658    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
659    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
660    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
661    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
662    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
663    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
664    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
665    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
666    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
667    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
668    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
669    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
670    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
671    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
672    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
673
674    // Operations that we legalize using splitting.
675    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
676    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
677
678    // Vector float <-> i32 conversions.
679    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
680    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
681
682    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
683    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
684    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
685    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
686    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
687    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
688    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
689    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
690    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
691    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
692    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
693    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
694    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
695    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
696    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
697    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
698    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
699    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
700    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
701    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
702
703    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
704    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
705    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
706    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
707    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
708    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
709
710    // Vector double <-> i32 conversions.
711    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
712    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
713
714    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
715    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
716    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
717    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
718    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
719    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
720
721    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
722    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
723    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
724    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
725    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
726    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
727  };
728
729  if (SrcTy.isVector() && ST->hasNEON()) {
730    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
731                                                   DstTy.getSimpleVT(),
732                                                   SrcTy.getSimpleVT()))
733      return AdjustCost(Entry->Cost);
734  }
735
736  // Scalar float to integer conversions.
737  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
738    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
739    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
740    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
741    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
742    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
743    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
744    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
745    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
746    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
747    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
748    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
749    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
750    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
751    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
752    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
753    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
754    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
755    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
756    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
757    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
758  };
759  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
760    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
761                                                   DstTy.getSimpleVT(),
762                                                   SrcTy.getSimpleVT()))
763      return AdjustCost(Entry->Cost);
764  }
765
766  // Scalar integer to float conversions.
767  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
768    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
769    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
770    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
771    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
772    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
773    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
774    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
775    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
776    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
777    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
778    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
779    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
780    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
781    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
782    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
783    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
784    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
785    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
786    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
787    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
788  };
789
790  if (SrcTy.isInteger() && ST->hasNEON()) {
791    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
792                                                   ISD, DstTy.getSimpleVT(),
793                                                   SrcTy.getSimpleVT()))
794      return AdjustCost(Entry->Cost);
795  }
796
797  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
798  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
799  // are linearised so take more.
800  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
801    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
802    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
803    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
804    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
805    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
806    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
807    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
808    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
809    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
810    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
811    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
812    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
813  };
814
815  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
816    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
817                                                   ISD, DstTy.getSimpleVT(),
818                                                   SrcTy.getSimpleVT()))
819      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
820  }
821
822  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
823    // As general rule, fp converts that were not matched above are scalarized
824    // and cost 1 vcvt for each lane, so long as the instruction is available.
825    // If not it will become a series of function calls.
826    const InstructionCost CallCost =
827        getCallInstrCost(nullptr, Dst, {Src}, CostKind);
828    int Lanes = 1;
829    if (SrcTy.isFixedLengthVector())
830      Lanes = SrcTy.getVectorNumElements();
831
832    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
833      return Lanes;
834    else
835      return Lanes * CallCost;
836  }
837
838  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
839      SrcTy.isFixedLengthVector()) {
840    // Treat a truncate with larger than legal source (128bits for MVE) as
841    // expensive, 2 instructions per lane.
842    if ((SrcTy.getScalarType() == MVT::i8 ||
843         SrcTy.getScalarType() == MVT::i16 ||
844         SrcTy.getScalarType() == MVT::i32) &&
845        SrcTy.getSizeInBits() > 128 &&
846        SrcTy.getSizeInBits() > DstTy.getSizeInBits())
847      return SrcTy.getVectorNumElements() * 2;
848  }
849
850  // Scalar integer conversion costs.
851  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
852    // i16 -> i64 requires two dependent operations.
853    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
854
855    // Truncates on i64 are assumed to be free.
856    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
857    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
858    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
859    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
860  };
861
862  if (SrcTy.isInteger()) {
863    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
864                                                   DstTy.getSimpleVT(),
865                                                   SrcTy.getSimpleVT()))
866      return AdjustCost(Entry->Cost);
867  }
868
869  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
870                     ? ST->getMVEVectorCostFactor(CostKind)
871                     : 1;
872  return AdjustCost(
873      BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
874}
875
876InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
877                                               TTI::TargetCostKind CostKind,
878                                               unsigned Index, Value *Op0,
879                                               Value *Op1) {
880  // Penalize inserting into an D-subregister. We end up with a three times
881  // lower estimated throughput on swift.
882  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
883      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
884    return 3;
885
886  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
887                        Opcode == Instruction::ExtractElement)) {
888    // Cross-class copies are expensive on many microarchitectures,
889    // so assume they are expensive by default.
890    if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
891      return 3;
892
893    // Even if it's not a cross class copy, this likely leads to mixing
894    // of NEON and VFP code and should be therefore penalized.
895    if (ValTy->isVectorTy() &&
896        ValTy->getScalarSizeInBits() <= 32)
897      return std::max<InstructionCost>(
898          BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
899          2U);
900  }
901
902  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
903                                 Opcode == Instruction::ExtractElement)) {
904    // Integer cross-lane moves are more expensive than float, which can
905    // sometimes just be vmovs. Integer involve being passes to GPR registers,
906    // causing more of a delay.
907    std::pair<InstructionCost, MVT> LT =
908        getTypeLegalizationCost(ValTy->getScalarType());
909    return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
910  }
911
912  return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
913}
914
915InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
916                                               Type *CondTy,
917                                               CmpInst::Predicate VecPred,
918                                               TTI::TargetCostKind CostKind,
919                                               const Instruction *I) {
920  int ISD = TLI->InstructionOpcodeToISD(Opcode);
921
922  // Thumb scalar code size cost for select.
923  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
924      ST->isThumb() && !ValTy->isVectorTy()) {
925    // Assume expensive structs.
926    if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
927      return TTI::TCC_Expensive;
928
929    // Select costs can vary because they:
930    // - may require one or more conditional mov (including an IT),
931    // - can't operate directly on immediates,
932    // - require live flags, which we can't copy around easily.
933    InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
934
935    // Possible IT instruction for Thumb2, or more for Thumb1.
936    ++Cost;
937
938    // i1 values may need rematerialising by using mov immediates and/or
939    // flag setting instructions.
940    if (ValTy->isIntegerTy(1))
941      ++Cost;
942
943    return Cost;
944  }
945
946  // If this is a vector min/max/abs, use the cost of that intrinsic directly
947  // instead. Hopefully when min/max intrinsics are more prevalent this code
948  // will not be needed.
949  const Instruction *Sel = I;
950  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
951      Sel->hasOneUse())
952    Sel = cast<Instruction>(Sel->user_back());
953  if (Sel && ValTy->isVectorTy() &&
954      (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
955    const Value *LHS, *RHS;
956    SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
957    unsigned IID = 0;
958    switch (SPF) {
959    case SPF_ABS:
960      IID = Intrinsic::abs;
961      break;
962    case SPF_SMIN:
963      IID = Intrinsic::smin;
964      break;
965    case SPF_SMAX:
966      IID = Intrinsic::smax;
967      break;
968    case SPF_UMIN:
969      IID = Intrinsic::umin;
970      break;
971    case SPF_UMAX:
972      IID = Intrinsic::umax;
973      break;
974    case SPF_FMINNUM:
975      IID = Intrinsic::minnum;
976      break;
977    case SPF_FMAXNUM:
978      IID = Intrinsic::maxnum;
979      break;
980    default:
981      break;
982    }
983    if (IID) {
984      // The ICmp is free, the select gets the cost of the min/max/etc
985      if (Sel != I)
986        return 0;
987      IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
988      return getIntrinsicInstrCost(CostAttrs, CostKind);
989    }
990  }
991
992  // On NEON a vector select gets lowered to vbsl.
993  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
994    // Lowering of some vector selects is currently far from perfect.
995    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
996      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
997      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
998      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
999    };
1000
1001    EVT SelCondTy = TLI->getValueType(DL, CondTy);
1002    EVT SelValTy = TLI->getValueType(DL, ValTy);
1003    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1004      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1005                                                     SelCondTy.getSimpleVT(),
1006                                                     SelValTy.getSimpleVT()))
1007        return Entry->Cost;
1008    }
1009
1010    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1011    return LT.first;
1012  }
1013
1014  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1015      (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1016      cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1017    FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1018    FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1019    if (!VecCondTy)
1020      VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1021
1022    // If we don't have mve.fp any fp operations will need to be scalarized.
1023    if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1024      // One scalaization insert, one scalarization extract and the cost of the
1025      // fcmps.
1026      return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1027                                             /*Extract*/ true, CostKind) +
1028             BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1029                                             /*Extract*/ false, CostKind) +
1030             VecValTy->getNumElements() *
1031                 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1032                                    VecCondTy->getScalarType(), VecPred,
1033                                    CostKind, I);
1034    }
1035
1036    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1037    int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1038    // There are two types - the input that specifies the type of the compare
1039    // and the output vXi1 type. Because we don't know how the output will be
1040    // split, we may need an expensive shuffle to get two in sync. This has the
1041    // effect of making larger than legal compares (v8i32 for example)
1042    // expensive.
1043    if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1044      if (LT.first > 1)
1045        return LT.first * BaseCost +
1046               BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1047                                               /*Extract*/ false, CostKind);
1048      return BaseCost;
1049    }
1050  }
1051
1052  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1053  // for "multiple beats" potentially needed by MVE instructions.
1054  int BaseCost = 1;
1055  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1056    BaseCost = ST->getMVEVectorCostFactor(CostKind);
1057
1058  return BaseCost *
1059         BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1060}
1061
1062InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1063                                                      ScalarEvolution *SE,
1064                                                      const SCEV *Ptr) {
1065  // Address computations in vectorized code with non-consecutive addresses will
1066  // likely result in more instructions compared to scalar code where the
1067  // computation can more often be merged into the index mode. The resulting
1068  // extra micro-ops can significantly decrease throughput.
1069  unsigned NumVectorInstToHideOverhead = 10;
1070  int MaxMergeDistance = 64;
1071
1072  if (ST->hasNEON()) {
1073    if (Ty->isVectorTy() && SE &&
1074        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1075      return NumVectorInstToHideOverhead;
1076
1077    // In many cases the address computation is not merged into the instruction
1078    // addressing mode.
1079    return 1;
1080  }
1081  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1082}
1083
1084bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1085  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1086    // If a VCTP is part of a chain, it's already profitable and shouldn't be
1087    // optimized, else LSR may block tail-predication.
1088    switch (II->getIntrinsicID()) {
1089    case Intrinsic::arm_mve_vctp8:
1090    case Intrinsic::arm_mve_vctp16:
1091    case Intrinsic::arm_mve_vctp32:
1092    case Intrinsic::arm_mve_vctp64:
1093      return true;
1094    default:
1095      break;
1096    }
1097  }
1098  return false;
1099}
1100
1101bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1102  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1103    return false;
1104
1105  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1106    // Don't support v2i1 yet.
1107    if (VecTy->getNumElements() == 2)
1108      return false;
1109
1110    // We don't support extending fp types.
1111     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1112    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1113      return false;
1114  }
1115
1116  unsigned EltWidth = DataTy->getScalarSizeInBits();
1117  return (EltWidth == 32 && Alignment >= 4) ||
1118         (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1119}
1120
1121bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1122  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1123    return false;
1124
1125  unsigned EltWidth = Ty->getScalarSizeInBits();
1126  return ((EltWidth == 32 && Alignment >= 4) ||
1127          (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1128}
1129
1130/// Given a memcpy/memset/memmove instruction, return the number of memory
1131/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1132/// call is used.
1133int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1134  MemOp MOp;
1135  unsigned DstAddrSpace = ~0u;
1136  unsigned SrcAddrSpace = ~0u;
1137  const Function *F = I->getParent()->getParent();
1138
1139  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1140    ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1141    // If 'size' is not a constant, a library call will be generated.
1142    if (!C)
1143      return -1;
1144
1145    const unsigned Size = C->getValue().getZExtValue();
1146    const Align DstAlign = *MC->getDestAlign();
1147    const Align SrcAlign = *MC->getSourceAlign();
1148
1149    MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1150                      /*IsVolatile*/ false);
1151    DstAddrSpace = MC->getDestAddressSpace();
1152    SrcAddrSpace = MC->getSourceAddressSpace();
1153  }
1154  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1155    ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1156    // If 'size' is not a constant, a library call will be generated.
1157    if (!C)
1158      return -1;
1159
1160    const unsigned Size = C->getValue().getZExtValue();
1161    const Align DstAlign = *MS->getDestAlign();
1162
1163    MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1164                     /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1165    DstAddrSpace = MS->getDestAddressSpace();
1166  }
1167  else
1168    llvm_unreachable("Expected a memcpy/move or memset!");
1169
1170  unsigned Limit, Factor = 2;
1171  switch(I->getIntrinsicID()) {
1172    case Intrinsic::memcpy:
1173      Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1174      break;
1175    case Intrinsic::memmove:
1176      Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1177      break;
1178    case Intrinsic::memset:
1179      Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1180      Factor = 1;
1181      break;
1182    default:
1183      llvm_unreachable("Expected a memcpy/move or memset!");
1184  }
1185
1186  // MemOps will be poplulated with a list of data types that needs to be
1187  // loaded and stored. That's why we multiply the number of elements by 2 to
1188  // get the cost for this memcpy.
1189  std::vector<EVT> MemOps;
1190  if (getTLI()->findOptimalMemOpLowering(
1191          MemOps, Limit, MOp, DstAddrSpace,
1192          SrcAddrSpace, F->getAttributes()))
1193    return MemOps.size() * Factor;
1194
1195  // If we can't find an optimal memop lowering, return the default cost
1196  return -1;
1197}
1198
1199InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1200  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1201
1202  // To model the cost of a library call, we assume 1 for the call, and
1203  // 3 for the argument setup.
1204  if (NumOps == -1)
1205    return 4;
1206  return NumOps;
1207}
1208
1209InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1210                                           VectorType *Tp, ArrayRef<int> Mask,
1211                                           TTI::TargetCostKind CostKind,
1212                                           int Index, VectorType *SubTp,
1213                                           ArrayRef<const Value *> Args) {
1214  Kind = improveShuffleKindFromMask(Kind, Mask);
1215  if (ST->hasNEON()) {
1216    if (Kind == TTI::SK_Broadcast) {
1217      static const CostTblEntry NEONDupTbl[] = {
1218          // VDUP handles these cases.
1219          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1220          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1221          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1222          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1223          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1224          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1225
1226          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1227          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1228          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1229          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1230
1231      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1232      if (const auto *Entry =
1233              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1234        return LT.first * Entry->Cost;
1235    }
1236    if (Kind == TTI::SK_Reverse) {
1237      static const CostTblEntry NEONShuffleTbl[] = {
1238          // Reverse shuffle cost one instruction if we are shuffling within a
1239          // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1240          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1241          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1242          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1243          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1244          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1245          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1246
1247          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1248          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1249          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1250          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1251
1252      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1253      if (const auto *Entry =
1254              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1255        return LT.first * Entry->Cost;
1256    }
1257    if (Kind == TTI::SK_Select) {
1258      static const CostTblEntry NEONSelShuffleTbl[] = {
1259          // Select shuffle cost table for ARM. Cost is the number of
1260          // instructions
1261          // required to create the shuffled vector.
1262
1263          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1264          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1265          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1266          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1267
1268          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1269          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1270          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1271
1272          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1273
1274          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1275
1276      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1277      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1278                                              ISD::VECTOR_SHUFFLE, LT.second))
1279        return LT.first * Entry->Cost;
1280    }
1281  }
1282  if (ST->hasMVEIntegerOps()) {
1283    if (Kind == TTI::SK_Broadcast) {
1284      static const CostTblEntry MVEDupTbl[] = {
1285          // VDUP handles these cases.
1286          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1287          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1288          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1289          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1290          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1291
1292      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1293      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1294                                              LT.second))
1295        return LT.first * Entry->Cost *
1296               ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1297    }
1298
1299    if (!Mask.empty()) {
1300      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1301      if (LT.second.isVector() &&
1302          Mask.size() <= LT.second.getVectorNumElements() &&
1303          (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1304           isVREVMask(Mask, LT.second, 64)))
1305        return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1306    }
1307  }
1308
1309  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1310                     ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1311                     : 1;
1312  return BaseCost *
1313         BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1314}
1315
1316InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1317    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1318    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1319    ArrayRef<const Value *> Args,
1320    const Instruction *CxtI) {
1321  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1322  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1323    // Make operations on i1 relatively expensive as this often involves
1324    // combining predicates. AND and XOR should be easier to handle with IT
1325    // blocks.
1326    switch (ISDOpcode) {
1327    default:
1328      break;
1329    case ISD::AND:
1330    case ISD::XOR:
1331      return 2;
1332    case ISD::OR:
1333      return 3;
1334    }
1335  }
1336
1337  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1338
1339  if (ST->hasNEON()) {
1340    const unsigned FunctionCallDivCost = 20;
1341    const unsigned ReciprocalDivCost = 10;
1342    static const CostTblEntry CostTbl[] = {
1343      // Division.
1344      // These costs are somewhat random. Choose a cost of 20 to indicate that
1345      // vectorizing devision (added function call) is going to be very expensive.
1346      // Double registers types.
1347      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1348      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1349      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1350      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1351      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1352      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1353      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1354      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1355      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
1356      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
1357      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1358      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1359      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
1360      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
1361      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
1362      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
1363      // Quad register types.
1364      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1365      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1366      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1367      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1368      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1369      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1370      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1371      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1372      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1373      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1374      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1375      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1376      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1377      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1378      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1379      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1380      // Multiplication.
1381    };
1382
1383    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1384      return LT.first * Entry->Cost;
1385
1386    InstructionCost Cost = BaseT::getArithmeticInstrCost(
1387        Opcode, Ty, CostKind, Op1Info, Op2Info);
1388
1389    // This is somewhat of a hack. The problem that we are facing is that SROA
1390    // creates a sequence of shift, and, or instructions to construct values.
1391    // These sequences are recognized by the ISel and have zero-cost. Not so for
1392    // the vectorized code. Because we have support for v2i64 but not i64 those
1393    // sequences look particularly beneficial to vectorize.
1394    // To work around this we increase the cost of v2i64 operations to make them
1395    // seem less beneficial.
1396    if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1397      Cost += 4;
1398
1399    return Cost;
1400  }
1401
1402  // If this operation is a shift on arm/thumb2, it might well be folded into
1403  // the following instruction, hence having a cost of 0.
1404  auto LooksLikeAFreeShift = [&]() {
1405    if (ST->isThumb1Only() || Ty->isVectorTy())
1406      return false;
1407
1408    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1409      return false;
1410    if (!Op2Info.isUniform() || !Op2Info.isConstant())
1411      return false;
1412
1413    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1414    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1415    case Instruction::Add:
1416    case Instruction::Sub:
1417    case Instruction::And:
1418    case Instruction::Xor:
1419    case Instruction::Or:
1420    case Instruction::ICmp:
1421      return true;
1422    default:
1423      return false;
1424    }
1425  };
1426  if (LooksLikeAFreeShift())
1427    return 0;
1428
1429  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1430  // for "multiple beats" potentially needed by MVE instructions.
1431  int BaseCost = 1;
1432  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1433    BaseCost = ST->getMVEVectorCostFactor(CostKind);
1434
1435  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1436  // without treating floats as more expensive that scalars or increasing the
1437  // costs for custom operations. The results is also multiplied by the
1438  // MVEVectorCostFactor where appropriate.
1439  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1440    return LT.first * BaseCost;
1441
1442  // Else this is expand, assume that we need to scalarize this op.
1443  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1444    unsigned Num = VTy->getNumElements();
1445    InstructionCost Cost =
1446        getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1447    // Return the cost of multiple scalar invocation plus the cost of
1448    // inserting and extracting the values.
1449    SmallVector<Type *> Tys(Args.size(), Ty);
1450    return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1451           Num * Cost;
1452  }
1453
1454  return BaseCost;
1455}
1456
1457InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1458                                            MaybeAlign Alignment,
1459                                            unsigned AddressSpace,
1460                                            TTI::TargetCostKind CostKind,
1461                                            TTI::OperandValueInfo OpInfo,
1462                                            const Instruction *I) {
1463  // TODO: Handle other cost kinds.
1464  if (CostKind != TTI::TCK_RecipThroughput)
1465    return 1;
1466
1467  // Type legalization can't handle structs
1468  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1469    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1470                                  CostKind);
1471
1472  if (ST->hasNEON() && Src->isVectorTy() &&
1473      (Alignment && *Alignment != Align(16)) &&
1474      cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1475    // Unaligned loads/stores are extremely inefficient.
1476    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1477    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1478    return LT.first * 4;
1479  }
1480
1481  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1482  // Same for stores.
1483  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1484      ((Opcode == Instruction::Load && I->hasOneUse() &&
1485        isa<FPExtInst>(*I->user_begin())) ||
1486       (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1487    FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1488    Type *DstTy =
1489        Opcode == Instruction::Load
1490            ? (*I->user_begin())->getType()
1491            : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1492    if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1493        DstTy->getScalarType()->isFloatTy())
1494      return ST->getMVEVectorCostFactor(CostKind);
1495  }
1496
1497  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1498                     ? ST->getMVEVectorCostFactor(CostKind)
1499                     : 1;
1500  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1501                                           CostKind, OpInfo, I);
1502}
1503
1504InstructionCost
1505ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1506                                  unsigned AddressSpace,
1507                                  TTI::TargetCostKind CostKind) {
1508  if (ST->hasMVEIntegerOps()) {
1509    if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1510      return ST->getMVEVectorCostFactor(CostKind);
1511    if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1512      return ST->getMVEVectorCostFactor(CostKind);
1513  }
1514  if (!isa<FixedVectorType>(Src))
1515    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1516                                        CostKind);
1517  // Scalar cost, which is currently very high due to the efficiency of the
1518  // generated code.
1519  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1520}
1521
1522InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1523    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1524    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1525    bool UseMaskForCond, bool UseMaskForGaps) {
1526  assert(Factor >= 2 && "Invalid interleave factor");
1527  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1528
1529  // vldN/vstN doesn't support vector types of i64/f64 element.
1530  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1531
1532  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1533      !UseMaskForCond && !UseMaskForGaps) {
1534    unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1535    auto *SubVecTy =
1536        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1537
1538    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1539    // Accesses having vector types that are a multiple of 128 bits can be
1540    // matched to more than one vldN/vstN instruction.
1541    int BaseCost =
1542        ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1543    if (NumElts % Factor == 0 &&
1544        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1545      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1546
1547    // Some smaller than legal interleaved patterns are cheap as we can make
1548    // use of the vmovn or vrev patterns to interleave a standard load. This is
1549    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1550    // promoted differently). The cost of 2 here is then a load and vrev or
1551    // vmovn.
1552    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1553        VecTy->isIntOrIntVectorTy() &&
1554        DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1555      return 2 * BaseCost;
1556  }
1557
1558  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1559                                           Alignment, AddressSpace, CostKind,
1560                                           UseMaskForCond, UseMaskForGaps);
1561}
1562
1563InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1564    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1565    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1566  using namespace PatternMatch;
1567  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1568    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1569                                         Alignment, CostKind, I);
1570
1571  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1572  auto *VTy = cast<FixedVectorType>(DataTy);
1573
1574  // TODO: Splitting, once we do that.
1575
1576  unsigned NumElems = VTy->getNumElements();
1577  unsigned EltSize = VTy->getScalarSizeInBits();
1578  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1579
1580  // For now, it is assumed that for the MVE gather instructions the loads are
1581  // all effectively serialised. This means the cost is the scalar cost
1582  // multiplied by the number of elements being loaded. This is possibly very
1583  // conservative, but even so we still end up vectorising loops because the
1584  // cost per iteration for many loops is lower than for scalar loops.
1585  InstructionCost VectorCost =
1586      NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1587  // The scalarization cost should be a lot higher. We use the number of vector
1588  // elements plus the scalarization overhead.
1589  InstructionCost ScalarCost =
1590      NumElems * LT.first +
1591      BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1592                                      CostKind) +
1593      BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1594                                      CostKind);
1595
1596  if (EltSize < 8 || Alignment < EltSize / 8)
1597    return ScalarCost;
1598
1599  unsigned ExtSize = EltSize;
1600  // Check whether there's a single user that asks for an extended type
1601  if (I != nullptr) {
1602    // Dependent of the caller of this function, a gather instruction will
1603    // either have opcode Instruction::Load or be a call to the masked_gather
1604    // intrinsic
1605    if ((I->getOpcode() == Instruction::Load ||
1606         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1607        I->hasOneUse()) {
1608      const User *Us = *I->users().begin();
1609      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1610        // only allow valid type combinations
1611        unsigned TypeSize =
1612            cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1613        if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1614             (TypeSize == 16 && EltSize == 8)) &&
1615            TypeSize * NumElems == 128) {
1616          ExtSize = TypeSize;
1617        }
1618      }
1619    }
1620    // Check whether the input data needs to be truncated
1621    TruncInst *T;
1622    if ((I->getOpcode() == Instruction::Store ||
1623         match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1624        (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1625      // Only allow valid type combinations
1626      unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1627      if (((EltSize == 16 && TypeSize == 32) ||
1628           (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1629          TypeSize * NumElems == 128)
1630        ExtSize = TypeSize;
1631    }
1632  }
1633
1634  if (ExtSize * NumElems != 128 || NumElems < 4)
1635    return ScalarCost;
1636
1637  // Any (aligned) i32 gather will not need to be scalarised.
1638  if (ExtSize == 32)
1639    return VectorCost;
1640  // For smaller types, we need to ensure that the gep's inputs are correctly
1641  // extended from a small enough value. Other sizes (including i64) are
1642  // scalarized for now.
1643  if (ExtSize != 8 && ExtSize != 16)
1644    return ScalarCost;
1645
1646  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1647    Ptr = BC->getOperand(0);
1648  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1649    if (GEP->getNumOperands() != 2)
1650      return ScalarCost;
1651    unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1652    // Scale needs to be correct (which is only relevant for i16s).
1653    if (Scale != 1 && Scale * 8 != ExtSize)
1654      return ScalarCost;
1655    // And we need to zext (not sext) the indexes from a small enough type.
1656    if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1657      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1658        return VectorCost;
1659    }
1660    return ScalarCost;
1661  }
1662  return ScalarCost;
1663}
1664
1665InstructionCost
1666ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1667                                       std::optional<FastMathFlags> FMF,
1668                                       TTI::TargetCostKind CostKind) {
1669  if (TTI::requiresOrderedReduction(FMF))
1670    return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1671
1672  EVT ValVT = TLI->getValueType(DL, ValTy);
1673  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1674  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1675    return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1676
1677  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1678
1679  static const CostTblEntry CostTblAdd[]{
1680      {ISD::ADD, MVT::v16i8, 1},
1681      {ISD::ADD, MVT::v8i16, 1},
1682      {ISD::ADD, MVT::v4i32, 1},
1683  };
1684  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1685    return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1686
1687  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1688}
1689
1690InstructionCost ARMTTIImpl::getExtendedReductionCost(
1691    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1692    std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1693  EVT ValVT = TLI->getValueType(DL, ValTy);
1694  EVT ResVT = TLI->getValueType(DL, ResTy);
1695
1696  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1697
1698  switch (ISD) {
1699  case ISD::ADD:
1700    if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1701      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1702
1703      // The legal cases are:
1704      //   VADDV u/s 8/16/32
1705      //   VADDLV u/s 32
1706      // Codegen currently cannot always handle larger than legal vectors very
1707      // well, especially for predicated reductions where the mask needs to be
1708      // split, so restrict to 128bit or smaller input types.
1709      unsigned RevVTSize = ResVT.getSizeInBits();
1710      if (ValVT.getSizeInBits() <= 128 &&
1711          ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1712           (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1713           (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1714        return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1715    }
1716    break;
1717  default:
1718    break;
1719  }
1720  return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1721                                         CostKind);
1722}
1723
1724InstructionCost
1725ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1726                                   VectorType *ValTy,
1727                                   TTI::TargetCostKind CostKind) {
1728  EVT ValVT = TLI->getValueType(DL, ValTy);
1729  EVT ResVT = TLI->getValueType(DL, ResTy);
1730
1731  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1732    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1733
1734    // The legal cases are:
1735    //   VMLAV u/s 8/16/32
1736    //   VMLALV u/s 16/32
1737    // Codegen currently cannot always handle larger than legal vectors very
1738    // well, especially for predicated reductions where the mask needs to be
1739    // split, so restrict to 128bit or smaller input types.
1740    unsigned RevVTSize = ResVT.getSizeInBits();
1741    if (ValVT.getSizeInBits() <= 128 &&
1742        ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1743         (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1744         (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1745      return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1746  }
1747
1748  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1749}
1750
1751InstructionCost
1752ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1753                                  TTI::TargetCostKind CostKind) {
1754  switch (ICA.getID()) {
1755  case Intrinsic::get_active_lane_mask:
1756    // Currently we make a somewhat optimistic assumption that
1757    // active_lane_mask's are always free. In reality it may be freely folded
1758    // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1759    // of add/icmp code. We may need to improve this in the future, but being
1760    // able to detect if it is free or not involves looking at a lot of other
1761    // code. We currently assume that the vectorizer inserted these, and knew
1762    // what it was doing in adding one.
1763    if (ST->hasMVEIntegerOps())
1764      return 0;
1765    break;
1766  case Intrinsic::sadd_sat:
1767  case Intrinsic::ssub_sat:
1768  case Intrinsic::uadd_sat:
1769  case Intrinsic::usub_sat: {
1770    if (!ST->hasMVEIntegerOps())
1771      break;
1772    Type *VT = ICA.getReturnType();
1773
1774    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1775    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1776        LT.second == MVT::v16i8) {
1777      // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1778      // need to extend the type, as it uses shr(qadd(shl, shl)).
1779      unsigned Instrs =
1780          LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1781      return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1782    }
1783    break;
1784  }
1785  case Intrinsic::abs:
1786  case Intrinsic::smin:
1787  case Intrinsic::smax:
1788  case Intrinsic::umin:
1789  case Intrinsic::umax: {
1790    if (!ST->hasMVEIntegerOps())
1791      break;
1792    Type *VT = ICA.getReturnType();
1793
1794    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1795    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1796        LT.second == MVT::v16i8)
1797      return LT.first * ST->getMVEVectorCostFactor(CostKind);
1798    break;
1799  }
1800  case Intrinsic::minnum:
1801  case Intrinsic::maxnum: {
1802    if (!ST->hasMVEFloatOps())
1803      break;
1804    Type *VT = ICA.getReturnType();
1805    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1806    if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1807      return LT.first * ST->getMVEVectorCostFactor(CostKind);
1808    break;
1809  }
1810  case Intrinsic::fptosi_sat:
1811  case Intrinsic::fptoui_sat: {
1812    if (ICA.getArgTypes().empty())
1813      break;
1814    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1815    auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1816    EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1817    // Check for the legal types, with the corect subtarget features.
1818    if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1819        (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1820        (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1821      return LT.first;
1822
1823    // Equally for MVE vector types
1824    if (ST->hasMVEFloatOps() &&
1825        (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1826        LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1827      return LT.first * ST->getMVEVectorCostFactor(CostKind);
1828
1829    // Otherwise we use a legal convert followed by a min+max
1830    if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1831         (ST->hasFP64() && LT.second == MVT::f64) ||
1832         (ST->hasFullFP16() && LT.second == MVT::f16) ||
1833         (ST->hasMVEFloatOps() &&
1834          (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1835        LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1836      Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1837                                      LT.second.getScalarSizeInBits());
1838      InstructionCost Cost =
1839          LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1840      IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1841                                              : Intrinsic::umin,
1842                                     LegalTy, {LegalTy, LegalTy});
1843      Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1844      IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1845                                              : Intrinsic::umax,
1846                                     LegalTy, {LegalTy, LegalTy});
1847      Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1848      return LT.first * Cost;
1849    }
1850    break;
1851  }
1852  }
1853
1854  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1855}
1856
1857bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1858  if (!F->isIntrinsic())
1859    return BaseT::isLoweredToCall(F);
1860
1861  // Assume all Arm-specific intrinsics map to an instruction.
1862  if (F->getName().startswith("llvm.arm"))
1863    return false;
1864
1865  switch (F->getIntrinsicID()) {
1866  default: break;
1867  case Intrinsic::powi:
1868  case Intrinsic::sin:
1869  case Intrinsic::cos:
1870  case Intrinsic::pow:
1871  case Intrinsic::log:
1872  case Intrinsic::log10:
1873  case Intrinsic::log2:
1874  case Intrinsic::exp:
1875  case Intrinsic::exp2:
1876    return true;
1877  case Intrinsic::sqrt:
1878  case Intrinsic::fabs:
1879  case Intrinsic::copysign:
1880  case Intrinsic::floor:
1881  case Intrinsic::ceil:
1882  case Intrinsic::trunc:
1883  case Intrinsic::rint:
1884  case Intrinsic::nearbyint:
1885  case Intrinsic::round:
1886  case Intrinsic::canonicalize:
1887  case Intrinsic::lround:
1888  case Intrinsic::llround:
1889  case Intrinsic::lrint:
1890  case Intrinsic::llrint:
1891    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1892      return true;
1893    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1894      return true;
1895    // Some operations can be handled by vector instructions and assume
1896    // unsupported vectors will be expanded into supported scalar ones.
1897    // TODO Handle scalar operations properly.
1898    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1899  case Intrinsic::masked_store:
1900  case Intrinsic::masked_load:
1901  case Intrinsic::masked_gather:
1902  case Intrinsic::masked_scatter:
1903    return !ST->hasMVEIntegerOps();
1904  case Intrinsic::sadd_with_overflow:
1905  case Intrinsic::uadd_with_overflow:
1906  case Intrinsic::ssub_with_overflow:
1907  case Intrinsic::usub_with_overflow:
1908  case Intrinsic::sadd_sat:
1909  case Intrinsic::uadd_sat:
1910  case Intrinsic::ssub_sat:
1911  case Intrinsic::usub_sat:
1912    return false;
1913  }
1914
1915  return BaseT::isLoweredToCall(F);
1916}
1917
1918bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1919  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1920  EVT VT = TLI->getValueType(DL, I.getType(), true);
1921  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1922    return true;
1923
1924  // Check if an intrinsic will be lowered to a call and assume that any
1925  // other CallInst will generate a bl.
1926  if (auto *Call = dyn_cast<CallInst>(&I)) {
1927    if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1928      switch(II->getIntrinsicID()) {
1929        case Intrinsic::memcpy:
1930        case Intrinsic::memset:
1931        case Intrinsic::memmove:
1932          return getNumMemOps(II) == -1;
1933        default:
1934          if (const Function *F = Call->getCalledFunction())
1935            return isLoweredToCall(F);
1936      }
1937    }
1938    return true;
1939  }
1940
1941  // FPv5 provides conversions between integer, double-precision,
1942  // single-precision, and half-precision formats.
1943  switch (I.getOpcode()) {
1944  default:
1945    break;
1946  case Instruction::FPToSI:
1947  case Instruction::FPToUI:
1948  case Instruction::SIToFP:
1949  case Instruction::UIToFP:
1950  case Instruction::FPTrunc:
1951  case Instruction::FPExt:
1952    return !ST->hasFPARMv8Base();
1953  }
1954
1955  // FIXME: Unfortunately the approach of checking the Operation Action does
1956  // not catch all cases of Legalization that use library calls. Our
1957  // Legalization step categorizes some transformations into library calls as
1958  // Custom, Expand or even Legal when doing type legalization. So for now
1959  // we have to special case for instance the SDIV of 64bit integers and the
1960  // use of floating point emulation.
1961  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1962    switch (ISD) {
1963    default:
1964      break;
1965    case ISD::SDIV:
1966    case ISD::UDIV:
1967    case ISD::SREM:
1968    case ISD::UREM:
1969    case ISD::SDIVREM:
1970    case ISD::UDIVREM:
1971      return true;
1972    }
1973  }
1974
1975  // Assume all other non-float operations are supported.
1976  if (!VT.isFloatingPoint())
1977    return false;
1978
1979  // We'll need a library call to handle most floats when using soft.
1980  if (TLI->useSoftFloat()) {
1981    switch (I.getOpcode()) {
1982    default:
1983      return true;
1984    case Instruction::Alloca:
1985    case Instruction::Load:
1986    case Instruction::Store:
1987    case Instruction::Select:
1988    case Instruction::PHI:
1989      return false;
1990    }
1991  }
1992
1993  // We'll need a libcall to perform double precision operations on a single
1994  // precision only FPU.
1995  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1996    return true;
1997
1998  // Likewise for half precision arithmetic.
1999  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2000    return true;
2001
2002  return false;
2003}
2004
2005bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2006                                          AssumptionCache &AC,
2007                                          TargetLibraryInfo *LibInfo,
2008                                          HardwareLoopInfo &HWLoopInfo) {
2009  // Low-overhead branches are only supported in the 'low-overhead branch'
2010  // extension of v8.1-m.
2011  if (!ST->hasLOB() || DisableLowOverheadLoops) {
2012    LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2013    return false;
2014  }
2015
2016  if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2017    LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2018    return false;
2019  }
2020
2021  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2022  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2023    LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2024    return false;
2025  }
2026
2027  const SCEV *TripCountSCEV =
2028    SE.getAddExpr(BackedgeTakenCount,
2029                  SE.getOne(BackedgeTakenCount->getType()));
2030
2031  // We need to store the trip count in LR, a 32-bit register.
2032  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2033    LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2034    return false;
2035  }
2036
2037  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2038  // point in generating a hardware loop if that's going to happen.
2039
2040  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2041    if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2042      switch (Call->getIntrinsicID()) {
2043      default:
2044        break;
2045      case Intrinsic::start_loop_iterations:
2046      case Intrinsic::test_start_loop_iterations:
2047      case Intrinsic::loop_decrement:
2048      case Intrinsic::loop_decrement_reg:
2049        return true;
2050      }
2051    }
2052    return false;
2053  };
2054
2055  // Scan the instructions to see if there's any that we know will turn into a
2056  // call or if this loop is already a low-overhead loop or will become a tail
2057  // predicated loop.
2058  bool IsTailPredLoop = false;
2059  auto ScanLoop = [&](Loop *L) {
2060    for (auto *BB : L->getBlocks()) {
2061      for (auto &I : *BB) {
2062        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2063            isa<InlineAsm>(I)) {
2064          LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2065          return false;
2066        }
2067        if (auto *II = dyn_cast<IntrinsicInst>(&I))
2068          IsTailPredLoop |=
2069              II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2070              II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2071              II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2072              II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2073              II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2074      }
2075    }
2076    return true;
2077  };
2078
2079  // Visit inner loops.
2080  for (auto *Inner : *L)
2081    if (!ScanLoop(Inner))
2082      return false;
2083
2084  if (!ScanLoop(L))
2085    return false;
2086
2087  // TODO: Check whether the trip count calculation is expensive. If L is the
2088  // inner loop but we know it has a low trip count, calculating that trip
2089  // count (in the parent loop) may be detrimental.
2090
2091  LLVMContext &C = L->getHeader()->getContext();
2092  HWLoopInfo.CounterInReg = true;
2093  HWLoopInfo.IsNestingLegal = false;
2094  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2095  HWLoopInfo.CountType = Type::getInt32Ty(C);
2096  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2097  return true;
2098}
2099
2100static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2101  // We don't allow icmp's, and because we only look at single block loops,
2102  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2103  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2104    return false;
2105  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2106  // not currently canonical, but soon will be. Code without them uses icmp, and
2107  // so is not tail predicated as per the condition above. In order to get the
2108  // same performance we treat min and max the same as an icmp for tailpred
2109  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2110  // pick more optimial instructions like VQDMULH. They need to be recognized
2111  // directly by the vectorizer).
2112  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2113    if ((II->getIntrinsicID() == Intrinsic::smin ||
2114         II->getIntrinsicID() == Intrinsic::smax ||
2115         II->getIntrinsicID() == Intrinsic::umin ||
2116         II->getIntrinsicID() == Intrinsic::umax) &&
2117        ++ICmpCount > 1)
2118      return false;
2119
2120  if (isa<FCmpInst>(&I))
2121    return false;
2122
2123  // We could allow extending/narrowing FP loads/stores, but codegen is
2124  // too inefficient so reject this for now.
2125  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2126    return false;
2127
2128  // Extends have to be extending-loads
2129  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2130    if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2131      return false;
2132
2133  // Truncs have to be narrowing-stores
2134  if (isa<TruncInst>(&I) )
2135    if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2136      return false;
2137
2138  return true;
2139}
2140
2141// To set up a tail-predicated loop, we need to know the total number of
2142// elements processed by that loop. Thus, we need to determine the element
2143// size and:
2144// 1) it should be uniform for all operations in the vector loop, so we
2145//    e.g. don't want any widening/narrowing operations.
2146// 2) it should be smaller than i64s because we don't have vector operations
2147//    that work on i64s.
2148// 3) we don't want elements to be reversed or shuffled, to make sure the
2149//    tail-predication masks/predicates the right lanes.
2150//
2151static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2152                                 const DataLayout &DL,
2153                                 const LoopAccessInfo *LAI) {
2154  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2155
2156  // If there are live-out values, it is probably a reduction. We can predicate
2157  // most reduction operations freely under MVE using a combination of
2158  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2159  // floating point and integer reductions, but don't check for operators
2160  // specifically here. If the value ends up not being a reduction (and so the
2161  // vectorizer cannot tailfold the loop), we should fall back to standard
2162  // vectorization automatically.
2163  SmallVector< Instruction *, 8 > LiveOuts;
2164  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2165  bool ReductionsDisabled =
2166      EnableTailPredication == TailPredication::EnabledNoReductions ||
2167      EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2168
2169  for (auto *I : LiveOuts) {
2170    if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2171        !I->getType()->isHalfTy()) {
2172      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2173                           "live-out value\n");
2174      return false;
2175    }
2176    if (ReductionsDisabled) {
2177      LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2178      return false;
2179    }
2180  }
2181
2182  // Next, check that all instructions can be tail-predicated.
2183  PredicatedScalarEvolution PSE = LAI->getPSE();
2184  SmallVector<Instruction *, 16> LoadStores;
2185  int ICmpCount = 0;
2186
2187  for (BasicBlock *BB : L->blocks()) {
2188    for (Instruction &I : BB->instructionsWithoutDebug()) {
2189      if (isa<PHINode>(&I))
2190        continue;
2191      if (!canTailPredicateInstruction(I, ICmpCount)) {
2192        LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2193        return false;
2194      }
2195
2196      Type *T  = I.getType();
2197      if (T->getScalarSizeInBits() > 32) {
2198        LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2199        return false;
2200      }
2201      if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2202        Value *Ptr = getLoadStorePointerOperand(&I);
2203        Type *AccessTy = getLoadStoreType(&I);
2204        int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2205        if (NextStride == 1) {
2206          // TODO: for now only allow consecutive strides of 1. We could support
2207          // other strides as long as it is uniform, but let's keep it simple
2208          // for now.
2209          continue;
2210        } else if (NextStride == -1 ||
2211                   (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2212                   (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2213          LLVM_DEBUG(dbgs()
2214                     << "Consecutive strides of 2 found, vld2/vstr2 can't "
2215                        "be tail-predicated\n.");
2216          return false;
2217          // TODO: don't tail predicate if there is a reversed load?
2218        } else if (EnableMaskedGatherScatters) {
2219          // Gather/scatters do allow loading from arbitrary strides, at
2220          // least if they are loop invariant.
2221          // TODO: Loop variant strides should in theory work, too, but
2222          // this requires further testing.
2223          const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2224          if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2225            const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2226            if (PSE.getSE()->isLoopInvariant(Step, L))
2227              continue;
2228          }
2229        }
2230        LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2231                             "tail-predicate\n.");
2232        return false;
2233      }
2234    }
2235  }
2236
2237  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2238  return true;
2239}
2240
2241bool ARMTTIImpl::preferPredicateOverEpilogue(
2242    Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
2243    TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
2244    InterleavedAccessInfo *IAI) {
2245  if (!EnableTailPredication) {
2246    LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2247    return false;
2248  }
2249
2250  // Creating a predicated vector loop is the first step for generating a
2251  // tail-predicated hardware loop, for which we need the MVE masked
2252  // load/stores instructions:
2253  if (!ST->hasMVEIntegerOps())
2254    return false;
2255
2256  // For now, restrict this to single block loops.
2257  if (L->getNumBlocks() > 1) {
2258    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2259                         "loop.\n");
2260    return false;
2261  }
2262
2263  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2264
2265  HardwareLoopInfo HWLoopInfo(L);
2266  if (!HWLoopInfo.canAnalyze(*LI)) {
2267    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2268                         "analyzable.\n");
2269    return false;
2270  }
2271
2272  // This checks if we have the low-overhead branch architecture
2273  // extension, and if we will create a hardware-loop:
2274  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2275    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2276                         "profitable.\n");
2277    return false;
2278  }
2279
2280  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2281    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2282                         "a candidate.\n");
2283    return false;
2284  }
2285
2286  return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
2287}
2288
2289PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
2290  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2291    return PredicationStyle::None;
2292
2293  // Intrinsic @llvm.get.active.lane.mask is supported.
2294  // It is used in the MVETailPredication pass, which requires the number of
2295  // elements processed by this vector loop to setup the tail-predicated
2296  // loop.
2297  return PredicationStyle::Data;
2298}
2299void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2300                                         TTI::UnrollingPreferences &UP,
2301                                         OptimizationRemarkEmitter *ORE) {
2302  // Enable Upper bound unrolling universally, not dependant upon the conditions
2303  // below.
2304  UP.UpperBound = true;
2305
2306  // Only currently enable these preferences for M-Class cores.
2307  if (!ST->isMClass())
2308    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2309
2310  // Disable loop unrolling for Oz and Os.
2311  UP.OptSizeThreshold = 0;
2312  UP.PartialOptSizeThreshold = 0;
2313  if (L->getHeader()->getParent()->hasOptSize())
2314    return;
2315
2316  SmallVector<BasicBlock*, 4> ExitingBlocks;
2317  L->getExitingBlocks(ExitingBlocks);
2318  LLVM_DEBUG(dbgs() << "Loop has:\n"
2319                    << "Blocks: " << L->getNumBlocks() << "\n"
2320                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
2321
2322  // Only allow another exit other than the latch. This acts as an early exit
2323  // as it mirrors the profitability calculation of the runtime unroller.
2324  if (ExitingBlocks.size() > 2)
2325    return;
2326
2327  // Limit the CFG of the loop body for targets with a branch predictor.
2328  // Allowing 4 blocks permits if-then-else diamonds in the body.
2329  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2330    return;
2331
2332  // Don't unroll vectorized loops, including the remainder loop
2333  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2334    return;
2335
2336  // Scan the loop: don't unroll loops with calls as this could prevent
2337  // inlining.
2338  InstructionCost Cost = 0;
2339  for (auto *BB : L->getBlocks()) {
2340    for (auto &I : *BB) {
2341      // Don't unroll vectorised loop. MVE does not benefit from it as much as
2342      // scalar code.
2343      if (I.getType()->isVectorTy())
2344        return;
2345
2346      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2347        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2348          if (!isLoweredToCall(F))
2349            continue;
2350        }
2351        return;
2352      }
2353
2354      SmallVector<const Value*, 4> Operands(I.operand_values());
2355      Cost += getInstructionCost(&I, Operands,
2356                                 TargetTransformInfo::TCK_SizeAndLatency);
2357    }
2358  }
2359
2360  // On v6m cores, there are very few registers available. We can easily end up
2361  // spilling and reloading more registers in an unrolled loop. Look at the
2362  // number of LCSSA phis as a rough measure of how many registers will need to
2363  // be live out of the loop, reducing the default unroll count if more than 1
2364  // value is needed.  In the long run, all of this should be being learnt by a
2365  // machine.
2366  unsigned UnrollCount = 4;
2367  if (ST->isThumb1Only()) {
2368    unsigned ExitingValues = 0;
2369    SmallVector<BasicBlock *, 4> ExitBlocks;
2370    L->getExitBlocks(ExitBlocks);
2371    for (auto *Exit : ExitBlocks) {
2372      // Count the number of LCSSA phis. Exclude values coming from GEP's as
2373      // only the last is expected to be needed for address operands.
2374      unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2375        return PH.getNumOperands() != 1 ||
2376               !isa<GetElementPtrInst>(PH.getOperand(0));
2377      });
2378      ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2379    }
2380    if (ExitingValues)
2381      UnrollCount /= ExitingValues;
2382    if (UnrollCount <= 1)
2383      return;
2384  }
2385
2386  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2387  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2388
2389  UP.Partial = true;
2390  UP.Runtime = true;
2391  UP.UnrollRemainder = true;
2392  UP.DefaultUnrollRuntimeCount = UnrollCount;
2393  UP.UnrollAndJam = true;
2394  UP.UnrollAndJamInnerLoopThreshold = 60;
2395
2396  // Force unrolling small loops can be very useful because of the branch
2397  // taken cost of the backedge.
2398  if (Cost < 12)
2399    UP.Force = true;
2400}
2401
2402void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2403                                       TTI::PeelingPreferences &PP) {
2404  BaseT::getPeelingPreferences(L, SE, PP);
2405}
2406
2407bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2408                                       TTI::ReductionFlags Flags) const {
2409  if (!ST->hasMVEIntegerOps())
2410    return false;
2411
2412  unsigned ScalarBits = Ty->getScalarSizeInBits();
2413  switch (Opcode) {
2414  case Instruction::Add:
2415    return ScalarBits <= 64;
2416  default:
2417    return false;
2418  }
2419}
2420
2421bool ARMTTIImpl::preferPredicatedReductionSelect(
2422    unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2423  if (!ST->hasMVEIntegerOps())
2424    return false;
2425  return true;
2426}
2427
2428InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2429                                                 int64_t BaseOffset,
2430                                                 bool HasBaseReg, int64_t Scale,
2431                                                 unsigned AddrSpace) const {
2432  TargetLoweringBase::AddrMode AM;
2433  AM.BaseGV = BaseGV;
2434  AM.BaseOffs = BaseOffset;
2435  AM.HasBaseReg = HasBaseReg;
2436  AM.Scale = Scale;
2437  if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2438    if (ST->hasFPAO())
2439      return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2440    return 0;
2441  }
2442  return -1;
2443}
2444