1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/MachineValueType.h"
47#include "llvm/Support/MathExtras.h"
48#include "llvm/Target/TargetMachine.h"
49#include <algorithm>
50#include <cassert>
51#include <cstdint>
52#include <limits>
53#include <utility>
54
55namespace llvm {
56
57class Function;
58class GlobalValue;
59class LLVMContext;
60class ScalarEvolution;
61class SCEV;
62class TargetMachine;
63
64extern cl::opt<unsigned> PartialUnrollingThreshold;
65
66/// Base class which can be used to help build a TTI implementation.
67///
68/// This class provides as much implementation of the TTI interface as is
69/// possible using the target independent parts of the code generator.
70///
71/// In order to subclass it, your class must implement a getST() method to
72/// return the subtarget, and a getTLI() method to return the target lowering.
73/// We need these methods implemented in the derived class so that this class
74/// doesn't have to duplicate storage for them.
75template <typename T>
76class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
77private:
78  using BaseT = TargetTransformInfoImplCRTPBase<T>;
79  using TTI = TargetTransformInfo;
80
81  /// Helper function to access this as a T.
82  T *thisT() { return static_cast<T *>(this); }
83
84  /// Estimate a cost of Broadcast as an extract and sequence of insert
85  /// operations.
86  InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
87    InstructionCost Cost = 0;
88    // Broadcast cost is equal to the cost of extracting the zero'th element
89    // plus the cost of inserting it into every element of the result vector.
90    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
91
92    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
93      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
94    }
95    return Cost;
96  }
97
98  /// Estimate a cost of shuffle as a sequence of extract and insert
99  /// operations.
100  InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
101    InstructionCost Cost = 0;
102    // Shuffle cost is equal to the cost of extracting element from its argument
103    // plus the cost of inserting them onto the result vector.
104
105    // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
106    // index 0 of first vector, index 1 of second vector,index 2 of first
107    // vector and finally index 3 of second vector and insert them at index
108    // <0,1,2,3> of result vector.
109    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
110      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
111      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
112    }
113    return Cost;
114  }
115
116  /// Estimate a cost of subvector extraction as a sequence of extract and
117  /// insert operations.
118  InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
119                                       FixedVectorType *SubVTy) {
120    assert(VTy && SubVTy &&
121           "Can only extract subvectors from vectors");
122    int NumSubElts = SubVTy->getNumElements();
123    assert((!isa<FixedVectorType>(VTy) ||
124            (Index + NumSubElts) <=
125                (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
126           "SK_ExtractSubvector index out of range");
127
128    InstructionCost Cost = 0;
129    // Subvector extraction cost is equal to the cost of extracting element from
130    // the source type plus the cost of inserting them into the result vector
131    // type.
132    for (int i = 0; i != NumSubElts; ++i) {
133      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
134                                          i + Index);
135      Cost +=
136          thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
137    }
138    return Cost;
139  }
140
141  /// Estimate a cost of subvector insertion as a sequence of extract and
142  /// insert operations.
143  InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
144                                      FixedVectorType *SubVTy) {
145    assert(VTy && SubVTy &&
146           "Can only insert subvectors into vectors");
147    int NumSubElts = SubVTy->getNumElements();
148    assert((!isa<FixedVectorType>(VTy) ||
149            (Index + NumSubElts) <=
150                (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
151           "SK_InsertSubvector index out of range");
152
153    InstructionCost Cost = 0;
154    // Subvector insertion cost is equal to the cost of extracting element from
155    // the source type plus the cost of inserting them into the result vector
156    // type.
157    for (int i = 0; i != NumSubElts; ++i) {
158      Cost +=
159          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
160      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
161                                          i + Index);
162    }
163    return Cost;
164  }
165
166  /// Local query method delegates up to T which *must* implement this!
167  const TargetSubtargetInfo *getST() const {
168    return static_cast<const T *>(this)->getST();
169  }
170
171  /// Local query method delegates up to T which *must* implement this!
172  const TargetLoweringBase *getTLI() const {
173    return static_cast<const T *>(this)->getTLI();
174  }
175
176  static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
177    switch (M) {
178      case TTI::MIM_Unindexed:
179        return ISD::UNINDEXED;
180      case TTI::MIM_PreInc:
181        return ISD::PRE_INC;
182      case TTI::MIM_PreDec:
183        return ISD::PRE_DEC;
184      case TTI::MIM_PostInc:
185        return ISD::POST_INC;
186      case TTI::MIM_PostDec:
187        return ISD::POST_DEC;
188    }
189    llvm_unreachable("Unexpected MemIndexedMode");
190  }
191
192  InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
193                                              Align Alignment,
194                                              bool VariableMask,
195                                              bool IsGatherScatter,
196                                              TTI::TargetCostKind CostKind) {
197    auto *VT = cast<FixedVectorType>(DataTy);
198    // Assume the target does not have support for gather/scatter operations
199    // and provide a rough estimate.
200    //
201    // First, compute the cost of the individual memory operations.
202    InstructionCost AddrExtractCost =
203        IsGatherScatter
204            ? getVectorInstrCost(Instruction::ExtractElement,
205                                 FixedVectorType::get(
206                                     PointerType::get(VT->getElementType(), 0),
207                                     VT->getNumElements()),
208                                 -1)
209            : 0;
210    InstructionCost LoadCost =
211        VT->getNumElements() *
212        (AddrExtractCost +
213         getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
214
215    // Next, compute the cost of packing the result in a vector.
216    InstructionCost PackingCost = getScalarizationOverhead(
217        VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
218
219    InstructionCost ConditionalCost = 0;
220    if (VariableMask) {
221      // Compute the cost of conditionally executing the memory operations with
222      // variable masks. This includes extracting the individual conditions, a
223      // branches and PHIs to combine the results.
224      // NOTE: Estimating the cost of conditionally executing the memory
225      // operations accurately is quite difficult and the current solution
226      // provides a very rough estimate only.
227      ConditionalCost =
228          VT->getNumElements() *
229          (getVectorInstrCost(
230               Instruction::ExtractElement,
231               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
232                                    VT->getNumElements()),
233               -1) +
234           getCFInstrCost(Instruction::Br, CostKind) +
235           getCFInstrCost(Instruction::PHI, CostKind));
236    }
237
238    return LoadCost + PackingCost + ConditionalCost;
239  }
240
241protected:
242  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
243      : BaseT(DL) {}
244  virtual ~BasicTTIImplBase() = default;
245
246  using TargetTransformInfoImplBase::DL;
247
248public:
249  /// \name Scalar TTI Implementations
250  /// @{
251  bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
252                                      unsigned AddressSpace, Align Alignment,
253                                      bool *Fast) const {
254    EVT E = EVT::getIntegerVT(Context, BitWidth);
255    return getTLI()->allowsMisalignedMemoryAccesses(
256        E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
257  }
258
259  bool hasBranchDivergence() { return false; }
260
261  bool useGPUDivergenceAnalysis() { return false; }
262
263  bool isSourceOfDivergence(const Value *V) { return false; }
264
265  bool isAlwaysUniform(const Value *V) { return false; }
266
267  unsigned getFlatAddressSpace() {
268    // Return an invalid address space.
269    return -1;
270  }
271
272  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
273                                  Intrinsic::ID IID) const {
274    return false;
275  }
276
277  bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
278    return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
279  }
280
281  unsigned getAssumedAddrSpace(const Value *V) const {
282    return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
283  }
284
285  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
286                                          Value *NewV) const {
287    return nullptr;
288  }
289
290  bool isLegalAddImmediate(int64_t imm) {
291    return getTLI()->isLegalAddImmediate(imm);
292  }
293
294  bool isLegalICmpImmediate(int64_t imm) {
295    return getTLI()->isLegalICmpImmediate(imm);
296  }
297
298  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
299                             bool HasBaseReg, int64_t Scale,
300                             unsigned AddrSpace, Instruction *I = nullptr) {
301    TargetLoweringBase::AddrMode AM;
302    AM.BaseGV = BaseGV;
303    AM.BaseOffs = BaseOffset;
304    AM.HasBaseReg = HasBaseReg;
305    AM.Scale = Scale;
306    return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
307  }
308
309  bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
310                          const DataLayout &DL) const {
311    EVT VT = getTLI()->getValueType(DL, Ty);
312    return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
313  }
314
315  bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
316                           const DataLayout &DL) const {
317    EVT VT = getTLI()->getValueType(DL, Ty);
318    return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
319  }
320
321  bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
322    return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
323  }
324
325  bool isNumRegsMajorCostOfLSR() {
326    return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
327  }
328
329  bool isProfitableLSRChainElement(Instruction *I) {
330    return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
331  }
332
333  InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
334                                       int64_t BaseOffset, bool HasBaseReg,
335                                       int64_t Scale, unsigned AddrSpace) {
336    TargetLoweringBase::AddrMode AM;
337    AM.BaseGV = BaseGV;
338    AM.BaseOffs = BaseOffset;
339    AM.HasBaseReg = HasBaseReg;
340    AM.Scale = Scale;
341    return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
342  }
343
344  bool isTruncateFree(Type *Ty1, Type *Ty2) {
345    return getTLI()->isTruncateFree(Ty1, Ty2);
346  }
347
348  bool isProfitableToHoist(Instruction *I) {
349    return getTLI()->isProfitableToHoist(I);
350  }
351
352  bool useAA() const { return getST()->useAA(); }
353
354  bool isTypeLegal(Type *Ty) {
355    EVT VT = getTLI()->getValueType(DL, Ty);
356    return getTLI()->isTypeLegal(VT);
357  }
358
359  InstructionCost getRegUsageForType(Type *Ty) {
360    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
361    assert(Val >= 0 && "Negative cost!");
362    return Val;
363  }
364
365  InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
366                             ArrayRef<const Value *> Operands) {
367    return BaseT::getGEPCost(PointeeType, Ptr, Operands);
368  }
369
370  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
371                                            unsigned &JumpTableSize,
372                                            ProfileSummaryInfo *PSI,
373                                            BlockFrequencyInfo *BFI) {
374    /// Try to find the estimated number of clusters. Note that the number of
375    /// clusters identified in this function could be different from the actual
376    /// numbers found in lowering. This function ignore switches that are
377    /// lowered with a mix of jump table / bit test / BTree. This function was
378    /// initially intended to be used when estimating the cost of switch in
379    /// inline cost heuristic, but it's a generic cost model to be used in other
380    /// places (e.g., in loop unrolling).
381    unsigned N = SI.getNumCases();
382    const TargetLoweringBase *TLI = getTLI();
383    const DataLayout &DL = this->getDataLayout();
384
385    JumpTableSize = 0;
386    bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
387
388    // Early exit if both a jump table and bit test are not allowed.
389    if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
390      return N;
391
392    APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
393    APInt MinCaseVal = MaxCaseVal;
394    for (auto CI : SI.cases()) {
395      const APInt &CaseVal = CI.getCaseValue()->getValue();
396      if (CaseVal.sgt(MaxCaseVal))
397        MaxCaseVal = CaseVal;
398      if (CaseVal.slt(MinCaseVal))
399        MinCaseVal = CaseVal;
400    }
401
402    // Check if suitable for a bit test
403    if (N <= DL.getIndexSizeInBits(0u)) {
404      SmallPtrSet<const BasicBlock *, 4> Dests;
405      for (auto I : SI.cases())
406        Dests.insert(I.getCaseSuccessor());
407
408      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
409                                     DL))
410        return 1;
411    }
412
413    // Check if suitable for a jump table.
414    if (IsJTAllowed) {
415      if (N < 2 || N < TLI->getMinimumJumpTableEntries())
416        return N;
417      uint64_t Range =
418          (MaxCaseVal - MinCaseVal)
419              .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
420      // Check whether a range of clusters is dense enough for a jump table
421      if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
422        JumpTableSize = Range;
423        return 1;
424      }
425    }
426    return N;
427  }
428
429  bool shouldBuildLookupTables() {
430    const TargetLoweringBase *TLI = getTLI();
431    return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
432           TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
433  }
434
435  bool shouldBuildRelLookupTables() const {
436    const TargetMachine &TM = getTLI()->getTargetMachine();
437    // If non-PIC mode, do not generate a relative lookup table.
438    if (!TM.isPositionIndependent())
439      return false;
440
441    /// Relative lookup table entries consist of 32-bit offsets.
442    /// Do not generate relative lookup tables for large code models
443    /// in 64-bit achitectures where 32-bit offsets might not be enough.
444    if (TM.getCodeModel() == CodeModel::Medium ||
445        TM.getCodeModel() == CodeModel::Large)
446      return false;
447
448    Triple TargetTriple = TM.getTargetTriple();
449    if (!TargetTriple.isArch64Bit())
450      return false;
451
452    // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
453    // there.
454    if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
455      return false;
456
457    return true;
458  }
459
460  bool haveFastSqrt(Type *Ty) {
461    const TargetLoweringBase *TLI = getTLI();
462    EVT VT = TLI->getValueType(DL, Ty);
463    return TLI->isTypeLegal(VT) &&
464           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
465  }
466
467  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
468    return true;
469  }
470
471  InstructionCost getFPOpCost(Type *Ty) {
472    // Check whether FADD is available, as a proxy for floating-point in
473    // general.
474    const TargetLoweringBase *TLI = getTLI();
475    EVT VT = TLI->getValueType(DL, Ty);
476    if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
477      return TargetTransformInfo::TCC_Basic;
478    return TargetTransformInfo::TCC_Expensive;
479  }
480
481  unsigned getInliningThresholdMultiplier() { return 1; }
482  unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
483
484  int getInlinerVectorBonusPercent() { return 150; }
485
486  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
487                               TTI::UnrollingPreferences &UP) {
488    // This unrolling functionality is target independent, but to provide some
489    // motivation for its intended use, for x86:
490
491    // According to the Intel 64 and IA-32 Architectures Optimization Reference
492    // Manual, Intel Core models and later have a loop stream detector (and
493    // associated uop queue) that can benefit from partial unrolling.
494    // The relevant requirements are:
495    //  - The loop must have no more than 4 (8 for Nehalem and later) branches
496    //    taken, and none of them may be calls.
497    //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
498
499    // According to the Software Optimization Guide for AMD Family 15h
500    // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
501    // and loop buffer which can benefit from partial unrolling.
502    // The relevant requirements are:
503    //  - The loop must have fewer than 16 branches
504    //  - The loop must have less than 40 uops in all executed loop branches
505
506    // The number of taken branches in a loop is hard to estimate here, and
507    // benchmarking has revealed that it is better not to be conservative when
508    // estimating the branch count. As a result, we'll ignore the branch limits
509    // until someone finds a case where it matters in practice.
510
511    unsigned MaxOps;
512    const TargetSubtargetInfo *ST = getST();
513    if (PartialUnrollingThreshold.getNumOccurrences() > 0)
514      MaxOps = PartialUnrollingThreshold;
515    else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
516      MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
517    else
518      return;
519
520    // Scan the loop: don't unroll loops with calls.
521    for (BasicBlock *BB : L->blocks()) {
522      for (Instruction &I : *BB) {
523        if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
524          if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
525            if (!thisT()->isLoweredToCall(F))
526              continue;
527          }
528
529          return;
530        }
531      }
532    }
533
534    // Enable runtime and partial unrolling up to the specified size.
535    // Enable using trip count upper bound to unroll loops.
536    UP.Partial = UP.Runtime = UP.UpperBound = true;
537    UP.PartialThreshold = MaxOps;
538
539    // Avoid unrolling when optimizing for size.
540    UP.OptSizeThreshold = 0;
541    UP.PartialOptSizeThreshold = 0;
542
543    // Set number of instructions optimized when "back edge"
544    // becomes "fall through" to default value of 2.
545    UP.BEInsns = 2;
546  }
547
548  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549                             TTI::PeelingPreferences &PP) {
550    PP.PeelCount = 0;
551    PP.AllowPeeling = true;
552    PP.AllowLoopNestsPeeling = false;
553    PP.PeelProfiledIterations = true;
554  }
555
556  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
557                                AssumptionCache &AC,
558                                TargetLibraryInfo *LibInfo,
559                                HardwareLoopInfo &HWLoopInfo) {
560    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
561  }
562
563  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
564                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
565                                   DominatorTree *DT,
566                                   const LoopAccessInfo *LAI) {
567    return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
568  }
569
570  bool emitGetActiveLaneMask() {
571    return BaseT::emitGetActiveLaneMask();
572  }
573
574  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
575                                               IntrinsicInst &II) {
576    return BaseT::instCombineIntrinsic(IC, II);
577  }
578
579  Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
580                                                     IntrinsicInst &II,
581                                                     APInt DemandedMask,
582                                                     KnownBits &Known,
583                                                     bool &KnownBitsComputed) {
584    return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
585                                                   KnownBitsComputed);
586  }
587
588  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
589      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
590      APInt &UndefElts2, APInt &UndefElts3,
591      std::function<void(Instruction *, unsigned, APInt, APInt &)>
592          SimplifyAndSetOp) {
593    return BaseT::simplifyDemandedVectorEltsIntrinsic(
594        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
595        SimplifyAndSetOp);
596  }
597
598  InstructionCost getInstructionLatency(const Instruction *I) {
599    if (isa<LoadInst>(I))
600      return getST()->getSchedModel().DefaultLoadLatency;
601
602    return BaseT::getInstructionLatency(I);
603  }
604
605  virtual Optional<unsigned>
606  getCacheSize(TargetTransformInfo::CacheLevel Level) const {
607    return Optional<unsigned>(
608      getST()->getCacheSize(static_cast<unsigned>(Level)));
609  }
610
611  virtual Optional<unsigned>
612  getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
613    Optional<unsigned> TargetResult =
614        getST()->getCacheAssociativity(static_cast<unsigned>(Level));
615
616    if (TargetResult)
617      return TargetResult;
618
619    return BaseT::getCacheAssociativity(Level);
620  }
621
622  virtual unsigned getCacheLineSize() const {
623    return getST()->getCacheLineSize();
624  }
625
626  virtual unsigned getPrefetchDistance() const {
627    return getST()->getPrefetchDistance();
628  }
629
630  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
631                                        unsigned NumStridedMemAccesses,
632                                        unsigned NumPrefetches,
633                                        bool HasCall) const {
634    return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
635                                         NumPrefetches, HasCall);
636  }
637
638  virtual unsigned getMaxPrefetchIterationsAhead() const {
639    return getST()->getMaxPrefetchIterationsAhead();
640  }
641
642  virtual bool enableWritePrefetching() const {
643    return getST()->enableWritePrefetching();
644  }
645
646  /// @}
647
648  /// \name Vector TTI Implementations
649  /// @{
650
651  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
652    return TypeSize::getFixed(32);
653  }
654
655  Optional<unsigned> getMaxVScale() const { return None; }
656
657  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
658  /// are set if the demanded result elements need to be inserted and/or
659  /// extracted from vectors.
660  InstructionCost getScalarizationOverhead(VectorType *InTy,
661                                           const APInt &DemandedElts,
662                                           bool Insert, bool Extract) {
663    /// FIXME: a bitfield is not a reasonable abstraction for talking about
664    /// which elements are needed from a scalable vector
665    auto *Ty = cast<FixedVectorType>(InTy);
666
667    assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
668           "Vector size mismatch");
669
670    InstructionCost Cost = 0;
671
672    for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
673      if (!DemandedElts[i])
674        continue;
675      if (Insert)
676        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
677      if (Extract)
678        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
679    }
680
681    return Cost;
682  }
683
684  /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
685  InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
686                                           bool Extract) {
687    auto *Ty = cast<FixedVectorType>(InTy);
688
689    APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
690    return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
691  }
692
693  /// Estimate the overhead of scalarizing an instructions unique
694  /// non-constant operands. The (potentially vector) types to use for each of
695  /// argument are passes via Tys.
696  InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
697                                                   ArrayRef<Type *> Tys) {
698    assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
699
700    InstructionCost Cost = 0;
701    SmallPtrSet<const Value*, 4> UniqueOperands;
702    for (int I = 0, E = Args.size(); I != E; I++) {
703      // Disregard things like metadata arguments.
704      const Value *A = Args[I];
705      Type *Ty = Tys[I];
706      if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
707          !Ty->isPtrOrPtrVectorTy())
708        continue;
709
710      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
711        if (auto *VecTy = dyn_cast<VectorType>(Ty))
712          Cost += getScalarizationOverhead(VecTy, false, true);
713      }
714    }
715
716    return Cost;
717  }
718
719  /// Estimate the overhead of scalarizing the inputs and outputs of an
720  /// instruction, with return type RetTy and arguments Args of type Tys. If
721  /// Args are unknown (empty), then the cost associated with one argument is
722  /// added as a heuristic.
723  InstructionCost getScalarizationOverhead(VectorType *RetTy,
724                                           ArrayRef<const Value *> Args,
725                                           ArrayRef<Type *> Tys) {
726    InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
727    if (!Args.empty())
728      Cost += getOperandsScalarizationOverhead(Args, Tys);
729    else
730      // When no information on arguments is provided, we add the cost
731      // associated with one argument as a heuristic.
732      Cost += getScalarizationOverhead(RetTy, false, true);
733
734    return Cost;
735  }
736
737  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
738
739  InstructionCost getArithmeticInstrCost(
740      unsigned Opcode, Type *Ty,
741      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
742      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
743      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
744      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
745      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
746      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
747      const Instruction *CxtI = nullptr) {
748    // Check if any of the operands are vector operands.
749    const TargetLoweringBase *TLI = getTLI();
750    int ISD = TLI->InstructionOpcodeToISD(Opcode);
751    assert(ISD && "Invalid opcode");
752
753    // TODO: Handle more cost kinds.
754    if (CostKind != TTI::TCK_RecipThroughput)
755      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
756                                           Opd1Info, Opd2Info,
757                                           Opd1PropInfo, Opd2PropInfo,
758                                           Args, CxtI);
759
760    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
761
762    bool IsFloat = Ty->isFPOrFPVectorTy();
763    // Assume that floating point arithmetic operations cost twice as much as
764    // integer operations.
765    InstructionCost OpCost = (IsFloat ? 2 : 1);
766
767    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
768      // The operation is legal. Assume it costs 1.
769      // TODO: Once we have extract/insert subvector cost we need to use them.
770      return LT.first * OpCost;
771    }
772
773    if (!TLI->isOperationExpand(ISD, LT.second)) {
774      // If the operation is custom lowered, then assume that the code is twice
775      // as expensive.
776      return LT.first * 2 * OpCost;
777    }
778
779    // Else, assume that we need to scalarize this op.
780    // TODO: If one of the types get legalized by splitting, handle this
781    // similarly to what getCastInstrCost() does.
782    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
783      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
784      InstructionCost Cost = thisT()->getArithmeticInstrCost(
785          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
786          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
787      // Return the cost of multiple scalar invocation plus the cost of
788      // inserting and extracting the values.
789      SmallVector<Type *> Tys(Args.size(), Ty);
790      return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
791    }
792
793    // We don't know anything about this scalar instruction.
794    return OpCost;
795  }
796
797  TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
798                                              ArrayRef<int> Mask) const {
799    int Limit = Mask.size() * 2;
800    if (Mask.empty() ||
801        // Extra check required by isSingleSourceMaskImpl function (called by
802        // ShuffleVectorInst::isSingleSourceMask).
803        any_of(Mask, [Limit](int I) { return I >= Limit; }))
804      return Kind;
805    switch (Kind) {
806    case TTI::SK_PermuteSingleSrc:
807      if (ShuffleVectorInst::isReverseMask(Mask))
808        return TTI::SK_Reverse;
809      if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
810        return TTI::SK_Broadcast;
811      break;
812    case TTI::SK_PermuteTwoSrc:
813      if (ShuffleVectorInst::isSelectMask(Mask))
814        return TTI::SK_Select;
815      if (ShuffleVectorInst::isTransposeMask(Mask))
816        return TTI::SK_Transpose;
817      break;
818    case TTI::SK_Select:
819    case TTI::SK_Reverse:
820    case TTI::SK_Broadcast:
821    case TTI::SK_Transpose:
822    case TTI::SK_InsertSubvector:
823    case TTI::SK_ExtractSubvector:
824      break;
825    }
826    return Kind;
827  }
828
829  InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
830                                 ArrayRef<int> Mask, int Index,
831                                 VectorType *SubTp) {
832
833    switch (improveShuffleKindFromMask(Kind, Mask)) {
834    case TTI::SK_Broadcast:
835      return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
836    case TTI::SK_Select:
837    case TTI::SK_Reverse:
838    case TTI::SK_Transpose:
839    case TTI::SK_PermuteSingleSrc:
840    case TTI::SK_PermuteTwoSrc:
841      return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
842    case TTI::SK_ExtractSubvector:
843      return getExtractSubvectorOverhead(Tp, Index,
844                                         cast<FixedVectorType>(SubTp));
845    case TTI::SK_InsertSubvector:
846      return getInsertSubvectorOverhead(Tp, Index,
847                                        cast<FixedVectorType>(SubTp));
848    }
849    llvm_unreachable("Unknown TTI::ShuffleKind");
850  }
851
852  InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
853                                   TTI::CastContextHint CCH,
854                                   TTI::TargetCostKind CostKind,
855                                   const Instruction *I = nullptr) {
856    if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
857      return 0;
858
859    const TargetLoweringBase *TLI = getTLI();
860    int ISD = TLI->InstructionOpcodeToISD(Opcode);
861    assert(ISD && "Invalid opcode");
862    std::pair<InstructionCost, MVT> SrcLT =
863        TLI->getTypeLegalizationCost(DL, Src);
864    std::pair<InstructionCost, MVT> DstLT =
865        TLI->getTypeLegalizationCost(DL, Dst);
866
867    TypeSize SrcSize = SrcLT.second.getSizeInBits();
868    TypeSize DstSize = DstLT.second.getSizeInBits();
869    bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
870    bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
871
872    switch (Opcode) {
873    default:
874      break;
875    case Instruction::Trunc:
876      // Check for NOOP conversions.
877      if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
878        return 0;
879      LLVM_FALLTHROUGH;
880    case Instruction::BitCast:
881      // Bitcast between types that are legalized to the same type are free and
882      // assume int to/from ptr of the same size is also free.
883      if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
884          SrcSize == DstSize)
885        return 0;
886      break;
887    case Instruction::FPExt:
888      if (I && getTLI()->isExtFree(I))
889        return 0;
890      break;
891    case Instruction::ZExt:
892      if (TLI->isZExtFree(SrcLT.second, DstLT.second))
893        return 0;
894      LLVM_FALLTHROUGH;
895    case Instruction::SExt:
896      if (I && getTLI()->isExtFree(I))
897        return 0;
898
899      // If this is a zext/sext of a load, return 0 if the corresponding
900      // extending load exists on target and the result type is legal.
901      if (CCH == TTI::CastContextHint::Normal) {
902        EVT ExtVT = EVT::getEVT(Dst);
903        EVT LoadVT = EVT::getEVT(Src);
904        unsigned LType =
905          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
906        if (DstLT.first == SrcLT.first &&
907            TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
908          return 0;
909      }
910      break;
911    case Instruction::AddrSpaceCast:
912      if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
913                                   Dst->getPointerAddressSpace()))
914        return 0;
915      break;
916    }
917
918    auto *SrcVTy = dyn_cast<VectorType>(Src);
919    auto *DstVTy = dyn_cast<VectorType>(Dst);
920
921    // If the cast is marked as legal (or promote) then assume low cost.
922    if (SrcLT.first == DstLT.first &&
923        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
924      return SrcLT.first;
925
926    // Handle scalar conversions.
927    if (!SrcVTy && !DstVTy) {
928      // Just check the op cost. If the operation is legal then assume it costs
929      // 1.
930      if (!TLI->isOperationExpand(ISD, DstLT.second))
931        return 1;
932
933      // Assume that illegal scalar instruction are expensive.
934      return 4;
935    }
936
937    // Check vector-to-vector casts.
938    if (DstVTy && SrcVTy) {
939      // If the cast is between same-sized registers, then the check is simple.
940      if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
941
942        // Assume that Zext is done using AND.
943        if (Opcode == Instruction::ZExt)
944          return SrcLT.first;
945
946        // Assume that sext is done using SHL and SRA.
947        if (Opcode == Instruction::SExt)
948          return SrcLT.first * 2;
949
950        // Just check the op cost. If the operation is legal then assume it
951        // costs
952        // 1 and multiply by the type-legalization overhead.
953        if (!TLI->isOperationExpand(ISD, DstLT.second))
954          return SrcLT.first * 1;
955      }
956
957      // If we are legalizing by splitting, query the concrete TTI for the cost
958      // of casting the original vector twice. We also need to factor in the
959      // cost of the split itself. Count that as 1, to be consistent with
960      // TLI->getTypeLegalizationCost().
961      bool SplitSrc =
962          TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
963          TargetLowering::TypeSplitVector;
964      bool SplitDst =
965          TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
966          TargetLowering::TypeSplitVector;
967      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
968          DstVTy->getElementCount().isVector()) {
969        Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
970        Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
971        T *TTI = static_cast<T *>(this);
972        // If both types need to be split then the split is free.
973        InstructionCost SplitCost =
974            (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
975        return SplitCost +
976               (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
977                                          CostKind, I));
978      }
979
980      // In other cases where the source or destination are illegal, assume
981      // the operation will get scalarized.
982      unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
983      InstructionCost Cost = thisT()->getCastInstrCost(
984          Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
985
986      // Return the cost of multiple scalar invocation plus the cost of
987      // inserting and extracting the values.
988      return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
989    }
990
991    // We already handled vector-to-vector and scalar-to-scalar conversions.
992    // This
993    // is where we handle bitcast between vectors and scalars. We need to assume
994    //  that the conversion is scalarized in one way or another.
995    if (Opcode == Instruction::BitCast) {
996      // Illegal bitcasts are done by storing and loading from a stack slot.
997      return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
998             (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
999    }
1000
1001    llvm_unreachable("Unhandled cast");
1002  }
1003
1004  InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1005                                           VectorType *VecTy, unsigned Index) {
1006    return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1007                                       Index) +
1008           thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1009                                     TTI::CastContextHint::None,
1010                                     TTI::TCK_RecipThroughput);
1011  }
1012
1013  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1014                                 const Instruction *I = nullptr) {
1015    return BaseT::getCFInstrCost(Opcode, CostKind, I);
1016  }
1017
1018  InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1019                                     CmpInst::Predicate VecPred,
1020                                     TTI::TargetCostKind CostKind,
1021                                     const Instruction *I = nullptr) {
1022    const TargetLoweringBase *TLI = getTLI();
1023    int ISD = TLI->InstructionOpcodeToISD(Opcode);
1024    assert(ISD && "Invalid opcode");
1025
1026    // TODO: Handle other cost kinds.
1027    if (CostKind != TTI::TCK_RecipThroughput)
1028      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1029                                       I);
1030
1031    // Selects on vectors are actually vector selects.
1032    if (ISD == ISD::SELECT) {
1033      assert(CondTy && "CondTy must exist");
1034      if (CondTy->isVectorTy())
1035        ISD = ISD::VSELECT;
1036    }
1037    std::pair<InstructionCost, MVT> LT =
1038        TLI->getTypeLegalizationCost(DL, ValTy);
1039
1040    if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1041        !TLI->isOperationExpand(ISD, LT.second)) {
1042      // The operation is legal. Assume it costs 1. Multiply
1043      // by the type-legalization overhead.
1044      return LT.first * 1;
1045    }
1046
1047    // Otherwise, assume that the cast is scalarized.
1048    // TODO: If one of the types get legalized by splitting, handle this
1049    // similarly to what getCastInstrCost() does.
1050    if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1051      unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1052      if (CondTy)
1053        CondTy = CondTy->getScalarType();
1054      InstructionCost Cost = thisT()->getCmpSelInstrCost(
1055          Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1056
1057      // Return the cost of multiple scalar invocation plus the cost of
1058      // inserting and extracting the values.
1059      return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1060    }
1061
1062    // Unknown scalar opcode.
1063    return 1;
1064  }
1065
1066  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1067                                     unsigned Index) {
1068    std::pair<InstructionCost, MVT> LT =
1069        getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1070
1071    return LT.first;
1072  }
1073
1074  InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1075                                  MaybeAlign Alignment, unsigned AddressSpace,
1076                                  TTI::TargetCostKind CostKind,
1077                                  const Instruction *I = nullptr) {
1078    assert(!Src->isVoidTy() && "Invalid type");
1079    // Assume types, such as structs, are expensive.
1080    if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1081      return 4;
1082    std::pair<InstructionCost, MVT> LT =
1083        getTLI()->getTypeLegalizationCost(DL, Src);
1084
1085    // Assuming that all loads of legal types cost 1.
1086    InstructionCost Cost = LT.first;
1087    if (CostKind != TTI::TCK_RecipThroughput)
1088      return Cost;
1089
1090    if (Src->isVectorTy() &&
1091        // In practice it's not currently possible to have a change in lane
1092        // length for extending loads or truncating stores so both types should
1093        // have the same scalable property.
1094        TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1095                            LT.second.getSizeInBits())) {
1096      // This is a vector load that legalizes to a larger type than the vector
1097      // itself. Unless the corresponding extending load or truncating store is
1098      // legal, then this will scalarize.
1099      TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1100      EVT MemVT = getTLI()->getValueType(DL, Src);
1101      if (Opcode == Instruction::Store)
1102        LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1103      else
1104        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1105
1106      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1107        // This is a vector load/store for some illegal type that is scalarized.
1108        // We must account for the cost of building or decomposing the vector.
1109        Cost += getScalarizationOverhead(cast<VectorType>(Src),
1110                                         Opcode != Instruction::Store,
1111                                         Opcode == Instruction::Store);
1112      }
1113    }
1114
1115    return Cost;
1116  }
1117
1118  InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1119                                        Align Alignment, unsigned AddressSpace,
1120                                        TTI::TargetCostKind CostKind) {
1121    return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1122                                       CostKind);
1123  }
1124
1125  InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1126                                         const Value *Ptr, bool VariableMask,
1127                                         Align Alignment,
1128                                         TTI::TargetCostKind CostKind,
1129                                         const Instruction *I = nullptr) {
1130    return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1131                                       true, CostKind);
1132  }
1133
1134  InstructionCost getInterleavedMemoryOpCost(
1135      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1136      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1137      bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1138    auto *VT = cast<FixedVectorType>(VecTy);
1139
1140    unsigned NumElts = VT->getNumElements();
1141    assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1142
1143    unsigned NumSubElts = NumElts / Factor;
1144    auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1145
1146    // Firstly, the cost of load/store operation.
1147    InstructionCost Cost;
1148    if (UseMaskForCond || UseMaskForGaps)
1149      Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1150                                            AddressSpace, CostKind);
1151    else
1152      Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1153                                      CostKind);
1154
1155    // Legalize the vector type, and get the legalized and unlegalized type
1156    // sizes.
1157    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1158    unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1159    unsigned VecTyLTSize = VecTyLT.getStoreSize();
1160
1161    // Scale the cost of the memory operation by the fraction of legalized
1162    // instructions that will actually be used. We shouldn't account for the
1163    // cost of dead instructions since they will be removed.
1164    //
1165    // E.g., An interleaved load of factor 8:
1166    //       %vec = load <16 x i64>, <16 x i64>* %ptr
1167    //       %v0 = shufflevector %vec, undef, <0, 8>
1168    //
1169    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1170    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1171    // type). The other loads are unused.
1172    //
1173    // We only scale the cost of loads since interleaved store groups aren't
1174    // allowed to have gaps.
1175    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1176      // The number of loads of a legal type it will take to represent a load
1177      // of the unlegalized vector type.
1178      unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1179
1180      // The number of elements of the unlegalized type that correspond to a
1181      // single legal instruction.
1182      unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1183
1184      // Determine which legal instructions will be used.
1185      BitVector UsedInsts(NumLegalInsts, false);
1186      for (unsigned Index : Indices)
1187        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1188          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1189
1190      // Scale the cost of the load by the fraction of legal instructions that
1191      // will be used.
1192      Cost *= UsedInsts.count() / NumLegalInsts;
1193    }
1194
1195    // Then plus the cost of interleave operation.
1196    if (Opcode == Instruction::Load) {
1197      // The interleave cost is similar to extract sub vectors' elements
1198      // from the wide vector, and insert them into sub vectors.
1199      //
1200      // E.g. An interleaved load of factor 2 (with one member of index 0):
1201      //      %vec = load <8 x i32>, <8 x i32>* %ptr
1202      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1203      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1204      // <8 x i32> vector and insert them into a <4 x i32> vector.
1205
1206      assert(Indices.size() <= Factor &&
1207             "Interleaved memory op has too many members");
1208
1209      for (unsigned Index : Indices) {
1210        assert(Index < Factor && "Invalid index for interleaved memory op");
1211
1212        // Extract elements from loaded vector for each sub vector.
1213        for (unsigned i = 0; i < NumSubElts; i++)
1214          Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1215                                              Index + i * Factor);
1216      }
1217
1218      InstructionCost InsSubCost = 0;
1219      for (unsigned i = 0; i < NumSubElts; i++)
1220        InsSubCost +=
1221            thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1222
1223      Cost += Indices.size() * InsSubCost;
1224    } else {
1225      // The interleave cost is extract all elements from sub vectors, and
1226      // insert them into the wide vector.
1227      //
1228      // E.g. An interleaved store of factor 2:
1229      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1230      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1231      // The cost is estimated as extract all elements from both <4 x i32>
1232      // vectors and insert into the <8 x i32> vector.
1233
1234      InstructionCost ExtSubCost = 0;
1235      for (unsigned i = 0; i < NumSubElts; i++)
1236        ExtSubCost +=
1237            thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1238      Cost += ExtSubCost * Factor;
1239
1240      for (unsigned i = 0; i < NumElts; i++)
1241        Cost += static_cast<T *>(this)
1242                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1243    }
1244
1245    if (!UseMaskForCond)
1246      return Cost;
1247
1248    Type *I8Type = Type::getInt8Ty(VT->getContext());
1249    auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1250    SubVT = FixedVectorType::get(I8Type, NumSubElts);
1251
1252    // The Mask shuffling cost is extract all the elements of the Mask
1253    // and insert each of them Factor times into the wide vector:
1254    //
1255    // E.g. an interleaved group with factor 3:
1256    //    %mask = icmp ult <8 x i32> %vec1, %vec2
1257    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1258    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1259    // The cost is estimated as extract all mask elements from the <8xi1> mask
1260    // vector and insert them factor times into the <24xi1> shuffled mask
1261    // vector.
1262    for (unsigned i = 0; i < NumSubElts; i++)
1263      Cost +=
1264          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1265
1266    for (unsigned i = 0; i < NumElts; i++)
1267      Cost +=
1268          thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1269
1270    // The Gaps mask is invariant and created outside the loop, therefore the
1271    // cost of creating it is not accounted for here. However if we have both
1272    // a MaskForGaps and some other mask that guards the execution of the
1273    // memory access, we need to account for the cost of And-ing the two masks
1274    // inside the loop.
1275    if (UseMaskForGaps)
1276      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1277                                              CostKind);
1278
1279    return Cost;
1280  }
1281
1282  /// Get intrinsic cost based on arguments.
1283  InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1284                                        TTI::TargetCostKind CostKind) {
1285    // Check for generically free intrinsics.
1286    if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1287      return 0;
1288
1289    // Assume that target intrinsics are cheap.
1290    Intrinsic::ID IID = ICA.getID();
1291    if (Function::isTargetIntrinsic(IID))
1292      return TargetTransformInfo::TCC_Basic;
1293
1294    if (ICA.isTypeBasedOnly())
1295      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1296
1297    Type *RetTy = ICA.getReturnType();
1298
1299    ElementCount RetVF =
1300        (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1301                             : ElementCount::getFixed(1));
1302    const IntrinsicInst *I = ICA.getInst();
1303    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1304    FastMathFlags FMF = ICA.getFlags();
1305    switch (IID) {
1306    default:
1307      break;
1308
1309    case Intrinsic::cttz:
1310      // FIXME: If necessary, this should go in target-specific overrides.
1311      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1312        return TargetTransformInfo::TCC_Basic;
1313      break;
1314
1315    case Intrinsic::ctlz:
1316      // FIXME: If necessary, this should go in target-specific overrides.
1317      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1318        return TargetTransformInfo::TCC_Basic;
1319      break;
1320
1321    case Intrinsic::memcpy:
1322      return thisT()->getMemcpyCost(ICA.getInst());
1323
1324    case Intrinsic::masked_scatter: {
1325      const Value *Mask = Args[3];
1326      bool VarMask = !isa<Constant>(Mask);
1327      Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1328      return thisT()->getGatherScatterOpCost(Instruction::Store,
1329                                             ICA.getArgTypes()[0], Args[1],
1330                                             VarMask, Alignment, CostKind, I);
1331    }
1332    case Intrinsic::masked_gather: {
1333      const Value *Mask = Args[2];
1334      bool VarMask = !isa<Constant>(Mask);
1335      Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1336      return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1337                                             VarMask, Alignment, CostKind, I);
1338    }
1339    case Intrinsic::experimental_stepvector: {
1340      if (isa<ScalableVectorType>(RetTy))
1341        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1342      // The cost of materialising a constant integer vector.
1343      return TargetTransformInfo::TCC_Basic;
1344    }
1345    case Intrinsic::experimental_vector_extract: {
1346      // FIXME: Handle case where a scalable vector is extracted from a scalable
1347      // vector
1348      if (isa<ScalableVectorType>(RetTy))
1349        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1350      unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1351      return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1352                                     cast<VectorType>(Args[0]->getType()), None,
1353                                     Index, cast<VectorType>(RetTy));
1354    }
1355    case Intrinsic::experimental_vector_insert: {
1356      // FIXME: Handle case where a scalable vector is inserted into a scalable
1357      // vector
1358      if (isa<ScalableVectorType>(Args[1]->getType()))
1359        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1360      unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1361      return thisT()->getShuffleCost(
1362          TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1363          Index, cast<VectorType>(Args[1]->getType()));
1364    }
1365    case Intrinsic::experimental_vector_reverse: {
1366      return thisT()->getShuffleCost(TTI::SK_Reverse,
1367                                     cast<VectorType>(Args[0]->getType()), None,
1368                                     0, cast<VectorType>(RetTy));
1369    }
1370    case Intrinsic::vector_reduce_add:
1371    case Intrinsic::vector_reduce_mul:
1372    case Intrinsic::vector_reduce_and:
1373    case Intrinsic::vector_reduce_or:
1374    case Intrinsic::vector_reduce_xor:
1375    case Intrinsic::vector_reduce_smax:
1376    case Intrinsic::vector_reduce_smin:
1377    case Intrinsic::vector_reduce_fmax:
1378    case Intrinsic::vector_reduce_fmin:
1379    case Intrinsic::vector_reduce_umax:
1380    case Intrinsic::vector_reduce_umin: {
1381      IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1382      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1383    }
1384    case Intrinsic::vector_reduce_fadd:
1385    case Intrinsic::vector_reduce_fmul: {
1386      IntrinsicCostAttributes Attrs(
1387          IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1388      return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1389    }
1390    case Intrinsic::fshl:
1391    case Intrinsic::fshr: {
1392      if (isa<ScalableVectorType>(RetTy))
1393        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1394      const Value *X = Args[0];
1395      const Value *Y = Args[1];
1396      const Value *Z = Args[2];
1397      TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1398      TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1399      TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1400      TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1401      TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1402      OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1403                                                              : TTI::OP_None;
1404      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1405      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1406      InstructionCost Cost = 0;
1407      Cost +=
1408          thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1409      Cost +=
1410          thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1411      Cost += thisT()->getArithmeticInstrCost(
1412          BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1413      Cost += thisT()->getArithmeticInstrCost(
1414          BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1415      // Non-constant shift amounts requires a modulo.
1416      if (OpKindZ != TTI::OK_UniformConstantValue &&
1417          OpKindZ != TTI::OK_NonUniformConstantValue)
1418        Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1419                                                CostKind, OpKindZ, OpKindBW,
1420                                                OpPropsZ, OpPropsBW);
1421      // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1422      if (X != Y) {
1423        Type *CondTy = RetTy->getWithNewBitWidth(1);
1424        Cost +=
1425            thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1426                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
1427        Cost +=
1428            thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1429                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
1430      }
1431      return Cost;
1432    }
1433    }
1434
1435    // Assume that we need to scalarize this intrinsic.
1436    // Compute the scalarization overhead based on Args for a vector
1437    // intrinsic.
1438    InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1439    if (RetVF.isVector() && !RetVF.isScalable()) {
1440      ScalarizationCost = 0;
1441      if (!RetTy->isVoidTy())
1442        ScalarizationCost +=
1443            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1444      ScalarizationCost +=
1445          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1446    }
1447
1448    IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1449                                  ScalarizationCost);
1450    return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1451  }
1452
1453  /// Get intrinsic cost based on argument types.
1454  /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1455  /// cost of scalarizing the arguments and the return value will be computed
1456  /// based on types.
1457  InstructionCost
1458  getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1459                                 TTI::TargetCostKind CostKind) {
1460    Intrinsic::ID IID = ICA.getID();
1461    Type *RetTy = ICA.getReturnType();
1462    const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1463    FastMathFlags FMF = ICA.getFlags();
1464    InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1465    bool SkipScalarizationCost = ICA.skipScalarizationCost();
1466
1467    VectorType *VecOpTy = nullptr;
1468    if (!Tys.empty()) {
1469      // The vector reduction operand is operand 0 except for fadd/fmul.
1470      // Their operand 0 is a scalar start value, so the vector op is operand 1.
1471      unsigned VecTyIndex = 0;
1472      if (IID == Intrinsic::vector_reduce_fadd ||
1473          IID == Intrinsic::vector_reduce_fmul)
1474        VecTyIndex = 1;
1475      assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1476      VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1477    }
1478
1479    // Library call cost - other than size, make it expensive.
1480    unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1481    SmallVector<unsigned, 2> ISDs;
1482    switch (IID) {
1483    default: {
1484      // Scalable vectors cannot be scalarized, so return Invalid.
1485      if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1486            return isa<ScalableVectorType>(Ty);
1487          }))
1488        return InstructionCost::getInvalid();
1489
1490      // Assume that we need to scalarize this intrinsic.
1491      InstructionCost ScalarizationCost =
1492          SkipScalarizationCost ? ScalarizationCostPassed : 0;
1493      unsigned ScalarCalls = 1;
1494      Type *ScalarRetTy = RetTy;
1495      if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1496        if (!SkipScalarizationCost)
1497          ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1498        ScalarCalls = std::max(ScalarCalls,
1499                               cast<FixedVectorType>(RetVTy)->getNumElements());
1500        ScalarRetTy = RetTy->getScalarType();
1501      }
1502      SmallVector<Type *, 4> ScalarTys;
1503      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1504        Type *Ty = Tys[i];
1505        if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1506          if (!SkipScalarizationCost)
1507            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1508          ScalarCalls = std::max(ScalarCalls,
1509                                 cast<FixedVectorType>(VTy)->getNumElements());
1510          Ty = Ty->getScalarType();
1511        }
1512        ScalarTys.push_back(Ty);
1513      }
1514      if (ScalarCalls == 1)
1515        return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1516
1517      IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1518      InstructionCost ScalarCost =
1519          thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1520
1521      return ScalarCalls * ScalarCost + ScalarizationCost;
1522    }
1523    // Look for intrinsics that can be lowered directly or turned into a scalar
1524    // intrinsic call.
1525    case Intrinsic::sqrt:
1526      ISDs.push_back(ISD::FSQRT);
1527      break;
1528    case Intrinsic::sin:
1529      ISDs.push_back(ISD::FSIN);
1530      break;
1531    case Intrinsic::cos:
1532      ISDs.push_back(ISD::FCOS);
1533      break;
1534    case Intrinsic::exp:
1535      ISDs.push_back(ISD::FEXP);
1536      break;
1537    case Intrinsic::exp2:
1538      ISDs.push_back(ISD::FEXP2);
1539      break;
1540    case Intrinsic::log:
1541      ISDs.push_back(ISD::FLOG);
1542      break;
1543    case Intrinsic::log10:
1544      ISDs.push_back(ISD::FLOG10);
1545      break;
1546    case Intrinsic::log2:
1547      ISDs.push_back(ISD::FLOG2);
1548      break;
1549    case Intrinsic::fabs:
1550      ISDs.push_back(ISD::FABS);
1551      break;
1552    case Intrinsic::canonicalize:
1553      ISDs.push_back(ISD::FCANONICALIZE);
1554      break;
1555    case Intrinsic::minnum:
1556      ISDs.push_back(ISD::FMINNUM);
1557      break;
1558    case Intrinsic::maxnum:
1559      ISDs.push_back(ISD::FMAXNUM);
1560      break;
1561    case Intrinsic::minimum:
1562      ISDs.push_back(ISD::FMINIMUM);
1563      break;
1564    case Intrinsic::maximum:
1565      ISDs.push_back(ISD::FMAXIMUM);
1566      break;
1567    case Intrinsic::copysign:
1568      ISDs.push_back(ISD::FCOPYSIGN);
1569      break;
1570    case Intrinsic::floor:
1571      ISDs.push_back(ISD::FFLOOR);
1572      break;
1573    case Intrinsic::ceil:
1574      ISDs.push_back(ISD::FCEIL);
1575      break;
1576    case Intrinsic::trunc:
1577      ISDs.push_back(ISD::FTRUNC);
1578      break;
1579    case Intrinsic::nearbyint:
1580      ISDs.push_back(ISD::FNEARBYINT);
1581      break;
1582    case Intrinsic::rint:
1583      ISDs.push_back(ISD::FRINT);
1584      break;
1585    case Intrinsic::round:
1586      ISDs.push_back(ISD::FROUND);
1587      break;
1588    case Intrinsic::roundeven:
1589      ISDs.push_back(ISD::FROUNDEVEN);
1590      break;
1591    case Intrinsic::pow:
1592      ISDs.push_back(ISD::FPOW);
1593      break;
1594    case Intrinsic::fma:
1595      ISDs.push_back(ISD::FMA);
1596      break;
1597    case Intrinsic::fmuladd:
1598      ISDs.push_back(ISD::FMA);
1599      break;
1600    case Intrinsic::experimental_constrained_fmuladd:
1601      ISDs.push_back(ISD::STRICT_FMA);
1602      break;
1603    // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1604    case Intrinsic::lifetime_start:
1605    case Intrinsic::lifetime_end:
1606    case Intrinsic::sideeffect:
1607    case Intrinsic::pseudoprobe:
1608      return 0;
1609    case Intrinsic::masked_store: {
1610      Type *Ty = Tys[0];
1611      Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1612      return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1613                                            CostKind);
1614    }
1615    case Intrinsic::masked_load: {
1616      Type *Ty = RetTy;
1617      Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1618      return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1619                                            CostKind);
1620    }
1621    case Intrinsic::vector_reduce_add:
1622      return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1623                                                 /*IsPairwiseForm=*/false,
1624                                                 CostKind);
1625    case Intrinsic::vector_reduce_mul:
1626      return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1627                                                 /*IsPairwiseForm=*/false,
1628                                                 CostKind);
1629    case Intrinsic::vector_reduce_and:
1630      return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1631                                                 /*IsPairwiseForm=*/false,
1632                                                 CostKind);
1633    case Intrinsic::vector_reduce_or:
1634      return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1635                                                 /*IsPairwiseForm=*/false,
1636                                                 CostKind);
1637    case Intrinsic::vector_reduce_xor:
1638      return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1639                                                 /*IsPairwiseForm=*/false,
1640                                                 CostKind);
1641    case Intrinsic::vector_reduce_fadd:
1642      // FIXME: Add new flag for cost of strict reductions.
1643      return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1644                                                 /*IsPairwiseForm=*/false,
1645                                                 CostKind);
1646    case Intrinsic::vector_reduce_fmul:
1647      // FIXME: Add new flag for cost of strict reductions.
1648      return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1649                                                 /*IsPairwiseForm=*/false,
1650                                                 CostKind);
1651    case Intrinsic::vector_reduce_smax:
1652    case Intrinsic::vector_reduce_smin:
1653    case Intrinsic::vector_reduce_fmax:
1654    case Intrinsic::vector_reduce_fmin:
1655      return thisT()->getMinMaxReductionCost(
1656          VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1657          /*IsPairwiseForm=*/false,
1658          /*IsUnsigned=*/false, CostKind);
1659    case Intrinsic::vector_reduce_umax:
1660    case Intrinsic::vector_reduce_umin:
1661      return thisT()->getMinMaxReductionCost(
1662          VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1663          /*IsPairwiseForm=*/false,
1664          /*IsUnsigned=*/true, CostKind);
1665    case Intrinsic::abs:
1666    case Intrinsic::smax:
1667    case Intrinsic::smin:
1668    case Intrinsic::umax:
1669    case Intrinsic::umin: {
1670      // abs(X) = select(icmp(X,0),X,sub(0,X))
1671      // minmax(X,Y) = select(icmp(X,Y),X,Y)
1672      Type *CondTy = RetTy->getWithNewBitWidth(1);
1673      InstructionCost Cost = 0;
1674      // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1675      Cost +=
1676          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1677                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1678      Cost +=
1679          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1680                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1681      // TODO: Should we add an OperandValueProperties::OP_Zero property?
1682      if (IID == Intrinsic::abs)
1683        Cost += thisT()->getArithmeticInstrCost(
1684            BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1685      return Cost;
1686    }
1687    case Intrinsic::sadd_sat:
1688    case Intrinsic::ssub_sat: {
1689      Type *CondTy = RetTy->getWithNewBitWidth(1);
1690
1691      Type *OpTy = StructType::create({RetTy, CondTy});
1692      Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1693                                     ? Intrinsic::sadd_with_overflow
1694                                     : Intrinsic::ssub_with_overflow;
1695
1696      // SatMax -> Overflow && SumDiff < 0
1697      // SatMin -> Overflow && SumDiff >= 0
1698      InstructionCost Cost = 0;
1699      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1700                                    nullptr, ScalarizationCostPassed);
1701      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1702      Cost +=
1703          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1704                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1705      Cost += 2 * thisT()->getCmpSelInstrCost(
1706                      BinaryOperator::Select, RetTy, CondTy,
1707                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1708      return Cost;
1709    }
1710    case Intrinsic::uadd_sat:
1711    case Intrinsic::usub_sat: {
1712      Type *CondTy = RetTy->getWithNewBitWidth(1);
1713
1714      Type *OpTy = StructType::create({RetTy, CondTy});
1715      Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1716                                     ? Intrinsic::uadd_with_overflow
1717                                     : Intrinsic::usub_with_overflow;
1718
1719      InstructionCost Cost = 0;
1720      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1721                                    nullptr, ScalarizationCostPassed);
1722      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1723      Cost +=
1724          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1725                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1726      return Cost;
1727    }
1728    case Intrinsic::smul_fix:
1729    case Intrinsic::umul_fix: {
1730      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1731      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1732
1733      unsigned ExtOp =
1734          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1735      TTI::CastContextHint CCH = TTI::CastContextHint::None;
1736
1737      InstructionCost Cost = 0;
1738      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1739      Cost +=
1740          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1741      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1742                                            CCH, CostKind);
1743      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1744                                              CostKind, TTI::OK_AnyValue,
1745                                              TTI::OK_UniformConstantValue);
1746      Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1747                                              TTI::OK_AnyValue,
1748                                              TTI::OK_UniformConstantValue);
1749      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1750      return Cost;
1751    }
1752    case Intrinsic::sadd_with_overflow:
1753    case Intrinsic::ssub_with_overflow: {
1754      Type *SumTy = RetTy->getContainedType(0);
1755      Type *OverflowTy = RetTy->getContainedType(1);
1756      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1757                            ? BinaryOperator::Add
1758                            : BinaryOperator::Sub;
1759
1760      //   LHSSign -> LHS >= 0
1761      //   RHSSign -> RHS >= 0
1762      //   SumSign -> Sum >= 0
1763      //
1764      //   Add:
1765      //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1766      //   Sub:
1767      //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1768      InstructionCost Cost = 0;
1769      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1770      Cost += 3 * thisT()->getCmpSelInstrCost(
1771                      Instruction::ICmp, SumTy, OverflowTy,
1772                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1773      Cost += 2 * thisT()->getCmpSelInstrCost(
1774                      Instruction::Select, OverflowTy, OverflowTy,
1775                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1776      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1777                                              CostKind);
1778      return Cost;
1779    }
1780    case Intrinsic::uadd_with_overflow:
1781    case Intrinsic::usub_with_overflow: {
1782      Type *SumTy = RetTy->getContainedType(0);
1783      Type *OverflowTy = RetTy->getContainedType(1);
1784      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1785                            ? BinaryOperator::Add
1786                            : BinaryOperator::Sub;
1787
1788      InstructionCost Cost = 0;
1789      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1790      Cost +=
1791          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1792                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1793      return Cost;
1794    }
1795    case Intrinsic::smul_with_overflow:
1796    case Intrinsic::umul_with_overflow: {
1797      Type *MulTy = RetTy->getContainedType(0);
1798      Type *OverflowTy = RetTy->getContainedType(1);
1799      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1800      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1801
1802      unsigned ExtOp =
1803          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1804      TTI::CastContextHint CCH = TTI::CastContextHint::None;
1805
1806      InstructionCost Cost = 0;
1807      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1808      Cost +=
1809          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1810      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1811                                            CCH, CostKind);
1812      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1813                                              CostKind, TTI::OK_AnyValue,
1814                                              TTI::OK_UniformConstantValue);
1815
1816      if (IID == Intrinsic::smul_with_overflow)
1817        Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1818                                                CostKind, TTI::OK_AnyValue,
1819                                                TTI::OK_UniformConstantValue);
1820
1821      Cost +=
1822          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1823                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
1824      return Cost;
1825    }
1826    case Intrinsic::ctpop:
1827      ISDs.push_back(ISD::CTPOP);
1828      // In case of legalization use TCC_Expensive. This is cheaper than a
1829      // library call but still not a cheap instruction.
1830      SingleCallCost = TargetTransformInfo::TCC_Expensive;
1831      break;
1832    case Intrinsic::ctlz:
1833      ISDs.push_back(ISD::CTLZ);
1834      break;
1835    case Intrinsic::cttz:
1836      ISDs.push_back(ISD::CTTZ);
1837      break;
1838    case Intrinsic::bswap:
1839      ISDs.push_back(ISD::BSWAP);
1840      break;
1841    case Intrinsic::bitreverse:
1842      ISDs.push_back(ISD::BITREVERSE);
1843      break;
1844    }
1845
1846    const TargetLoweringBase *TLI = getTLI();
1847    std::pair<InstructionCost, MVT> LT =
1848        TLI->getTypeLegalizationCost(DL, RetTy);
1849
1850    SmallVector<InstructionCost, 2> LegalCost;
1851    SmallVector<InstructionCost, 2> CustomCost;
1852    for (unsigned ISD : ISDs) {
1853      if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1854        if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1855            TLI->isFAbsFree(LT.second)) {
1856          return 0;
1857        }
1858
1859        // The operation is legal. Assume it costs 1.
1860        // If the type is split to multiple registers, assume that there is some
1861        // overhead to this.
1862        // TODO: Once we have extract/insert subvector cost we need to use them.
1863        if (LT.first > 1)
1864          LegalCost.push_back(LT.first * 2);
1865        else
1866          LegalCost.push_back(LT.first * 1);
1867      } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1868        // If the operation is custom lowered then assume
1869        // that the code is twice as expensive.
1870        CustomCost.push_back(LT.first * 2);
1871      }
1872    }
1873
1874    auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1875    if (MinLegalCostI != LegalCost.end())
1876      return *MinLegalCostI;
1877
1878    auto MinCustomCostI =
1879        std::min_element(CustomCost.begin(), CustomCost.end());
1880    if (MinCustomCostI != CustomCost.end())
1881      return *MinCustomCostI;
1882
1883    // If we can't lower fmuladd into an FMA estimate the cost as a floating
1884    // point mul followed by an add.
1885    if (IID == Intrinsic::fmuladd)
1886      return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1887                                             CostKind) +
1888             thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1889                                             CostKind);
1890    if (IID == Intrinsic::experimental_constrained_fmuladd) {
1891      IntrinsicCostAttributes FMulAttrs(
1892        Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1893      IntrinsicCostAttributes FAddAttrs(
1894        Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1895      return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1896             thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1897    }
1898
1899    // Else, assume that we need to scalarize this intrinsic. For math builtins
1900    // this will emit a costly libcall, adding call overhead and spills. Make it
1901    // very expensive.
1902    if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1903      // Scalable vectors cannot be scalarized, so return Invalid.
1904      if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1905            return isa<ScalableVectorType>(Ty);
1906          }))
1907        return InstructionCost::getInvalid();
1908
1909      InstructionCost ScalarizationCost =
1910          SkipScalarizationCost ? ScalarizationCostPassed
1911                                : getScalarizationOverhead(RetVTy, true, false);
1912
1913      unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1914      SmallVector<Type *, 4> ScalarTys;
1915      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1916        Type *Ty = Tys[i];
1917        if (Ty->isVectorTy())
1918          Ty = Ty->getScalarType();
1919        ScalarTys.push_back(Ty);
1920      }
1921      IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1922      InstructionCost ScalarCost =
1923          thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1924      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1925        if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1926          if (!ICA.skipScalarizationCost())
1927            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1928          ScalarCalls = std::max(ScalarCalls,
1929                                 cast<FixedVectorType>(VTy)->getNumElements());
1930        }
1931      }
1932      return ScalarCalls * ScalarCost + ScalarizationCost;
1933    }
1934
1935    // This is going to be turned into a library call, make it expensive.
1936    return SingleCallCost;
1937  }
1938
1939  /// Compute a cost of the given call instruction.
1940  ///
1941  /// Compute the cost of calling function F with return type RetTy and
1942  /// argument types Tys. F might be nullptr, in this case the cost of an
1943  /// arbitrary call with the specified signature will be returned.
1944  /// This is used, for instance,  when we estimate call of a vector
1945  /// counterpart of the given function.
1946  /// \param F Called function, might be nullptr.
1947  /// \param RetTy Return value types.
1948  /// \param Tys Argument types.
1949  /// \returns The cost of Call instruction.
1950  InstructionCost
1951  getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1952                   TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1953    return 10;
1954  }
1955
1956  unsigned getNumberOfParts(Type *Tp) {
1957    std::pair<InstructionCost, MVT> LT =
1958        getTLI()->getTypeLegalizationCost(DL, Tp);
1959    return *LT.first.getValue();
1960  }
1961
1962  InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1963                                            const SCEV *) {
1964    return 0;
1965  }
1966
1967  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1968  /// We're assuming that reduction operation are performing the following way:
1969  /// 1. Non-pairwise reduction
1970  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1971  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1972  ///            \----------------v-------------/  \----------v------------/
1973  ///                            n/2 elements               n/2 elements
1974  /// %red1 = op <n x t> %val, <n x t> val1
1975  /// After this operation we have a vector %red1 where only the first n/2
1976  /// elements are meaningful, the second n/2 elements are undefined and can be
1977  /// dropped. All other operations are actually working with the vector of
1978  /// length n/2, not n, though the real vector length is still n.
1979  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1980  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1981  ///            \----------------v-------------/  \----------v------------/
1982  ///                            n/4 elements               3*n/4 elements
1983  /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
1984  /// length n/2, the resulting vector has length n/4 etc.
1985  /// 2. Pairwise reduction:
1986  /// Everything is the same except for an additional shuffle operation which
1987  /// is used to produce operands for pairwise kind of reductions.
1988  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1989  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1990  ///            \-------------v----------/  \----------v------------/
1991  ///                   n/2 elements               n/2 elements
1992  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1993  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1994  ///            \-------------v----------/  \----------v------------/
1995  ///                   n/2 elements               n/2 elements
1996  /// %red1 = op <n x t> %val1, <n x t> val2
1997  /// Again, the operation is performed on <n x t> vector, but the resulting
1998  /// vector %red1 is <n/2 x t> vector.
1999  ///
2000  /// The cost model should take into account that the actual length of the
2001  /// vector is reduced on each iteration.
2002  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2003                                             bool IsPairwise,
2004                                             TTI::TargetCostKind CostKind) {
2005    Type *ScalarTy = Ty->getElementType();
2006    unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2007    if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2008        ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2009        NumVecElts >= 2) {
2010      // Or reduction for i1 is represented as:
2011      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2012      // %res = cmp ne iReduxWidth %val, 0
2013      // And reduction for i1 is represented as:
2014      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2015      // %res = cmp eq iReduxWidth %val, 11111
2016      Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2017      return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2018                                       TTI::CastContextHint::None, CostKind) +
2019             thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2020                                         CmpInst::makeCmpResultType(ValTy),
2021                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
2022    }
2023    unsigned NumReduxLevels = Log2_32(NumVecElts);
2024    InstructionCost ArithCost = 0;
2025    InstructionCost ShuffleCost = 0;
2026    std::pair<InstructionCost, MVT> LT =
2027        thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2028    unsigned LongVectorCount = 0;
2029    unsigned MVTLen =
2030        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2031    while (NumVecElts > MVTLen) {
2032      NumVecElts /= 2;
2033      VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2034      // Assume the pairwise shuffles add a cost.
2035      ShuffleCost += (IsPairwise + 1) *
2036                     thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2037                                             NumVecElts, SubTy);
2038      ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2039      Ty = SubTy;
2040      ++LongVectorCount;
2041    }
2042
2043    NumReduxLevels -= LongVectorCount;
2044
2045    // The minimal length of the vector is limited by the real length of vector
2046    // operations performed on the current platform. That's why several final
2047    // reduction operations are performed on the vectors with the same
2048    // architecture-dependent length.
2049
2050    // Non pairwise reductions need one shuffle per reduction level. Pairwise
2051    // reductions need two shuffles on every level, but the last one. On that
2052    // level one of the shuffles is <0, u, u, ...> which is identity.
2053    unsigned NumShuffles = NumReduxLevels;
2054    if (IsPairwise && NumReduxLevels >= 1)
2055      NumShuffles += NumReduxLevels - 1;
2056    ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2057                                     TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2058    ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2059    return ShuffleCost + ArithCost +
2060           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2061  }
2062
2063  /// Try to calculate op costs for min/max reduction operations.
2064  /// \param CondTy Conditional type for the Select instruction.
2065  InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2066                                         bool IsPairwise, bool IsUnsigned,
2067                                         TTI::TargetCostKind CostKind) {
2068    Type *ScalarTy = Ty->getElementType();
2069    Type *ScalarCondTy = CondTy->getElementType();
2070    unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2071    unsigned NumReduxLevels = Log2_32(NumVecElts);
2072    unsigned CmpOpcode;
2073    if (Ty->isFPOrFPVectorTy()) {
2074      CmpOpcode = Instruction::FCmp;
2075    } else {
2076      assert(Ty->isIntOrIntVectorTy() &&
2077             "expecting floating point or integer type for min/max reduction");
2078      CmpOpcode = Instruction::ICmp;
2079    }
2080    InstructionCost MinMaxCost = 0;
2081    InstructionCost ShuffleCost = 0;
2082    std::pair<InstructionCost, MVT> LT =
2083        thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2084    unsigned LongVectorCount = 0;
2085    unsigned MVTLen =
2086        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2087    while (NumVecElts > MVTLen) {
2088      NumVecElts /= 2;
2089      auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2090      CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2091
2092      // Assume the pairwise shuffles add a cost.
2093      ShuffleCost += (IsPairwise + 1) *
2094                     thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2095                                             NumVecElts, SubTy);
2096      MinMaxCost +=
2097          thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2098                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2099          thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2100                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
2101      Ty = SubTy;
2102      ++LongVectorCount;
2103    }
2104
2105    NumReduxLevels -= LongVectorCount;
2106
2107    // The minimal length of the vector is limited by the real length of vector
2108    // operations performed on the current platform. That's why several final
2109    // reduction opertions are perfomed on the vectors with the same
2110    // architecture-dependent length.
2111
2112    // Non pairwise reductions need one shuffle per reduction level. Pairwise
2113    // reductions need two shuffles on every level, but the last one. On that
2114    // level one of the shuffles is <0, u, u, ...> which is identity.
2115    unsigned NumShuffles = NumReduxLevels;
2116    if (IsPairwise && NumReduxLevels >= 1)
2117      NumShuffles += NumReduxLevels - 1;
2118    ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2119                                     TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2120    MinMaxCost +=
2121        NumReduxLevels *
2122        (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2123                                     CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2124         thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2125                                     CmpInst::BAD_ICMP_PREDICATE, CostKind));
2126    // The last min/max should be in vector registers and we counted it above.
2127    // So just need a single extractelement.
2128    return ShuffleCost + MinMaxCost +
2129           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2130  }
2131
2132  InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2133                                              Type *ResTy, VectorType *Ty,
2134                                              TTI::TargetCostKind CostKind) {
2135    // Without any native support, this is equivalent to the cost of
2136    // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2137    VectorType *ExtTy = VectorType::get(ResTy, Ty);
2138    InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2139        Instruction::Add, ExtTy, false, CostKind);
2140    InstructionCost MulCost = 0;
2141    InstructionCost ExtCost = thisT()->getCastInstrCost(
2142        IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2143        TTI::CastContextHint::None, CostKind);
2144    if (IsMLA) {
2145      MulCost =
2146          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2147      ExtCost *= 2;
2148    }
2149
2150    return RedCost + MulCost + ExtCost;
2151  }
2152
2153  InstructionCost getVectorSplitCost() { return 1; }
2154
2155  /// @}
2156};
2157
2158/// Concrete BasicTTIImpl that can be used if no further customization
2159/// is needed.
2160class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2161  using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2162
2163  friend class BasicTTIImplBase<BasicTTIImpl>;
2164
2165  const TargetSubtargetInfo *ST;
2166  const TargetLoweringBase *TLI;
2167
2168  const TargetSubtargetInfo *getST() const { return ST; }
2169  const TargetLoweringBase *getTLI() const { return TLI; }
2170
2171public:
2172  explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2173};
2174
2175} // end namespace llvm
2176
2177#endif // LLVM_CODEGEN_BASICTTIIMPL_H
2178