1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CallSite.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
44#include "llvm/MC/MCSchedule.h"
45#include "llvm/Support/Casting.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/ErrorHandling.h"
48#include "llvm/Support/MachineValueType.h"
49#include "llvm/Support/MathExtras.h"
50#include <algorithm>
51#include <cassert>
52#include <cstdint>
53#include <limits>
54#include <utility>
55
56namespace llvm {
57
58class Function;
59class GlobalValue;
60class LLVMContext;
61class ScalarEvolution;
62class SCEV;
63class TargetMachine;
64
65extern cl::opt<unsigned> PartialUnrollingThreshold;
66
67/// Base class which can be used to help build a TTI implementation.
68///
69/// This class provides as much implementation of the TTI interface as is
70/// possible using the target independent parts of the code generator.
71///
72/// In order to subclass it, your class must implement a getST() method to
73/// return the subtarget, and a getTLI() method to return the target lowering.
74/// We need these methods implemented in the derived class so that this class
75/// doesn't have to duplicate storage for them.
76template <typename T>
77class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78private:
79  using BaseT = TargetTransformInfoImplCRTPBase<T>;
80  using TTI = TargetTransformInfo;
81
82  /// Estimate a cost of Broadcast as an extract and sequence of insert
83  /// operations.
84  unsigned getBroadcastShuffleOverhead(Type *Ty) {
85    assert(Ty->isVectorTy() && "Can only shuffle vectors");
86    unsigned Cost = 0;
87    // Broadcast cost is equal to the cost of extracting the zero'th element
88    // plus the cost of inserting it into every element of the result vector.
89    Cost += static_cast<T *>(this)->getVectorInstrCost(
90        Instruction::ExtractElement, Ty, 0);
91
92    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
93      Cost += static_cast<T *>(this)->getVectorInstrCost(
94          Instruction::InsertElement, Ty, i);
95    }
96    return Cost;
97  }
98
99  /// Estimate a cost of shuffle as a sequence of extract and insert
100  /// operations.
101  unsigned getPermuteShuffleOverhead(Type *Ty) {
102    assert(Ty->isVectorTy() && "Can only shuffle vectors");
103    unsigned Cost = 0;
104    // Shuffle cost is equal to the cost of extracting element from its argument
105    // plus the cost of inserting them onto the result vector.
106
107    // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
108    // index 0 of first vector, index 1 of second vector,index 2 of first
109    // vector and finally index 3 of second vector and insert them at index
110    // <0,1,2,3> of result vector.
111    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
112      Cost += static_cast<T *>(this)
113                  ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
114      Cost += static_cast<T *>(this)
115                  ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
116    }
117    return Cost;
118  }
119
120  /// Estimate a cost of subvector extraction as a sequence of extract and
121  /// insert operations.
122  unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
123    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
124           "Can only extract subvectors from vectors");
125    int NumSubElts = SubTy->getVectorNumElements();
126    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
127           "SK_ExtractSubvector index out of range");
128
129    unsigned Cost = 0;
130    // Subvector extraction cost is equal to the cost of extracting element from
131    // the source type plus the cost of inserting them into the result vector
132    // type.
133    for (int i = 0; i != NumSubElts; ++i) {
134      Cost += static_cast<T *>(this)->getVectorInstrCost(
135          Instruction::ExtractElement, Ty, i + Index);
136      Cost += static_cast<T *>(this)->getVectorInstrCost(
137          Instruction::InsertElement, SubTy, i);
138    }
139    return Cost;
140  }
141
142  /// Estimate a cost of subvector insertion as a sequence of extract and
143  /// insert operations.
144  unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
145    assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
146           "Can only insert subvectors into vectors");
147    int NumSubElts = SubTy->getVectorNumElements();
148    assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
149           "SK_InsertSubvector index out of range");
150
151    unsigned Cost = 0;
152    // Subvector insertion cost is equal to the cost of extracting element from
153    // the source type plus the cost of inserting them into the result vector
154    // type.
155    for (int i = 0; i != NumSubElts; ++i) {
156      Cost += static_cast<T *>(this)->getVectorInstrCost(
157          Instruction::ExtractElement, SubTy, i);
158      Cost += static_cast<T *>(this)->getVectorInstrCost(
159          Instruction::InsertElement, Ty, i + Index);
160    }
161    return Cost;
162  }
163
164  /// Local query method delegates up to T which *must* implement this!
165  const TargetSubtargetInfo *getST() const {
166    return static_cast<const T *>(this)->getST();
167  }
168
169  /// Local query method delegates up to T which *must* implement this!
170  const TargetLoweringBase *getTLI() const {
171    return static_cast<const T *>(this)->getTLI();
172  }
173
174  static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
175    switch (M) {
176      case TTI::MIM_Unindexed:
177        return ISD::UNINDEXED;
178      case TTI::MIM_PreInc:
179        return ISD::PRE_INC;
180      case TTI::MIM_PreDec:
181        return ISD::PRE_DEC;
182      case TTI::MIM_PostInc:
183        return ISD::POST_INC;
184      case TTI::MIM_PostDec:
185        return ISD::POST_DEC;
186    }
187    llvm_unreachable("Unexpected MemIndexedMode");
188  }
189
190protected:
191  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
192      : BaseT(DL) {}
193  virtual ~BasicTTIImplBase() = default;
194
195  using TargetTransformInfoImplBase::DL;
196
197public:
198  /// \name Scalar TTI Implementations
199  /// @{
200  bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
201                                      unsigned AddressSpace, unsigned Alignment,
202                                      bool *Fast) const {
203    EVT E = EVT::getIntegerVT(Context, BitWidth);
204    return getTLI()->allowsMisalignedMemoryAccesses(
205        E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
206  }
207
208  bool hasBranchDivergence() { return false; }
209
210  bool isSourceOfDivergence(const Value *V) { return false; }
211
212  bool isAlwaysUniform(const Value *V) { return false; }
213
214  unsigned getFlatAddressSpace() {
215    // Return an invalid address space.
216    return -1;
217  }
218
219  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
220                                  Intrinsic::ID IID) const {
221    return false;
222  }
223
224  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
225                                        Value *OldV, Value *NewV) const {
226    return false;
227  }
228
229  bool isLegalAddImmediate(int64_t imm) {
230    return getTLI()->isLegalAddImmediate(imm);
231  }
232
233  bool isLegalICmpImmediate(int64_t imm) {
234    return getTLI()->isLegalICmpImmediate(imm);
235  }
236
237  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
238                             bool HasBaseReg, int64_t Scale,
239                             unsigned AddrSpace, Instruction *I = nullptr) {
240    TargetLoweringBase::AddrMode AM;
241    AM.BaseGV = BaseGV;
242    AM.BaseOffs = BaseOffset;
243    AM.HasBaseReg = HasBaseReg;
244    AM.Scale = Scale;
245    return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
246  }
247
248  bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
249                          const DataLayout &DL) const {
250    EVT VT = getTLI()->getValueType(DL, Ty);
251    return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
252  }
253
254  bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
255                           const DataLayout &DL) const {
256    EVT VT = getTLI()->getValueType(DL, Ty);
257    return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
258  }
259
260  bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
261    return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
262  }
263
264  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
265                           bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
266    TargetLoweringBase::AddrMode AM;
267    AM.BaseGV = BaseGV;
268    AM.BaseOffs = BaseOffset;
269    AM.HasBaseReg = HasBaseReg;
270    AM.Scale = Scale;
271    return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
272  }
273
274  bool isTruncateFree(Type *Ty1, Type *Ty2) {
275    return getTLI()->isTruncateFree(Ty1, Ty2);
276  }
277
278  bool isProfitableToHoist(Instruction *I) {
279    return getTLI()->isProfitableToHoist(I);
280  }
281
282  bool useAA() const { return getST()->useAA(); }
283
284  bool isTypeLegal(Type *Ty) {
285    EVT VT = getTLI()->getValueType(DL, Ty);
286    return getTLI()->isTypeLegal(VT);
287  }
288
289  int getGEPCost(Type *PointeeType, const Value *Ptr,
290                 ArrayRef<const Value *> Operands) {
291    return BaseT::getGEPCost(PointeeType, Ptr, Operands);
292  }
293
294  int getExtCost(const Instruction *I, const Value *Src) {
295    if (getTLI()->isExtFree(I))
296      return TargetTransformInfo::TCC_Free;
297
298    if (isa<ZExtInst>(I) || isa<SExtInst>(I))
299      if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
300        if (getTLI()->isExtLoad(LI, I, DL))
301          return TargetTransformInfo::TCC_Free;
302
303    return TargetTransformInfo::TCC_Basic;
304  }
305
306  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
307                            ArrayRef<const Value *> Arguments, const User *U) {
308    return BaseT::getIntrinsicCost(IID, RetTy, Arguments, U);
309  }
310
311  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
312                            ArrayRef<Type *> ParamTys, const User *U) {
313    if (IID == Intrinsic::cttz) {
314      if (getTLI()->isCheapToSpeculateCttz())
315        return TargetTransformInfo::TCC_Basic;
316      return TargetTransformInfo::TCC_Expensive;
317    }
318
319    if (IID == Intrinsic::ctlz) {
320      if (getTLI()->isCheapToSpeculateCtlz())
321        return TargetTransformInfo::TCC_Basic;
322      return TargetTransformInfo::TCC_Expensive;
323    }
324
325    return BaseT::getIntrinsicCost(IID, RetTy, ParamTys, U);
326  }
327
328  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
329                                            unsigned &JumpTableSize,
330                                            ProfileSummaryInfo *PSI,
331                                            BlockFrequencyInfo *BFI) {
332    /// Try to find the estimated number of clusters. Note that the number of
333    /// clusters identified in this function could be different from the actual
334    /// numbers found in lowering. This function ignore switches that are
335    /// lowered with a mix of jump table / bit test / BTree. This function was
336    /// initially intended to be used when estimating the cost of switch in
337    /// inline cost heuristic, but it's a generic cost model to be used in other
338    /// places (e.g., in loop unrolling).
339    unsigned N = SI.getNumCases();
340    const TargetLoweringBase *TLI = getTLI();
341    const DataLayout &DL = this->getDataLayout();
342
343    JumpTableSize = 0;
344    bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
345
346    // Early exit if both a jump table and bit test are not allowed.
347    if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
348      return N;
349
350    APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
351    APInt MinCaseVal = MaxCaseVal;
352    for (auto CI : SI.cases()) {
353      const APInt &CaseVal = CI.getCaseValue()->getValue();
354      if (CaseVal.sgt(MaxCaseVal))
355        MaxCaseVal = CaseVal;
356      if (CaseVal.slt(MinCaseVal))
357        MinCaseVal = CaseVal;
358    }
359
360    // Check if suitable for a bit test
361    if (N <= DL.getIndexSizeInBits(0u)) {
362      SmallPtrSet<const BasicBlock *, 4> Dests;
363      for (auto I : SI.cases())
364        Dests.insert(I.getCaseSuccessor());
365
366      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
367                                     DL))
368        return 1;
369    }
370
371    // Check if suitable for a jump table.
372    if (IsJTAllowed) {
373      if (N < 2 || N < TLI->getMinimumJumpTableEntries())
374        return N;
375      uint64_t Range =
376          (MaxCaseVal - MinCaseVal)
377              .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
378      // Check whether a range of clusters is dense enough for a jump table
379      if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
380        JumpTableSize = Range;
381        return 1;
382      }
383    }
384    return N;
385  }
386
387  bool shouldBuildLookupTables() {
388    const TargetLoweringBase *TLI = getTLI();
389    return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
390           TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
391  }
392
393  bool haveFastSqrt(Type *Ty) {
394    const TargetLoweringBase *TLI = getTLI();
395    EVT VT = TLI->getValueType(DL, Ty);
396    return TLI->isTypeLegal(VT) &&
397           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
398  }
399
400  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
401    return true;
402  }
403
404  unsigned getFPOpCost(Type *Ty) {
405    // Check whether FADD is available, as a proxy for floating-point in
406    // general.
407    const TargetLoweringBase *TLI = getTLI();
408    EVT VT = TLI->getValueType(DL, Ty);
409    if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
410      return TargetTransformInfo::TCC_Basic;
411    return TargetTransformInfo::TCC_Expensive;
412  }
413
414  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
415    const TargetLoweringBase *TLI = getTLI();
416    switch (Opcode) {
417    default: break;
418    case Instruction::Trunc:
419      if (TLI->isTruncateFree(OpTy, Ty))
420        return TargetTransformInfo::TCC_Free;
421      return TargetTransformInfo::TCC_Basic;
422    case Instruction::ZExt:
423      if (TLI->isZExtFree(OpTy, Ty))
424        return TargetTransformInfo::TCC_Free;
425      return TargetTransformInfo::TCC_Basic;
426
427    case Instruction::AddrSpaceCast:
428      if (TLI->isFreeAddrSpaceCast(OpTy->getPointerAddressSpace(),
429                                   Ty->getPointerAddressSpace()))
430        return TargetTransformInfo::TCC_Free;
431      return TargetTransformInfo::TCC_Basic;
432    }
433
434    return BaseT::getOperationCost(Opcode, Ty, OpTy);
435  }
436
437  unsigned getInliningThresholdMultiplier() { return 1; }
438
439  int getInlinerVectorBonusPercent() { return 150; }
440
441  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
442                               TTI::UnrollingPreferences &UP) {
443    // This unrolling functionality is target independent, but to provide some
444    // motivation for its intended use, for x86:
445
446    // According to the Intel 64 and IA-32 Architectures Optimization Reference
447    // Manual, Intel Core models and later have a loop stream detector (and
448    // associated uop queue) that can benefit from partial unrolling.
449    // The relevant requirements are:
450    //  - The loop must have no more than 4 (8 for Nehalem and later) branches
451    //    taken, and none of them may be calls.
452    //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
453
454    // According to the Software Optimization Guide for AMD Family 15h
455    // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
456    // and loop buffer which can benefit from partial unrolling.
457    // The relevant requirements are:
458    //  - The loop must have fewer than 16 branches
459    //  - The loop must have less than 40 uops in all executed loop branches
460
461    // The number of taken branches in a loop is hard to estimate here, and
462    // benchmarking has revealed that it is better not to be conservative when
463    // estimating the branch count. As a result, we'll ignore the branch limits
464    // until someone finds a case where it matters in practice.
465
466    unsigned MaxOps;
467    const TargetSubtargetInfo *ST = getST();
468    if (PartialUnrollingThreshold.getNumOccurrences() > 0)
469      MaxOps = PartialUnrollingThreshold;
470    else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
471      MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
472    else
473      return;
474
475    // Scan the loop: don't unroll loops with calls.
476    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
477         ++I) {
478      BasicBlock *BB = *I;
479
480      for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
481        if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
482          ImmutableCallSite CS(&*J);
483          if (const Function *F = CS.getCalledFunction()) {
484            if (!static_cast<T *>(this)->isLoweredToCall(F))
485              continue;
486          }
487
488          return;
489        }
490    }
491
492    // Enable runtime and partial unrolling up to the specified size.
493    // Enable using trip count upper bound to unroll loops.
494    UP.Partial = UP.Runtime = UP.UpperBound = true;
495    UP.PartialThreshold = MaxOps;
496
497    // Avoid unrolling when optimizing for size.
498    UP.OptSizeThreshold = 0;
499    UP.PartialOptSizeThreshold = 0;
500
501    // Set number of instructions optimized when "back edge"
502    // becomes "fall through" to default value of 2.
503    UP.BEInsns = 2;
504  }
505
506  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
507                                AssumptionCache &AC,
508                                TargetLibraryInfo *LibInfo,
509                                HardwareLoopInfo &HWLoopInfo) {
510    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
511  }
512
513  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
514                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
515                                   DominatorTree *DT,
516                                   const LoopAccessInfo *LAI) {
517    return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
518  }
519
520  int getInstructionLatency(const Instruction *I) {
521    if (isa<LoadInst>(I))
522      return getST()->getSchedModel().DefaultLoadLatency;
523
524    return BaseT::getInstructionLatency(I);
525  }
526
527  virtual Optional<unsigned>
528  getCacheSize(TargetTransformInfo::CacheLevel Level) const {
529    return Optional<unsigned>(
530      getST()->getCacheSize(static_cast<unsigned>(Level)));
531  }
532
533  virtual Optional<unsigned>
534  getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
535    Optional<unsigned> TargetResult =
536        getST()->getCacheAssociativity(static_cast<unsigned>(Level));
537
538    if (TargetResult)
539      return TargetResult;
540
541    return BaseT::getCacheAssociativity(Level);
542  }
543
544  virtual unsigned getCacheLineSize() const {
545    return getST()->getCacheLineSize();
546  }
547
548  virtual unsigned getPrefetchDistance() const {
549    return getST()->getPrefetchDistance();
550  }
551
552  virtual unsigned getMinPrefetchStride() const {
553    return getST()->getMinPrefetchStride();
554  }
555
556  virtual unsigned getMaxPrefetchIterationsAhead() const {
557    return getST()->getMaxPrefetchIterationsAhead();
558  }
559
560  /// @}
561
562  /// \name Vector TTI Implementations
563  /// @{
564
565  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
566
567  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
568  /// are set if the result needs to be inserted and/or extracted from vectors.
569  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
570    assert(Ty->isVectorTy() && "Can only scalarize vectors");
571    unsigned Cost = 0;
572
573    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
574      if (Insert)
575        Cost += static_cast<T *>(this)
576                    ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
577      if (Extract)
578        Cost += static_cast<T *>(this)
579                    ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
580    }
581
582    return Cost;
583  }
584
585  /// Estimate the overhead of scalarizing an instructions unique
586  /// non-constant operands. The types of the arguments are ordinarily
587  /// scalar, in which case the costs are multiplied with VF.
588  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
589                                            unsigned VF) {
590    unsigned Cost = 0;
591    SmallPtrSet<const Value*, 4> UniqueOperands;
592    for (const Value *A : Args) {
593      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
594        Type *VecTy = nullptr;
595        if (A->getType()->isVectorTy()) {
596          VecTy = A->getType();
597          // If A is a vector operand, VF should be 1 or correspond to A.
598          assert((VF == 1 || VF == VecTy->getVectorNumElements()) &&
599                 "Vector argument does not match VF");
600        }
601        else
602          VecTy = VectorType::get(A->getType(), VF);
603
604        Cost += getScalarizationOverhead(VecTy, false, true);
605      }
606    }
607
608    return Cost;
609  }
610
611  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
612    assert(VecTy->isVectorTy());
613
614    unsigned Cost = 0;
615
616    Cost += getScalarizationOverhead(VecTy, true, false);
617    if (!Args.empty())
618      Cost += getOperandsScalarizationOverhead(Args,
619                                               VecTy->getVectorNumElements());
620    else
621      // When no information on arguments is provided, we add the cost
622      // associated with one argument as a heuristic.
623      Cost += getScalarizationOverhead(VecTy, false, true);
624
625    return Cost;
626  }
627
628  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
629
630  unsigned getArithmeticInstrCost(
631      unsigned Opcode, Type *Ty,
632      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
633      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
634      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
635      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
636      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
637      const Instruction *CxtI = nullptr) {
638    // Check if any of the operands are vector operands.
639    const TargetLoweringBase *TLI = getTLI();
640    int ISD = TLI->InstructionOpcodeToISD(Opcode);
641    assert(ISD && "Invalid opcode");
642
643    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
644
645    bool IsFloat = Ty->isFPOrFPVectorTy();
646    // Assume that floating point arithmetic operations cost twice as much as
647    // integer operations.
648    unsigned OpCost = (IsFloat ? 2 : 1);
649
650    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
651      // The operation is legal. Assume it costs 1.
652      // TODO: Once we have extract/insert subvector cost we need to use them.
653      return LT.first * OpCost;
654    }
655
656    if (!TLI->isOperationExpand(ISD, LT.second)) {
657      // If the operation is custom lowered, then assume that the code is twice
658      // as expensive.
659      return LT.first * 2 * OpCost;
660    }
661
662    // Else, assume that we need to scalarize this op.
663    // TODO: If one of the types get legalized by splitting, handle this
664    // similarly to what getCastInstrCost() does.
665    if (Ty->isVectorTy()) {
666      unsigned Num = Ty->getVectorNumElements();
667      unsigned Cost = static_cast<T *>(this)
668                          ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
669      // Return the cost of multiple scalar invocation plus the cost of
670      // inserting and extracting the values.
671      return getScalarizationOverhead(Ty, Args) + Num * Cost;
672    }
673
674    // We don't know anything about this scalar instruction.
675    return OpCost;
676  }
677
678  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
679                          Type *SubTp) {
680    switch (Kind) {
681    case TTI::SK_Broadcast:
682      return getBroadcastShuffleOverhead(Tp);
683    case TTI::SK_Select:
684    case TTI::SK_Reverse:
685    case TTI::SK_Transpose:
686    case TTI::SK_PermuteSingleSrc:
687    case TTI::SK_PermuteTwoSrc:
688      return getPermuteShuffleOverhead(Tp);
689    case TTI::SK_ExtractSubvector:
690      return getExtractSubvectorOverhead(Tp, Index, SubTp);
691    case TTI::SK_InsertSubvector:
692      return getInsertSubvectorOverhead(Tp, Index, SubTp);
693    }
694    llvm_unreachable("Unknown TTI::ShuffleKind");
695  }
696
697  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
698                            const Instruction *I = nullptr) {
699    const TargetLoweringBase *TLI = getTLI();
700    int ISD = TLI->InstructionOpcodeToISD(Opcode);
701    assert(ISD && "Invalid opcode");
702    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
703    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
704
705    // Check for NOOP conversions.
706    if (SrcLT.first == DstLT.first &&
707        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
708
709      // Bitcast between types that are legalized to the same type are free.
710      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
711        return 0;
712    }
713
714    if (Opcode == Instruction::Trunc &&
715        TLI->isTruncateFree(SrcLT.second, DstLT.second))
716      return 0;
717
718    if (Opcode == Instruction::ZExt &&
719        TLI->isZExtFree(SrcLT.second, DstLT.second))
720      return 0;
721
722    if (Opcode == Instruction::AddrSpaceCast &&
723        TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
724                                 Dst->getPointerAddressSpace()))
725      return 0;
726
727    // If this is a zext/sext of a load, return 0 if the corresponding
728    // extending load exists on target.
729    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
730        I && isa<LoadInst>(I->getOperand(0))) {
731        EVT ExtVT = EVT::getEVT(Dst);
732        EVT LoadVT = EVT::getEVT(Src);
733        unsigned LType =
734          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
735        if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
736          return 0;
737    }
738
739    // If the cast is marked as legal (or promote) then assume low cost.
740    if (SrcLT.first == DstLT.first &&
741        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
742      return 1;
743
744    // Handle scalar conversions.
745    if (!Src->isVectorTy() && !Dst->isVectorTy()) {
746      // Scalar bitcasts are usually free.
747      if (Opcode == Instruction::BitCast)
748        return 0;
749
750      // Just check the op cost. If the operation is legal then assume it costs
751      // 1.
752      if (!TLI->isOperationExpand(ISD, DstLT.second))
753        return 1;
754
755      // Assume that illegal scalar instruction are expensive.
756      return 4;
757    }
758
759    // Check vector-to-vector casts.
760    if (Dst->isVectorTy() && Src->isVectorTy()) {
761      // If the cast is between same-sized registers, then the check is simple.
762      if (SrcLT.first == DstLT.first &&
763          SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
764
765        // Assume that Zext is done using AND.
766        if (Opcode == Instruction::ZExt)
767          return 1;
768
769        // Assume that sext is done using SHL and SRA.
770        if (Opcode == Instruction::SExt)
771          return 2;
772
773        // Just check the op cost. If the operation is legal then assume it
774        // costs
775        // 1 and multiply by the type-legalization overhead.
776        if (!TLI->isOperationExpand(ISD, DstLT.second))
777          return SrcLT.first * 1;
778      }
779
780      // If we are legalizing by splitting, query the concrete TTI for the cost
781      // of casting the original vector twice. We also need to factor in the
782      // cost of the split itself. Count that as 1, to be consistent with
783      // TLI->getTypeLegalizationCost().
784      if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
785               TargetLowering::TypeSplitVector ||
786           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
787               TargetLowering::TypeSplitVector) &&
788          Src->getVectorNumElements() > 1 && Dst->getVectorNumElements() > 1) {
789        Type *SplitDst = VectorType::get(Dst->getVectorElementType(),
790                                         Dst->getVectorNumElements() / 2);
791        Type *SplitSrc = VectorType::get(Src->getVectorElementType(),
792                                         Src->getVectorNumElements() / 2);
793        T *TTI = static_cast<T *>(this);
794        return TTI->getVectorSplitCost() +
795               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
796      }
797
798      // In other cases where the source or destination are illegal, assume
799      // the operation will get scalarized.
800      unsigned Num = Dst->getVectorNumElements();
801      unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
802          Opcode, Dst->getScalarType(), Src->getScalarType(), I);
803
804      // Return the cost of multiple scalar invocation plus the cost of
805      // inserting and extracting the values.
806      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
807    }
808
809    // We already handled vector-to-vector and scalar-to-scalar conversions.
810    // This
811    // is where we handle bitcast between vectors and scalars. We need to assume
812    //  that the conversion is scalarized in one way or another.
813    if (Opcode == Instruction::BitCast)
814      // Illegal bitcasts are done by storing and loading from a stack slot.
815      return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
816                                : 0) +
817             (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
818                                : 0);
819
820    llvm_unreachable("Unhandled cast");
821  }
822
823  unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
824                                    VectorType *VecTy, unsigned Index) {
825    return static_cast<T *>(this)->getVectorInstrCost(
826               Instruction::ExtractElement, VecTy, Index) +
827           static_cast<T *>(this)->getCastInstrCost(Opcode, Dst,
828                                                    VecTy->getElementType());
829  }
830
831  unsigned getCFInstrCost(unsigned Opcode) {
832    // Branches are assumed to be predicted.
833    return 0;
834  }
835
836  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
837                              const Instruction *I) {
838    const TargetLoweringBase *TLI = getTLI();
839    int ISD = TLI->InstructionOpcodeToISD(Opcode);
840    assert(ISD && "Invalid opcode");
841
842    // Selects on vectors are actually vector selects.
843    if (ISD == ISD::SELECT) {
844      assert(CondTy && "CondTy must exist");
845      if (CondTy->isVectorTy())
846        ISD = ISD::VSELECT;
847    }
848    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
849
850    if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
851        !TLI->isOperationExpand(ISD, LT.second)) {
852      // The operation is legal. Assume it costs 1. Multiply
853      // by the type-legalization overhead.
854      return LT.first * 1;
855    }
856
857    // Otherwise, assume that the cast is scalarized.
858    // TODO: If one of the types get legalized by splitting, handle this
859    // similarly to what getCastInstrCost() does.
860    if (ValTy->isVectorTy()) {
861      unsigned Num = ValTy->getVectorNumElements();
862      if (CondTy)
863        CondTy = CondTy->getScalarType();
864      unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
865          Opcode, ValTy->getScalarType(), CondTy, I);
866
867      // Return the cost of multiple scalar invocation plus the cost of
868      // inserting and extracting the values.
869      return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
870    }
871
872    // Unknown scalar opcode.
873    return 1;
874  }
875
876  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
877    std::pair<unsigned, MVT> LT =
878        getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
879
880    return LT.first;
881  }
882
883  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
884                           unsigned AddressSpace,
885                           const Instruction *I = nullptr) {
886    assert(!Src->isVoidTy() && "Invalid type");
887    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
888
889    // Assuming that all loads of legal types cost 1.
890    unsigned Cost = LT.first;
891
892    if (Src->isVectorTy() &&
893        Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
894      // This is a vector load that legalizes to a larger type than the vector
895      // itself. Unless the corresponding extending load or truncating store is
896      // legal, then this will scalarize.
897      TargetLowering::LegalizeAction LA = TargetLowering::Expand;
898      EVT MemVT = getTLI()->getValueType(DL, Src);
899      if (Opcode == Instruction::Store)
900        LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
901      else
902        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
903
904      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
905        // This is a vector load/store for some illegal type that is scalarized.
906        // We must account for the cost of building or decomposing the vector.
907        Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
908                                         Opcode == Instruction::Store);
909      }
910    }
911
912    return Cost;
913  }
914
915  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
916                                      unsigned Factor,
917                                      ArrayRef<unsigned> Indices,
918                                      unsigned Alignment, unsigned AddressSpace,
919                                      bool UseMaskForCond = false,
920                                      bool UseMaskForGaps = false) {
921    VectorType *VT = dyn_cast<VectorType>(VecTy);
922    assert(VT && "Expect a vector type for interleaved memory op");
923
924    unsigned NumElts = VT->getNumElements();
925    assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
926
927    unsigned NumSubElts = NumElts / Factor;
928    VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
929
930    // Firstly, the cost of load/store operation.
931    unsigned Cost;
932    if (UseMaskForCond || UseMaskForGaps)
933      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
934          Opcode, VecTy, Alignment, AddressSpace);
935    else
936      Cost = static_cast<T *>(this)->getMemoryOpCost(
937          Opcode, VecTy, MaybeAlign(Alignment), AddressSpace);
938
939    // Legalize the vector type, and get the legalized and unlegalized type
940    // sizes.
941    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
942    unsigned VecTySize =
943        static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
944    unsigned VecTyLTSize = VecTyLT.getStoreSize();
945
946    // Return the ceiling of dividing A by B.
947    auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
948
949    // Scale the cost of the memory operation by the fraction of legalized
950    // instructions that will actually be used. We shouldn't account for the
951    // cost of dead instructions since they will be removed.
952    //
953    // E.g., An interleaved load of factor 8:
954    //       %vec = load <16 x i64>, <16 x i64>* %ptr
955    //       %v0 = shufflevector %vec, undef, <0, 8>
956    //
957    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
958    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
959    // type). The other loads are unused.
960    //
961    // We only scale the cost of loads since interleaved store groups aren't
962    // allowed to have gaps.
963    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
964      // The number of loads of a legal type it will take to represent a load
965      // of the unlegalized vector type.
966      unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
967
968      // The number of elements of the unlegalized type that correspond to a
969      // single legal instruction.
970      unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
971
972      // Determine which legal instructions will be used.
973      BitVector UsedInsts(NumLegalInsts, false);
974      for (unsigned Index : Indices)
975        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
976          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
977
978      // Scale the cost of the load by the fraction of legal instructions that
979      // will be used.
980      Cost *= UsedInsts.count() / NumLegalInsts;
981    }
982
983    // Then plus the cost of interleave operation.
984    if (Opcode == Instruction::Load) {
985      // The interleave cost is similar to extract sub vectors' elements
986      // from the wide vector, and insert them into sub vectors.
987      //
988      // E.g. An interleaved load of factor 2 (with one member of index 0):
989      //      %vec = load <8 x i32>, <8 x i32>* %ptr
990      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
991      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
992      // <8 x i32> vector and insert them into a <4 x i32> vector.
993
994      assert(Indices.size() <= Factor &&
995             "Interleaved memory op has too many members");
996
997      for (unsigned Index : Indices) {
998        assert(Index < Factor && "Invalid index for interleaved memory op");
999
1000        // Extract elements from loaded vector for each sub vector.
1001        for (unsigned i = 0; i < NumSubElts; i++)
1002          Cost += static_cast<T *>(this)->getVectorInstrCost(
1003              Instruction::ExtractElement, VT, Index + i * Factor);
1004      }
1005
1006      unsigned InsSubCost = 0;
1007      for (unsigned i = 0; i < NumSubElts; i++)
1008        InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
1009            Instruction::InsertElement, SubVT, i);
1010
1011      Cost += Indices.size() * InsSubCost;
1012    } else {
1013      // The interleave cost is extract all elements from sub vectors, and
1014      // insert them into the wide vector.
1015      //
1016      // E.g. An interleaved store of factor 2:
1017      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1018      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1019      // The cost is estimated as extract all elements from both <4 x i32>
1020      // vectors and insert into the <8 x i32> vector.
1021
1022      unsigned ExtSubCost = 0;
1023      for (unsigned i = 0; i < NumSubElts; i++)
1024        ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
1025            Instruction::ExtractElement, SubVT, i);
1026      Cost += ExtSubCost * Factor;
1027
1028      for (unsigned i = 0; i < NumElts; i++)
1029        Cost += static_cast<T *>(this)
1030                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1031    }
1032
1033    if (!UseMaskForCond)
1034      return Cost;
1035
1036    Type *I8Type = Type::getInt8Ty(VT->getContext());
1037    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
1038    SubVT = VectorType::get(I8Type, NumSubElts);
1039
1040    // The Mask shuffling cost is extract all the elements of the Mask
1041    // and insert each of them Factor times into the wide vector:
1042    //
1043    // E.g. an interleaved group with factor 3:
1044    //    %mask = icmp ult <8 x i32> %vec1, %vec2
1045    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1046    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1047    // The cost is estimated as extract all mask elements from the <8xi1> mask
1048    // vector and insert them factor times into the <24xi1> shuffled mask
1049    // vector.
1050    for (unsigned i = 0; i < NumSubElts; i++)
1051      Cost += static_cast<T *>(this)->getVectorInstrCost(
1052          Instruction::ExtractElement, SubVT, i);
1053
1054    for (unsigned i = 0; i < NumElts; i++)
1055      Cost += static_cast<T *>(this)->getVectorInstrCost(
1056          Instruction::InsertElement, MaskVT, i);
1057
1058    // The Gaps mask is invariant and created outside the loop, therefore the
1059    // cost of creating it is not accounted for here. However if we have both
1060    // a MaskForGaps and some other mask that guards the execution of the
1061    // memory access, we need to account for the cost of And-ing the two masks
1062    // inside the loop.
1063    if (UseMaskForGaps)
1064      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
1065          BinaryOperator::And, MaskVT);
1066
1067    return Cost;
1068  }
1069
1070  /// Get intrinsic cost based on arguments.
1071  unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1072                                 ArrayRef<Value *> Args, FastMathFlags FMF,
1073                                 unsigned VF = 1) {
1074    unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
1075    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
1076    auto *ConcreteTTI = static_cast<T *>(this);
1077
1078    switch (IID) {
1079    default: {
1080      // Assume that we need to scalarize this intrinsic.
1081      SmallVector<Type *, 4> Types;
1082      for (Value *Op : Args) {
1083        Type *OpTy = Op->getType();
1084        assert(VF == 1 || !OpTy->isVectorTy());
1085        Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
1086      }
1087
1088      if (VF > 1 && !RetTy->isVoidTy())
1089        RetTy = VectorType::get(RetTy, VF);
1090
1091      // Compute the scalarization overhead based on Args for a vector
1092      // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
1093      // CostModel will pass a vector RetTy and VF is 1.
1094      unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
1095      if (RetVF > 1 || VF > 1) {
1096        ScalarizationCost = 0;
1097        if (!RetTy->isVoidTy())
1098          ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
1099        ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
1100      }
1101
1102      return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF,
1103                                                ScalarizationCost);
1104    }
1105    case Intrinsic::masked_scatter: {
1106      assert(VF == 1 && "Can't vectorize types here.");
1107      Value *Mask = Args[3];
1108      bool VarMask = !isa<Constant>(Mask);
1109      unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
1110      return ConcreteTTI->getGatherScatterOpCost(
1111          Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
1112    }
1113    case Intrinsic::masked_gather: {
1114      assert(VF == 1 && "Can't vectorize types here.");
1115      Value *Mask = Args[2];
1116      bool VarMask = !isa<Constant>(Mask);
1117      unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
1118      return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
1119                                                 Args[0], VarMask, Alignment);
1120    }
1121    case Intrinsic::experimental_vector_reduce_add:
1122    case Intrinsic::experimental_vector_reduce_mul:
1123    case Intrinsic::experimental_vector_reduce_and:
1124    case Intrinsic::experimental_vector_reduce_or:
1125    case Intrinsic::experimental_vector_reduce_xor:
1126    case Intrinsic::experimental_vector_reduce_v2_fadd:
1127    case Intrinsic::experimental_vector_reduce_v2_fmul:
1128    case Intrinsic::experimental_vector_reduce_smax:
1129    case Intrinsic::experimental_vector_reduce_smin:
1130    case Intrinsic::experimental_vector_reduce_fmax:
1131    case Intrinsic::experimental_vector_reduce_fmin:
1132    case Intrinsic::experimental_vector_reduce_umax:
1133    case Intrinsic::experimental_vector_reduce_umin:
1134      return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
1135    case Intrinsic::fshl:
1136    case Intrinsic::fshr: {
1137      Value *X = Args[0];
1138      Value *Y = Args[1];
1139      Value *Z = Args[2];
1140      TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1141      TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1142      TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1143      TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1144      TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1145      OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1146                                                              : TTI::OP_None;
1147      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1148      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1149      unsigned Cost = 0;
1150      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy);
1151      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy);
1152      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy,
1153                                                  OpKindX, OpKindZ, OpPropsX);
1154      Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
1155                                                  OpKindY, OpKindZ, OpPropsY);
1156      // Non-constant shift amounts requires a modulo.
1157      if (OpKindZ != TTI::OK_UniformConstantValue &&
1158          OpKindZ != TTI::OK_NonUniformConstantValue)
1159        Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1160                                                    OpKindZ, OpKindBW, OpPropsZ,
1161                                                    OpPropsBW);
1162      // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1163      if (X != Y) {
1164        Type *CondTy = RetTy->getWithNewBitWidth(1);
1165        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
1166                                                CondTy, nullptr);
1167        Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1168                                                CondTy, nullptr);
1169      }
1170      return Cost;
1171    }
1172    }
1173  }
1174
1175  /// Get intrinsic cost based on argument types.
1176  /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1177  /// cost of scalarizing the arguments and the return value will be computed
1178  /// based on types.
1179  unsigned getIntrinsicInstrCost(
1180      Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
1181      unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
1182    auto *ConcreteTTI = static_cast<T *>(this);
1183
1184    SmallVector<unsigned, 2> ISDs;
1185    unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
1186    switch (IID) {
1187    default: {
1188      // Assume that we need to scalarize this intrinsic.
1189      unsigned ScalarizationCost = ScalarizationCostPassed;
1190      unsigned ScalarCalls = 1;
1191      Type *ScalarRetTy = RetTy;
1192      if (RetTy->isVectorTy()) {
1193        if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1194          ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
1195        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
1196        ScalarRetTy = RetTy->getScalarType();
1197      }
1198      SmallVector<Type *, 4> ScalarTys;
1199      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1200        Type *Ty = Tys[i];
1201        if (Ty->isVectorTy()) {
1202          if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1203            ScalarizationCost += getScalarizationOverhead(Ty, false, true);
1204          ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
1205          Ty = Ty->getScalarType();
1206        }
1207        ScalarTys.push_back(Ty);
1208      }
1209      if (ScalarCalls == 1)
1210        return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1211
1212      unsigned ScalarCost =
1213          ConcreteTTI->getIntrinsicInstrCost(IID, ScalarRetTy, ScalarTys, FMF);
1214
1215      return ScalarCalls * ScalarCost + ScalarizationCost;
1216    }
1217    // Look for intrinsics that can be lowered directly or turned into a scalar
1218    // intrinsic call.
1219    case Intrinsic::sqrt:
1220      ISDs.push_back(ISD::FSQRT);
1221      break;
1222    case Intrinsic::sin:
1223      ISDs.push_back(ISD::FSIN);
1224      break;
1225    case Intrinsic::cos:
1226      ISDs.push_back(ISD::FCOS);
1227      break;
1228    case Intrinsic::exp:
1229      ISDs.push_back(ISD::FEXP);
1230      break;
1231    case Intrinsic::exp2:
1232      ISDs.push_back(ISD::FEXP2);
1233      break;
1234    case Intrinsic::log:
1235      ISDs.push_back(ISD::FLOG);
1236      break;
1237    case Intrinsic::log10:
1238      ISDs.push_back(ISD::FLOG10);
1239      break;
1240    case Intrinsic::log2:
1241      ISDs.push_back(ISD::FLOG2);
1242      break;
1243    case Intrinsic::fabs:
1244      ISDs.push_back(ISD::FABS);
1245      break;
1246    case Intrinsic::canonicalize:
1247      ISDs.push_back(ISD::FCANONICALIZE);
1248      break;
1249    case Intrinsic::minnum:
1250      ISDs.push_back(ISD::FMINNUM);
1251      if (FMF.noNaNs())
1252        ISDs.push_back(ISD::FMINIMUM);
1253      break;
1254    case Intrinsic::maxnum:
1255      ISDs.push_back(ISD::FMAXNUM);
1256      if (FMF.noNaNs())
1257        ISDs.push_back(ISD::FMAXIMUM);
1258      break;
1259    case Intrinsic::copysign:
1260      ISDs.push_back(ISD::FCOPYSIGN);
1261      break;
1262    case Intrinsic::floor:
1263      ISDs.push_back(ISD::FFLOOR);
1264      break;
1265    case Intrinsic::ceil:
1266      ISDs.push_back(ISD::FCEIL);
1267      break;
1268    case Intrinsic::trunc:
1269      ISDs.push_back(ISD::FTRUNC);
1270      break;
1271    case Intrinsic::nearbyint:
1272      ISDs.push_back(ISD::FNEARBYINT);
1273      break;
1274    case Intrinsic::rint:
1275      ISDs.push_back(ISD::FRINT);
1276      break;
1277    case Intrinsic::round:
1278      ISDs.push_back(ISD::FROUND);
1279      break;
1280    case Intrinsic::pow:
1281      ISDs.push_back(ISD::FPOW);
1282      break;
1283    case Intrinsic::fma:
1284      ISDs.push_back(ISD::FMA);
1285      break;
1286    case Intrinsic::fmuladd:
1287      ISDs.push_back(ISD::FMA);
1288      break;
1289    // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1290    case Intrinsic::lifetime_start:
1291    case Intrinsic::lifetime_end:
1292    case Intrinsic::sideeffect:
1293      return 0;
1294    case Intrinsic::masked_store:
1295      return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0,
1296                                                0);
1297    case Intrinsic::masked_load:
1298      return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
1299    case Intrinsic::experimental_vector_reduce_add:
1300      return ConcreteTTI->getArithmeticReductionCost(Instruction::Add, Tys[0],
1301                                                     /*IsPairwiseForm=*/false);
1302    case Intrinsic::experimental_vector_reduce_mul:
1303      return ConcreteTTI->getArithmeticReductionCost(Instruction::Mul, Tys[0],
1304                                                     /*IsPairwiseForm=*/false);
1305    case Intrinsic::experimental_vector_reduce_and:
1306      return ConcreteTTI->getArithmeticReductionCost(Instruction::And, Tys[0],
1307                                                     /*IsPairwiseForm=*/false);
1308    case Intrinsic::experimental_vector_reduce_or:
1309      return ConcreteTTI->getArithmeticReductionCost(Instruction::Or, Tys[0],
1310                                                     /*IsPairwiseForm=*/false);
1311    case Intrinsic::experimental_vector_reduce_xor:
1312      return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0],
1313                                                     /*IsPairwiseForm=*/false);
1314    case Intrinsic::experimental_vector_reduce_v2_fadd:
1315      return ConcreteTTI->getArithmeticReductionCost(
1316          Instruction::FAdd, Tys[0],
1317          /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict
1318                                     // reductions.
1319    case Intrinsic::experimental_vector_reduce_v2_fmul:
1320      return ConcreteTTI->getArithmeticReductionCost(
1321          Instruction::FMul, Tys[0],
1322          /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict
1323                                     // reductions.
1324    case Intrinsic::experimental_vector_reduce_smax:
1325    case Intrinsic::experimental_vector_reduce_smin:
1326    case Intrinsic::experimental_vector_reduce_fmax:
1327    case Intrinsic::experimental_vector_reduce_fmin:
1328      return ConcreteTTI->getMinMaxReductionCost(
1329          Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1330          /*IsUnsigned=*/true);
1331    case Intrinsic::experimental_vector_reduce_umax:
1332    case Intrinsic::experimental_vector_reduce_umin:
1333      return ConcreteTTI->getMinMaxReductionCost(
1334          Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1335          /*IsUnsigned=*/false);
1336    case Intrinsic::sadd_sat:
1337    case Intrinsic::ssub_sat: {
1338      Type *CondTy = RetTy->getWithNewBitWidth(1);
1339
1340      Type *OpTy = StructType::create({RetTy, CondTy});
1341      Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1342                                     ? Intrinsic::sadd_with_overflow
1343                                     : Intrinsic::ssub_with_overflow;
1344
1345      // SatMax -> Overflow && SumDiff < 0
1346      // SatMin -> Overflow && SumDiff >= 0
1347      unsigned Cost = 0;
1348      Cost += ConcreteTTI->getIntrinsicInstrCost(
1349          OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
1350      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
1351                                              CondTy, nullptr);
1352      Cost += 2 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1353                                                  CondTy, nullptr);
1354      return Cost;
1355    }
1356    case Intrinsic::uadd_sat:
1357    case Intrinsic::usub_sat: {
1358      Type *CondTy = RetTy->getWithNewBitWidth(1);
1359
1360      Type *OpTy = StructType::create({RetTy, CondTy});
1361      Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1362                                     ? Intrinsic::uadd_with_overflow
1363                                     : Intrinsic::usub_with_overflow;
1364
1365      unsigned Cost = 0;
1366      Cost += ConcreteTTI->getIntrinsicInstrCost(
1367          OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed);
1368      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1369                                              CondTy, nullptr);
1370      return Cost;
1371    }
1372    case Intrinsic::smul_fix:
1373    case Intrinsic::umul_fix: {
1374      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1375      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1376
1377      unsigned ExtOp =
1378          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1379
1380      unsigned Cost = 0;
1381      Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, RetTy);
1382      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy);
1383      Cost +=
1384          2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy);
1385      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, RetTy,
1386                                                  TTI::OK_AnyValue,
1387                                                  TTI::OK_UniformConstantValue);
1388      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Shl, RetTy,
1389                                                  TTI::OK_AnyValue,
1390                                                  TTI::OK_UniformConstantValue);
1391      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Or, RetTy);
1392      return Cost;
1393    }
1394    case Intrinsic::sadd_with_overflow:
1395    case Intrinsic::ssub_with_overflow: {
1396      Type *SumTy = RetTy->getContainedType(0);
1397      Type *OverflowTy = RetTy->getContainedType(1);
1398      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1399                            ? BinaryOperator::Add
1400                            : BinaryOperator::Sub;
1401
1402      //   LHSSign -> LHS >= 0
1403      //   RHSSign -> RHS >= 0
1404      //   SumSign -> Sum >= 0
1405      //
1406      //   Add:
1407      //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1408      //   Sub:
1409      //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1410      unsigned Cost = 0;
1411      Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
1412      Cost += 3 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
1413                                                  OverflowTy, nullptr);
1414      Cost += 2 * ConcreteTTI->getCmpSelInstrCost(
1415                      BinaryOperator::ICmp, OverflowTy, OverflowTy, nullptr);
1416      Cost +=
1417          ConcreteTTI->getArithmeticInstrCost(BinaryOperator::And, OverflowTy);
1418      return Cost;
1419    }
1420    case Intrinsic::uadd_with_overflow:
1421    case Intrinsic::usub_with_overflow: {
1422      Type *SumTy = RetTy->getContainedType(0);
1423      Type *OverflowTy = RetTy->getContainedType(1);
1424      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1425                            ? BinaryOperator::Add
1426                            : BinaryOperator::Sub;
1427
1428      unsigned Cost = 0;
1429      Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy);
1430      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
1431                                              OverflowTy, nullptr);
1432      return Cost;
1433    }
1434    case Intrinsic::smul_with_overflow:
1435    case Intrinsic::umul_with_overflow: {
1436      Type *MulTy = RetTy->getContainedType(0);
1437      Type *OverflowTy = RetTy->getContainedType(1);
1438      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1439      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1440
1441      unsigned ExtOp =
1442          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1443
1444      unsigned Cost = 0;
1445      Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, MulTy);
1446      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy);
1447      Cost +=
1448          2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy);
1449      Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, MulTy,
1450                                                  TTI::OK_AnyValue,
1451                                                  TTI::OK_UniformConstantValue);
1452
1453      if (IID == Intrinsic::smul_with_overflow)
1454        Cost += ConcreteTTI->getArithmeticInstrCost(
1455            Instruction::AShr, MulTy, TTI::OK_AnyValue,
1456            TTI::OK_UniformConstantValue);
1457
1458      Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy,
1459                                              OverflowTy, nullptr);
1460      return Cost;
1461    }
1462    case Intrinsic::ctpop:
1463      ISDs.push_back(ISD::CTPOP);
1464      // In case of legalization use TCC_Expensive. This is cheaper than a
1465      // library call but still not a cheap instruction.
1466      SingleCallCost = TargetTransformInfo::TCC_Expensive;
1467      break;
1468    // FIXME: ctlz, cttz, ...
1469    }
1470
1471    const TargetLoweringBase *TLI = getTLI();
1472    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1473
1474    SmallVector<unsigned, 2> LegalCost;
1475    SmallVector<unsigned, 2> CustomCost;
1476    for (unsigned ISD : ISDs) {
1477      if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1478        if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1479            TLI->isFAbsFree(LT.second)) {
1480          return 0;
1481        }
1482
1483        // The operation is legal. Assume it costs 1.
1484        // If the type is split to multiple registers, assume that there is some
1485        // overhead to this.
1486        // TODO: Once we have extract/insert subvector cost we need to use them.
1487        if (LT.first > 1)
1488          LegalCost.push_back(LT.first * 2);
1489        else
1490          LegalCost.push_back(LT.first * 1);
1491      } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1492        // If the operation is custom lowered then assume
1493        // that the code is twice as expensive.
1494        CustomCost.push_back(LT.first * 2);
1495      }
1496    }
1497
1498    auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1499    if (MinLegalCostI != LegalCost.end())
1500      return *MinLegalCostI;
1501
1502    auto MinCustomCostI =
1503        std::min_element(CustomCost.begin(), CustomCost.end());
1504    if (MinCustomCostI != CustomCost.end())
1505      return *MinCustomCostI;
1506
1507    // If we can't lower fmuladd into an FMA estimate the cost as a floating
1508    // point mul followed by an add.
1509    if (IID == Intrinsic::fmuladd)
1510      return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
1511             ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
1512
1513    // Else, assume that we need to scalarize this intrinsic. For math builtins
1514    // this will emit a costly libcall, adding call overhead and spills. Make it
1515    // very expensive.
1516    if (RetTy->isVectorTy()) {
1517      unsigned ScalarizationCost =
1518          ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max())
1519               ? ScalarizationCostPassed
1520               : getScalarizationOverhead(RetTy, true, false));
1521      unsigned ScalarCalls = RetTy->getVectorNumElements();
1522      SmallVector<Type *, 4> ScalarTys;
1523      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1524        Type *Ty = Tys[i];
1525        if (Ty->isVectorTy())
1526          Ty = Ty->getScalarType();
1527        ScalarTys.push_back(Ty);
1528      }
1529      unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost(
1530          IID, RetTy->getScalarType(), ScalarTys, FMF);
1531      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1532        if (Tys[i]->isVectorTy()) {
1533          if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1534            ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
1535          ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
1536        }
1537      }
1538
1539      return ScalarCalls * ScalarCost + ScalarizationCost;
1540    }
1541
1542    // This is going to be turned into a library call, make it expensive.
1543    return SingleCallCost;
1544  }
1545
1546  /// Compute a cost of the given call instruction.
1547  ///
1548  /// Compute the cost of calling function F with return type RetTy and
1549  /// argument types Tys. F might be nullptr, in this case the cost of an
1550  /// arbitrary call with the specified signature will be returned.
1551  /// This is used, for instance,  when we estimate call of a vector
1552  /// counterpart of the given function.
1553  /// \param F Called function, might be nullptr.
1554  /// \param RetTy Return value types.
1555  /// \param Tys Argument types.
1556  /// \returns The cost of Call instruction.
1557  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
1558    return 10;
1559  }
1560
1561  unsigned getNumberOfParts(Type *Tp) {
1562    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1563    return LT.first;
1564  }
1565
1566  unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
1567                                     const SCEV *) {
1568    return 0;
1569  }
1570
1571  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1572  /// We're assuming that reduction operation are performing the following way:
1573  /// 1. Non-pairwise reduction
1574  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1575  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1576  ///            \----------------v-------------/  \----------v------------/
1577  ///                            n/2 elements               n/2 elements
1578  /// %red1 = op <n x t> %val, <n x t> val1
1579  /// After this operation we have a vector %red1 where only the first n/2
1580  /// elements are meaningful, the second n/2 elements are undefined and can be
1581  /// dropped. All other operations are actually working with the vector of
1582  /// length n/2, not n, though the real vector length is still n.
1583  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1584  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1585  ///            \----------------v-------------/  \----------v------------/
1586  ///                            n/4 elements               3*n/4 elements
1587  /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
1588  /// length n/2, the resulting vector has length n/4 etc.
1589  /// 2. Pairwise reduction:
1590  /// Everything is the same except for an additional shuffle operation which
1591  /// is used to produce operands for pairwise kind of reductions.
1592  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1593  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1594  ///            \-------------v----------/  \----------v------------/
1595  ///                   n/2 elements               n/2 elements
1596  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1597  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1598  ///            \-------------v----------/  \----------v------------/
1599  ///                   n/2 elements               n/2 elements
1600  /// %red1 = op <n x t> %val1, <n x t> val2
1601  /// Again, the operation is performed on <n x t> vector, but the resulting
1602  /// vector %red1 is <n/2 x t> vector.
1603  ///
1604  /// The cost model should take into account that the actual length of the
1605  /// vector is reduced on each iteration.
1606  unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1607                                      bool IsPairwise) {
1608    assert(Ty->isVectorTy() && "Expect a vector type");
1609    Type *ScalarTy = Ty->getVectorElementType();
1610    unsigned NumVecElts = Ty->getVectorNumElements();
1611    unsigned NumReduxLevels = Log2_32(NumVecElts);
1612    unsigned ArithCost = 0;
1613    unsigned ShuffleCost = 0;
1614    auto *ConcreteTTI = static_cast<T *>(this);
1615    std::pair<unsigned, MVT> LT =
1616        ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1617    unsigned LongVectorCount = 0;
1618    unsigned MVTLen =
1619        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1620    while (NumVecElts > MVTLen) {
1621      NumVecElts /= 2;
1622      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
1623      // Assume the pairwise shuffles add a cost.
1624      ShuffleCost += (IsPairwise + 1) *
1625                     ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1626                                                 NumVecElts, SubTy);
1627      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy);
1628      Ty = SubTy;
1629      ++LongVectorCount;
1630    }
1631
1632    NumReduxLevels -= LongVectorCount;
1633
1634    // The minimal length of the vector is limited by the real length of vector
1635    // operations performed on the current platform. That's why several final
1636    // reduction operations are performed on the vectors with the same
1637    // architecture-dependent length.
1638
1639    // Non pairwise reductions need one shuffle per reduction level. Pairwise
1640    // reductions need two shuffles on every level, but the last one. On that
1641    // level one of the shuffles is <0, u, u, ...> which is identity.
1642    unsigned NumShuffles = NumReduxLevels;
1643    if (IsPairwise && NumReduxLevels >= 1)
1644      NumShuffles += NumReduxLevels - 1;
1645    ShuffleCost += NumShuffles *
1646                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
1647                                               0, Ty);
1648    ArithCost += NumReduxLevels *
1649                 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1650    return ShuffleCost + ArithCost +
1651           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
1652  }
1653
1654  /// Try to calculate op costs for min/max reduction operations.
1655  /// \param CondTy Conditional type for the Select instruction.
1656  unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise,
1657                                  bool) {
1658    assert(Ty->isVectorTy() && "Expect a vector type");
1659    Type *ScalarTy = Ty->getVectorElementType();
1660    Type *ScalarCondTy = CondTy->getVectorElementType();
1661    unsigned NumVecElts = Ty->getVectorNumElements();
1662    unsigned NumReduxLevels = Log2_32(NumVecElts);
1663    unsigned CmpOpcode;
1664    if (Ty->isFPOrFPVectorTy()) {
1665      CmpOpcode = Instruction::FCmp;
1666    } else {
1667      assert(Ty->isIntOrIntVectorTy() &&
1668             "expecting floating point or integer type for min/max reduction");
1669      CmpOpcode = Instruction::ICmp;
1670    }
1671    unsigned MinMaxCost = 0;
1672    unsigned ShuffleCost = 0;
1673    auto *ConcreteTTI = static_cast<T *>(this);
1674    std::pair<unsigned, MVT> LT =
1675        ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1676    unsigned LongVectorCount = 0;
1677    unsigned MVTLen =
1678        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1679    while (NumVecElts > MVTLen) {
1680      NumVecElts /= 2;
1681      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
1682      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
1683
1684      // Assume the pairwise shuffles add a cost.
1685      ShuffleCost += (IsPairwise + 1) *
1686                     ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1687                                                 NumVecElts, SubTy);
1688      MinMaxCost +=
1689          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) +
1690          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
1691                                          nullptr);
1692      Ty = SubTy;
1693      ++LongVectorCount;
1694    }
1695
1696    NumReduxLevels -= LongVectorCount;
1697
1698    // The minimal length of the vector is limited by the real length of vector
1699    // operations performed on the current platform. That's why several final
1700    // reduction opertions are perfomed on the vectors with the same
1701    // architecture-dependent length.
1702
1703    // Non pairwise reductions need one shuffle per reduction level. Pairwise
1704    // reductions need two shuffles on every level, but the last one. On that
1705    // level one of the shuffles is <0, u, u, ...> which is identity.
1706    unsigned NumShuffles = NumReduxLevels;
1707    if (IsPairwise && NumReduxLevels >= 1)
1708      NumShuffles += NumReduxLevels - 1;
1709    ShuffleCost += NumShuffles *
1710                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
1711                                               0, Ty);
1712    MinMaxCost +=
1713        NumReduxLevels *
1714        (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1715         ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1716                                         nullptr));
1717    // The last min/max should be in vector registers and we counted it above.
1718    // So just need a single extractelement.
1719    return ShuffleCost + MinMaxCost +
1720           ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
1721  }
1722
1723  unsigned getVectorSplitCost() { return 1; }
1724
1725  /// @}
1726};
1727
1728/// Concrete BasicTTIImpl that can be used if no further customization
1729/// is needed.
1730class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
1731  using BaseT = BasicTTIImplBase<BasicTTIImpl>;
1732
1733  friend class BasicTTIImplBase<BasicTTIImpl>;
1734
1735  const TargetSubtargetInfo *ST;
1736  const TargetLoweringBase *TLI;
1737
1738  const TargetSubtargetInfo *getST() const { return ST; }
1739  const TargetLoweringBase *getTLI() const { return TLI; }
1740
1741public:
1742  explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
1743};
1744
1745} // end namespace llvm
1746
1747#endif // LLVM_CODEGEN_BASICTTIIMPL_H
1748