1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "RISCVTargetTransformInfo.h"
10#include "MCTargetDesc/RISCVMatInt.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/CodeGen/BasicTTIImpl.h"
13#include "llvm/CodeGen/CostTable.h"
14#include "llvm/CodeGen/TargetLowering.h"
15#include <cmath>
16#include <optional>
17using namespace llvm;
18
19#define DEBUG_TYPE "riscvtti"
20
21static cl::opt<unsigned> RVVRegisterWidthLMUL(
22    "riscv-v-register-bit-width-lmul",
23    cl::desc(
24        "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
25        "by autovectorized code. Fractional LMULs are not supported."),
26    cl::init(1), cl::Hidden);
27
28static cl::opt<unsigned> SLPMaxVF(
29    "riscv-v-slp-max-vf",
30    cl::desc(
31        "Result used for getMaximumVF query which is used exclusively by "
32        "SLP vectorizer.  Defaults to 1 which disables SLP."),
33    cl::init(1), cl::Hidden);
34
35InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
36  // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
37  // implementation-defined.
38  if (!VT.isVector())
39    return InstructionCost::getInvalid();
40  unsigned Cost;
41  if (VT.isScalableVector()) {
42    unsigned LMul;
43    bool Fractional;
44    std::tie(LMul, Fractional) =
45        RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
46    if (Fractional)
47      Cost = 1;
48    else
49      Cost = LMul;
50  } else {
51    Cost = VT.getSizeInBits() / ST->getRealMinVLen();
52  }
53  return std::max<unsigned>(Cost, 1);
54}
55
56InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
57                                            TTI::TargetCostKind CostKind) {
58  assert(Ty->isIntegerTy() &&
59         "getIntImmCost can only estimate cost of materialising integers");
60
61  // We have a Zero register, so 0 is always free.
62  if (Imm == 0)
63    return TTI::TCC_Free;
64
65  // Otherwise, we check how many instructions it will take to materialise.
66  const DataLayout &DL = getDataLayout();
67  return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
68                                    getST()->getFeatureBits());
69}
70
71// Look for patterns of shift followed by AND that can be turned into a pair of
72// shifts. We won't need to materialize an immediate for the AND so these can
73// be considered free.
74static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
75  uint64_t Mask = Imm.getZExtValue();
76  auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
77  if (!BO || !BO->hasOneUse())
78    return false;
79
80  if (BO->getOpcode() != Instruction::Shl)
81    return false;
82
83  if (!isa<ConstantInt>(BO->getOperand(1)))
84    return false;
85
86  unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
87  // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
88  // is a mask shifted by c2 bits with c3 leading zeros.
89  if (isShiftedMask_64(Mask)) {
90    unsigned Trailing = countTrailingZeros(Mask);
91    if (ShAmt == Trailing)
92      return true;
93  }
94
95  return false;
96}
97
98InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
99                                                const APInt &Imm, Type *Ty,
100                                                TTI::TargetCostKind CostKind,
101                                                Instruction *Inst) {
102  assert(Ty->isIntegerTy() &&
103         "getIntImmCost can only estimate cost of materialising integers");
104
105  // We have a Zero register, so 0 is always free.
106  if (Imm == 0)
107    return TTI::TCC_Free;
108
109  // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
110  // commutative, in others the immediate comes from a specific argument index.
111  bool Takes12BitImm = false;
112  unsigned ImmArgIdx = ~0U;
113
114  switch (Opcode) {
115  case Instruction::GetElementPtr:
116    // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
117    // split up large offsets in GEP into better parts than ConstantHoisting
118    // can.
119    return TTI::TCC_Free;
120  case Instruction::And:
121    // zext.h
122    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
123      return TTI::TCC_Free;
124    // zext.w
125    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
126      return TTI::TCC_Free;
127    // bclri
128    if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
129      return TTI::TCC_Free;
130    if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
131        canUseShiftPair(Inst, Imm))
132      return TTI::TCC_Free;
133    Takes12BitImm = true;
134    break;
135  case Instruction::Add:
136    Takes12BitImm = true;
137    break;
138  case Instruction::Or:
139  case Instruction::Xor:
140    // bseti/binvi
141    if (ST->hasStdExtZbs() && Imm.isPowerOf2())
142      return TTI::TCC_Free;
143    Takes12BitImm = true;
144    break;
145  case Instruction::Mul:
146    // Negated power of 2 is a shift and a negate.
147    if (Imm.isNegatedPowerOf2())
148      return TTI::TCC_Free;
149    // FIXME: There is no MULI instruction.
150    Takes12BitImm = true;
151    break;
152  case Instruction::Sub:
153  case Instruction::Shl:
154  case Instruction::LShr:
155  case Instruction::AShr:
156    Takes12BitImm = true;
157    ImmArgIdx = 1;
158    break;
159  default:
160    break;
161  }
162
163  if (Takes12BitImm) {
164    // Check immediate is the correct argument...
165    if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
166      // ... and fits into the 12-bit immediate.
167      if (Imm.getMinSignedBits() <= 64 &&
168          getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
169        return TTI::TCC_Free;
170      }
171    }
172
173    // Otherwise, use the full materialisation cost.
174    return getIntImmCost(Imm, Ty, CostKind);
175  }
176
177  // By default, prevent hoisting.
178  return TTI::TCC_Free;
179}
180
181InstructionCost
182RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
183                                  const APInt &Imm, Type *Ty,
184                                  TTI::TargetCostKind CostKind) {
185  // Prevent hoisting in unknown cases.
186  return TTI::TCC_Free;
187}
188
189TargetTransformInfo::PopcntSupportKind
190RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
191  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
192  return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
193}
194
195bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
196  // Currently, the ExpandReductions pass can't expand scalable-vector
197  // reductions, but we still request expansion as RVV doesn't support certain
198  // reductions and the SelectionDAG can't legalize them either.
199  switch (II->getIntrinsicID()) {
200  default:
201    return false;
202  // These reductions have no equivalent in RVV
203  case Intrinsic::vector_reduce_mul:
204  case Intrinsic::vector_reduce_fmul:
205    return true;
206  }
207}
208
209std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
210  if (ST->hasVInstructions())
211    return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
212  return BaseT::getMaxVScale();
213}
214
215std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
216  if (ST->hasVInstructions())
217    if (unsigned MinVLen = ST->getRealMinVLen();
218        MinVLen >= RISCV::RVVBitsPerBlock)
219      return MinVLen / RISCV::RVVBitsPerBlock;
220  return BaseT::getVScaleForTuning();
221}
222
223TypeSize
224RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
225  unsigned LMUL = PowerOf2Floor(
226      std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
227  switch (K) {
228  case TargetTransformInfo::RGK_Scalar:
229    return TypeSize::getFixed(ST->getXLen());
230  case TargetTransformInfo::RGK_FixedWidthVector:
231    return TypeSize::getFixed(
232        ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
233  case TargetTransformInfo::RGK_ScalableVector:
234    return TypeSize::getScalable(
235        (ST->hasVInstructions() &&
236         ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
237            ? LMUL * RISCV::RVVBitsPerBlock
238            : 0);
239  }
240
241  llvm_unreachable("Unsupported register kind");
242}
243
244InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
245  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
246
247  unsigned Cost = 2; // vslidedown+vslideup.
248  // TODO: Multiplying by LT.first implies this legalizes into multiple copies
249  // of similar code, but I think we expand through memory.
250  return Cost * LT.first * getLMULCost(LT.second);
251}
252
253InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
254                                             VectorType *Tp, ArrayRef<int> Mask,
255                                             TTI::TargetCostKind CostKind,
256                                             int Index, VectorType *SubTp,
257                                             ArrayRef<const Value *> Args) {
258  if (isa<ScalableVectorType>(Tp)) {
259    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
260    switch (Kind) {
261    default:
262      // Fallthrough to generic handling.
263      // TODO: Most of these cases will return getInvalid in generic code, and
264      // must be implemented here.
265      break;
266    case TTI::SK_Broadcast: {
267      return LT.first * 1;
268    }
269    case TTI::SK_Splice:
270      return getSpliceCost(Tp, Index);
271    case TTI::SK_Reverse:
272      // Most of the cost here is producing the vrgather index register
273      // Example sequence:
274      //   csrr a0, vlenb
275      //   srli a0, a0, 3
276      //   addi a0, a0, -1
277      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
278      //   vid.v v9
279      //   vrsub.vx v10, v9, a0
280      //   vrgather.vv v9, v8, v10
281      if (Tp->getElementType()->isIntegerTy(1))
282        // Mask operation additionally required extend and truncate
283        return LT.first * 9;
284      return LT.first * 6;
285    }
286  }
287
288  if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
289    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
290    bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
291                                           Instruction::InsertElement);
292    if (LT.second.getScalarSizeInBits() == 1) {
293      if (HasScalar) {
294        // Example sequence:
295        //   andi a0, a0, 1
296        //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
297        //   vmv.v.x v8, a0
298        //   vmsne.vi v0, v8, 0
299        return LT.first * getLMULCost(LT.second) * 3;
300      }
301      // Example sequence:
302      //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
303      //   vmv.v.i v8, 0
304      //   vmerge.vim      v8, v8, 1, v0
305      //   vmv.x.s a0, v8
306      //   andi    a0, a0, 1
307      //   vmv.v.x v8, a0
308      //   vmsne.vi  v0, v8, 0
309
310      return LT.first * getLMULCost(LT.second) * 6;
311    }
312
313    if (HasScalar) {
314      // Example sequence:
315      //   vmv.v.x v8, a0
316      return LT.first * getLMULCost(LT.second);
317    }
318
319    // Example sequence:
320    //   vrgather.vi     v9, v8, 0
321    // TODO: vrgather could be slower than vmv.v.x. It is
322    // implementation-dependent.
323    return LT.first * getLMULCost(LT.second);
324  }
325
326  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
327}
328
329InstructionCost
330RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
331                                    unsigned AddressSpace,
332                                    TTI::TargetCostKind CostKind) {
333  if (!isLegalMaskedLoadStore(Src, Alignment) ||
334      CostKind != TTI::TCK_RecipThroughput)
335    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
336                                        CostKind);
337
338  return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
339}
340
341InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
342    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
343    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
344  if (CostKind != TTI::TCK_RecipThroughput)
345    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
346                                         Alignment, CostKind, I);
347
348  if ((Opcode == Instruction::Load &&
349       !isLegalMaskedGather(DataTy, Align(Alignment))) ||
350      (Opcode == Instruction::Store &&
351       !isLegalMaskedScatter(DataTy, Align(Alignment))))
352    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
353                                         Alignment, CostKind, I);
354
355  // Cost is proportional to the number of memory operations implied.  For
356  // scalable vectors, we use an estimate on that number since we don't
357  // know exactly what VL will be.
358  auto &VTy = *cast<VectorType>(DataTy);
359  InstructionCost MemOpCost =
360      getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
361                      {TTI::OK_AnyValue, TTI::OP_None}, I);
362  unsigned NumLoads = getEstimatedVLFor(&VTy);
363  return NumLoads * MemOpCost;
364}
365
366// Currently, these represent both throughput and codesize costs
367// for the respective intrinsics.  The costs in this table are simply
368// instruction counts with the following adjustments made:
369// * One vsetvli is considered free.
370static const CostTblEntry VectorIntrinsicCostTable[]{
371    {Intrinsic::floor, MVT::v2f32, 9},
372    {Intrinsic::floor, MVT::v4f32, 9},
373    {Intrinsic::floor, MVT::v8f32, 9},
374    {Intrinsic::floor, MVT::v16f32, 9},
375    {Intrinsic::floor, MVT::nxv1f32, 9},
376    {Intrinsic::floor, MVT::nxv2f32, 9},
377    {Intrinsic::floor, MVT::nxv4f32, 9},
378    {Intrinsic::floor, MVT::nxv8f32, 9},
379    {Intrinsic::floor, MVT::nxv16f32, 9},
380    {Intrinsic::floor, MVT::v2f64, 9},
381    {Intrinsic::floor, MVT::v4f64, 9},
382    {Intrinsic::floor, MVT::v8f64, 9},
383    {Intrinsic::floor, MVT::v16f64, 9},
384    {Intrinsic::floor, MVT::nxv1f64, 9},
385    {Intrinsic::floor, MVT::nxv2f64, 9},
386    {Intrinsic::floor, MVT::nxv4f64, 9},
387    {Intrinsic::floor, MVT::nxv8f64, 9},
388    {Intrinsic::ceil, MVT::v2f32, 9},
389    {Intrinsic::ceil, MVT::v4f32, 9},
390    {Intrinsic::ceil, MVT::v8f32, 9},
391    {Intrinsic::ceil, MVT::v16f32, 9},
392    {Intrinsic::ceil, MVT::nxv1f32, 9},
393    {Intrinsic::ceil, MVT::nxv2f32, 9},
394    {Intrinsic::ceil, MVT::nxv4f32, 9},
395    {Intrinsic::ceil, MVT::nxv8f32, 9},
396    {Intrinsic::ceil, MVT::nxv16f32, 9},
397    {Intrinsic::ceil, MVT::v2f64, 9},
398    {Intrinsic::ceil, MVT::v4f64, 9},
399    {Intrinsic::ceil, MVT::v8f64, 9},
400    {Intrinsic::ceil, MVT::v16f64, 9},
401    {Intrinsic::ceil, MVT::nxv1f64, 9},
402    {Intrinsic::ceil, MVT::nxv2f64, 9},
403    {Intrinsic::ceil, MVT::nxv4f64, 9},
404    {Intrinsic::ceil, MVT::nxv8f64, 9},
405    {Intrinsic::trunc, MVT::v2f32, 7},
406    {Intrinsic::trunc, MVT::v4f32, 7},
407    {Intrinsic::trunc, MVT::v8f32, 7},
408    {Intrinsic::trunc, MVT::v16f32, 7},
409    {Intrinsic::trunc, MVT::nxv1f32, 7},
410    {Intrinsic::trunc, MVT::nxv2f32, 7},
411    {Intrinsic::trunc, MVT::nxv4f32, 7},
412    {Intrinsic::trunc, MVT::nxv8f32, 7},
413    {Intrinsic::trunc, MVT::nxv16f32, 7},
414    {Intrinsic::trunc, MVT::v2f64, 7},
415    {Intrinsic::trunc, MVT::v4f64, 7},
416    {Intrinsic::trunc, MVT::v8f64, 7},
417    {Intrinsic::trunc, MVT::v16f64, 7},
418    {Intrinsic::trunc, MVT::nxv1f64, 7},
419    {Intrinsic::trunc, MVT::nxv2f64, 7},
420    {Intrinsic::trunc, MVT::nxv4f64, 7},
421    {Intrinsic::trunc, MVT::nxv8f64, 7},
422    {Intrinsic::round, MVT::v2f32, 9},
423    {Intrinsic::round, MVT::v4f32, 9},
424    {Intrinsic::round, MVT::v8f32, 9},
425    {Intrinsic::round, MVT::v16f32, 9},
426    {Intrinsic::round, MVT::nxv1f32, 9},
427    {Intrinsic::round, MVT::nxv2f32, 9},
428    {Intrinsic::round, MVT::nxv4f32, 9},
429    {Intrinsic::round, MVT::nxv8f32, 9},
430    {Intrinsic::round, MVT::nxv16f32, 9},
431    {Intrinsic::round, MVT::v2f64, 9},
432    {Intrinsic::round, MVT::v4f64, 9},
433    {Intrinsic::round, MVT::v8f64, 9},
434    {Intrinsic::round, MVT::v16f64, 9},
435    {Intrinsic::round, MVT::nxv1f64, 9},
436    {Intrinsic::round, MVT::nxv2f64, 9},
437    {Intrinsic::round, MVT::nxv4f64, 9},
438    {Intrinsic::round, MVT::nxv8f64, 9},
439    {Intrinsic::roundeven, MVT::v2f32, 9},
440    {Intrinsic::roundeven, MVT::v4f32, 9},
441    {Intrinsic::roundeven, MVT::v8f32, 9},
442    {Intrinsic::roundeven, MVT::v16f32, 9},
443    {Intrinsic::roundeven, MVT::nxv1f32, 9},
444    {Intrinsic::roundeven, MVT::nxv2f32, 9},
445    {Intrinsic::roundeven, MVT::nxv4f32, 9},
446    {Intrinsic::roundeven, MVT::nxv8f32, 9},
447    {Intrinsic::roundeven, MVT::nxv16f32, 9},
448    {Intrinsic::roundeven, MVT::v2f64, 9},
449    {Intrinsic::roundeven, MVT::v4f64, 9},
450    {Intrinsic::roundeven, MVT::v8f64, 9},
451    {Intrinsic::roundeven, MVT::v16f64, 9},
452    {Intrinsic::roundeven, MVT::nxv1f64, 9},
453    {Intrinsic::roundeven, MVT::nxv2f64, 9},
454    {Intrinsic::roundeven, MVT::nxv4f64, 9},
455    {Intrinsic::roundeven, MVT::nxv8f64, 9},
456    {Intrinsic::bswap, MVT::v2i16, 3},
457    {Intrinsic::bswap, MVT::v4i16, 3},
458    {Intrinsic::bswap, MVT::v8i16, 3},
459    {Intrinsic::bswap, MVT::v16i16, 3},
460    {Intrinsic::bswap, MVT::nxv1i16, 3},
461    {Intrinsic::bswap, MVT::nxv2i16, 3},
462    {Intrinsic::bswap, MVT::nxv4i16, 3},
463    {Intrinsic::bswap, MVT::nxv8i16, 3},
464    {Intrinsic::bswap, MVT::nxv16i16, 3},
465    {Intrinsic::bswap, MVT::v2i32, 12},
466    {Intrinsic::bswap, MVT::v4i32, 12},
467    {Intrinsic::bswap, MVT::v8i32, 12},
468    {Intrinsic::bswap, MVT::v16i32, 12},
469    {Intrinsic::bswap, MVT::nxv1i32, 12},
470    {Intrinsic::bswap, MVT::nxv2i32, 12},
471    {Intrinsic::bswap, MVT::nxv4i32, 12},
472    {Intrinsic::bswap, MVT::nxv8i32, 12},
473    {Intrinsic::bswap, MVT::nxv16i32, 12},
474    {Intrinsic::bswap, MVT::v2i64, 31},
475    {Intrinsic::bswap, MVT::v4i64, 31},
476    {Intrinsic::bswap, MVT::v8i64, 31},
477    {Intrinsic::bswap, MVT::v16i64, 31},
478    {Intrinsic::bswap, MVT::nxv1i64, 31},
479    {Intrinsic::bswap, MVT::nxv2i64, 31},
480    {Intrinsic::bswap, MVT::nxv4i64, 31},
481    {Intrinsic::bswap, MVT::nxv8i64, 31},
482    {Intrinsic::vp_bswap, MVT::v2i16, 3},
483    {Intrinsic::vp_bswap, MVT::v4i16, 3},
484    {Intrinsic::vp_bswap, MVT::v8i16, 3},
485    {Intrinsic::vp_bswap, MVT::v16i16, 3},
486    {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
487    {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
488    {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
489    {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
490    {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
491    {Intrinsic::vp_bswap, MVT::v2i32, 12},
492    {Intrinsic::vp_bswap, MVT::v4i32, 12},
493    {Intrinsic::vp_bswap, MVT::v8i32, 12},
494    {Intrinsic::vp_bswap, MVT::v16i32, 12},
495    {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
496    {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
497    {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
498    {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
499    {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
500    {Intrinsic::vp_bswap, MVT::v2i64, 31},
501    {Intrinsic::vp_bswap, MVT::v4i64, 31},
502    {Intrinsic::vp_bswap, MVT::v8i64, 31},
503    {Intrinsic::vp_bswap, MVT::v16i64, 31},
504    {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
505    {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
506    {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
507    {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
508    {Intrinsic::vp_fshl, MVT::v2i8, 7},
509    {Intrinsic::vp_fshl, MVT::v4i8, 7},
510    {Intrinsic::vp_fshl, MVT::v8i8, 7},
511    {Intrinsic::vp_fshl, MVT::v16i8, 7},
512    {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
513    {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
514    {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
515    {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
516    {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
517    {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
518    {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
519    {Intrinsic::vp_fshl, MVT::v2i16, 7},
520    {Intrinsic::vp_fshl, MVT::v4i16, 7},
521    {Intrinsic::vp_fshl, MVT::v8i16, 7},
522    {Intrinsic::vp_fshl, MVT::v16i16, 7},
523    {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
524    {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
525    {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
526    {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
527    {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
528    {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
529    {Intrinsic::vp_fshl, MVT::v2i32, 7},
530    {Intrinsic::vp_fshl, MVT::v4i32, 7},
531    {Intrinsic::vp_fshl, MVT::v8i32, 7},
532    {Intrinsic::vp_fshl, MVT::v16i32, 7},
533    {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
534    {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
535    {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
536    {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
537    {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
538    {Intrinsic::vp_fshl, MVT::v2i64, 7},
539    {Intrinsic::vp_fshl, MVT::v4i64, 7},
540    {Intrinsic::vp_fshl, MVT::v8i64, 7},
541    {Intrinsic::vp_fshl, MVT::v16i64, 7},
542    {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
543    {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
544    {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
545    {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
546    {Intrinsic::vp_fshr, MVT::v2i8, 7},
547    {Intrinsic::vp_fshr, MVT::v4i8, 7},
548    {Intrinsic::vp_fshr, MVT::v8i8, 7},
549    {Intrinsic::vp_fshr, MVT::v16i8, 7},
550    {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
551    {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
552    {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
553    {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
554    {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
555    {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
556    {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
557    {Intrinsic::vp_fshr, MVT::v2i16, 7},
558    {Intrinsic::vp_fshr, MVT::v4i16, 7},
559    {Intrinsic::vp_fshr, MVT::v8i16, 7},
560    {Intrinsic::vp_fshr, MVT::v16i16, 7},
561    {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
562    {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
563    {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
564    {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
565    {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
566    {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
567    {Intrinsic::vp_fshr, MVT::v2i32, 7},
568    {Intrinsic::vp_fshr, MVT::v4i32, 7},
569    {Intrinsic::vp_fshr, MVT::v8i32, 7},
570    {Intrinsic::vp_fshr, MVT::v16i32, 7},
571    {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
572    {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
573    {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
574    {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
575    {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
576    {Intrinsic::vp_fshr, MVT::v2i64, 7},
577    {Intrinsic::vp_fshr, MVT::v4i64, 7},
578    {Intrinsic::vp_fshr, MVT::v8i64, 7},
579    {Intrinsic::vp_fshr, MVT::v16i64, 7},
580    {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
581    {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
582    {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
583    {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
584    {Intrinsic::bitreverse, MVT::v2i8, 17},
585    {Intrinsic::bitreverse, MVT::v4i8, 17},
586    {Intrinsic::bitreverse, MVT::v8i8, 17},
587    {Intrinsic::bitreverse, MVT::v16i8, 17},
588    {Intrinsic::bitreverse, MVT::nxv1i8, 17},
589    {Intrinsic::bitreverse, MVT::nxv2i8, 17},
590    {Intrinsic::bitreverse, MVT::nxv4i8, 17},
591    {Intrinsic::bitreverse, MVT::nxv8i8, 17},
592    {Intrinsic::bitreverse, MVT::nxv16i8, 17},
593    {Intrinsic::bitreverse, MVT::v2i16, 24},
594    {Intrinsic::bitreverse, MVT::v4i16, 24},
595    {Intrinsic::bitreverse, MVT::v8i16, 24},
596    {Intrinsic::bitreverse, MVT::v16i16, 24},
597    {Intrinsic::bitreverse, MVT::nxv1i16, 24},
598    {Intrinsic::bitreverse, MVT::nxv2i16, 24},
599    {Intrinsic::bitreverse, MVT::nxv4i16, 24},
600    {Intrinsic::bitreverse, MVT::nxv8i16, 24},
601    {Intrinsic::bitreverse, MVT::nxv16i16, 24},
602    {Intrinsic::bitreverse, MVT::v2i32, 33},
603    {Intrinsic::bitreverse, MVT::v4i32, 33},
604    {Intrinsic::bitreverse, MVT::v8i32, 33},
605    {Intrinsic::bitreverse, MVT::v16i32, 33},
606    {Intrinsic::bitreverse, MVT::nxv1i32, 33},
607    {Intrinsic::bitreverse, MVT::nxv2i32, 33},
608    {Intrinsic::bitreverse, MVT::nxv4i32, 33},
609    {Intrinsic::bitreverse, MVT::nxv8i32, 33},
610    {Intrinsic::bitreverse, MVT::nxv16i32, 33},
611    {Intrinsic::bitreverse, MVT::v2i64, 52},
612    {Intrinsic::bitreverse, MVT::v4i64, 52},
613    {Intrinsic::bitreverse, MVT::v8i64, 52},
614    {Intrinsic::bitreverse, MVT::v16i64, 52},
615    {Intrinsic::bitreverse, MVT::nxv1i64, 52},
616    {Intrinsic::bitreverse, MVT::nxv2i64, 52},
617    {Intrinsic::bitreverse, MVT::nxv4i64, 52},
618    {Intrinsic::bitreverse, MVT::nxv8i64, 52},
619    {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
620    {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
621    {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
622    {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
623    {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
624    {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
625    {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
626    {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
627    {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
628    {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
629    {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
630    {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
631    {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
632    {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
633    {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
634    {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
635    {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
636    {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
637    {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
638    {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
639    {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
640    {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
641    {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
642    {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
643    {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
644    {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
645    {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
646    {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
647    {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
648    {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
649    {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
650    {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
651    {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
652    {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
653    {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
654    {Intrinsic::ctpop, MVT::v2i8, 12},
655    {Intrinsic::ctpop, MVT::v4i8, 12},
656    {Intrinsic::ctpop, MVT::v8i8, 12},
657    {Intrinsic::ctpop, MVT::v16i8, 12},
658    {Intrinsic::ctpop, MVT::nxv1i8, 12},
659    {Intrinsic::ctpop, MVT::nxv2i8, 12},
660    {Intrinsic::ctpop, MVT::nxv4i8, 12},
661    {Intrinsic::ctpop, MVT::nxv8i8, 12},
662    {Intrinsic::ctpop, MVT::nxv16i8, 12},
663    {Intrinsic::ctpop, MVT::v2i16, 19},
664    {Intrinsic::ctpop, MVT::v4i16, 19},
665    {Intrinsic::ctpop, MVT::v8i16, 19},
666    {Intrinsic::ctpop, MVT::v16i16, 19},
667    {Intrinsic::ctpop, MVT::nxv1i16, 19},
668    {Intrinsic::ctpop, MVT::nxv2i16, 19},
669    {Intrinsic::ctpop, MVT::nxv4i16, 19},
670    {Intrinsic::ctpop, MVT::nxv8i16, 19},
671    {Intrinsic::ctpop, MVT::nxv16i16, 19},
672    {Intrinsic::ctpop, MVT::v2i32, 20},
673    {Intrinsic::ctpop, MVT::v4i32, 20},
674    {Intrinsic::ctpop, MVT::v8i32, 20},
675    {Intrinsic::ctpop, MVT::v16i32, 20},
676    {Intrinsic::ctpop, MVT::nxv1i32, 20},
677    {Intrinsic::ctpop, MVT::nxv2i32, 20},
678    {Intrinsic::ctpop, MVT::nxv4i32, 20},
679    {Intrinsic::ctpop, MVT::nxv8i32, 20},
680    {Intrinsic::ctpop, MVT::nxv16i32, 20},
681    {Intrinsic::ctpop, MVT::v2i64, 21},
682    {Intrinsic::ctpop, MVT::v4i64, 21},
683    {Intrinsic::ctpop, MVT::v8i64, 21},
684    {Intrinsic::ctpop, MVT::v16i64, 21},
685    {Intrinsic::ctpop, MVT::nxv1i64, 21},
686    {Intrinsic::ctpop, MVT::nxv2i64, 21},
687    {Intrinsic::ctpop, MVT::nxv4i64, 21},
688    {Intrinsic::ctpop, MVT::nxv8i64, 21},
689    {Intrinsic::vp_ctpop, MVT::v2i8, 12},
690    {Intrinsic::vp_ctpop, MVT::v4i8, 12},
691    {Intrinsic::vp_ctpop, MVT::v8i8, 12},
692    {Intrinsic::vp_ctpop, MVT::v16i8, 12},
693    {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
694    {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
695    {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
696    {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
697    {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
698    {Intrinsic::vp_ctpop, MVT::v2i16, 19},
699    {Intrinsic::vp_ctpop, MVT::v4i16, 19},
700    {Intrinsic::vp_ctpop, MVT::v8i16, 19},
701    {Intrinsic::vp_ctpop, MVT::v16i16, 19},
702    {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
703    {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
704    {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
705    {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
706    {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
707    {Intrinsic::vp_ctpop, MVT::v2i32, 20},
708    {Intrinsic::vp_ctpop, MVT::v4i32, 20},
709    {Intrinsic::vp_ctpop, MVT::v8i32, 20},
710    {Intrinsic::vp_ctpop, MVT::v16i32, 20},
711    {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
712    {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
713    {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
714    {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
715    {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
716    {Intrinsic::vp_ctpop, MVT::v2i64, 21},
717    {Intrinsic::vp_ctpop, MVT::v4i64, 21},
718    {Intrinsic::vp_ctpop, MVT::v8i64, 21},
719    {Intrinsic::vp_ctpop, MVT::v16i64, 21},
720    {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
721    {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
722    {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
723    {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
724    {Intrinsic::vp_ctlz, MVT::v2i8, 19},
725    {Intrinsic::vp_ctlz, MVT::v4i8, 19},
726    {Intrinsic::vp_ctlz, MVT::v8i8, 19},
727    {Intrinsic::vp_ctlz, MVT::v16i8, 19},
728    {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
729    {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
730    {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
731    {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
732    {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
733    {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
734    {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
735    {Intrinsic::vp_ctlz, MVT::v2i16, 28},
736    {Intrinsic::vp_ctlz, MVT::v4i16, 28},
737    {Intrinsic::vp_ctlz, MVT::v8i16, 28},
738    {Intrinsic::vp_ctlz, MVT::v16i16, 28},
739    {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
740    {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
741    {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
742    {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
743    {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
744    {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
745    {Intrinsic::vp_ctlz, MVT::v2i32, 31},
746    {Intrinsic::vp_ctlz, MVT::v4i32, 31},
747    {Intrinsic::vp_ctlz, MVT::v8i32, 31},
748    {Intrinsic::vp_ctlz, MVT::v16i32, 31},
749    {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
750    {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
751    {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
752    {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
753    {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
754    {Intrinsic::vp_ctlz, MVT::v2i64, 35},
755    {Intrinsic::vp_ctlz, MVT::v4i64, 35},
756    {Intrinsic::vp_ctlz, MVT::v8i64, 35},
757    {Intrinsic::vp_ctlz, MVT::v16i64, 35},
758    {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
759    {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
760    {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
761    {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
762    {Intrinsic::vp_cttz, MVT::v2i8, 16},
763    {Intrinsic::vp_cttz, MVT::v4i8, 16},
764    {Intrinsic::vp_cttz, MVT::v8i8, 16},
765    {Intrinsic::vp_cttz, MVT::v16i8, 16},
766    {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
767    {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
768    {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
769    {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
770    {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
771    {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
772    {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
773    {Intrinsic::vp_cttz, MVT::v2i16, 23},
774    {Intrinsic::vp_cttz, MVT::v4i16, 23},
775    {Intrinsic::vp_cttz, MVT::v8i16, 23},
776    {Intrinsic::vp_cttz, MVT::v16i16, 23},
777    {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
778    {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
779    {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
780    {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
781    {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
782    {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
783    {Intrinsic::vp_cttz, MVT::v2i32, 24},
784    {Intrinsic::vp_cttz, MVT::v4i32, 24},
785    {Intrinsic::vp_cttz, MVT::v8i32, 24},
786    {Intrinsic::vp_cttz, MVT::v16i32, 24},
787    {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
788    {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
789    {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
790    {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
791    {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
792    {Intrinsic::vp_cttz, MVT::v2i64, 25},
793    {Intrinsic::vp_cttz, MVT::v4i64, 25},
794    {Intrinsic::vp_cttz, MVT::v8i64, 25},
795    {Intrinsic::vp_cttz, MVT::v16i64, 25},
796    {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
797    {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
798    {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
799    {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
800};
801
802static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
803  switch (ID) {
804#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
805  case Intrinsic::VPID:                                                        \
806    return ISD::VPSD;
807#include "llvm/IR/VPIntrinsics.def"
808#undef HELPER_MAP_VPID_TO_VPSD
809  }
810  return ISD::DELETED_NODE;
811}
812
813InstructionCost
814RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
815                                    TTI::TargetCostKind CostKind) {
816  auto *RetTy = ICA.getReturnType();
817  switch (ICA.getID()) {
818  case Intrinsic::ceil:
819  case Intrinsic::floor:
820  case Intrinsic::trunc:
821  case Intrinsic::rint:
822  case Intrinsic::round:
823  case Intrinsic::roundeven: {
824    // These all use the same code.
825    auto LT = getTypeLegalizationCost(RetTy);
826    if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
827      return LT.first * 8;
828    break;
829  }
830  case Intrinsic::umin:
831  case Intrinsic::umax:
832  case Intrinsic::smin:
833  case Intrinsic::smax: {
834    auto LT = getTypeLegalizationCost(RetTy);
835    if ((ST->hasVInstructions() && LT.second.isVector()) ||
836        (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
837      return LT.first;
838    break;
839  }
840  case Intrinsic::sadd_sat:
841  case Intrinsic::ssub_sat:
842  case Intrinsic::uadd_sat:
843  case Intrinsic::usub_sat: {
844    auto LT = getTypeLegalizationCost(RetTy);
845    if (ST->hasVInstructions() && LT.second.isVector())
846      return LT.first;
847    break;
848  }
849  case Intrinsic::abs: {
850    auto LT = getTypeLegalizationCost(RetTy);
851    if (ST->hasVInstructions() && LT.second.isVector()) {
852      // vrsub.vi v10, v8, 0
853      // vmax.vv v8, v8, v10
854      return LT.first * 2;
855    }
856    break;
857  }
858  case Intrinsic::fabs:
859  case Intrinsic::sqrt: {
860    auto LT = getTypeLegalizationCost(RetTy);
861    if (ST->hasVInstructions() && LT.second.isVector())
862      return LT.first;
863    break;
864  }
865  // TODO: add more intrinsic
866  case Intrinsic::experimental_stepvector: {
867    unsigned Cost = 1; // vid
868    auto LT = getTypeLegalizationCost(RetTy);
869    return Cost + (LT.first - 1);
870  }
871  case Intrinsic::vp_rint: {
872    // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
873    unsigned Cost = 5;
874    auto LT = getTypeLegalizationCost(RetTy);
875    if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
876      return Cost * LT.first;
877    break;
878  }
879  case Intrinsic::vp_nearbyint: {
880    // More one read and one write for fflags than vp_rint.
881    unsigned Cost = 7;
882    auto LT = getTypeLegalizationCost(RetTy);
883    if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
884      return Cost * LT.first;
885    break;
886  }
887  case Intrinsic::vp_ceil:
888  case Intrinsic::vp_floor:
889  case Intrinsic::vp_round:
890  case Intrinsic::vp_roundeven:
891  case Intrinsic::vp_roundtozero: {
892    // Rounding with static rounding mode needs two more instructions to
893    // swap/write FRM than vp_rint.
894    unsigned Cost = 7;
895    auto LT = getTypeLegalizationCost(RetTy);
896    unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
897    if (TLI->isOperationCustom(VPISD, LT.second))
898      return Cost * LT.first;
899    break;
900  }
901  }
902
903  if (ST->hasVInstructions() && RetTy->isVectorTy()) {
904    auto LT = getTypeLegalizationCost(RetTy);
905    if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
906                                            ICA.getID(), LT.second))
907      return LT.first * Entry->Cost;
908  }
909
910  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
911}
912
913InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
914                                               Type *Src,
915                                               TTI::CastContextHint CCH,
916                                               TTI::TargetCostKind CostKind,
917                                               const Instruction *I) {
918  if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
919    // FIXME: Need to compute legalizing cost for illegal types.
920    if (!isTypeLegal(Src) || !isTypeLegal(Dst))
921      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
922
923    // Skip if element size of Dst or Src is bigger than ELEN.
924    if (Src->getScalarSizeInBits() > ST->getELEN() ||
925        Dst->getScalarSizeInBits() > ST->getELEN())
926      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
927
928    int ISD = TLI->InstructionOpcodeToISD(Opcode);
929    assert(ISD && "Invalid opcode");
930
931    // FIXME: Need to consider vsetvli and lmul.
932    int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
933                  (int)Log2_32(Src->getScalarSizeInBits());
934    switch (ISD) {
935    case ISD::SIGN_EXTEND:
936    case ISD::ZERO_EXTEND:
937      if (Src->getScalarSizeInBits() == 1) {
938        // We do not use vsext/vzext to extend from mask vector.
939        // Instead we use the following instructions to extend from mask vector:
940        // vmv.v.i v8, 0
941        // vmerge.vim v8, v8, -1, v0
942        return 2;
943      }
944      return 1;
945    case ISD::TRUNCATE:
946      if (Dst->getScalarSizeInBits() == 1) {
947        // We do not use several vncvt to truncate to mask vector. So we could
948        // not use PowDiff to calculate it.
949        // Instead we use the following instructions to truncate to mask vector:
950        // vand.vi v8, v8, 1
951        // vmsne.vi v0, v8, 0
952        return 2;
953      }
954      [[fallthrough]];
955    case ISD::FP_EXTEND:
956    case ISD::FP_ROUND:
957      // Counts of narrow/widen instructions.
958      return std::abs(PowDiff);
959    case ISD::FP_TO_SINT:
960    case ISD::FP_TO_UINT:
961    case ISD::SINT_TO_FP:
962    case ISD::UINT_TO_FP:
963      if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
964        // The cost of convert from or to mask vector is different from other
965        // cases. We could not use PowDiff to calculate it.
966        // For mask vector to fp, we should use the following instructions:
967        // vmv.v.i v8, 0
968        // vmerge.vim v8, v8, -1, v0
969        // vfcvt.f.x.v v8, v8
970
971        // And for fp vector to mask, we use:
972        // vfncvt.rtz.x.f.w v9, v8
973        // vand.vi v8, v9, 1
974        // vmsne.vi v0, v8, 0
975        return 3;
976      }
977      if (std::abs(PowDiff) <= 1)
978        return 1;
979      // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
980      // so it only need two conversion.
981      if (Src->isIntOrIntVectorTy())
982        return 2;
983      // Counts of narrow/widen instructions.
984      return std::abs(PowDiff);
985    }
986  }
987  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
988}
989
990unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
991  if (isa<ScalableVectorType>(Ty)) {
992    const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
993    const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
994    const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
995    return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
996  }
997  return cast<FixedVectorType>(Ty)->getNumElements();
998}
999
1000InstructionCost
1001RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1002                                     bool IsUnsigned,
1003                                     TTI::TargetCostKind CostKind) {
1004  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1005    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1006
1007  // Skip if scalar size of Ty is bigger than ELEN.
1008  if (Ty->getScalarSizeInBits() > ST->getELEN())
1009    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1010
1011  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1012  if (Ty->getElementType()->isIntegerTy(1))
1013    // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
1014    // cost 2, but we don't have enough info here so we slightly over cost.
1015    return (LT.first - 1) + 3;
1016
1017  // IR Reduction is composed by two vmv and one rvv reduction instruction.
1018  InstructionCost BaseCost = 2;
1019  unsigned VL = getEstimatedVLFor(Ty);
1020  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1021}
1022
1023InstructionCost
1024RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1025                                         std::optional<FastMathFlags> FMF,
1026                                         TTI::TargetCostKind CostKind) {
1027  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1028    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1029
1030  // Skip if scalar size of Ty is bigger than ELEN.
1031  if (Ty->getScalarSizeInBits() > ST->getELEN())
1032    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1033
1034  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1035  assert(ISD && "Invalid opcode");
1036
1037  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1038      ISD != ISD::FADD)
1039    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1040
1041  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1042  if (Ty->getElementType()->isIntegerTy(1))
1043    // vcpop sequences, see vreduction-mask.ll
1044    return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1045
1046  // IR Reduction is composed by two vmv and one rvv reduction instruction.
1047  InstructionCost BaseCost = 2;
1048  unsigned VL = getEstimatedVLFor(Ty);
1049  if (TTI::requiresOrderedReduction(FMF))
1050    return (LT.first - 1) + BaseCost + VL;
1051  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1052}
1053
1054InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1055    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1056    std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1057  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1058    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1059                                           FMF, CostKind);
1060
1061  // Skip if scalar size of ResTy is bigger than ELEN.
1062  if (ResTy->getScalarSizeInBits() > ST->getELEN())
1063    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1064                                           FMF, CostKind);
1065
1066  if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1067    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1068                                           FMF, CostKind);
1069
1070  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1071
1072  if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1073    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1074                                           FMF, CostKind);
1075
1076  return (LT.first - 1) +
1077         getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1078}
1079
1080InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1081                                              TTI::OperandValueInfo OpInfo,
1082                                              TTI::TargetCostKind CostKind) {
1083  assert(OpInfo.isConstant() && "non constant operand?");
1084  if (!isa<VectorType>(Ty))
1085    // FIXME: We need to account for immediate materialization here, but doing
1086    // a decent job requires more knowledge about the immediate than we
1087    // currently have here.
1088    return 0;
1089
1090  if (OpInfo.isUniform())
1091    // vmv.x.i, vmv.v.x, or vfmv.v.f
1092    // We ignore the cost of the scalar constant materialization to be consistent
1093    // with how we treat scalar constants themselves just above.
1094    return 1;
1095
1096  // Add a cost of address generation + the cost of the vector load. The
1097  // address is expected to be a PC relative offset to a constant pool entry
1098  // using auipc/addi.
1099  return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1100                             /*AddressSpace=*/0, CostKind);
1101}
1102
1103
1104InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1105                                              MaybeAlign Alignment,
1106                                              unsigned AddressSpace,
1107                                              TTI::TargetCostKind CostKind,
1108                                              TTI::OperandValueInfo OpInfo,
1109                                              const Instruction *I) {
1110  InstructionCost Cost = 0;
1111  if (Opcode == Instruction::Store && OpInfo.isConstant())
1112    Cost += getStoreImmCost(Src, OpInfo, CostKind);
1113  return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1114                                       CostKind, OpInfo, I);
1115}
1116
1117InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1118                                                 Type *CondTy,
1119                                                 CmpInst::Predicate VecPred,
1120                                                 TTI::TargetCostKind CostKind,
1121                                                 const Instruction *I) {
1122  if (CostKind != TTI::TCK_RecipThroughput)
1123    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1124                                     I);
1125
1126  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1127    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1128                                     I);
1129
1130  // Skip if scalar size of ValTy is bigger than ELEN.
1131  if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1132    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1133                                     I);
1134
1135  if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1136    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1137    if (CondTy->isVectorTy()) {
1138      if (ValTy->getScalarSizeInBits() == 1) {
1139        // vmandn.mm v8, v8, v9
1140        // vmand.mm v9, v0, v9
1141        // vmor.mm v0, v9, v8
1142        return LT.first * 3;
1143      }
1144      // vselect and max/min are supported natively.
1145      return LT.first * 1;
1146    }
1147
1148    if (ValTy->getScalarSizeInBits() == 1) {
1149      //  vmv.v.x v9, a0
1150      //  vmsne.vi v9, v9, 0
1151      //  vmandn.mm v8, v8, v9
1152      //  vmand.mm v9, v0, v9
1153      //  vmor.mm v0, v9, v8
1154      return LT.first * 5;
1155    }
1156
1157    // vmv.v.x v10, a0
1158    // vmsne.vi v0, v10, 0
1159    // vmerge.vvm v8, v9, v8, v0
1160    return LT.first * 3;
1161  }
1162
1163  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1164      ValTy->isVectorTy()) {
1165    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1166
1167    // Support natively.
1168    if (CmpInst::isIntPredicate(VecPred))
1169      return LT.first * 1;
1170
1171    // If we do not support the input floating point vector type, use the base
1172    // one which will calculate as:
1173    // ScalarizeCost + Num * Cost for fixed vector,
1174    // InvalidCost for scalable vector.
1175    if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1176        (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1177        (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1178      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1179                                       I);
1180    switch (VecPred) {
1181      // Support natively.
1182    case CmpInst::FCMP_OEQ:
1183    case CmpInst::FCMP_OGT:
1184    case CmpInst::FCMP_OGE:
1185    case CmpInst::FCMP_OLT:
1186    case CmpInst::FCMP_OLE:
1187    case CmpInst::FCMP_UNE:
1188      return LT.first * 1;
1189    // TODO: Other comparisons?
1190    default:
1191      break;
1192    }
1193  }
1194
1195  // TODO: Add cost for scalar type.
1196
1197  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1198}
1199
1200InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1201                                                 TTI::TargetCostKind CostKind,
1202                                                 unsigned Index, Value *Op0,
1203                                                 Value *Op1) {
1204  assert(Val->isVectorTy() && "This must be a vector type");
1205
1206  if (Opcode != Instruction::ExtractElement &&
1207      Opcode != Instruction::InsertElement)
1208    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1209
1210  // Legalize the type.
1211  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1212
1213  // This type is legalized to a scalar type.
1214  if (!LT.second.isVector())
1215    return 0;
1216
1217  // For unsupported scalable vector.
1218  if (LT.second.isScalableVector() && !LT.first.isValid())
1219    return LT.first;
1220
1221  if (!isTypeLegal(Val))
1222    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1223
1224  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1225  // and vslideup + vmv.s.x to insert element to vector.
1226  unsigned BaseCost = 1;
1227  // When insertelement we should add the index with 1 as the input of vslideup.
1228  unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1229
1230  if (Index != -1U) {
1231    // The type may be split. For fixed-width vectors we can normalize the
1232    // index to the new type.
1233    if (LT.second.isFixedLengthVector()) {
1234      unsigned Width = LT.second.getVectorNumElements();
1235      Index = Index % Width;
1236    }
1237
1238    // We could extract/insert the first element without vslidedown/vslideup.
1239    if (Index == 0)
1240      SlideCost = 0;
1241    else if (Opcode == Instruction::InsertElement)
1242      SlideCost = 1; // With a constant index, we do not need to use addi.
1243  }
1244
1245  // Mask vector extract/insert element is different from normal case.
1246  if (Val->getScalarSizeInBits() == 1) {
1247    // For extractelement, we need the following instructions:
1248    // vmv.v.i v8, 0
1249    // vmerge.vim v8, v8, 1, v0
1250    // vsetivli zero, 1, e8, m2, ta, mu (not count)
1251    // vslidedown.vx v8, v8, a0
1252    // vmv.x.s a0, v8
1253
1254    // For insertelement, we need the following instructions:
1255    // vsetvli a2, zero, e8, m1, ta, mu (not count)
1256    // vmv.s.x v8, a0
1257    // vmv.v.i v9, 0
1258    // vmerge.vim v9, v9, 1, v0
1259    // addi a0, a1, 1
1260    // vsetvli zero, a0, e8, m1, tu, mu (not count)
1261    // vslideup.vx v9, v8, a1
1262    // vsetvli a0, zero, e8, m1, ta, mu (not count)
1263    // vand.vi v8, v9, 1
1264    // vmsne.vi v0, v8, 0
1265
1266    // TODO: should we count these special vsetvlis?
1267    BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1268  }
1269  // Extract i64 in the target that has XLEN=32 need more instruction.
1270  if (Val->getScalarType()->isIntegerTy() &&
1271      ST->getXLen() < Val->getScalarSizeInBits()) {
1272    // For extractelement, we need the following instructions:
1273    // vsetivli zero, 1, e64, m1, ta, mu (not count)
1274    // vslidedown.vx v8, v8, a0
1275    // vmv.x.s a0, v8
1276    // li a1, 32
1277    // vsrl.vx v8, v8, a1
1278    // vmv.x.s a1, v8
1279
1280    // For insertelement, we need the following instructions:
1281    // vsetivli zero, 2, e32, m4, ta, mu (not count)
1282    // vmv.v.i v12, 0
1283    // vslide1up.vx v16, v12, a1
1284    // vslide1up.vx v12, v16, a0
1285    // addi a0, a2, 1
1286    // vsetvli zero, a0, e64, m4, tu, mu (not count)
1287    // vslideup.vx v8, v12, a2
1288
1289    // TODO: should we count these special vsetvlis?
1290    BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1291  }
1292  return BaseCost + SlideCost;
1293}
1294
1295InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1296    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1297    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1298    ArrayRef<const Value *> Args, const Instruction *CxtI) {
1299
1300  // TODO: Handle more cost kinds.
1301  if (CostKind != TTI::TCK_RecipThroughput)
1302    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1303                                         Args, CxtI);
1304
1305  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1306    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1307                                         Args, CxtI);
1308
1309  // Skip if scalar size of Ty is bigger than ELEN.
1310  if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1311    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1312                                         Args, CxtI);
1313
1314  // Legalize the type.
1315  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1316
1317  // TODO: Handle scalar type.
1318  if (!LT.second.isVector())
1319    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1320                                         Args, CxtI);
1321
1322
1323  auto getConstantMatCost =
1324    [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1325    if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1326      // Two sub-cases:
1327      // * Has a 5 bit immediate operand which can be splatted.
1328      // * Has a larger immediate which must be materialized in scalar register
1329      // We return 0 for both as we currently ignore the cost of materializing
1330      // scalar constants in GPRs.
1331      return 0;
1332
1333    // Add a cost of address generation + the cost of the vector load. The
1334    // address is expected to be a PC relative offset to a constant pool entry
1335    // using auipc/addi.
1336    return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1337                               /*AddressSpace=*/0, CostKind);
1338  };
1339
1340  // Add the cost of materializing any constant vectors required.
1341  InstructionCost ConstantMatCost = 0;
1342  if (Op1Info.isConstant())
1343    ConstantMatCost += getConstantMatCost(0, Op1Info);
1344  if (Op2Info.isConstant())
1345    ConstantMatCost += getConstantMatCost(1, Op2Info);
1346
1347  switch (TLI->InstructionOpcodeToISD(Opcode)) {
1348  case ISD::ADD:
1349  case ISD::SUB:
1350  case ISD::AND:
1351  case ISD::OR:
1352  case ISD::XOR:
1353  case ISD::SHL:
1354  case ISD::SRL:
1355  case ISD::SRA:
1356  case ISD::MUL:
1357  case ISD::MULHS:
1358  case ISD::MULHU:
1359  case ISD::FADD:
1360  case ISD::FSUB:
1361  case ISD::FMUL:
1362  case ISD::FNEG: {
1363    return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1364  }
1365  default:
1366    return ConstantMatCost +
1367           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1368                                         Args, CxtI);
1369  }
1370}
1371
1372void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1373                                           TTI::UnrollingPreferences &UP,
1374                                           OptimizationRemarkEmitter *ORE) {
1375  // TODO: More tuning on benchmarks and metrics with changes as needed
1376  //       would apply to all settings below to enable performance.
1377
1378
1379  if (ST->enableDefaultUnroll())
1380    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1381
1382  // Enable Upper bound unrolling universally, not dependant upon the conditions
1383  // below.
1384  UP.UpperBound = true;
1385
1386  // Disable loop unrolling for Oz and Os.
1387  UP.OptSizeThreshold = 0;
1388  UP.PartialOptSizeThreshold = 0;
1389  if (L->getHeader()->getParent()->hasOptSize())
1390    return;
1391
1392  SmallVector<BasicBlock *, 4> ExitingBlocks;
1393  L->getExitingBlocks(ExitingBlocks);
1394  LLVM_DEBUG(dbgs() << "Loop has:\n"
1395                    << "Blocks: " << L->getNumBlocks() << "\n"
1396                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
1397
1398  // Only allow another exit other than the latch. This acts as an early exit
1399  // as it mirrors the profitability calculation of the runtime unroller.
1400  if (ExitingBlocks.size() > 2)
1401    return;
1402
1403  // Limit the CFG of the loop body for targets with a branch predictor.
1404  // Allowing 4 blocks permits if-then-else diamonds in the body.
1405  if (L->getNumBlocks() > 4)
1406    return;
1407
1408  // Don't unroll vectorized loops, including the remainder loop
1409  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1410    return;
1411
1412  // Scan the loop: don't unroll loops with calls as this could prevent
1413  // inlining.
1414  InstructionCost Cost = 0;
1415  for (auto *BB : L->getBlocks()) {
1416    for (auto &I : *BB) {
1417      // Initial setting - Don't unroll loops containing vectorized
1418      // instructions.
1419      if (I.getType()->isVectorTy())
1420        return;
1421
1422      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1423        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1424          if (!isLoweredToCall(F))
1425            continue;
1426        }
1427        return;
1428      }
1429
1430      SmallVector<const Value *> Operands(I.operand_values());
1431      Cost += getInstructionCost(&I, Operands,
1432                                 TargetTransformInfo::TCK_SizeAndLatency);
1433    }
1434  }
1435
1436  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1437
1438  UP.Partial = true;
1439  UP.Runtime = true;
1440  UP.UnrollRemainder = true;
1441  UP.UnrollAndJam = true;
1442  UP.UnrollAndJamInnerLoopThreshold = 60;
1443
1444  // Force unrolling small loops can be very useful because of the branch
1445  // taken cost of the backedge.
1446  if (Cost < 12)
1447    UP.Force = true;
1448}
1449
1450void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1451                                         TTI::PeelingPreferences &PP) {
1452  BaseT::getPeelingPreferences(L, SE, PP);
1453}
1454
1455unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1456  TypeSize Size = DL.getTypeSizeInBits(Ty);
1457  if (Ty->isVectorTy()) {
1458    if (Size.isScalable() && ST->hasVInstructions())
1459      return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1460
1461    if (ST->useRVVForFixedLengthVectors())
1462      return divideCeil(Size, ST->getRealMinVLen());
1463  }
1464
1465  return BaseT::getRegUsageForType(Ty);
1466}
1467
1468unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1469  // This interface is currently only used by SLP.  Returning 1 (which is the
1470  // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
1471  // problem w/ constant materialization which causes SLP to perform majorly
1472  // unprofitable transformations.
1473  // TODO: Figure out constant materialization cost modeling and remove.
1474  return SLPMaxVF;
1475}
1476
1477bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1478                                 const TargetTransformInfo::LSRCost &C2) {
1479  // RISCV specific here are "instruction number 1st priority".
1480  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1481                  C1.NumIVMuls, C1.NumBaseAdds,
1482                  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1483         std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1484                  C2.NumIVMuls, C2.NumBaseAdds,
1485                  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1486}
1487