1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "RISCVTargetTransformInfo.h"
10#include "MCTargetDesc/RISCVMatInt.h"
11#include "llvm/ADT/STLExtras.h"
12#include "llvm/Analysis/TargetTransformInfo.h"
13#include "llvm/CodeGen/BasicTTIImpl.h"
14#include "llvm/CodeGen/CostTable.h"
15#include "llvm/CodeGen/TargetLowering.h"
16#include "llvm/IR/Instructions.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
23static cl::opt<unsigned> RVVRegisterWidthLMUL(
24    "riscv-v-register-bit-width-lmul",
25    cl::desc(
26        "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27        "by autovectorized code. Fractional LMULs are not supported."),
28    cl::init(2), cl::Hidden);
29
30static cl::opt<unsigned> SLPMaxVF(
31    "riscv-v-slp-max-vf",
32    cl::desc(
33        "Overrides result used for getMaximumVF query which is used "
34        "exclusively by SLP vectorizer."),
35    cl::Hidden);
36
37InstructionCost
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
39                                      TTI::TargetCostKind CostKind) {
40  // Check if the type is valid for all CostKind
41  if (!VT.isVector())
42    return InstructionCost::getInvalid();
43  size_t NumInstr = OpCodes.size();
44  if (CostKind == TTI::TCK_CodeSize)
45    return NumInstr;
46  InstructionCost LMULCost = TLI->getLMULCost(VT);
47  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
48    return LMULCost * NumInstr;
49  InstructionCost Cost = 0;
50  for (auto Op : OpCodes) {
51    switch (Op) {
52    case RISCV::VRGATHER_VI:
53      Cost += TLI->getVRGatherVICost(VT);
54      break;
55    case RISCV::VRGATHER_VV:
56      Cost += TLI->getVRGatherVVCost(VT);
57      break;
58    case RISCV::VSLIDEUP_VI:
59    case RISCV::VSLIDEDOWN_VI:
60      Cost += TLI->getVSlideVICost(VT);
61      break;
62    case RISCV::VSLIDEUP_VX:
63    case RISCV::VSLIDEDOWN_VX:
64      Cost += TLI->getVSlideVXCost(VT);
65      break;
66    case RISCV::VREDMAX_VS:
67    case RISCV::VREDMIN_VS:
68    case RISCV::VREDMAXU_VS:
69    case RISCV::VREDMINU_VS:
70    case RISCV::VREDSUM_VS:
71    case RISCV::VREDAND_VS:
72    case RISCV::VREDOR_VS:
73    case RISCV::VREDXOR_VS:
74    case RISCV::VFREDMAX_VS:
75    case RISCV::VFREDMIN_VS:
76    case RISCV::VFREDUSUM_VS: {
77      unsigned VL = VT.getVectorMinNumElements();
78      if (!VT.isFixedLengthVector())
79        VL *= *getVScaleForTuning();
80      Cost += Log2_32_Ceil(VL);
81      break;
82    }
83    case RISCV::VFREDOSUM_VS: {
84      unsigned VL = VT.getVectorMinNumElements();
85      if (!VT.isFixedLengthVector())
86        VL *= *getVScaleForTuning();
87      Cost += VL;
88      break;
89    }
90    case RISCV::VMV_X_S:
91    case RISCV::VMV_S_X:
92      Cost += 1;
93      break;
94    default:
95      Cost += LMULCost;
96    }
97  }
98  return Cost;
99}
100
101InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
102                                            TTI::TargetCostKind CostKind) {
103  assert(Ty->isIntegerTy() &&
104         "getIntImmCost can only estimate cost of materialising integers");
105
106  // We have a Zero register, so 0 is always free.
107  if (Imm == 0)
108    return TTI::TCC_Free;
109
110  // Otherwise, we check how many instructions it will take to materialise.
111  const DataLayout &DL = getDataLayout();
112  return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
113}
114
115// Look for patterns of shift followed by AND that can be turned into a pair of
116// shifts. We won't need to materialize an immediate for the AND so these can
117// be considered free.
118static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
119  uint64_t Mask = Imm.getZExtValue();
120  auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
121  if (!BO || !BO->hasOneUse())
122    return false;
123
124  if (BO->getOpcode() != Instruction::Shl)
125    return false;
126
127  if (!isa<ConstantInt>(BO->getOperand(1)))
128    return false;
129
130  unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
131  // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
132  // is a mask shifted by c2 bits with c3 leading zeros.
133  if (isShiftedMask_64(Mask)) {
134    unsigned Trailing = llvm::countr_zero(Mask);
135    if (ShAmt == Trailing)
136      return true;
137  }
138
139  return false;
140}
141
142InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
143                                                const APInt &Imm, Type *Ty,
144                                                TTI::TargetCostKind CostKind,
145                                                Instruction *Inst) {
146  assert(Ty->isIntegerTy() &&
147         "getIntImmCost can only estimate cost of materialising integers");
148
149  // We have a Zero register, so 0 is always free.
150  if (Imm == 0)
151    return TTI::TCC_Free;
152
153  // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
154  // commutative, in others the immediate comes from a specific argument index.
155  bool Takes12BitImm = false;
156  unsigned ImmArgIdx = ~0U;
157
158  switch (Opcode) {
159  case Instruction::GetElementPtr:
160    // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
161    // split up large offsets in GEP into better parts than ConstantHoisting
162    // can.
163    return TTI::TCC_Free;
164  case Instruction::And:
165    // zext.h
166    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
167      return TTI::TCC_Free;
168    // zext.w
169    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
170      return TTI::TCC_Free;
171    // bclri
172    if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
173      return TTI::TCC_Free;
174    if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
175        canUseShiftPair(Inst, Imm))
176      return TTI::TCC_Free;
177    Takes12BitImm = true;
178    break;
179  case Instruction::Add:
180    Takes12BitImm = true;
181    break;
182  case Instruction::Or:
183  case Instruction::Xor:
184    // bseti/binvi
185    if (ST->hasStdExtZbs() && Imm.isPowerOf2())
186      return TTI::TCC_Free;
187    Takes12BitImm = true;
188    break;
189  case Instruction::Mul:
190    // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
191    if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
192      return TTI::TCC_Free;
193    // One more or less than a power of 2 can use SLLI+ADD/SUB.
194    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
195      return TTI::TCC_Free;
196    // FIXME: There is no MULI instruction.
197    Takes12BitImm = true;
198    break;
199  case Instruction::Sub:
200  case Instruction::Shl:
201  case Instruction::LShr:
202  case Instruction::AShr:
203    Takes12BitImm = true;
204    ImmArgIdx = 1;
205    break;
206  default:
207    break;
208  }
209
210  if (Takes12BitImm) {
211    // Check immediate is the correct argument...
212    if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
213      // ... and fits into the 12-bit immediate.
214      if (Imm.getSignificantBits() <= 64 &&
215          getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
216        return TTI::TCC_Free;
217      }
218    }
219
220    // Otherwise, use the full materialisation cost.
221    return getIntImmCost(Imm, Ty, CostKind);
222  }
223
224  // By default, prevent hoisting.
225  return TTI::TCC_Free;
226}
227
228InstructionCost
229RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
230                                  const APInt &Imm, Type *Ty,
231                                  TTI::TargetCostKind CostKind) {
232  // Prevent hoisting in unknown cases.
233  return TTI::TCC_Free;
234}
235
236TargetTransformInfo::PopcntSupportKind
237RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
238  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
239  return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
240             ? TTI::PSK_FastHardware
241             : TTI::PSK_Software;
242}
243
244bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
245  // Currently, the ExpandReductions pass can't expand scalable-vector
246  // reductions, but we still request expansion as RVV doesn't support certain
247  // reductions and the SelectionDAG can't legalize them either.
248  switch (II->getIntrinsicID()) {
249  default:
250    return false;
251  // These reductions have no equivalent in RVV
252  case Intrinsic::vector_reduce_mul:
253  case Intrinsic::vector_reduce_fmul:
254    return true;
255  }
256}
257
258std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
259  if (ST->hasVInstructions())
260    return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
261  return BaseT::getMaxVScale();
262}
263
264std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
265  if (ST->hasVInstructions())
266    if (unsigned MinVLen = ST->getRealMinVLen();
267        MinVLen >= RISCV::RVVBitsPerBlock)
268      return MinVLen / RISCV::RVVBitsPerBlock;
269  return BaseT::getVScaleForTuning();
270}
271
272TypeSize
273RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
274  unsigned LMUL =
275      llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
276  switch (K) {
277  case TargetTransformInfo::RGK_Scalar:
278    return TypeSize::getFixed(ST->getXLen());
279  case TargetTransformInfo::RGK_FixedWidthVector:
280    return TypeSize::getFixed(
281        ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
282  case TargetTransformInfo::RGK_ScalableVector:
283    return TypeSize::getScalable(
284        (ST->hasVInstructions() &&
285         ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
286            ? LMUL * RISCV::RVVBitsPerBlock
287            : 0);
288  }
289
290  llvm_unreachable("Unsupported register kind");
291}
292
293InstructionCost
294RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,  TTI::TargetCostKind CostKind) {
295  // Add a cost of address generation + the cost of the load. The address
296  // is expected to be a PC relative offset to a constant pool entry
297  // using auipc/addi.
298  return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
299                             /*AddressSpace=*/0, CostKind);
300}
301
302static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
303                                        LLVMContext &C) {
304  assert((DataVT.getScalarSizeInBits() != 8 ||
305          DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
306  MVT IndexVT = DataVT.changeTypeToInteger();
307  if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
308    IndexVT = IndexVT.changeVectorElementType(MVT::i16);
309  return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
310}
311
312InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
313                                             VectorType *Tp, ArrayRef<int> Mask,
314                                             TTI::TargetCostKind CostKind,
315                                             int Index, VectorType *SubTp,
316                                             ArrayRef<const Value *> Args) {
317  Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
318
319  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
320
321  // First, handle cases where having a fixed length vector enables us to
322  // give a more accurate cost than falling back to generic scalable codegen.
323  // TODO: Each of these cases hints at a modeling gap around scalable vectors.
324  if (isa<FixedVectorType>(Tp)) {
325    switch (Kind) {
326    default:
327      break;
328    case TTI::SK_PermuteSingleSrc: {
329      if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
330        MVT EltTp = LT.second.getVectorElementType();
331        // If the size of the element is < ELEN then shuffles of interleaves and
332        // deinterleaves of 2 vectors can be lowered into the following
333        // sequences
334        if (EltTp.getScalarSizeInBits() < ST->getELen()) {
335          // Example sequence:
336          //   vsetivli     zero, 4, e8, mf4, ta, ma (ignored)
337          //   vwaddu.vv    v10, v8, v9
338          //   li       a0, -1                   (ignored)
339          //   vwmaccu.vx   v10, a0, v9
340          if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
341            return 2 * LT.first * TLI->getLMULCost(LT.second);
342
343          if (Mask[0] == 0 || Mask[0] == 1) {
344            auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
345            // Example sequence:
346            //   vnsrl.wi   v10, v8, 0
347            if (equal(DeinterleaveMask, Mask))
348              return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
349                                                        LT.second, CostKind);
350          }
351        }
352      }
353      // vrgather + cost of generating the mask constant.
354      // We model this for an unknown mask with a single vrgather.
355      if (LT.second.isFixedLengthVector() && LT.first == 1 &&
356          (LT.second.getScalarSizeInBits() != 8 ||
357           LT.second.getVectorNumElements() <= 256)) {
358        VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
359        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
360        return IndexCost +
361               getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
362      }
363      [[fallthrough]];
364    }
365    case TTI::SK_Transpose:
366    case TTI::SK_PermuteTwoSrc: {
367      // 2 x (vrgather + cost of generating the mask constant) + cost of mask
368      // register for the second vrgather. We model this for an unknown
369      // (shuffle) mask.
370      if (LT.second.isFixedLengthVector() && LT.first == 1 &&
371          (LT.second.getScalarSizeInBits() != 8 ||
372           LT.second.getVectorNumElements() <= 256)) {
373        auto &C = Tp->getContext();
374        auto EC = Tp->getElementCount();
375        VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
376        VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
377        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
378        InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
379        return 2 * IndexCost +
380               getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
381                                       LT.second, CostKind) +
382               MaskCost;
383      }
384      [[fallthrough]];
385    }
386    case TTI::SK_Select: {
387      // We are going to permute multiple sources and the result will be in
388      // multiple destinations. Providing an accurate cost only for splits where
389      // the element type remains the same.
390      if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
391          LT.second.isFixedLengthVector() &&
392          LT.second.getVectorElementType().getSizeInBits() ==
393              Tp->getElementType()->getPrimitiveSizeInBits() &&
394          LT.second.getVectorNumElements() <
395              cast<FixedVectorType>(Tp)->getNumElements() &&
396          divideCeil(Mask.size(),
397                     cast<FixedVectorType>(Tp)->getNumElements()) ==
398              static_cast<unsigned>(*LT.first.getValue())) {
399        unsigned NumRegs = *LT.first.getValue();
400        unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
401        unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
402        auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
403
404        InstructionCost Cost = 0;
405        for (unsigned I = 0; I < NumRegs; ++I) {
406          bool IsSingleVector = true;
407          SmallVector<int> SubMask(SubVF, PoisonMaskElem);
408          transform(Mask.slice(I * SubVF,
409                               I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
410                    SubMask.begin(), [&](int I) {
411                      bool SingleSubVector = I / VF == 0;
412                      IsSingleVector &= SingleSubVector;
413                      return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
414                    });
415          Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
416                                                : TTI::SK_PermuteTwoSrc,
417                                 SubVecTy, SubMask, CostKind, 0, nullptr);
418          return Cost;
419        }
420      }
421      break;
422    }
423    }
424  };
425
426  // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
427  switch (Kind) {
428  default:
429    // Fallthrough to generic handling.
430    // TODO: Most of these cases will return getInvalid in generic code, and
431    // must be implemented here.
432    break;
433  case TTI::SK_ExtractSubvector:
434    // Example sequence:
435    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
436    // vslidedown.vi  v8, v9, 2
437    return LT.first *
438           getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
439  case TTI::SK_InsertSubvector:
440    // Example sequence:
441    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
442    // vslideup.vi  v8, v9, 2
443    return LT.first *
444           getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
445  case TTI::SK_Select: {
446    // Example sequence:
447    // li           a0, 90
448    // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)
449    // vmv.s.x      v0, a0
450    // vmerge.vvm   v8, v9, v8, v0
451    // We use 2 for the cost of the mask materialization as this is the true
452    // cost for small masks and most shuffles are small.  At worst, this cost
453    // should be a very small constant for the constant pool load.  As such,
454    // we may bias towards large selects slightly more than truely warranted.
455    return LT.first *
456           (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
457                                        LT.second, CostKind));
458  }
459  case TTI::SK_Broadcast: {
460    bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
461                                           Instruction::InsertElement);
462    if (LT.second.getScalarSizeInBits() == 1) {
463      if (HasScalar) {
464        // Example sequence:
465        //   andi a0, a0, 1
466        //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
467        //   vmv.v.x v8, a0
468        //   vmsne.vi v0, v8, 0
469        return LT.first *
470               (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
471                getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
472                                        LT.second, CostKind));
473      }
474      // Example sequence:
475      //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
476      //   vmv.v.i v8, 0
477      //   vmerge.vim      v8, v8, 1, v0
478      //   vmv.x.s a0, v8
479      //   andi    a0, a0, 1
480      //   vmv.v.x v8, a0
481      //   vmsne.vi  v0, v8, 0
482
483      return LT.first *
484             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
485              getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
486                                       RISCV::VMV_X_S, RISCV::VMV_V_X,
487                                       RISCV::VMSNE_VI},
488                                      LT.second, CostKind));
489    }
490
491    if (HasScalar) {
492      // Example sequence:
493      //   vmv.v.x v8, a0
494      return LT.first *
495             getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
496    }
497
498    // Example sequence:
499    //   vrgather.vi     v9, v8, 0
500    return LT.first *
501           getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
502  }
503  case TTI::SK_Splice: {
504    // vslidedown+vslideup.
505    // TODO: Multiplying by LT.first implies this legalizes into multiple copies
506    // of similar code, but I think we expand through memory.
507    unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
508    if (Index >= 0 && Index < 32)
509      Opcodes[0] = RISCV::VSLIDEDOWN_VI;
510    else if (Index < 0 && Index > -32)
511      Opcodes[1] = RISCV::VSLIDEUP_VI;
512    return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
513  }
514  case TTI::SK_Reverse: {
515    // TODO: Cases to improve here:
516    // * Illegal vector types
517    // * i64 on RV32
518    // * i1 vector
519    // At low LMUL, most of the cost is producing the vrgather index register.
520    // At high LMUL, the cost of the vrgather itself will dominate.
521    // Example sequence:
522    //   csrr a0, vlenb
523    //   srli a0, a0, 3
524    //   addi a0, a0, -1
525    //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
526    //   vid.v v9
527    //   vrsub.vx v10, v9, a0
528    //   vrgather.vv v9, v8, v10
529    InstructionCost LenCost = 3;
530    if (LT.second.isFixedLengthVector())
531      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
532      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
533    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
534    InstructionCost GatherCost =
535        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
536    // Mask operation additionally required extend and truncate
537    InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
538    return LT.first * (LenCost + GatherCost + ExtendCost);
539  }
540  }
541  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
542}
543
544InstructionCost
545RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
546                                    unsigned AddressSpace,
547                                    TTI::TargetCostKind CostKind) {
548  if (!isLegalMaskedLoadStore(Src, Alignment) ||
549      CostKind != TTI::TCK_RecipThroughput)
550    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
551                                        CostKind);
552
553  return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
554}
555
556InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
557    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
558    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
559    bool UseMaskForCond, bool UseMaskForGaps) {
560  if (isa<ScalableVectorType>(VecTy))
561    return InstructionCost::getInvalid();
562  auto *FVTy = cast<FixedVectorType>(VecTy);
563  InstructionCost MemCost =
564      getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
565  unsigned VF = FVTy->getNumElements() / Factor;
566
567  // The interleaved memory access pass will lower interleaved memory ops (i.e
568  // a load and store followed by a specific shuffle) to vlseg/vsseg
569  // intrinsics. In those cases then we can treat it as if it's just one (legal)
570  // memory op
571  if (!UseMaskForCond && !UseMaskForGaps &&
572      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
573    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
574    // Need to make sure type has't been scalarized
575    if (LT.second.isFixedLengthVector()) {
576      auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
577                                             LT.second.getVectorNumElements());
578      // FIXME: We use the memory op cost of the *legalized* type here, becuase
579      // it's getMemoryOpCost returns a really expensive cost for types like
580      // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
581      // Should the memory op cost of these be cheaper?
582      if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
583                                            AddressSpace, DL)) {
584        InstructionCost LegalMemCost = getMemoryOpCost(
585            Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
586        return LT.first + LegalMemCost;
587      }
588    }
589  }
590
591  // An interleaved load will look like this for Factor=3:
592  // %wide.vec = load <12 x i32>, ptr %3, align 4
593  // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
594  // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
595  // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
596  if (Opcode == Instruction::Load) {
597    InstructionCost Cost = MemCost;
598    for (unsigned Index : Indices) {
599      FixedVectorType *SubVecTy =
600          FixedVectorType::get(FVTy->getElementType(), VF * Factor);
601      auto Mask = createStrideMask(Index, Factor, VF);
602      InstructionCost ShuffleCost =
603          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
604                         CostKind, 0, nullptr, {});
605      Cost += ShuffleCost;
606    }
607    return Cost;
608  }
609
610  // TODO: Model for NF > 2
611  // We'll need to enhance getShuffleCost to model shuffles that are just
612  // inserts and extracts into subvectors, since they won't have the full cost
613  // of a vrgather.
614  // An interleaved store for 3 vectors of 4 lanes will look like
615  // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
616  // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
617  // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
618  // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
619  // store <12 x i32> %interleaved.vec, ptr %10, align 4
620  if (Factor != 2)
621    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
622                                             Alignment, AddressSpace, CostKind,
623                                             UseMaskForCond, UseMaskForGaps);
624
625  assert(Opcode == Instruction::Store && "Opcode must be a store");
626  // For an interleaving store of 2 vectors, we perform one large interleaving
627  // shuffle that goes into the wide store
628  auto Mask = createInterleaveMask(VF, Factor);
629  InstructionCost ShuffleCost =
630      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
631                     CostKind, 0, nullptr, {});
632  return MemCost + ShuffleCost;
633}
634
635InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
636    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
637    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
638  if (CostKind != TTI::TCK_RecipThroughput)
639    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
640                                         Alignment, CostKind, I);
641
642  if ((Opcode == Instruction::Load &&
643       !isLegalMaskedGather(DataTy, Align(Alignment))) ||
644      (Opcode == Instruction::Store &&
645       !isLegalMaskedScatter(DataTy, Align(Alignment))))
646    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
647                                         Alignment, CostKind, I);
648
649  // Cost is proportional to the number of memory operations implied.  For
650  // scalable vectors, we use an estimate on that number since we don't
651  // know exactly what VL will be.
652  auto &VTy = *cast<VectorType>(DataTy);
653  InstructionCost MemOpCost =
654      getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
655                      {TTI::OK_AnyValue, TTI::OP_None}, I);
656  unsigned NumLoads = getEstimatedVLFor(&VTy);
657  return NumLoads * MemOpCost;
658}
659
660// Currently, these represent both throughput and codesize costs
661// for the respective intrinsics.  The costs in this table are simply
662// instruction counts with the following adjustments made:
663// * One vsetvli is considered free.
664static const CostTblEntry VectorIntrinsicCostTable[]{
665    {Intrinsic::floor, MVT::f32, 9},
666    {Intrinsic::floor, MVT::f64, 9},
667    {Intrinsic::ceil, MVT::f32, 9},
668    {Intrinsic::ceil, MVT::f64, 9},
669    {Intrinsic::trunc, MVT::f32, 7},
670    {Intrinsic::trunc, MVT::f64, 7},
671    {Intrinsic::round, MVT::f32, 9},
672    {Intrinsic::round, MVT::f64, 9},
673    {Intrinsic::roundeven, MVT::f32, 9},
674    {Intrinsic::roundeven, MVT::f64, 9},
675    {Intrinsic::rint, MVT::f32, 7},
676    {Intrinsic::rint, MVT::f64, 7},
677    {Intrinsic::lrint, MVT::i32, 1},
678    {Intrinsic::lrint, MVT::i64, 1},
679    {Intrinsic::llrint, MVT::i64, 1},
680    {Intrinsic::nearbyint, MVT::f32, 9},
681    {Intrinsic::nearbyint, MVT::f64, 9},
682    {Intrinsic::bswap, MVT::i16, 3},
683    {Intrinsic::bswap, MVT::i32, 12},
684    {Intrinsic::bswap, MVT::i64, 31},
685    {Intrinsic::vp_bswap, MVT::i16, 3},
686    {Intrinsic::vp_bswap, MVT::i32, 12},
687    {Intrinsic::vp_bswap, MVT::i64, 31},
688    {Intrinsic::vp_fshl, MVT::i8, 7},
689    {Intrinsic::vp_fshl, MVT::i16, 7},
690    {Intrinsic::vp_fshl, MVT::i32, 7},
691    {Intrinsic::vp_fshl, MVT::i64, 7},
692    {Intrinsic::vp_fshr, MVT::i8, 7},
693    {Intrinsic::vp_fshr, MVT::i16, 7},
694    {Intrinsic::vp_fshr, MVT::i32, 7},
695    {Intrinsic::vp_fshr, MVT::i64, 7},
696    {Intrinsic::bitreverse, MVT::i8, 17},
697    {Intrinsic::bitreverse, MVT::i16, 24},
698    {Intrinsic::bitreverse, MVT::i32, 33},
699    {Intrinsic::bitreverse, MVT::i64, 52},
700    {Intrinsic::vp_bitreverse, MVT::i8, 17},
701    {Intrinsic::vp_bitreverse, MVT::i16, 24},
702    {Intrinsic::vp_bitreverse, MVT::i32, 33},
703    {Intrinsic::vp_bitreverse, MVT::i64, 52},
704    {Intrinsic::ctpop, MVT::i8, 12},
705    {Intrinsic::ctpop, MVT::i16, 19},
706    {Intrinsic::ctpop, MVT::i32, 20},
707    {Intrinsic::ctpop, MVT::i64, 21},
708    {Intrinsic::vp_ctpop, MVT::i8, 12},
709    {Intrinsic::vp_ctpop, MVT::i16, 19},
710    {Intrinsic::vp_ctpop, MVT::i32, 20},
711    {Intrinsic::vp_ctpop, MVT::i64, 21},
712    {Intrinsic::vp_ctlz, MVT::i8, 19},
713    {Intrinsic::vp_ctlz, MVT::i16, 28},
714    {Intrinsic::vp_ctlz, MVT::i32, 31},
715    {Intrinsic::vp_ctlz, MVT::i64, 35},
716    {Intrinsic::vp_cttz, MVT::i8, 16},
717    {Intrinsic::vp_cttz, MVT::i16, 23},
718    {Intrinsic::vp_cttz, MVT::i32, 24},
719    {Intrinsic::vp_cttz, MVT::i64, 25},
720};
721
722static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
723  switch (ID) {
724#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
725  case Intrinsic::VPID:                                                        \
726    return ISD::VPSD;
727#include "llvm/IR/VPIntrinsics.def"
728#undef HELPER_MAP_VPID_TO_VPSD
729  }
730  return ISD::DELETED_NODE;
731}
732
733InstructionCost
734RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
735                                    TTI::TargetCostKind CostKind) {
736  auto *RetTy = ICA.getReturnType();
737  switch (ICA.getID()) {
738  case Intrinsic::ceil:
739  case Intrinsic::floor:
740  case Intrinsic::trunc:
741  case Intrinsic::rint:
742  case Intrinsic::lrint:
743  case Intrinsic::llrint:
744  case Intrinsic::round:
745  case Intrinsic::roundeven: {
746    // These all use the same code.
747    auto LT = getTypeLegalizationCost(RetTy);
748    if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
749      return LT.first * 8;
750    break;
751  }
752  case Intrinsic::umin:
753  case Intrinsic::umax:
754  case Intrinsic::smin:
755  case Intrinsic::smax: {
756    auto LT = getTypeLegalizationCost(RetTy);
757    if ((ST->hasVInstructions() && LT.second.isVector()) ||
758        (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
759      return LT.first;
760    break;
761  }
762  case Intrinsic::sadd_sat:
763  case Intrinsic::ssub_sat:
764  case Intrinsic::uadd_sat:
765  case Intrinsic::usub_sat:
766  case Intrinsic::fabs:
767  case Intrinsic::sqrt: {
768    auto LT = getTypeLegalizationCost(RetTy);
769    if (ST->hasVInstructions() && LT.second.isVector())
770      return LT.first;
771    break;
772  }
773  case Intrinsic::ctpop: {
774    auto LT = getTypeLegalizationCost(RetTy);
775    if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
776      return LT.first;
777    break;
778  }
779  case Intrinsic::abs: {
780    auto LT = getTypeLegalizationCost(RetTy);
781    if (ST->hasVInstructions() && LT.second.isVector()) {
782      // vrsub.vi v10, v8, 0
783      // vmax.vv v8, v8, v10
784      return LT.first * 2;
785    }
786    break;
787  }
788  // TODO: add more intrinsic
789  case Intrinsic::experimental_stepvector: {
790    unsigned Cost = 1; // vid
791    auto LT = getTypeLegalizationCost(RetTy);
792    return Cost + (LT.first - 1);
793  }
794  case Intrinsic::vp_rint: {
795    // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
796    unsigned Cost = 5;
797    auto LT = getTypeLegalizationCost(RetTy);
798    if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
799      return Cost * LT.first;
800    break;
801  }
802  case Intrinsic::vp_nearbyint: {
803    // More one read and one write for fflags than vp_rint.
804    unsigned Cost = 7;
805    auto LT = getTypeLegalizationCost(RetTy);
806    if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
807      return Cost * LT.first;
808    break;
809  }
810  case Intrinsic::vp_ceil:
811  case Intrinsic::vp_floor:
812  case Intrinsic::vp_round:
813  case Intrinsic::vp_roundeven:
814  case Intrinsic::vp_roundtozero: {
815    // Rounding with static rounding mode needs two more instructions to
816    // swap/write FRM than vp_rint.
817    unsigned Cost = 7;
818    auto LT = getTypeLegalizationCost(RetTy);
819    unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
820    if (TLI->isOperationCustom(VPISD, LT.second))
821      return Cost * LT.first;
822    break;
823  }
824  }
825
826  if (ST->hasVInstructions() && RetTy->isVectorTy()) {
827    if (auto LT = getTypeLegalizationCost(RetTy);
828        LT.second.isVector()) {
829      MVT EltTy = LT.second.getVectorElementType();
830      if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
831                                              ICA.getID(), EltTy))
832        return LT.first * Entry->Cost;
833    }
834  }
835
836  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
837}
838
839InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
840                                               Type *Src,
841                                               TTI::CastContextHint CCH,
842                                               TTI::TargetCostKind CostKind,
843                                               const Instruction *I) {
844  if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
845    // FIXME: Need to compute legalizing cost for illegal types.
846    if (!isTypeLegal(Src) || !isTypeLegal(Dst))
847      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
848
849    // Skip if element size of Dst or Src is bigger than ELEN.
850    if (Src->getScalarSizeInBits() > ST->getELen() ||
851        Dst->getScalarSizeInBits() > ST->getELen())
852      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
853
854    int ISD = TLI->InstructionOpcodeToISD(Opcode);
855    assert(ISD && "Invalid opcode");
856
857    // FIXME: Need to consider vsetvli and lmul.
858    int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
859                  (int)Log2_32(Src->getScalarSizeInBits());
860    switch (ISD) {
861    case ISD::SIGN_EXTEND:
862    case ISD::ZERO_EXTEND:
863      if (Src->getScalarSizeInBits() == 1) {
864        // We do not use vsext/vzext to extend from mask vector.
865        // Instead we use the following instructions to extend from mask vector:
866        // vmv.v.i v8, 0
867        // vmerge.vim v8, v8, -1, v0
868        return 2;
869      }
870      return 1;
871    case ISD::TRUNCATE:
872      if (Dst->getScalarSizeInBits() == 1) {
873        // We do not use several vncvt to truncate to mask vector. So we could
874        // not use PowDiff to calculate it.
875        // Instead we use the following instructions to truncate to mask vector:
876        // vand.vi v8, v8, 1
877        // vmsne.vi v0, v8, 0
878        return 2;
879      }
880      [[fallthrough]];
881    case ISD::FP_EXTEND:
882    case ISD::FP_ROUND:
883      // Counts of narrow/widen instructions.
884      return std::abs(PowDiff);
885    case ISD::FP_TO_SINT:
886    case ISD::FP_TO_UINT:
887    case ISD::SINT_TO_FP:
888    case ISD::UINT_TO_FP:
889      if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
890        // The cost of convert from or to mask vector is different from other
891        // cases. We could not use PowDiff to calculate it.
892        // For mask vector to fp, we should use the following instructions:
893        // vmv.v.i v8, 0
894        // vmerge.vim v8, v8, -1, v0
895        // vfcvt.f.x.v v8, v8
896
897        // And for fp vector to mask, we use:
898        // vfncvt.rtz.x.f.w v9, v8
899        // vand.vi v8, v9, 1
900        // vmsne.vi v0, v8, 0
901        return 3;
902      }
903      if (std::abs(PowDiff) <= 1)
904        return 1;
905      // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
906      // so it only need two conversion.
907      if (Src->isIntOrIntVectorTy())
908        return 2;
909      // Counts of narrow/widen instructions.
910      return std::abs(PowDiff);
911    }
912  }
913  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
914}
915
916unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
917  if (isa<ScalableVectorType>(Ty)) {
918    const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
919    const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
920    const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
921    return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
922  }
923  return cast<FixedVectorType>(Ty)->getNumElements();
924}
925
926InstructionCost
927RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
928                                     FastMathFlags FMF,
929                                     TTI::TargetCostKind CostKind) {
930  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
931    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
932
933  // Skip if scalar size of Ty is bigger than ELEN.
934  if (Ty->getScalarSizeInBits() > ST->getELen())
935    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
936
937  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
938  if (Ty->getElementType()->isIntegerTy(1))
939    // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
940    // cost 2, but we don't have enough info here so we slightly over cost.
941    return (LT.first - 1) + 3;
942
943  // IR Reduction is composed by two vmv and one rvv reduction instruction.
944  InstructionCost BaseCost = 2;
945
946  if (CostKind == TTI::TCK_CodeSize)
947    return (LT.first - 1) + BaseCost;
948
949  unsigned VL = getEstimatedVLFor(Ty);
950  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
951}
952
953InstructionCost
954RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
955                                         std::optional<FastMathFlags> FMF,
956                                         TTI::TargetCostKind CostKind) {
957  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
958    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
959
960  // Skip if scalar size of Ty is bigger than ELEN.
961  if (Ty->getScalarSizeInBits() > ST->getELen())
962    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
963
964  int ISD = TLI->InstructionOpcodeToISD(Opcode);
965  assert(ISD && "Invalid opcode");
966
967  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
968      ISD != ISD::FADD)
969    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
970
971  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
972  if (Ty->getElementType()->isIntegerTy(1))
973    // vcpop sequences, see vreduction-mask.ll
974    return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
975
976  // IR Reduction is composed by two vmv and one rvv reduction instruction.
977  InstructionCost BaseCost = 2;
978
979  if (CostKind == TTI::TCK_CodeSize)
980    return (LT.first - 1) + BaseCost;
981
982  unsigned VL = getEstimatedVLFor(Ty);
983  if (TTI::requiresOrderedReduction(FMF))
984    return (LT.first - 1) + BaseCost + VL;
985  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
986}
987
988InstructionCost RISCVTTIImpl::getExtendedReductionCost(
989    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
990    FastMathFlags FMF, TTI::TargetCostKind CostKind) {
991  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
992    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
993                                           FMF, CostKind);
994
995  // Skip if scalar size of ResTy is bigger than ELEN.
996  if (ResTy->getScalarSizeInBits() > ST->getELen())
997    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
998                                           FMF, CostKind);
999
1000  if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1001    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1002                                           FMF, CostKind);
1003
1004  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1005
1006  if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1007    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1008                                           FMF, CostKind);
1009
1010  return (LT.first - 1) +
1011         getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1012}
1013
1014InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1015                                              TTI::OperandValueInfo OpInfo,
1016                                              TTI::TargetCostKind CostKind) {
1017  assert(OpInfo.isConstant() && "non constant operand?");
1018  if (!isa<VectorType>(Ty))
1019    // FIXME: We need to account for immediate materialization here, but doing
1020    // a decent job requires more knowledge about the immediate than we
1021    // currently have here.
1022    return 0;
1023
1024  if (OpInfo.isUniform())
1025    // vmv.x.i, vmv.v.x, or vfmv.v.f
1026    // We ignore the cost of the scalar constant materialization to be consistent
1027    // with how we treat scalar constants themselves just above.
1028    return 1;
1029
1030  return getConstantPoolLoadCost(Ty, CostKind);
1031}
1032
1033
1034InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1035                                              MaybeAlign Alignment,
1036                                              unsigned AddressSpace,
1037                                              TTI::TargetCostKind CostKind,
1038                                              TTI::OperandValueInfo OpInfo,
1039                                              const Instruction *I) {
1040  EVT VT = TLI->getValueType(DL, Src, true);
1041  // Type legalization can't handle structs
1042  if (VT == MVT::Other)
1043    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1044                                  CostKind, OpInfo, I);
1045
1046  InstructionCost Cost = 0;
1047  if (Opcode == Instruction::Store && OpInfo.isConstant())
1048    Cost += getStoreImmCost(Src, OpInfo, CostKind);
1049  InstructionCost BaseCost =
1050    BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1051                           CostKind, OpInfo, I);
1052  // Assume memory ops cost scale with the number of vector registers
1053  // possible accessed by the instruction.  Note that BasicTTI already
1054  // handles the LT.first term for us.
1055  if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1056      LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1057    BaseCost *= TLI->getLMULCost(LT.second);
1058  return Cost + BaseCost;
1059
1060}
1061
1062InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1063                                                 Type *CondTy,
1064                                                 CmpInst::Predicate VecPred,
1065                                                 TTI::TargetCostKind CostKind,
1066                                                 const Instruction *I) {
1067  if (CostKind != TTI::TCK_RecipThroughput)
1068    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1069                                     I);
1070
1071  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1072    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1073                                     I);
1074
1075  // Skip if scalar size of ValTy is bigger than ELEN.
1076  if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1077    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1078                                     I);
1079
1080  if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1081    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1082    if (CondTy->isVectorTy()) {
1083      if (ValTy->getScalarSizeInBits() == 1) {
1084        // vmandn.mm v8, v8, v9
1085        // vmand.mm v9, v0, v9
1086        // vmor.mm v0, v9, v8
1087        return LT.first * 3;
1088      }
1089      // vselect and max/min are supported natively.
1090      return LT.first * 1;
1091    }
1092
1093    if (ValTy->getScalarSizeInBits() == 1) {
1094      //  vmv.v.x v9, a0
1095      //  vmsne.vi v9, v9, 0
1096      //  vmandn.mm v8, v8, v9
1097      //  vmand.mm v9, v0, v9
1098      //  vmor.mm v0, v9, v8
1099      return LT.first * 5;
1100    }
1101
1102    // vmv.v.x v10, a0
1103    // vmsne.vi v0, v10, 0
1104    // vmerge.vvm v8, v9, v8, v0
1105    return LT.first * 3;
1106  }
1107
1108  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1109      ValTy->isVectorTy()) {
1110    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1111
1112    // Support natively.
1113    if (CmpInst::isIntPredicate(VecPred))
1114      return LT.first * 1;
1115
1116    // If we do not support the input floating point vector type, use the base
1117    // one which will calculate as:
1118    // ScalarizeCost + Num * Cost for fixed vector,
1119    // InvalidCost for scalable vector.
1120    if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1121        (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1122        (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1123      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1124                                       I);
1125    switch (VecPred) {
1126      // Support natively.
1127    case CmpInst::FCMP_OEQ:
1128    case CmpInst::FCMP_OGT:
1129    case CmpInst::FCMP_OGE:
1130    case CmpInst::FCMP_OLT:
1131    case CmpInst::FCMP_OLE:
1132    case CmpInst::FCMP_UNE:
1133      return LT.first * 1;
1134    // TODO: Other comparisons?
1135    default:
1136      break;
1137    }
1138  }
1139
1140  // TODO: Add cost for scalar type.
1141
1142  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1143}
1144
1145InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1146                                             TTI::TargetCostKind CostKind,
1147                                             const Instruction *I) {
1148  if (CostKind != TTI::TCK_RecipThroughput)
1149    return Opcode == Instruction::PHI ? 0 : 1;
1150  // Branches are assumed to be predicted.
1151  return 0;
1152}
1153
1154InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1155                                                 TTI::TargetCostKind CostKind,
1156                                                 unsigned Index, Value *Op0,
1157                                                 Value *Op1) {
1158  assert(Val->isVectorTy() && "This must be a vector type");
1159
1160  if (Opcode != Instruction::ExtractElement &&
1161      Opcode != Instruction::InsertElement)
1162    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1163
1164  // Legalize the type.
1165  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1166
1167  // This type is legalized to a scalar type.
1168  if (!LT.second.isVector()) {
1169    auto *FixedVecTy = cast<FixedVectorType>(Val);
1170    // If Index is a known constant, cost is zero.
1171    if (Index != -1U)
1172      return 0;
1173    // Extract/InsertElement with non-constant index is very costly when
1174    // scalarized; estimate cost of loads/stores sequence via the stack:
1175    // ExtractElement cost: store vector to stack, load scalar;
1176    // InsertElement cost: store vector to stack, store scalar, load vector.
1177    Type *ElemTy = FixedVecTy->getElementType();
1178    auto NumElems = FixedVecTy->getNumElements();
1179    auto Align = DL.getPrefTypeAlign(ElemTy);
1180    InstructionCost LoadCost =
1181        getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1182    InstructionCost StoreCost =
1183        getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1184    return Opcode == Instruction::ExtractElement
1185               ? StoreCost * NumElems + LoadCost
1186               : (StoreCost + LoadCost) * NumElems + StoreCost;
1187  }
1188
1189  // For unsupported scalable vector.
1190  if (LT.second.isScalableVector() && !LT.first.isValid())
1191    return LT.first;
1192
1193  if (!isTypeLegal(Val))
1194    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1195
1196  // Mask vector extract/insert is expanded via e8.
1197  if (Val->getScalarSizeInBits() == 1) {
1198    VectorType *WideTy =
1199      VectorType::get(IntegerType::get(Val->getContext(), 8),
1200                      cast<VectorType>(Val)->getElementCount());
1201    if (Opcode == Instruction::ExtractElement) {
1202      InstructionCost ExtendCost
1203        = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1204                           TTI::CastContextHint::None, CostKind);
1205      InstructionCost ExtractCost
1206        = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1207      return ExtendCost + ExtractCost;
1208    }
1209    InstructionCost ExtendCost
1210      = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1211                         TTI::CastContextHint::None, CostKind);
1212    InstructionCost InsertCost
1213      = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1214    InstructionCost TruncCost
1215      = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1216                         TTI::CastContextHint::None, CostKind);
1217    return ExtendCost + InsertCost + TruncCost;
1218  }
1219
1220
1221  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1222  // and vslideup + vmv.s.x to insert element to vector.
1223  unsigned BaseCost = 1;
1224  // When insertelement we should add the index with 1 as the input of vslideup.
1225  unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1226
1227  if (Index != -1U) {
1228    // The type may be split. For fixed-width vectors we can normalize the
1229    // index to the new type.
1230    if (LT.second.isFixedLengthVector()) {
1231      unsigned Width = LT.second.getVectorNumElements();
1232      Index = Index % Width;
1233    }
1234
1235    // We could extract/insert the first element without vslidedown/vslideup.
1236    if (Index == 0)
1237      SlideCost = 0;
1238    else if (Opcode == Instruction::InsertElement)
1239      SlideCost = 1; // With a constant index, we do not need to use addi.
1240  }
1241
1242  // Extract i64 in the target that has XLEN=32 need more instruction.
1243  if (Val->getScalarType()->isIntegerTy() &&
1244      ST->getXLen() < Val->getScalarSizeInBits()) {
1245    // For extractelement, we need the following instructions:
1246    // vsetivli zero, 1, e64, m1, ta, mu (not count)
1247    // vslidedown.vx v8, v8, a0
1248    // vmv.x.s a0, v8
1249    // li a1, 32
1250    // vsrl.vx v8, v8, a1
1251    // vmv.x.s a1, v8
1252
1253    // For insertelement, we need the following instructions:
1254    // vsetivli zero, 2, e32, m4, ta, mu (not count)
1255    // vmv.v.i v12, 0
1256    // vslide1up.vx v16, v12, a1
1257    // vslide1up.vx v12, v16, a0
1258    // addi a0, a2, 1
1259    // vsetvli zero, a0, e64, m4, tu, mu (not count)
1260    // vslideup.vx v8, v12, a2
1261
1262    // TODO: should we count these special vsetvlis?
1263    BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1264  }
1265  return BaseCost + SlideCost;
1266}
1267
1268InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1269    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1270    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1271    ArrayRef<const Value *> Args, const Instruction *CxtI) {
1272
1273  // TODO: Handle more cost kinds.
1274  if (CostKind != TTI::TCK_RecipThroughput)
1275    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1276                                         Args, CxtI);
1277
1278  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1279    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1280                                         Args, CxtI);
1281
1282  // Skip if scalar size of Ty is bigger than ELEN.
1283  if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1284    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1285                                         Args, CxtI);
1286
1287  // Legalize the type.
1288  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1289
1290  // TODO: Handle scalar type.
1291  if (!LT.second.isVector())
1292    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1293                                         Args, CxtI);
1294
1295
1296  auto getConstantMatCost =
1297    [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1298    if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1299      // Two sub-cases:
1300      // * Has a 5 bit immediate operand which can be splatted.
1301      // * Has a larger immediate which must be materialized in scalar register
1302      // We return 0 for both as we currently ignore the cost of materializing
1303      // scalar constants in GPRs.
1304      return 0;
1305
1306    return getConstantPoolLoadCost(Ty, CostKind);
1307  };
1308
1309  // Add the cost of materializing any constant vectors required.
1310  InstructionCost ConstantMatCost = 0;
1311  if (Op1Info.isConstant())
1312    ConstantMatCost += getConstantMatCost(0, Op1Info);
1313  if (Op2Info.isConstant())
1314    ConstantMatCost += getConstantMatCost(1, Op2Info);
1315
1316  switch (TLI->InstructionOpcodeToISD(Opcode)) {
1317  case ISD::ADD:
1318  case ISD::SUB:
1319  case ISD::AND:
1320  case ISD::OR:
1321  case ISD::XOR:
1322  case ISD::SHL:
1323  case ISD::SRL:
1324  case ISD::SRA:
1325  case ISD::MUL:
1326  case ISD::MULHS:
1327  case ISD::MULHU:
1328  case ISD::FADD:
1329  case ISD::FSUB:
1330  case ISD::FMUL:
1331  case ISD::FNEG: {
1332    return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1333  }
1334  default:
1335    return ConstantMatCost +
1336           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1337                                         Args, CxtI);
1338  }
1339}
1340
1341// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1342InstructionCost RISCVTTIImpl::getPointersChainCost(
1343    ArrayRef<const Value *> Ptrs, const Value *Base,
1344    const TTI::PointersChainInfo &Info, Type *AccessTy,
1345    TTI::TargetCostKind CostKind) {
1346  InstructionCost Cost = TTI::TCC_Free;
1347  // In the basic model we take into account GEP instructions only
1348  // (although here can come alloca instruction, a value, constants and/or
1349  // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1350  // pointer). Typically, if Base is a not a GEP-instruction and all the
1351  // pointers are relative to the same base address, all the rest are
1352  // either GEP instructions, PHIs, bitcasts or constants. When we have same
1353  // base, we just calculate cost of each non-Base GEP as an ADD operation if
1354  // any their index is a non-const.
1355  // If no known dependecies between the pointers cost is calculated as a sum
1356  // of costs of GEP instructions.
1357  for (auto [I, V] : enumerate(Ptrs)) {
1358    const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1359    if (!GEP)
1360      continue;
1361    if (Info.isSameBase() && V != Base) {
1362      if (GEP->hasAllConstantIndices())
1363        continue;
1364      // If the chain is unit-stride and BaseReg + stride*i is a legal
1365      // addressing mode, then presume the base GEP is sitting around in a
1366      // register somewhere and check if we can fold the offset relative to
1367      // it.
1368      unsigned Stride = DL.getTypeStoreSize(AccessTy);
1369      if (Info.isUnitStride() &&
1370          isLegalAddressingMode(AccessTy,
1371                                /* BaseGV */ nullptr,
1372                                /* BaseOffset */ Stride * I,
1373                                /* HasBaseReg */ true,
1374                                /* Scale */ 0,
1375                                GEP->getType()->getPointerAddressSpace()))
1376        continue;
1377      Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1378                                     {TTI::OK_AnyValue, TTI::OP_None},
1379                                     {TTI::OK_AnyValue, TTI::OP_None},
1380                                     std::nullopt);
1381    } else {
1382      SmallVector<const Value *> Indices(GEP->indices());
1383      Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1384                         Indices, AccessTy, CostKind);
1385    }
1386  }
1387  return Cost;
1388}
1389
1390void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1391                                           TTI::UnrollingPreferences &UP,
1392                                           OptimizationRemarkEmitter *ORE) {
1393  // TODO: More tuning on benchmarks and metrics with changes as needed
1394  //       would apply to all settings below to enable performance.
1395
1396
1397  if (ST->enableDefaultUnroll())
1398    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1399
1400  // Enable Upper bound unrolling universally, not dependant upon the conditions
1401  // below.
1402  UP.UpperBound = true;
1403
1404  // Disable loop unrolling for Oz and Os.
1405  UP.OptSizeThreshold = 0;
1406  UP.PartialOptSizeThreshold = 0;
1407  if (L->getHeader()->getParent()->hasOptSize())
1408    return;
1409
1410  SmallVector<BasicBlock *, 4> ExitingBlocks;
1411  L->getExitingBlocks(ExitingBlocks);
1412  LLVM_DEBUG(dbgs() << "Loop has:\n"
1413                    << "Blocks: " << L->getNumBlocks() << "\n"
1414                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
1415
1416  // Only allow another exit other than the latch. This acts as an early exit
1417  // as it mirrors the profitability calculation of the runtime unroller.
1418  if (ExitingBlocks.size() > 2)
1419    return;
1420
1421  // Limit the CFG of the loop body for targets with a branch predictor.
1422  // Allowing 4 blocks permits if-then-else diamonds in the body.
1423  if (L->getNumBlocks() > 4)
1424    return;
1425
1426  // Don't unroll vectorized loops, including the remainder loop
1427  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1428    return;
1429
1430  // Scan the loop: don't unroll loops with calls as this could prevent
1431  // inlining.
1432  InstructionCost Cost = 0;
1433  for (auto *BB : L->getBlocks()) {
1434    for (auto &I : *BB) {
1435      // Initial setting - Don't unroll loops containing vectorized
1436      // instructions.
1437      if (I.getType()->isVectorTy())
1438        return;
1439
1440      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1441        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1442          if (!isLoweredToCall(F))
1443            continue;
1444        }
1445        return;
1446      }
1447
1448      SmallVector<const Value *> Operands(I.operand_values());
1449      Cost += getInstructionCost(&I, Operands,
1450                                 TargetTransformInfo::TCK_SizeAndLatency);
1451    }
1452  }
1453
1454  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1455
1456  UP.Partial = true;
1457  UP.Runtime = true;
1458  UP.UnrollRemainder = true;
1459  UP.UnrollAndJam = true;
1460  UP.UnrollAndJamInnerLoopThreshold = 60;
1461
1462  // Force unrolling small loops can be very useful because of the branch
1463  // taken cost of the backedge.
1464  if (Cost < 12)
1465    UP.Force = true;
1466}
1467
1468void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1469                                         TTI::PeelingPreferences &PP) {
1470  BaseT::getPeelingPreferences(L, SE, PP);
1471}
1472
1473unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1474  TypeSize Size = DL.getTypeSizeInBits(Ty);
1475  if (Ty->isVectorTy()) {
1476    if (Size.isScalable() && ST->hasVInstructions())
1477      return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1478
1479    if (ST->useRVVForFixedLengthVectors())
1480      return divideCeil(Size, ST->getRealMinVLen());
1481  }
1482
1483  return BaseT::getRegUsageForType(Ty);
1484}
1485
1486unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1487  if (SLPMaxVF.getNumOccurrences())
1488    return SLPMaxVF;
1489
1490  // Return how many elements can fit in getRegisterBitwidth.  This is the
1491  // same routine as used in LoopVectorizer.  We should probably be
1492  // accounting for whether we actually have instructions with the right
1493  // lane type, but we don't have enough information to do that without
1494  // some additional plumbing which hasn't been justified yet.
1495  TypeSize RegWidth =
1496    getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
1497  // If no vector registers, or absurd element widths, disable
1498  // vectorization by returning 1.
1499  return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1500}
1501
1502bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1503                                 const TargetTransformInfo::LSRCost &C2) {
1504  // RISC-V specific here are "instruction number 1st priority".
1505  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1506                  C1.NumIVMuls, C1.NumBaseAdds,
1507                  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1508         std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1509                  C2.NumIVMuls, C2.NumBaseAdds,
1510                  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1511}
1512