1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64ExpandImm.h"
10#include "AArch64TargetTransformInfo.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/LoopInfo.h"
13#include "llvm/Analysis/TargetTransformInfo.h"
14#include "llvm/CodeGen/BasicTTIImpl.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/TargetLowering.h"
17#include "llvm/IR/IntrinsicInst.h"
18#include "llvm/IR/IntrinsicsAArch64.h"
19#include "llvm/Support/Debug.h"
20#include <algorithm>
21using namespace llvm;
22
23#define DEBUG_TYPE "aarch64tti"
24
25static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
26                                               cl::init(true), cl::Hidden);
27
28bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
29                                         const Function *Callee) const {
30  const TargetMachine &TM = getTLI()->getTargetMachine();
31
32  const FeatureBitset &CallerBits =
33      TM.getSubtargetImpl(*Caller)->getFeatureBits();
34  const FeatureBitset &CalleeBits =
35      TM.getSubtargetImpl(*Callee)->getFeatureBits();
36
37  // Inline a callee if its target-features are a subset of the callers
38  // target-features.
39  return (CallerBits & CalleeBits) == CalleeBits;
40}
41
42/// Calculate the cost of materializing a 64-bit value. This helper
43/// method might only calculate a fraction of a larger immediate. Therefore it
44/// is valid to return a cost of ZERO.
45int AArch64TTIImpl::getIntImmCost(int64_t Val) {
46  // Check if the immediate can be encoded within an instruction.
47  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
48    return 0;
49
50  if (Val < 0)
51    Val = ~Val;
52
53  // Calculate how many moves we will need to materialize this constant.
54  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
55  AArch64_IMM::expandMOVImm(Val, 64, Insn);
56  return Insn.size();
57}
58
59/// Calculate the cost of materializing the given constant.
60int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
61  assert(Ty->isIntegerTy());
62
63  unsigned BitSize = Ty->getPrimitiveSizeInBits();
64  if (BitSize == 0)
65    return ~0U;
66
67  // Sign-extend all constants to a multiple of 64-bit.
68  APInt ImmVal = Imm;
69  if (BitSize & 0x3f)
70    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
71
72  // Split the constant into 64-bit chunks and calculate the cost for each
73  // chunk.
74  int Cost = 0;
75  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
76    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
77    int64_t Val = Tmp.getSExtValue();
78    Cost += getIntImmCost(Val);
79  }
80  // We need at least one instruction to materialze the constant.
81  return std::max(1, Cost);
82}
83
84int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
85                                      const APInt &Imm, Type *Ty) {
86  assert(Ty->isIntegerTy());
87
88  unsigned BitSize = Ty->getPrimitiveSizeInBits();
89  // There is no cost model for constants with a bit size of 0. Return TCC_Free
90  // here, so that constant hoisting will ignore this constant.
91  if (BitSize == 0)
92    return TTI::TCC_Free;
93
94  unsigned ImmIdx = ~0U;
95  switch (Opcode) {
96  default:
97    return TTI::TCC_Free;
98  case Instruction::GetElementPtr:
99    // Always hoist the base address of a GetElementPtr.
100    if (Idx == 0)
101      return 2 * TTI::TCC_Basic;
102    return TTI::TCC_Free;
103  case Instruction::Store:
104    ImmIdx = 0;
105    break;
106  case Instruction::Add:
107  case Instruction::Sub:
108  case Instruction::Mul:
109  case Instruction::UDiv:
110  case Instruction::SDiv:
111  case Instruction::URem:
112  case Instruction::SRem:
113  case Instruction::And:
114  case Instruction::Or:
115  case Instruction::Xor:
116  case Instruction::ICmp:
117    ImmIdx = 1;
118    break;
119  // Always return TCC_Free for the shift value of a shift instruction.
120  case Instruction::Shl:
121  case Instruction::LShr:
122  case Instruction::AShr:
123    if (Idx == 1)
124      return TTI::TCC_Free;
125    break;
126  case Instruction::Trunc:
127  case Instruction::ZExt:
128  case Instruction::SExt:
129  case Instruction::IntToPtr:
130  case Instruction::PtrToInt:
131  case Instruction::BitCast:
132  case Instruction::PHI:
133  case Instruction::Call:
134  case Instruction::Select:
135  case Instruction::Ret:
136  case Instruction::Load:
137    break;
138  }
139
140  if (Idx == ImmIdx) {
141    int NumConstants = (BitSize + 63) / 64;
142    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
143    return (Cost <= NumConstants * TTI::TCC_Basic)
144               ? static_cast<int>(TTI::TCC_Free)
145               : Cost;
146  }
147  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
148}
149
150int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
151                                        const APInt &Imm, Type *Ty) {
152  assert(Ty->isIntegerTy());
153
154  unsigned BitSize = Ty->getPrimitiveSizeInBits();
155  // There is no cost model for constants with a bit size of 0. Return TCC_Free
156  // here, so that constant hoisting will ignore this constant.
157  if (BitSize == 0)
158    return TTI::TCC_Free;
159
160  // Most (all?) AArch64 intrinsics do not support folding immediates into the
161  // selected instruction, so we compute the materialization cost for the
162  // immediate directly.
163  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
164    return AArch64TTIImpl::getIntImmCost(Imm, Ty);
165
166  switch (IID) {
167  default:
168    return TTI::TCC_Free;
169  case Intrinsic::sadd_with_overflow:
170  case Intrinsic::uadd_with_overflow:
171  case Intrinsic::ssub_with_overflow:
172  case Intrinsic::usub_with_overflow:
173  case Intrinsic::smul_with_overflow:
174  case Intrinsic::umul_with_overflow:
175    if (Idx == 1) {
176      int NumConstants = (BitSize + 63) / 64;
177      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
178      return (Cost <= NumConstants * TTI::TCC_Basic)
179                 ? static_cast<int>(TTI::TCC_Free)
180                 : Cost;
181    }
182    break;
183  case Intrinsic::experimental_stackmap:
184    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
185      return TTI::TCC_Free;
186    break;
187  case Intrinsic::experimental_patchpoint_void:
188  case Intrinsic::experimental_patchpoint_i64:
189    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
190      return TTI::TCC_Free;
191    break;
192  }
193  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
194}
195
196TargetTransformInfo::PopcntSupportKind
197AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
198  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
199  if (TyWidth == 32 || TyWidth == 64)
200    return TTI::PSK_FastHardware;
201  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
202  return TTI::PSK_Software;
203}
204
205bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
206                                           ArrayRef<const Value *> Args) {
207
208  // A helper that returns a vector type from the given type. The number of
209  // elements in type Ty determine the vector width.
210  auto toVectorTy = [&](Type *ArgTy) {
211    return VectorType::get(ArgTy->getScalarType(),
212                           DstTy->getVectorNumElements());
213  };
214
215  // Exit early if DstTy is not a vector type whose elements are at least
216  // 16-bits wide.
217  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
218    return false;
219
220  // Determine if the operation has a widening variant. We consider both the
221  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
222  // instructions.
223  //
224  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
225  //       verify that their extending operands are eliminated during code
226  //       generation.
227  switch (Opcode) {
228  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
229  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
230    break;
231  default:
232    return false;
233  }
234
235  // To be a widening instruction (either the "wide" or "long" versions), the
236  // second operand must be a sign- or zero extend having a single user. We
237  // only consider extends having a single user because they may otherwise not
238  // be eliminated.
239  if (Args.size() != 2 ||
240      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
241      !Args[1]->hasOneUse())
242    return false;
243  auto *Extend = cast<CastInst>(Args[1]);
244
245  // Legalize the destination type and ensure it can be used in a widening
246  // operation.
247  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
248  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
249  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
250    return false;
251
252  // Legalize the source type and ensure it can be used in a widening
253  // operation.
254  Type *SrcTy = toVectorTy(Extend->getSrcTy());
255  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
256  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
257  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
258    return false;
259
260  // Get the total number of vector elements in the legalized types.
261  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
262  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
263
264  // Return true if the legalized types have the same number of vector elements
265  // and the destination element type size is twice that of the source type.
266  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
267}
268
269int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
270                                     const Instruction *I) {
271  int ISD = TLI->InstructionOpcodeToISD(Opcode);
272  assert(ISD && "Invalid opcode");
273
274  // If the cast is observable, and it is used by a widening instruction (e.g.,
275  // uaddl, saddw, etc.), it may be free.
276  if (I && I->hasOneUse()) {
277    auto *SingleUser = cast<Instruction>(*I->user_begin());
278    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
279    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
280      // If the cast is the second operand, it is free. We will generate either
281      // a "wide" or "long" version of the widening instruction.
282      if (I == SingleUser->getOperand(1))
283        return 0;
284      // If the cast is not the second operand, it will be free if it looks the
285      // same as the second operand. In this case, we will generate a "long"
286      // version of the widening instruction.
287      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
288        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
289            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
290          return 0;
291    }
292  }
293
294  EVT SrcTy = TLI->getValueType(DL, Src);
295  EVT DstTy = TLI->getValueType(DL, Dst);
296
297  if (!SrcTy.isSimple() || !DstTy.isSimple())
298    return BaseT::getCastInstrCost(Opcode, Dst, Src);
299
300  static const TypeConversionCostTblEntry
301  ConversionTbl[] = {
302    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
303    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
304    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
305    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
306
307    // The number of shll instructions for the extension.
308    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
309    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
310    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
311    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
312    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
313    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
314    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
315    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
316    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
317    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
318    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
319    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
320    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
321    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
322    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
323    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
324
325    // LowerVectorINT_TO_FP:
326    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
327    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
328    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
329    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
330    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
331    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
332
333    // Complex: to v2f32
334    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
335    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
336    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
337    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
338    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
339    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
340
341    // Complex: to v4f32
342    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
343    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
344    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
345    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
346
347    // Complex: to v8f32
348    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
349    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
350    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
351    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
352
353    // Complex: to v16f32
354    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
355    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
356
357    // Complex: to v2f64
358    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
359    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
360    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
361    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
362    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
363    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
364
365
366    // LowerVectorFP_TO_INT
367    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
368    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
369    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
370    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
371    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
372    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
373
374    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
375    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
376    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
377    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
378    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
379    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
380    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
381
382    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
383    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
384    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
385    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
386    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
387
388    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
389    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
390    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
391    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
392    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
393    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
394    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
395  };
396
397  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
398                                                 DstTy.getSimpleVT(),
399                                                 SrcTy.getSimpleVT()))
400    return Entry->Cost;
401
402  return BaseT::getCastInstrCost(Opcode, Dst, Src);
403}
404
405int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
406                                             VectorType *VecTy,
407                                             unsigned Index) {
408
409  // Make sure we were given a valid extend opcode.
410  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
411         "Invalid opcode");
412
413  // We are extending an element we extract from a vector, so the source type
414  // of the extend is the element type of the vector.
415  auto *Src = VecTy->getElementType();
416
417  // Sign- and zero-extends are for integer types only.
418  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
419
420  // Get the cost for the extract. We compute the cost (if any) for the extend
421  // below.
422  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
423
424  // Legalize the types.
425  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
426  auto DstVT = TLI->getValueType(DL, Dst);
427  auto SrcVT = TLI->getValueType(DL, Src);
428
429  // If the resulting type is still a vector and the destination type is legal,
430  // we may get the extension for free. If not, get the default cost for the
431  // extend.
432  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
433    return Cost + getCastInstrCost(Opcode, Dst, Src);
434
435  // The destination type should be larger than the element type. If not, get
436  // the default cost for the extend.
437  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
438    return Cost + getCastInstrCost(Opcode, Dst, Src);
439
440  switch (Opcode) {
441  default:
442    llvm_unreachable("Opcode should be either SExt or ZExt");
443
444  // For sign-extends, we only need a smov, which performs the extension
445  // automatically.
446  case Instruction::SExt:
447    return Cost;
448
449  // For zero-extends, the extend is performed automatically by a umov unless
450  // the destination type is i64 and the element type is i8 or i16.
451  case Instruction::ZExt:
452    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
453      return Cost;
454  }
455
456  // If we are unable to perform the extend for free, get the default cost.
457  return Cost + getCastInstrCost(Opcode, Dst, Src);
458}
459
460int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
461                                       unsigned Index) {
462  assert(Val->isVectorTy() && "This must be a vector type");
463
464  if (Index != -1U) {
465    // Legalize the type.
466    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
467
468    // This type is legalized to a scalar type.
469    if (!LT.second.isVector())
470      return 0;
471
472    // The type may be split. Normalize the index to the new type.
473    unsigned Width = LT.second.getVectorNumElements();
474    Index = Index % Width;
475
476    // The element at index zero is already inside the vector.
477    if (Index == 0)
478      return 0;
479  }
480
481  // All other insert/extracts cost this much.
482  return ST->getVectorInsertExtractBaseCost();
483}
484
485int AArch64TTIImpl::getArithmeticInstrCost(
486    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
487    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
488    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
489    const Instruction *CxtI) {
490  // Legalize the type.
491  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
492
493  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
494  // add in the widening overhead specified by the sub-target. Since the
495  // extends feeding widening instructions are performed automatically, they
496  // aren't present in the generated code and have a zero cost. By adding a
497  // widening overhead here, we attach the total cost of the combined operation
498  // to the widening instruction.
499  int Cost = 0;
500  if (isWideningInstruction(Ty, Opcode, Args))
501    Cost += ST->getWideningBaseCost();
502
503  int ISD = TLI->InstructionOpcodeToISD(Opcode);
504
505  switch (ISD) {
506  default:
507    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
508                                                Opd1PropInfo, Opd2PropInfo);
509  case ISD::SDIV:
510    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
511        Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
512      // On AArch64, scalar signed division by constants power-of-two are
513      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
514      // The OperandValue properties many not be same as that of previous
515      // operation; conservatively assume OP_None.
516      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
517                                     TargetTransformInfo::OP_None,
518                                     TargetTransformInfo::OP_None);
519      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
520                                     TargetTransformInfo::OP_None,
521                                     TargetTransformInfo::OP_None);
522      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
523                                     TargetTransformInfo::OP_None,
524                                     TargetTransformInfo::OP_None);
525      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
526                                     TargetTransformInfo::OP_None,
527                                     TargetTransformInfo::OP_None);
528      return Cost;
529    }
530    LLVM_FALLTHROUGH;
531  case ISD::UDIV:
532    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
533      auto VT = TLI->getValueType(DL, Ty);
534      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
535        // Vector signed division by constant are expanded to the
536        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
537        // to MULHS + SUB + SRL + ADD + SRL.
538        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
539                                             Opd2Info,
540                                             TargetTransformInfo::OP_None,
541                                             TargetTransformInfo::OP_None);
542        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
543                                             Opd2Info,
544                                             TargetTransformInfo::OP_None,
545                                             TargetTransformInfo::OP_None);
546        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
547                                             Opd2Info,
548                                             TargetTransformInfo::OP_None,
549                                             TargetTransformInfo::OP_None);
550        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
551      }
552    }
553
554    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
555                                          Opd1PropInfo, Opd2PropInfo);
556    if (Ty->isVectorTy()) {
557      // On AArch64, vector divisions are not supported natively and are
558      // expanded into scalar divisions of each pair of elements.
559      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
560                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
561      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
562                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
563      // TODO: if one of the arguments is scalar, then it's not necessary to
564      // double the cost of handling the vector elements.
565      Cost += Cost;
566    }
567    return Cost;
568
569  case ISD::ADD:
570  case ISD::MUL:
571  case ISD::XOR:
572  case ISD::OR:
573  case ISD::AND:
574    // These nodes are marked as 'custom' for combining purposes only.
575    // We know that they are legal. See LowerAdd in ISelLowering.
576    return (Cost + 1) * LT.first;
577  }
578}
579
580int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
581                                              const SCEV *Ptr) {
582  // Address computations in vectorized code with non-consecutive addresses will
583  // likely result in more instructions compared to scalar code where the
584  // computation can more often be merged into the index mode. The resulting
585  // extra micro-ops can significantly decrease throughput.
586  unsigned NumVectorInstToHideOverhead = 10;
587  int MaxMergeDistance = 64;
588
589  if (Ty->isVectorTy() && SE &&
590      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
591    return NumVectorInstToHideOverhead;
592
593  // In many cases the address computation is not merged into the instruction
594  // addressing mode.
595  return 1;
596}
597
598int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
599                                       Type *CondTy, const Instruction *I) {
600
601  int ISD = TLI->InstructionOpcodeToISD(Opcode);
602  // We don't lower some vector selects well that are wider than the register
603  // width.
604  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
605    // We would need this many instructions to hide the scalarization happening.
606    const int AmortizationCost = 20;
607    static const TypeConversionCostTblEntry
608    VectorSelectTbl[] = {
609      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
610      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
611      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
612      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
613      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
614      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
615    };
616
617    EVT SelCondTy = TLI->getValueType(DL, CondTy);
618    EVT SelValTy = TLI->getValueType(DL, ValTy);
619    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
620      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
621                                                     SelCondTy.getSimpleVT(),
622                                                     SelValTy.getSimpleVT()))
623        return Entry->Cost;
624    }
625  }
626  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
627}
628
629AArch64TTIImpl::TTI::MemCmpExpansionOptions
630AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
631  TTI::MemCmpExpansionOptions Options;
632  Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
633  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
634  Options.NumLoadsPerBlock = Options.MaxNumLoads;
635  // TODO: Though vector loads usually perform well on AArch64, in some targets
636  // they may wake up the FP unit, which raises the power consumption.  Perhaps
637  // they could be used with no holds barred (-O3).
638  Options.LoadSizes = {8, 4, 2, 1};
639  return Options;
640}
641
642int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
643                                    MaybeAlign Alignment, unsigned AddressSpace,
644                                    const Instruction *I) {
645  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
646
647  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
648      LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
649    // Unaligned stores are extremely inefficient. We don't split all
650    // unaligned 128-bit stores because the negative impact that has shown in
651    // practice on inlined block copy code.
652    // We make such stores expensive so that we will only vectorize if there
653    // are 6 other instructions getting vectorized.
654    const int AmortizationCost = 6;
655
656    return LT.first * 2 * AmortizationCost;
657  }
658
659  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
660    unsigned ProfitableNumElements;
661    if (Opcode == Instruction::Store)
662      // We use a custom trunc store lowering so v.4b should be profitable.
663      ProfitableNumElements = 4;
664    else
665      // We scalarize the loads because there is not v.4b register and we
666      // have to promote the elements to v.2.
667      ProfitableNumElements = 8;
668
669    if (Ty->getVectorNumElements() < ProfitableNumElements) {
670      unsigned NumVecElts = Ty->getVectorNumElements();
671      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
672      // We generate 2 instructions per vector element.
673      return NumVectorizableInstsToAmortize * NumVecElts * 2;
674    }
675  }
676
677  return LT.first;
678}
679
680int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
681                                               unsigned Factor,
682                                               ArrayRef<unsigned> Indices,
683                                               unsigned Alignment,
684                                               unsigned AddressSpace,
685                                               bool UseMaskForCond,
686                                               bool UseMaskForGaps) {
687  assert(Factor >= 2 && "Invalid interleave factor");
688  assert(isa<VectorType>(VecTy) && "Expect a vector type");
689
690  if (!UseMaskForCond && !UseMaskForGaps &&
691      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
692    unsigned NumElts = VecTy->getVectorNumElements();
693    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
694
695    // ldN/stN only support legal vector types of size 64 or 128 in bits.
696    // Accesses having vector types that are a multiple of 128 bits can be
697    // matched to more than one ldN/stN instruction.
698    if (NumElts % Factor == 0 &&
699        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
700      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
701  }
702
703  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
704                                           Alignment, AddressSpace,
705                                           UseMaskForCond, UseMaskForGaps);
706}
707
708int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
709  int Cost = 0;
710  for (auto *I : Tys) {
711    if (!I->isVectorTy())
712      continue;
713    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
714      Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) +
715              getMemoryOpCost(Instruction::Load, I, Align(128), 0);
716  }
717  return Cost;
718}
719
720unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
721  return ST->getMaxInterleaveFactor();
722}
723
724// For Falkor, we want to avoid having too many strided loads in a loop since
725// that can exhaust the HW prefetcher resources.  We adjust the unroller
726// MaxCount preference below to attempt to ensure unrolling doesn't create too
727// many strided loads.
728static void
729getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
730                              TargetTransformInfo::UnrollingPreferences &UP) {
731  enum { MaxStridedLoads = 7 };
732  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
733    int StridedLoads = 0;
734    // FIXME? We could make this more precise by looking at the CFG and
735    // e.g. not counting loads in each side of an if-then-else diamond.
736    for (const auto BB : L->blocks()) {
737      for (auto &I : *BB) {
738        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
739        if (!LMemI)
740          continue;
741
742        Value *PtrValue = LMemI->getPointerOperand();
743        if (L->isLoopInvariant(PtrValue))
744          continue;
745
746        const SCEV *LSCEV = SE.getSCEV(PtrValue);
747        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
748        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
749          continue;
750
751        // FIXME? We could take pairing of unrolled load copies into account
752        // by looking at the AddRec, but we would probably have to limit this
753        // to loops with no stores or other memory optimization barriers.
754        ++StridedLoads;
755        // We've seen enough strided loads that seeing more won't make a
756        // difference.
757        if (StridedLoads > MaxStridedLoads / 2)
758          return StridedLoads;
759      }
760    }
761    return StridedLoads;
762  };
763
764  int StridedLoads = countStridedLoads(L, SE);
765  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
766                    << " strided loads\n");
767  // Pick the largest power of 2 unroll count that won't result in too many
768  // strided loads.
769  if (StridedLoads) {
770    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
771    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
772                      << UP.MaxCount << '\n');
773  }
774}
775
776void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
777                                             TTI::UnrollingPreferences &UP) {
778  // Enable partial unrolling and runtime unrolling.
779  BaseT::getUnrollingPreferences(L, SE, UP);
780
781  // For inner loop, it is more likely to be a hot one, and the runtime check
782  // can be promoted out from LICM pass, so the overhead is less, let's try
783  // a larger threshold to unroll more loops.
784  if (L->getLoopDepth() > 1)
785    UP.PartialThreshold *= 2;
786
787  // Disable partial & runtime unrolling on -Os.
788  UP.PartialOptSizeThreshold = 0;
789
790  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
791      EnableFalkorHWPFUnrollFix)
792    getFalkorUnrollingPreferences(L, SE, UP);
793}
794
795Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
796                                                         Type *ExpectedType) {
797  switch (Inst->getIntrinsicID()) {
798  default:
799    return nullptr;
800  case Intrinsic::aarch64_neon_st2:
801  case Intrinsic::aarch64_neon_st3:
802  case Intrinsic::aarch64_neon_st4: {
803    // Create a struct type
804    StructType *ST = dyn_cast<StructType>(ExpectedType);
805    if (!ST)
806      return nullptr;
807    unsigned NumElts = Inst->getNumArgOperands() - 1;
808    if (ST->getNumElements() != NumElts)
809      return nullptr;
810    for (unsigned i = 0, e = NumElts; i != e; ++i) {
811      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
812        return nullptr;
813    }
814    Value *Res = UndefValue::get(ExpectedType);
815    IRBuilder<> Builder(Inst);
816    for (unsigned i = 0, e = NumElts; i != e; ++i) {
817      Value *L = Inst->getArgOperand(i);
818      Res = Builder.CreateInsertValue(Res, L, i);
819    }
820    return Res;
821  }
822  case Intrinsic::aarch64_neon_ld2:
823  case Intrinsic::aarch64_neon_ld3:
824  case Intrinsic::aarch64_neon_ld4:
825    if (Inst->getType() == ExpectedType)
826      return Inst;
827    return nullptr;
828  }
829}
830
831bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
832                                        MemIntrinsicInfo &Info) {
833  switch (Inst->getIntrinsicID()) {
834  default:
835    break;
836  case Intrinsic::aarch64_neon_ld2:
837  case Intrinsic::aarch64_neon_ld3:
838  case Intrinsic::aarch64_neon_ld4:
839    Info.ReadMem = true;
840    Info.WriteMem = false;
841    Info.PtrVal = Inst->getArgOperand(0);
842    break;
843  case Intrinsic::aarch64_neon_st2:
844  case Intrinsic::aarch64_neon_st3:
845  case Intrinsic::aarch64_neon_st4:
846    Info.ReadMem = false;
847    Info.WriteMem = true;
848    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
849    break;
850  }
851
852  switch (Inst->getIntrinsicID()) {
853  default:
854    return false;
855  case Intrinsic::aarch64_neon_ld2:
856  case Intrinsic::aarch64_neon_st2:
857    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
858    break;
859  case Intrinsic::aarch64_neon_ld3:
860  case Intrinsic::aarch64_neon_st3:
861    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
862    break;
863  case Intrinsic::aarch64_neon_ld4:
864  case Intrinsic::aarch64_neon_st4:
865    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
866    break;
867  }
868  return true;
869}
870
871/// See if \p I should be considered for address type promotion. We check if \p
872/// I is a sext with right type and used in memory accesses. If it used in a
873/// "complex" getelementptr, we allow it to be promoted without finding other
874/// sext instructions that sign extended the same initial value. A getelementptr
875/// is considered as "complex" if it has more than 2 operands.
876bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
877    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
878  bool Considerable = false;
879  AllowPromotionWithoutCommonHeader = false;
880  if (!isa<SExtInst>(&I))
881    return false;
882  Type *ConsideredSExtType =
883      Type::getInt64Ty(I.getParent()->getParent()->getContext());
884  if (I.getType() != ConsideredSExtType)
885    return false;
886  // See if the sext is the one with the right type and used in at least one
887  // GetElementPtrInst.
888  for (const User *U : I.users()) {
889    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
890      Considerable = true;
891      // A getelementptr is considered as "complex" if it has more than 2
892      // operands. We will promote a SExt used in such complex GEP as we
893      // expect some computation to be merged if they are done on 64 bits.
894      if (GEPInst->getNumOperands() > 2) {
895        AllowPromotionWithoutCommonHeader = true;
896        break;
897      }
898    }
899  }
900  return Considerable;
901}
902
903bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
904                                           TTI::ReductionFlags Flags) const {
905  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
906  unsigned ScalarBits = Ty->getScalarSizeInBits();
907  switch (Opcode) {
908  case Instruction::FAdd:
909  case Instruction::FMul:
910  case Instruction::And:
911  case Instruction::Or:
912  case Instruction::Xor:
913  case Instruction::Mul:
914    return false;
915  case Instruction::Add:
916    return ScalarBits * Ty->getVectorNumElements() >= 128;
917  case Instruction::ICmp:
918    return (ScalarBits < 64) &&
919           (ScalarBits * Ty->getVectorNumElements() >= 128);
920  case Instruction::FCmp:
921    return Flags.NoNaN;
922  default:
923    llvm_unreachable("Unhandled reduction opcode");
924  }
925  return false;
926}
927
928int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
929                                               bool IsPairwiseForm) {
930
931  if (IsPairwiseForm)
932    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
933
934  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
935  MVT MTy = LT.second;
936  int ISD = TLI->InstructionOpcodeToISD(Opcode);
937  assert(ISD && "Invalid opcode");
938
939  // Horizontal adds can use the 'addv' instruction. We model the cost of these
940  // instructions as normal vector adds. This is the only arithmetic vector
941  // reduction operation for which we have an instruction.
942  static const CostTblEntry CostTblNoPairwise[]{
943      {ISD::ADD, MVT::v8i8,  1},
944      {ISD::ADD, MVT::v16i8, 1},
945      {ISD::ADD, MVT::v4i16, 1},
946      {ISD::ADD, MVT::v8i16, 1},
947      {ISD::ADD, MVT::v4i32, 1},
948  };
949
950  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
951    return LT.first * Entry->Cost;
952
953  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
954}
955
956int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
957                                   Type *SubTp) {
958  if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
959      Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
960    static const CostTblEntry ShuffleTbl[] = {
961      // Broadcast shuffle kinds can be performed with 'dup'.
962      { TTI::SK_Broadcast, MVT::v8i8,  1 },
963      { TTI::SK_Broadcast, MVT::v16i8, 1 },
964      { TTI::SK_Broadcast, MVT::v4i16, 1 },
965      { TTI::SK_Broadcast, MVT::v8i16, 1 },
966      { TTI::SK_Broadcast, MVT::v2i32, 1 },
967      { TTI::SK_Broadcast, MVT::v4i32, 1 },
968      { TTI::SK_Broadcast, MVT::v2i64, 1 },
969      { TTI::SK_Broadcast, MVT::v2f32, 1 },
970      { TTI::SK_Broadcast, MVT::v4f32, 1 },
971      { TTI::SK_Broadcast, MVT::v2f64, 1 },
972      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
973      // 'zip1/zip2' instructions.
974      { TTI::SK_Transpose, MVT::v8i8,  1 },
975      { TTI::SK_Transpose, MVT::v16i8, 1 },
976      { TTI::SK_Transpose, MVT::v4i16, 1 },
977      { TTI::SK_Transpose, MVT::v8i16, 1 },
978      { TTI::SK_Transpose, MVT::v2i32, 1 },
979      { TTI::SK_Transpose, MVT::v4i32, 1 },
980      { TTI::SK_Transpose, MVT::v2i64, 1 },
981      { TTI::SK_Transpose, MVT::v2f32, 1 },
982      { TTI::SK_Transpose, MVT::v4f32, 1 },
983      { TTI::SK_Transpose, MVT::v2f64, 1 },
984      // Select shuffle kinds.
985      // TODO: handle vXi8/vXi16.
986      { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
987      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
988      { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
989      { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
990      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
991      { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
992      // PermuteSingleSrc shuffle kinds.
993      // TODO: handle vXi8/vXi16.
994      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
995      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
996      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
997      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
998      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
999      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
1000    };
1001    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1002    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
1003      return LT.first * Entry->Cost;
1004  }
1005
1006  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1007}
1008