1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// This file provides a helper that implements much of the TTI interface in 11/// terms of the target-independent code generator and TargetLowering 12/// interfaces. 13// 14//===----------------------------------------------------------------------===// 15 16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17#define LLVM_CODEGEN_BASICTTIIMPL_H 18 19#include "llvm/ADT/APInt.h" 20#include "llvm/ADT/ArrayRef.h" 21#include "llvm/ADT/BitVector.h" 22#include "llvm/ADT/SmallPtrSet.h" 23#include "llvm/ADT/SmallVector.h" 24#include "llvm/Analysis/LoopInfo.h" 25#include "llvm/Analysis/TargetTransformInfo.h" 26#include "llvm/Analysis/TargetTransformInfoImpl.h" 27#include "llvm/CodeGen/ISDOpcodes.h" 28#include "llvm/CodeGen/TargetLowering.h" 29#include "llvm/CodeGen/TargetSubtargetInfo.h" 30#include "llvm/CodeGen/ValueTypes.h" 31#include "llvm/IR/BasicBlock.h" 32#include "llvm/IR/Constant.h" 33#include "llvm/IR/Constants.h" 34#include "llvm/IR/DataLayout.h" 35#include "llvm/IR/DerivedTypes.h" 36#include "llvm/IR/InstrTypes.h" 37#include "llvm/IR/Instruction.h" 38#include "llvm/IR/Instructions.h" 39#include "llvm/IR/Intrinsics.h" 40#include "llvm/IR/Operator.h" 41#include "llvm/IR/Type.h" 42#include "llvm/IR/Value.h" 43#include "llvm/Support/Casting.h" 44#include "llvm/Support/CommandLine.h" 45#include "llvm/Support/ErrorHandling.h" 46#include "llvm/Support/MachineValueType.h" 47#include "llvm/Support/MathExtras.h" 48#include "llvm/Target/TargetMachine.h" 49#include <algorithm> 50#include <cassert> 51#include <cstdint> 52#include <limits> 53#include <utility> 54 55namespace llvm { 56 57class Function; 58class GlobalValue; 59class LLVMContext; 60class ScalarEvolution; 61class SCEV; 62class TargetMachine; 63 64extern cl::opt<unsigned> PartialUnrollingThreshold; 65 66/// Base class which can be used to help build a TTI implementation. 67/// 68/// This class provides as much implementation of the TTI interface as is 69/// possible using the target independent parts of the code generator. 70/// 71/// In order to subclass it, your class must implement a getST() method to 72/// return the subtarget, and a getTLI() method to return the target lowering. 73/// We need these methods implemented in the derived class so that this class 74/// doesn't have to duplicate storage for them. 75template <typename T> 76class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 77private: 78 using BaseT = TargetTransformInfoImplCRTPBase<T>; 79 using TTI = TargetTransformInfo; 80 81 /// Helper function to access this as a T. 82 T *thisT() { return static_cast<T *>(this); } 83 84 /// Estimate a cost of Broadcast as an extract and sequence of insert 85 /// operations. 86 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { 87 InstructionCost Cost = 0; 88 // Broadcast cost is equal to the cost of extracting the zero'th element 89 // plus the cost of inserting it into every element of the result vector. 90 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); 91 92 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 93 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 94 } 95 return Cost; 96 } 97 98 /// Estimate a cost of shuffle as a sequence of extract and insert 99 /// operations. 100 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { 101 InstructionCost Cost = 0; 102 // Shuffle cost is equal to the cost of extracting element from its argument 103 // plus the cost of inserting them onto the result vector. 104 105 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 106 // index 0 of first vector, index 1 of second vector,index 2 of first 107 // vector and finally index 3 of second vector and insert them at index 108 // <0,1,2,3> of result vector. 109 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 110 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 111 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); 112 } 113 return Cost; 114 } 115 116 /// Estimate a cost of subvector extraction as a sequence of extract and 117 /// insert operations. 118 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, 119 FixedVectorType *SubVTy) { 120 assert(VTy && SubVTy && 121 "Can only extract subvectors from vectors"); 122 int NumSubElts = SubVTy->getNumElements(); 123 assert((!isa<FixedVectorType>(VTy) || 124 (Index + NumSubElts) <= 125 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 126 "SK_ExtractSubvector index out of range"); 127 128 InstructionCost Cost = 0; 129 // Subvector extraction cost is equal to the cost of extracting element from 130 // the source type plus the cost of inserting them into the result vector 131 // type. 132 for (int i = 0; i != NumSubElts; ++i) { 133 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 134 i + Index); 135 Cost += 136 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); 137 } 138 return Cost; 139 } 140 141 /// Estimate a cost of subvector insertion as a sequence of extract and 142 /// insert operations. 143 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, 144 FixedVectorType *SubVTy) { 145 assert(VTy && SubVTy && 146 "Can only insert subvectors into vectors"); 147 int NumSubElts = SubVTy->getNumElements(); 148 assert((!isa<FixedVectorType>(VTy) || 149 (Index + NumSubElts) <= 150 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 151 "SK_InsertSubvector index out of range"); 152 153 InstructionCost Cost = 0; 154 // Subvector insertion cost is equal to the cost of extracting element from 155 // the source type plus the cost of inserting them into the result vector 156 // type. 157 for (int i = 0; i != NumSubElts; ++i) { 158 Cost += 159 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); 160 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 161 i + Index); 162 } 163 return Cost; 164 } 165 166 /// Local query method delegates up to T which *must* implement this! 167 const TargetSubtargetInfo *getST() const { 168 return static_cast<const T *>(this)->getST(); 169 } 170 171 /// Local query method delegates up to T which *must* implement this! 172 const TargetLoweringBase *getTLI() const { 173 return static_cast<const T *>(this)->getTLI(); 174 } 175 176 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 177 switch (M) { 178 case TTI::MIM_Unindexed: 179 return ISD::UNINDEXED; 180 case TTI::MIM_PreInc: 181 return ISD::PRE_INC; 182 case TTI::MIM_PreDec: 183 return ISD::PRE_DEC; 184 case TTI::MIM_PostInc: 185 return ISD::POST_INC; 186 case TTI::MIM_PostDec: 187 return ISD::POST_DEC; 188 } 189 llvm_unreachable("Unexpected MemIndexedMode"); 190 } 191 192 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 193 Align Alignment, 194 bool VariableMask, 195 bool IsGatherScatter, 196 TTI::TargetCostKind CostKind) { 197 auto *VT = cast<FixedVectorType>(DataTy); 198 // Assume the target does not have support for gather/scatter operations 199 // and provide a rough estimate. 200 // 201 // First, compute the cost of the individual memory operations. 202 InstructionCost AddrExtractCost = 203 IsGatherScatter 204 ? getVectorInstrCost(Instruction::ExtractElement, 205 FixedVectorType::get( 206 PointerType::get(VT->getElementType(), 0), 207 VT->getNumElements()), 208 -1) 209 : 0; 210 InstructionCost LoadCost = 211 VT->getNumElements() * 212 (AddrExtractCost + 213 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 214 215 // Next, compute the cost of packing the result in a vector. 216 InstructionCost PackingCost = getScalarizationOverhead( 217 VT, Opcode != Instruction::Store, Opcode == Instruction::Store); 218 219 InstructionCost ConditionalCost = 0; 220 if (VariableMask) { 221 // Compute the cost of conditionally executing the memory operations with 222 // variable masks. This includes extracting the individual conditions, a 223 // branches and PHIs to combine the results. 224 // NOTE: Estimating the cost of conditionally executing the memory 225 // operations accurately is quite difficult and the current solution 226 // provides a very rough estimate only. 227 ConditionalCost = 228 VT->getNumElements() * 229 (getVectorInstrCost( 230 Instruction::ExtractElement, 231 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 232 VT->getNumElements()), 233 -1) + 234 getCFInstrCost(Instruction::Br, CostKind) + 235 getCFInstrCost(Instruction::PHI, CostKind)); 236 } 237 238 return LoadCost + PackingCost + ConditionalCost; 239 } 240 241protected: 242 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 243 : BaseT(DL) {} 244 virtual ~BasicTTIImplBase() = default; 245 246 using TargetTransformInfoImplBase::DL; 247 248public: 249 /// \name Scalar TTI Implementations 250 /// @{ 251 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 252 unsigned AddressSpace, Align Alignment, 253 bool *Fast) const { 254 EVT E = EVT::getIntegerVT(Context, BitWidth); 255 return getTLI()->allowsMisalignedMemoryAccesses( 256 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 257 } 258 259 bool hasBranchDivergence() { return false; } 260 261 bool useGPUDivergenceAnalysis() { return false; } 262 263 bool isSourceOfDivergence(const Value *V) { return false; } 264 265 bool isAlwaysUniform(const Value *V) { return false; } 266 267 unsigned getFlatAddressSpace() { 268 // Return an invalid address space. 269 return -1; 270 } 271 272 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 273 Intrinsic::ID IID) const { 274 return false; 275 } 276 277 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 278 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 279 } 280 281 unsigned getAssumedAddrSpace(const Value *V) const { 282 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 283 } 284 285 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 286 Value *NewV) const { 287 return nullptr; 288 } 289 290 bool isLegalAddImmediate(int64_t imm) { 291 return getTLI()->isLegalAddImmediate(imm); 292 } 293 294 bool isLegalICmpImmediate(int64_t imm) { 295 return getTLI()->isLegalICmpImmediate(imm); 296 } 297 298 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 299 bool HasBaseReg, int64_t Scale, 300 unsigned AddrSpace, Instruction *I = nullptr) { 301 TargetLoweringBase::AddrMode AM; 302 AM.BaseGV = BaseGV; 303 AM.BaseOffs = BaseOffset; 304 AM.HasBaseReg = HasBaseReg; 305 AM.Scale = Scale; 306 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 307 } 308 309 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 310 const DataLayout &DL) const { 311 EVT VT = getTLI()->getValueType(DL, Ty); 312 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 313 } 314 315 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 316 const DataLayout &DL) const { 317 EVT VT = getTLI()->getValueType(DL, Ty); 318 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 319 } 320 321 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 322 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 323 } 324 325 bool isNumRegsMajorCostOfLSR() { 326 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 327 } 328 329 bool isProfitableLSRChainElement(Instruction *I) { 330 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 331 } 332 333 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 334 int64_t BaseOffset, bool HasBaseReg, 335 int64_t Scale, unsigned AddrSpace) { 336 TargetLoweringBase::AddrMode AM; 337 AM.BaseGV = BaseGV; 338 AM.BaseOffs = BaseOffset; 339 AM.HasBaseReg = HasBaseReg; 340 AM.Scale = Scale; 341 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 342 } 343 344 bool isTruncateFree(Type *Ty1, Type *Ty2) { 345 return getTLI()->isTruncateFree(Ty1, Ty2); 346 } 347 348 bool isProfitableToHoist(Instruction *I) { 349 return getTLI()->isProfitableToHoist(I); 350 } 351 352 bool useAA() const { return getST()->useAA(); } 353 354 bool isTypeLegal(Type *Ty) { 355 EVT VT = getTLI()->getValueType(DL, Ty); 356 return getTLI()->isTypeLegal(VT); 357 } 358 359 InstructionCost getRegUsageForType(Type *Ty) { 360 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; 361 assert(Val >= 0 && "Negative cost!"); 362 return Val; 363 } 364 365 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 366 ArrayRef<const Value *> Operands) { 367 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 368 } 369 370 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 371 unsigned &JumpTableSize, 372 ProfileSummaryInfo *PSI, 373 BlockFrequencyInfo *BFI) { 374 /// Try to find the estimated number of clusters. Note that the number of 375 /// clusters identified in this function could be different from the actual 376 /// numbers found in lowering. This function ignore switches that are 377 /// lowered with a mix of jump table / bit test / BTree. This function was 378 /// initially intended to be used when estimating the cost of switch in 379 /// inline cost heuristic, but it's a generic cost model to be used in other 380 /// places (e.g., in loop unrolling). 381 unsigned N = SI.getNumCases(); 382 const TargetLoweringBase *TLI = getTLI(); 383 const DataLayout &DL = this->getDataLayout(); 384 385 JumpTableSize = 0; 386 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 387 388 // Early exit if both a jump table and bit test are not allowed. 389 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 390 return N; 391 392 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 393 APInt MinCaseVal = MaxCaseVal; 394 for (auto CI : SI.cases()) { 395 const APInt &CaseVal = CI.getCaseValue()->getValue(); 396 if (CaseVal.sgt(MaxCaseVal)) 397 MaxCaseVal = CaseVal; 398 if (CaseVal.slt(MinCaseVal)) 399 MinCaseVal = CaseVal; 400 } 401 402 // Check if suitable for a bit test 403 if (N <= DL.getIndexSizeInBits(0u)) { 404 SmallPtrSet<const BasicBlock *, 4> Dests; 405 for (auto I : SI.cases()) 406 Dests.insert(I.getCaseSuccessor()); 407 408 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 409 DL)) 410 return 1; 411 } 412 413 // Check if suitable for a jump table. 414 if (IsJTAllowed) { 415 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 416 return N; 417 uint64_t Range = 418 (MaxCaseVal - MinCaseVal) 419 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 420 // Check whether a range of clusters is dense enough for a jump table 421 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 422 JumpTableSize = Range; 423 return 1; 424 } 425 } 426 return N; 427 } 428 429 bool shouldBuildLookupTables() { 430 const TargetLoweringBase *TLI = getTLI(); 431 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 432 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 433 } 434 435 bool shouldBuildRelLookupTables() const { 436 const TargetMachine &TM = getTLI()->getTargetMachine(); 437 // If non-PIC mode, do not generate a relative lookup table. 438 if (!TM.isPositionIndependent()) 439 return false; 440 441 /// Relative lookup table entries consist of 32-bit offsets. 442 /// Do not generate relative lookup tables for large code models 443 /// in 64-bit achitectures where 32-bit offsets might not be enough. 444 if (TM.getCodeModel() == CodeModel::Medium || 445 TM.getCodeModel() == CodeModel::Large) 446 return false; 447 448 Triple TargetTriple = TM.getTargetTriple(); 449 if (!TargetTriple.isArch64Bit()) 450 return false; 451 452 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 453 // there. 454 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 455 return false; 456 457 return true; 458 } 459 460 bool haveFastSqrt(Type *Ty) { 461 const TargetLoweringBase *TLI = getTLI(); 462 EVT VT = TLI->getValueType(DL, Ty); 463 return TLI->isTypeLegal(VT) && 464 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 465 } 466 467 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 468 return true; 469 } 470 471 InstructionCost getFPOpCost(Type *Ty) { 472 // Check whether FADD is available, as a proxy for floating-point in 473 // general. 474 const TargetLoweringBase *TLI = getTLI(); 475 EVT VT = TLI->getValueType(DL, Ty); 476 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 477 return TargetTransformInfo::TCC_Basic; 478 return TargetTransformInfo::TCC_Expensive; 479 } 480 481 unsigned getInliningThresholdMultiplier() { return 1; } 482 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 483 484 int getInlinerVectorBonusPercent() { return 150; } 485 486 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 487 TTI::UnrollingPreferences &UP) { 488 // This unrolling functionality is target independent, but to provide some 489 // motivation for its intended use, for x86: 490 491 // According to the Intel 64 and IA-32 Architectures Optimization Reference 492 // Manual, Intel Core models and later have a loop stream detector (and 493 // associated uop queue) that can benefit from partial unrolling. 494 // The relevant requirements are: 495 // - The loop must have no more than 4 (8 for Nehalem and later) branches 496 // taken, and none of them may be calls. 497 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 498 499 // According to the Software Optimization Guide for AMD Family 15h 500 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 501 // and loop buffer which can benefit from partial unrolling. 502 // The relevant requirements are: 503 // - The loop must have fewer than 16 branches 504 // - The loop must have less than 40 uops in all executed loop branches 505 506 // The number of taken branches in a loop is hard to estimate here, and 507 // benchmarking has revealed that it is better not to be conservative when 508 // estimating the branch count. As a result, we'll ignore the branch limits 509 // until someone finds a case where it matters in practice. 510 511 unsigned MaxOps; 512 const TargetSubtargetInfo *ST = getST(); 513 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 514 MaxOps = PartialUnrollingThreshold; 515 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 516 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 517 else 518 return; 519 520 // Scan the loop: don't unroll loops with calls. 521 for (BasicBlock *BB : L->blocks()) { 522 for (Instruction &I : *BB) { 523 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 524 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 525 if (!thisT()->isLoweredToCall(F)) 526 continue; 527 } 528 529 return; 530 } 531 } 532 } 533 534 // Enable runtime and partial unrolling up to the specified size. 535 // Enable using trip count upper bound to unroll loops. 536 UP.Partial = UP.Runtime = UP.UpperBound = true; 537 UP.PartialThreshold = MaxOps; 538 539 // Avoid unrolling when optimizing for size. 540 UP.OptSizeThreshold = 0; 541 UP.PartialOptSizeThreshold = 0; 542 543 // Set number of instructions optimized when "back edge" 544 // becomes "fall through" to default value of 2. 545 UP.BEInsns = 2; 546 } 547 548 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 549 TTI::PeelingPreferences &PP) { 550 PP.PeelCount = 0; 551 PP.AllowPeeling = true; 552 PP.AllowLoopNestsPeeling = false; 553 PP.PeelProfiledIterations = true; 554 } 555 556 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 557 AssumptionCache &AC, 558 TargetLibraryInfo *LibInfo, 559 HardwareLoopInfo &HWLoopInfo) { 560 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 561 } 562 563 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 564 AssumptionCache &AC, TargetLibraryInfo *TLI, 565 DominatorTree *DT, 566 const LoopAccessInfo *LAI) { 567 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); 568 } 569 570 bool emitGetActiveLaneMask() { 571 return BaseT::emitGetActiveLaneMask(); 572 } 573 574 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 575 IntrinsicInst &II) { 576 return BaseT::instCombineIntrinsic(IC, II); 577 } 578 579 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, 580 IntrinsicInst &II, 581 APInt DemandedMask, 582 KnownBits &Known, 583 bool &KnownBitsComputed) { 584 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 585 KnownBitsComputed); 586 } 587 588 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 589 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 590 APInt &UndefElts2, APInt &UndefElts3, 591 std::function<void(Instruction *, unsigned, APInt, APInt &)> 592 SimplifyAndSetOp) { 593 return BaseT::simplifyDemandedVectorEltsIntrinsic( 594 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 595 SimplifyAndSetOp); 596 } 597 598 InstructionCost getInstructionLatency(const Instruction *I) { 599 if (isa<LoadInst>(I)) 600 return getST()->getSchedModel().DefaultLoadLatency; 601 602 return BaseT::getInstructionLatency(I); 603 } 604 605 virtual Optional<unsigned> 606 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 607 return Optional<unsigned>( 608 getST()->getCacheSize(static_cast<unsigned>(Level))); 609 } 610 611 virtual Optional<unsigned> 612 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 613 Optional<unsigned> TargetResult = 614 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 615 616 if (TargetResult) 617 return TargetResult; 618 619 return BaseT::getCacheAssociativity(Level); 620 } 621 622 virtual unsigned getCacheLineSize() const { 623 return getST()->getCacheLineSize(); 624 } 625 626 virtual unsigned getPrefetchDistance() const { 627 return getST()->getPrefetchDistance(); 628 } 629 630 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 631 unsigned NumStridedMemAccesses, 632 unsigned NumPrefetches, 633 bool HasCall) const { 634 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 635 NumPrefetches, HasCall); 636 } 637 638 virtual unsigned getMaxPrefetchIterationsAhead() const { 639 return getST()->getMaxPrefetchIterationsAhead(); 640 } 641 642 virtual bool enableWritePrefetching() const { 643 return getST()->enableWritePrefetching(); 644 } 645 646 /// @} 647 648 /// \name Vector TTI Implementations 649 /// @{ 650 651 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 652 return TypeSize::getFixed(32); 653 } 654 655 Optional<unsigned> getMaxVScale() const { return None; } 656 657 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 658 /// are set if the demanded result elements need to be inserted and/or 659 /// extracted from vectors. 660 InstructionCost getScalarizationOverhead(VectorType *InTy, 661 const APInt &DemandedElts, 662 bool Insert, bool Extract) { 663 /// FIXME: a bitfield is not a reasonable abstraction for talking about 664 /// which elements are needed from a scalable vector 665 auto *Ty = cast<FixedVectorType>(InTy); 666 667 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 668 "Vector size mismatch"); 669 670 InstructionCost Cost = 0; 671 672 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 673 if (!DemandedElts[i]) 674 continue; 675 if (Insert) 676 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); 677 if (Extract) 678 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 679 } 680 681 return Cost; 682 } 683 684 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. 685 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 686 bool Extract) { 687 auto *Ty = cast<FixedVectorType>(InTy); 688 689 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); 690 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 691 } 692 693 /// Estimate the overhead of scalarizing an instructions unique 694 /// non-constant operands. The (potentially vector) types to use for each of 695 /// argument are passes via Tys. 696 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 697 ArrayRef<Type *> Tys) { 698 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 699 700 InstructionCost Cost = 0; 701 SmallPtrSet<const Value*, 4> UniqueOperands; 702 for (int I = 0, E = Args.size(); I != E; I++) { 703 // Disregard things like metadata arguments. 704 const Value *A = Args[I]; 705 Type *Ty = Tys[I]; 706 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 707 !Ty->isPtrOrPtrVectorTy()) 708 continue; 709 710 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 711 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 712 Cost += getScalarizationOverhead(VecTy, false, true); 713 } 714 } 715 716 return Cost; 717 } 718 719 /// Estimate the overhead of scalarizing the inputs and outputs of an 720 /// instruction, with return type RetTy and arguments Args of type Tys. If 721 /// Args are unknown (empty), then the cost associated with one argument is 722 /// added as a heuristic. 723 InstructionCost getScalarizationOverhead(VectorType *RetTy, 724 ArrayRef<const Value *> Args, 725 ArrayRef<Type *> Tys) { 726 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); 727 if (!Args.empty()) 728 Cost += getOperandsScalarizationOverhead(Args, Tys); 729 else 730 // When no information on arguments is provided, we add the cost 731 // associated with one argument as a heuristic. 732 Cost += getScalarizationOverhead(RetTy, false, true); 733 734 return Cost; 735 } 736 737 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 738 739 InstructionCost getArithmeticInstrCost( 740 unsigned Opcode, Type *Ty, 741 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 742 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 743 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 744 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 745 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 746 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 747 const Instruction *CxtI = nullptr) { 748 // Check if any of the operands are vector operands. 749 const TargetLoweringBase *TLI = getTLI(); 750 int ISD = TLI->InstructionOpcodeToISD(Opcode); 751 assert(ISD && "Invalid opcode"); 752 753 // TODO: Handle more cost kinds. 754 if (CostKind != TTI::TCK_RecipThroughput) 755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 756 Opd1Info, Opd2Info, 757 Opd1PropInfo, Opd2PropInfo, 758 Args, CxtI); 759 760 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 761 762 bool IsFloat = Ty->isFPOrFPVectorTy(); 763 // Assume that floating point arithmetic operations cost twice as much as 764 // integer operations. 765 InstructionCost OpCost = (IsFloat ? 2 : 1); 766 767 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 768 // The operation is legal. Assume it costs 1. 769 // TODO: Once we have extract/insert subvector cost we need to use them. 770 return LT.first * OpCost; 771 } 772 773 if (!TLI->isOperationExpand(ISD, LT.second)) { 774 // If the operation is custom lowered, then assume that the code is twice 775 // as expensive. 776 return LT.first * 2 * OpCost; 777 } 778 779 // Else, assume that we need to scalarize this op. 780 // TODO: If one of the types get legalized by splitting, handle this 781 // similarly to what getCastInstrCost() does. 782 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 783 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); 784 InstructionCost Cost = thisT()->getArithmeticInstrCost( 785 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 786 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 787 // Return the cost of multiple scalar invocation plus the cost of 788 // inserting and extracting the values. 789 SmallVector<Type *> Tys(Args.size(), Ty); 790 return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; 791 } 792 793 // We don't know anything about this scalar instruction. 794 return OpCost; 795 } 796 797 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 798 ArrayRef<int> Mask) const { 799 int Limit = Mask.size() * 2; 800 if (Mask.empty() || 801 // Extra check required by isSingleSourceMaskImpl function (called by 802 // ShuffleVectorInst::isSingleSourceMask). 803 any_of(Mask, [Limit](int I) { return I >= Limit; })) 804 return Kind; 805 switch (Kind) { 806 case TTI::SK_PermuteSingleSrc: 807 if (ShuffleVectorInst::isReverseMask(Mask)) 808 return TTI::SK_Reverse; 809 if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) 810 return TTI::SK_Broadcast; 811 break; 812 case TTI::SK_PermuteTwoSrc: 813 if (ShuffleVectorInst::isSelectMask(Mask)) 814 return TTI::SK_Select; 815 if (ShuffleVectorInst::isTransposeMask(Mask)) 816 return TTI::SK_Transpose; 817 break; 818 case TTI::SK_Select: 819 case TTI::SK_Reverse: 820 case TTI::SK_Broadcast: 821 case TTI::SK_Transpose: 822 case TTI::SK_InsertSubvector: 823 case TTI::SK_ExtractSubvector: 824 break; 825 } 826 return Kind; 827 } 828 829 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 830 ArrayRef<int> Mask, int Index, 831 VectorType *SubTp) { 832 833 switch (improveShuffleKindFromMask(Kind, Mask)) { 834 case TTI::SK_Broadcast: 835 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); 836 case TTI::SK_Select: 837 case TTI::SK_Reverse: 838 case TTI::SK_Transpose: 839 case TTI::SK_PermuteSingleSrc: 840 case TTI::SK_PermuteTwoSrc: 841 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); 842 case TTI::SK_ExtractSubvector: 843 return getExtractSubvectorOverhead(Tp, Index, 844 cast<FixedVectorType>(SubTp)); 845 case TTI::SK_InsertSubvector: 846 return getInsertSubvectorOverhead(Tp, Index, 847 cast<FixedVectorType>(SubTp)); 848 } 849 llvm_unreachable("Unknown TTI::ShuffleKind"); 850 } 851 852 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 853 TTI::CastContextHint CCH, 854 TTI::TargetCostKind CostKind, 855 const Instruction *I = nullptr) { 856 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 857 return 0; 858 859 const TargetLoweringBase *TLI = getTLI(); 860 int ISD = TLI->InstructionOpcodeToISD(Opcode); 861 assert(ISD && "Invalid opcode"); 862 std::pair<InstructionCost, MVT> SrcLT = 863 TLI->getTypeLegalizationCost(DL, Src); 864 std::pair<InstructionCost, MVT> DstLT = 865 TLI->getTypeLegalizationCost(DL, Dst); 866 867 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 868 TypeSize DstSize = DstLT.second.getSizeInBits(); 869 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 870 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 871 872 switch (Opcode) { 873 default: 874 break; 875 case Instruction::Trunc: 876 // Check for NOOP conversions. 877 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 878 return 0; 879 LLVM_FALLTHROUGH; 880 case Instruction::BitCast: 881 // Bitcast between types that are legalized to the same type are free and 882 // assume int to/from ptr of the same size is also free. 883 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 884 SrcSize == DstSize) 885 return 0; 886 break; 887 case Instruction::FPExt: 888 if (I && getTLI()->isExtFree(I)) 889 return 0; 890 break; 891 case Instruction::ZExt: 892 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 893 return 0; 894 LLVM_FALLTHROUGH; 895 case Instruction::SExt: 896 if (I && getTLI()->isExtFree(I)) 897 return 0; 898 899 // If this is a zext/sext of a load, return 0 if the corresponding 900 // extending load exists on target and the result type is legal. 901 if (CCH == TTI::CastContextHint::Normal) { 902 EVT ExtVT = EVT::getEVT(Dst); 903 EVT LoadVT = EVT::getEVT(Src); 904 unsigned LType = 905 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 906 if (DstLT.first == SrcLT.first && 907 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 908 return 0; 909 } 910 break; 911 case Instruction::AddrSpaceCast: 912 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 913 Dst->getPointerAddressSpace())) 914 return 0; 915 break; 916 } 917 918 auto *SrcVTy = dyn_cast<VectorType>(Src); 919 auto *DstVTy = dyn_cast<VectorType>(Dst); 920 921 // If the cast is marked as legal (or promote) then assume low cost. 922 if (SrcLT.first == DstLT.first && 923 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 924 return SrcLT.first; 925 926 // Handle scalar conversions. 927 if (!SrcVTy && !DstVTy) { 928 // Just check the op cost. If the operation is legal then assume it costs 929 // 1. 930 if (!TLI->isOperationExpand(ISD, DstLT.second)) 931 return 1; 932 933 // Assume that illegal scalar instruction are expensive. 934 return 4; 935 } 936 937 // Check vector-to-vector casts. 938 if (DstVTy && SrcVTy) { 939 // If the cast is between same-sized registers, then the check is simple. 940 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 941 942 // Assume that Zext is done using AND. 943 if (Opcode == Instruction::ZExt) 944 return SrcLT.first; 945 946 // Assume that sext is done using SHL and SRA. 947 if (Opcode == Instruction::SExt) 948 return SrcLT.first * 2; 949 950 // Just check the op cost. If the operation is legal then assume it 951 // costs 952 // 1 and multiply by the type-legalization overhead. 953 if (!TLI->isOperationExpand(ISD, DstLT.second)) 954 return SrcLT.first * 1; 955 } 956 957 // If we are legalizing by splitting, query the concrete TTI for the cost 958 // of casting the original vector twice. We also need to factor in the 959 // cost of the split itself. Count that as 1, to be consistent with 960 // TLI->getTypeLegalizationCost(). 961 bool SplitSrc = 962 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 963 TargetLowering::TypeSplitVector; 964 bool SplitDst = 965 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 966 TargetLowering::TypeSplitVector; 967 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 968 DstVTy->getElementCount().isVector()) { 969 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 970 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 971 T *TTI = static_cast<T *>(this); 972 // If both types need to be split then the split is free. 973 InstructionCost SplitCost = 974 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 975 return SplitCost + 976 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 977 CostKind, I)); 978 } 979 980 // In other cases where the source or destination are illegal, assume 981 // the operation will get scalarized. 982 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 983 InstructionCost Cost = thisT()->getCastInstrCost( 984 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 985 986 // Return the cost of multiple scalar invocation plus the cost of 987 // inserting and extracting the values. 988 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; 989 } 990 991 // We already handled vector-to-vector and scalar-to-scalar conversions. 992 // This 993 // is where we handle bitcast between vectors and scalars. We need to assume 994 // that the conversion is scalarized in one way or another. 995 if (Opcode == Instruction::BitCast) { 996 // Illegal bitcasts are done by storing and loading from a stack slot. 997 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + 998 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); 999 } 1000 1001 llvm_unreachable("Unhandled cast"); 1002 } 1003 1004 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1005 VectorType *VecTy, unsigned Index) { 1006 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1007 Index) + 1008 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1009 TTI::CastContextHint::None, 1010 TTI::TCK_RecipThroughput); 1011 } 1012 1013 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1014 const Instruction *I = nullptr) { 1015 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1016 } 1017 1018 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1019 CmpInst::Predicate VecPred, 1020 TTI::TargetCostKind CostKind, 1021 const Instruction *I = nullptr) { 1022 const TargetLoweringBase *TLI = getTLI(); 1023 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1024 assert(ISD && "Invalid opcode"); 1025 1026 // TODO: Handle other cost kinds. 1027 if (CostKind != TTI::TCK_RecipThroughput) 1028 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1029 I); 1030 1031 // Selects on vectors are actually vector selects. 1032 if (ISD == ISD::SELECT) { 1033 assert(CondTy && "CondTy must exist"); 1034 if (CondTy->isVectorTy()) 1035 ISD = ISD::VSELECT; 1036 } 1037 std::pair<InstructionCost, MVT> LT = 1038 TLI->getTypeLegalizationCost(DL, ValTy); 1039 1040 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1041 !TLI->isOperationExpand(ISD, LT.second)) { 1042 // The operation is legal. Assume it costs 1. Multiply 1043 // by the type-legalization overhead. 1044 return LT.first * 1; 1045 } 1046 1047 // Otherwise, assume that the cast is scalarized. 1048 // TODO: If one of the types get legalized by splitting, handle this 1049 // similarly to what getCastInstrCost() does. 1050 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1051 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1052 if (CondTy) 1053 CondTy = CondTy->getScalarType(); 1054 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1055 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1056 1057 // Return the cost of multiple scalar invocation plus the cost of 1058 // inserting and extracting the values. 1059 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; 1060 } 1061 1062 // Unknown scalar opcode. 1063 return 1; 1064 } 1065 1066 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1067 unsigned Index) { 1068 std::pair<InstructionCost, MVT> LT = 1069 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 1070 1071 return LT.first; 1072 } 1073 1074 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, 1075 MaybeAlign Alignment, unsigned AddressSpace, 1076 TTI::TargetCostKind CostKind, 1077 const Instruction *I = nullptr) { 1078 assert(!Src->isVoidTy() && "Invalid type"); 1079 // Assume types, such as structs, are expensive. 1080 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1081 return 4; 1082 std::pair<InstructionCost, MVT> LT = 1083 getTLI()->getTypeLegalizationCost(DL, Src); 1084 1085 // Assuming that all loads of legal types cost 1. 1086 InstructionCost Cost = LT.first; 1087 if (CostKind != TTI::TCK_RecipThroughput) 1088 return Cost; 1089 1090 if (Src->isVectorTy() && 1091 // In practice it's not currently possible to have a change in lane 1092 // length for extending loads or truncating stores so both types should 1093 // have the same scalable property. 1094 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), 1095 LT.second.getSizeInBits())) { 1096 // This is a vector load that legalizes to a larger type than the vector 1097 // itself. Unless the corresponding extending load or truncating store is 1098 // legal, then this will scalarize. 1099 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1100 EVT MemVT = getTLI()->getValueType(DL, Src); 1101 if (Opcode == Instruction::Store) 1102 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1103 else 1104 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1105 1106 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1107 // This is a vector load/store for some illegal type that is scalarized. 1108 // We must account for the cost of building or decomposing the vector. 1109 Cost += getScalarizationOverhead(cast<VectorType>(Src), 1110 Opcode != Instruction::Store, 1111 Opcode == Instruction::Store); 1112 } 1113 } 1114 1115 return Cost; 1116 } 1117 1118 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1119 Align Alignment, unsigned AddressSpace, 1120 TTI::TargetCostKind CostKind) { 1121 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1122 CostKind); 1123 } 1124 1125 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1126 const Value *Ptr, bool VariableMask, 1127 Align Alignment, 1128 TTI::TargetCostKind CostKind, 1129 const Instruction *I = nullptr) { 1130 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1131 true, CostKind); 1132 } 1133 1134 InstructionCost getInterleavedMemoryOpCost( 1135 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1136 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1137 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1138 auto *VT = cast<FixedVectorType>(VecTy); 1139 1140 unsigned NumElts = VT->getNumElements(); 1141 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1142 1143 unsigned NumSubElts = NumElts / Factor; 1144 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1145 1146 // Firstly, the cost of load/store operation. 1147 InstructionCost Cost; 1148 if (UseMaskForCond || UseMaskForGaps) 1149 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1150 AddressSpace, CostKind); 1151 else 1152 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1153 CostKind); 1154 1155 // Legalize the vector type, and get the legalized and unlegalized type 1156 // sizes. 1157 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 1158 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1159 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1160 1161 // Scale the cost of the memory operation by the fraction of legalized 1162 // instructions that will actually be used. We shouldn't account for the 1163 // cost of dead instructions since they will be removed. 1164 // 1165 // E.g., An interleaved load of factor 8: 1166 // %vec = load <16 x i64>, <16 x i64>* %ptr 1167 // %v0 = shufflevector %vec, undef, <0, 8> 1168 // 1169 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1170 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1171 // type). The other loads are unused. 1172 // 1173 // We only scale the cost of loads since interleaved store groups aren't 1174 // allowed to have gaps. 1175 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 1176 // The number of loads of a legal type it will take to represent a load 1177 // of the unlegalized vector type. 1178 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1179 1180 // The number of elements of the unlegalized type that correspond to a 1181 // single legal instruction. 1182 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1183 1184 // Determine which legal instructions will be used. 1185 BitVector UsedInsts(NumLegalInsts, false); 1186 for (unsigned Index : Indices) 1187 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1188 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1189 1190 // Scale the cost of the load by the fraction of legal instructions that 1191 // will be used. 1192 Cost *= UsedInsts.count() / NumLegalInsts; 1193 } 1194 1195 // Then plus the cost of interleave operation. 1196 if (Opcode == Instruction::Load) { 1197 // The interleave cost is similar to extract sub vectors' elements 1198 // from the wide vector, and insert them into sub vectors. 1199 // 1200 // E.g. An interleaved load of factor 2 (with one member of index 0): 1201 // %vec = load <8 x i32>, <8 x i32>* %ptr 1202 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1203 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1204 // <8 x i32> vector and insert them into a <4 x i32> vector. 1205 1206 assert(Indices.size() <= Factor && 1207 "Interleaved memory op has too many members"); 1208 1209 for (unsigned Index : Indices) { 1210 assert(Index < Factor && "Invalid index for interleaved memory op"); 1211 1212 // Extract elements from loaded vector for each sub vector. 1213 for (unsigned i = 0; i < NumSubElts; i++) 1214 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, 1215 Index + i * Factor); 1216 } 1217 1218 InstructionCost InsSubCost = 0; 1219 for (unsigned i = 0; i < NumSubElts; i++) 1220 InsSubCost += 1221 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); 1222 1223 Cost += Indices.size() * InsSubCost; 1224 } else { 1225 // The interleave cost is extract all elements from sub vectors, and 1226 // insert them into the wide vector. 1227 // 1228 // E.g. An interleaved store of factor 2: 1229 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 1230 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 1231 // The cost is estimated as extract all elements from both <4 x i32> 1232 // vectors and insert into the <8 x i32> vector. 1233 1234 InstructionCost ExtSubCost = 0; 1235 for (unsigned i = 0; i < NumSubElts; i++) 1236 ExtSubCost += 1237 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1238 Cost += ExtSubCost * Factor; 1239 1240 for (unsigned i = 0; i < NumElts; i++) 1241 Cost += static_cast<T *>(this) 1242 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 1243 } 1244 1245 if (!UseMaskForCond) 1246 return Cost; 1247 1248 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1249 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1250 SubVT = FixedVectorType::get(I8Type, NumSubElts); 1251 1252 // The Mask shuffling cost is extract all the elements of the Mask 1253 // and insert each of them Factor times into the wide vector: 1254 // 1255 // E.g. an interleaved group with factor 3: 1256 // %mask = icmp ult <8 x i32> %vec1, %vec2 1257 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1258 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1259 // The cost is estimated as extract all mask elements from the <8xi1> mask 1260 // vector and insert them factor times into the <24xi1> shuffled mask 1261 // vector. 1262 for (unsigned i = 0; i < NumSubElts; i++) 1263 Cost += 1264 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1265 1266 for (unsigned i = 0; i < NumElts; i++) 1267 Cost += 1268 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); 1269 1270 // The Gaps mask is invariant and created outside the loop, therefore the 1271 // cost of creating it is not accounted for here. However if we have both 1272 // a MaskForGaps and some other mask that guards the execution of the 1273 // memory access, we need to account for the cost of And-ing the two masks 1274 // inside the loop. 1275 if (UseMaskForGaps) 1276 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1277 CostKind); 1278 1279 return Cost; 1280 } 1281 1282 /// Get intrinsic cost based on arguments. 1283 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1284 TTI::TargetCostKind CostKind) { 1285 // Check for generically free intrinsics. 1286 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1287 return 0; 1288 1289 // Assume that target intrinsics are cheap. 1290 Intrinsic::ID IID = ICA.getID(); 1291 if (Function::isTargetIntrinsic(IID)) 1292 return TargetTransformInfo::TCC_Basic; 1293 1294 if (ICA.isTypeBasedOnly()) 1295 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1296 1297 Type *RetTy = ICA.getReturnType(); 1298 1299 ElementCount RetVF = 1300 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1301 : ElementCount::getFixed(1)); 1302 const IntrinsicInst *I = ICA.getInst(); 1303 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1304 FastMathFlags FMF = ICA.getFlags(); 1305 switch (IID) { 1306 default: 1307 break; 1308 1309 case Intrinsic::cttz: 1310 // FIXME: If necessary, this should go in target-specific overrides. 1311 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) 1312 return TargetTransformInfo::TCC_Basic; 1313 break; 1314 1315 case Intrinsic::ctlz: 1316 // FIXME: If necessary, this should go in target-specific overrides. 1317 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) 1318 return TargetTransformInfo::TCC_Basic; 1319 break; 1320 1321 case Intrinsic::memcpy: 1322 return thisT()->getMemcpyCost(ICA.getInst()); 1323 1324 case Intrinsic::masked_scatter: { 1325 const Value *Mask = Args[3]; 1326 bool VarMask = !isa<Constant>(Mask); 1327 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1328 return thisT()->getGatherScatterOpCost(Instruction::Store, 1329 ICA.getArgTypes()[0], Args[1], 1330 VarMask, Alignment, CostKind, I); 1331 } 1332 case Intrinsic::masked_gather: { 1333 const Value *Mask = Args[2]; 1334 bool VarMask = !isa<Constant>(Mask); 1335 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1336 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1337 VarMask, Alignment, CostKind, I); 1338 } 1339 case Intrinsic::experimental_stepvector: { 1340 if (isa<ScalableVectorType>(RetTy)) 1341 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1342 // The cost of materialising a constant integer vector. 1343 return TargetTransformInfo::TCC_Basic; 1344 } 1345 case Intrinsic::experimental_vector_extract: { 1346 // FIXME: Handle case where a scalable vector is extracted from a scalable 1347 // vector 1348 if (isa<ScalableVectorType>(RetTy)) 1349 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1350 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1351 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1352 cast<VectorType>(Args[0]->getType()), None, 1353 Index, cast<VectorType>(RetTy)); 1354 } 1355 case Intrinsic::experimental_vector_insert: { 1356 // FIXME: Handle case where a scalable vector is inserted into a scalable 1357 // vector 1358 if (isa<ScalableVectorType>(Args[1]->getType())) 1359 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1360 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1361 return thisT()->getShuffleCost( 1362 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, 1363 Index, cast<VectorType>(Args[1]->getType())); 1364 } 1365 case Intrinsic::experimental_vector_reverse: { 1366 return thisT()->getShuffleCost(TTI::SK_Reverse, 1367 cast<VectorType>(Args[0]->getType()), None, 1368 0, cast<VectorType>(RetTy)); 1369 } 1370 case Intrinsic::vector_reduce_add: 1371 case Intrinsic::vector_reduce_mul: 1372 case Intrinsic::vector_reduce_and: 1373 case Intrinsic::vector_reduce_or: 1374 case Intrinsic::vector_reduce_xor: 1375 case Intrinsic::vector_reduce_smax: 1376 case Intrinsic::vector_reduce_smin: 1377 case Intrinsic::vector_reduce_fmax: 1378 case Intrinsic::vector_reduce_fmin: 1379 case Intrinsic::vector_reduce_umax: 1380 case Intrinsic::vector_reduce_umin: { 1381 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1382 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1383 } 1384 case Intrinsic::vector_reduce_fadd: 1385 case Intrinsic::vector_reduce_fmul: { 1386 IntrinsicCostAttributes Attrs( 1387 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1388 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1389 } 1390 case Intrinsic::fshl: 1391 case Intrinsic::fshr: { 1392 if (isa<ScalableVectorType>(RetTy)) 1393 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1394 const Value *X = Args[0]; 1395 const Value *Y = Args[1]; 1396 const Value *Z = Args[2]; 1397 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1398 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1399 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1400 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1401 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1402 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1403 : TTI::OP_None; 1404 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1405 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1406 InstructionCost Cost = 0; 1407 Cost += 1408 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1409 Cost += 1410 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1411 Cost += thisT()->getArithmeticInstrCost( 1412 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); 1413 Cost += thisT()->getArithmeticInstrCost( 1414 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); 1415 // Non-constant shift amounts requires a modulo. 1416 if (OpKindZ != TTI::OK_UniformConstantValue && 1417 OpKindZ != TTI::OK_NonUniformConstantValue) 1418 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1419 CostKind, OpKindZ, OpKindBW, 1420 OpPropsZ, OpPropsBW); 1421 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1422 if (X != Y) { 1423 Type *CondTy = RetTy->getWithNewBitWidth(1); 1424 Cost += 1425 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1426 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1427 Cost += 1428 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1429 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1430 } 1431 return Cost; 1432 } 1433 } 1434 1435 // Assume that we need to scalarize this intrinsic. 1436 // Compute the scalarization overhead based on Args for a vector 1437 // intrinsic. 1438 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1439 if (RetVF.isVector() && !RetVF.isScalable()) { 1440 ScalarizationCost = 0; 1441 if (!RetTy->isVoidTy()) 1442 ScalarizationCost += 1443 getScalarizationOverhead(cast<VectorType>(RetTy), true, false); 1444 ScalarizationCost += 1445 getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); 1446 } 1447 1448 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1449 ScalarizationCost); 1450 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1451 } 1452 1453 /// Get intrinsic cost based on argument types. 1454 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1455 /// cost of scalarizing the arguments and the return value will be computed 1456 /// based on types. 1457 InstructionCost 1458 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1459 TTI::TargetCostKind CostKind) { 1460 Intrinsic::ID IID = ICA.getID(); 1461 Type *RetTy = ICA.getReturnType(); 1462 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1463 FastMathFlags FMF = ICA.getFlags(); 1464 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1465 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1466 1467 VectorType *VecOpTy = nullptr; 1468 if (!Tys.empty()) { 1469 // The vector reduction operand is operand 0 except for fadd/fmul. 1470 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1471 unsigned VecTyIndex = 0; 1472 if (IID == Intrinsic::vector_reduce_fadd || 1473 IID == Intrinsic::vector_reduce_fmul) 1474 VecTyIndex = 1; 1475 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1476 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1477 } 1478 1479 // Library call cost - other than size, make it expensive. 1480 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1481 SmallVector<unsigned, 2> ISDs; 1482 switch (IID) { 1483 default: { 1484 // Scalable vectors cannot be scalarized, so return Invalid. 1485 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1486 return isa<ScalableVectorType>(Ty); 1487 })) 1488 return InstructionCost::getInvalid(); 1489 1490 // Assume that we need to scalarize this intrinsic. 1491 InstructionCost ScalarizationCost = 1492 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1493 unsigned ScalarCalls = 1; 1494 Type *ScalarRetTy = RetTy; 1495 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1496 if (!SkipScalarizationCost) 1497 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); 1498 ScalarCalls = std::max(ScalarCalls, 1499 cast<FixedVectorType>(RetVTy)->getNumElements()); 1500 ScalarRetTy = RetTy->getScalarType(); 1501 } 1502 SmallVector<Type *, 4> ScalarTys; 1503 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1504 Type *Ty = Tys[i]; 1505 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1506 if (!SkipScalarizationCost) 1507 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1508 ScalarCalls = std::max(ScalarCalls, 1509 cast<FixedVectorType>(VTy)->getNumElements()); 1510 Ty = Ty->getScalarType(); 1511 } 1512 ScalarTys.push_back(Ty); 1513 } 1514 if (ScalarCalls == 1) 1515 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1516 1517 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1518 InstructionCost ScalarCost = 1519 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1520 1521 return ScalarCalls * ScalarCost + ScalarizationCost; 1522 } 1523 // Look for intrinsics that can be lowered directly or turned into a scalar 1524 // intrinsic call. 1525 case Intrinsic::sqrt: 1526 ISDs.push_back(ISD::FSQRT); 1527 break; 1528 case Intrinsic::sin: 1529 ISDs.push_back(ISD::FSIN); 1530 break; 1531 case Intrinsic::cos: 1532 ISDs.push_back(ISD::FCOS); 1533 break; 1534 case Intrinsic::exp: 1535 ISDs.push_back(ISD::FEXP); 1536 break; 1537 case Intrinsic::exp2: 1538 ISDs.push_back(ISD::FEXP2); 1539 break; 1540 case Intrinsic::log: 1541 ISDs.push_back(ISD::FLOG); 1542 break; 1543 case Intrinsic::log10: 1544 ISDs.push_back(ISD::FLOG10); 1545 break; 1546 case Intrinsic::log2: 1547 ISDs.push_back(ISD::FLOG2); 1548 break; 1549 case Intrinsic::fabs: 1550 ISDs.push_back(ISD::FABS); 1551 break; 1552 case Intrinsic::canonicalize: 1553 ISDs.push_back(ISD::FCANONICALIZE); 1554 break; 1555 case Intrinsic::minnum: 1556 ISDs.push_back(ISD::FMINNUM); 1557 break; 1558 case Intrinsic::maxnum: 1559 ISDs.push_back(ISD::FMAXNUM); 1560 break; 1561 case Intrinsic::minimum: 1562 ISDs.push_back(ISD::FMINIMUM); 1563 break; 1564 case Intrinsic::maximum: 1565 ISDs.push_back(ISD::FMAXIMUM); 1566 break; 1567 case Intrinsic::copysign: 1568 ISDs.push_back(ISD::FCOPYSIGN); 1569 break; 1570 case Intrinsic::floor: 1571 ISDs.push_back(ISD::FFLOOR); 1572 break; 1573 case Intrinsic::ceil: 1574 ISDs.push_back(ISD::FCEIL); 1575 break; 1576 case Intrinsic::trunc: 1577 ISDs.push_back(ISD::FTRUNC); 1578 break; 1579 case Intrinsic::nearbyint: 1580 ISDs.push_back(ISD::FNEARBYINT); 1581 break; 1582 case Intrinsic::rint: 1583 ISDs.push_back(ISD::FRINT); 1584 break; 1585 case Intrinsic::round: 1586 ISDs.push_back(ISD::FROUND); 1587 break; 1588 case Intrinsic::roundeven: 1589 ISDs.push_back(ISD::FROUNDEVEN); 1590 break; 1591 case Intrinsic::pow: 1592 ISDs.push_back(ISD::FPOW); 1593 break; 1594 case Intrinsic::fma: 1595 ISDs.push_back(ISD::FMA); 1596 break; 1597 case Intrinsic::fmuladd: 1598 ISDs.push_back(ISD::FMA); 1599 break; 1600 case Intrinsic::experimental_constrained_fmuladd: 1601 ISDs.push_back(ISD::STRICT_FMA); 1602 break; 1603 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1604 case Intrinsic::lifetime_start: 1605 case Intrinsic::lifetime_end: 1606 case Intrinsic::sideeffect: 1607 case Intrinsic::pseudoprobe: 1608 return 0; 1609 case Intrinsic::masked_store: { 1610 Type *Ty = Tys[0]; 1611 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1612 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1613 CostKind); 1614 } 1615 case Intrinsic::masked_load: { 1616 Type *Ty = RetTy; 1617 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1618 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1619 CostKind); 1620 } 1621 case Intrinsic::vector_reduce_add: 1622 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1623 /*IsPairwiseForm=*/false, 1624 CostKind); 1625 case Intrinsic::vector_reduce_mul: 1626 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1627 /*IsPairwiseForm=*/false, 1628 CostKind); 1629 case Intrinsic::vector_reduce_and: 1630 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1631 /*IsPairwiseForm=*/false, 1632 CostKind); 1633 case Intrinsic::vector_reduce_or: 1634 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, 1635 /*IsPairwiseForm=*/false, 1636 CostKind); 1637 case Intrinsic::vector_reduce_xor: 1638 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1639 /*IsPairwiseForm=*/false, 1640 CostKind); 1641 case Intrinsic::vector_reduce_fadd: 1642 // FIXME: Add new flag for cost of strict reductions. 1643 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 1644 /*IsPairwiseForm=*/false, 1645 CostKind); 1646 case Intrinsic::vector_reduce_fmul: 1647 // FIXME: Add new flag for cost of strict reductions. 1648 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 1649 /*IsPairwiseForm=*/false, 1650 CostKind); 1651 case Intrinsic::vector_reduce_smax: 1652 case Intrinsic::vector_reduce_smin: 1653 case Intrinsic::vector_reduce_fmax: 1654 case Intrinsic::vector_reduce_fmin: 1655 return thisT()->getMinMaxReductionCost( 1656 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1657 /*IsPairwiseForm=*/false, 1658 /*IsUnsigned=*/false, CostKind); 1659 case Intrinsic::vector_reduce_umax: 1660 case Intrinsic::vector_reduce_umin: 1661 return thisT()->getMinMaxReductionCost( 1662 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1663 /*IsPairwiseForm=*/false, 1664 /*IsUnsigned=*/true, CostKind); 1665 case Intrinsic::abs: 1666 case Intrinsic::smax: 1667 case Intrinsic::smin: 1668 case Intrinsic::umax: 1669 case Intrinsic::umin: { 1670 // abs(X) = select(icmp(X,0),X,sub(0,X)) 1671 // minmax(X,Y) = select(icmp(X,Y),X,Y) 1672 Type *CondTy = RetTy->getWithNewBitWidth(1); 1673 InstructionCost Cost = 0; 1674 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. 1675 Cost += 1676 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1677 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1678 Cost += 1679 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1680 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1681 // TODO: Should we add an OperandValueProperties::OP_Zero property? 1682 if (IID == Intrinsic::abs) 1683 Cost += thisT()->getArithmeticInstrCost( 1684 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); 1685 return Cost; 1686 } 1687 case Intrinsic::sadd_sat: 1688 case Intrinsic::ssub_sat: { 1689 Type *CondTy = RetTy->getWithNewBitWidth(1); 1690 1691 Type *OpTy = StructType::create({RetTy, CondTy}); 1692 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1693 ? Intrinsic::sadd_with_overflow 1694 : Intrinsic::ssub_with_overflow; 1695 1696 // SatMax -> Overflow && SumDiff < 0 1697 // SatMin -> Overflow && SumDiff >= 0 1698 InstructionCost Cost = 0; 1699 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1700 nullptr, ScalarizationCostPassed); 1701 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1702 Cost += 1703 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1704 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1705 Cost += 2 * thisT()->getCmpSelInstrCost( 1706 BinaryOperator::Select, RetTy, CondTy, 1707 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1708 return Cost; 1709 } 1710 case Intrinsic::uadd_sat: 1711 case Intrinsic::usub_sat: { 1712 Type *CondTy = RetTy->getWithNewBitWidth(1); 1713 1714 Type *OpTy = StructType::create({RetTy, CondTy}); 1715 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1716 ? Intrinsic::uadd_with_overflow 1717 : Intrinsic::usub_with_overflow; 1718 1719 InstructionCost Cost = 0; 1720 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1721 nullptr, ScalarizationCostPassed); 1722 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1723 Cost += 1724 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1725 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1726 return Cost; 1727 } 1728 case Intrinsic::smul_fix: 1729 case Intrinsic::umul_fix: { 1730 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1731 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1732 1733 unsigned ExtOp = 1734 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1735 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1736 1737 InstructionCost Cost = 0; 1738 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 1739 Cost += 1740 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1741 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 1742 CCH, CostKind); 1743 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 1744 CostKind, TTI::OK_AnyValue, 1745 TTI::OK_UniformConstantValue); 1746 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 1747 TTI::OK_AnyValue, 1748 TTI::OK_UniformConstantValue); 1749 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 1750 return Cost; 1751 } 1752 case Intrinsic::sadd_with_overflow: 1753 case Intrinsic::ssub_with_overflow: { 1754 Type *SumTy = RetTy->getContainedType(0); 1755 Type *OverflowTy = RetTy->getContainedType(1); 1756 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1757 ? BinaryOperator::Add 1758 : BinaryOperator::Sub; 1759 1760 // LHSSign -> LHS >= 0 1761 // RHSSign -> RHS >= 0 1762 // SumSign -> Sum >= 0 1763 // 1764 // Add: 1765 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) 1766 // Sub: 1767 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) 1768 InstructionCost Cost = 0; 1769 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1770 Cost += 3 * thisT()->getCmpSelInstrCost( 1771 Instruction::ICmp, SumTy, OverflowTy, 1772 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1773 Cost += 2 * thisT()->getCmpSelInstrCost( 1774 Instruction::Select, OverflowTy, OverflowTy, 1775 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1776 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy, 1777 CostKind); 1778 return Cost; 1779 } 1780 case Intrinsic::uadd_with_overflow: 1781 case Intrinsic::usub_with_overflow: { 1782 Type *SumTy = RetTy->getContainedType(0); 1783 Type *OverflowTy = RetTy->getContainedType(1); 1784 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1785 ? BinaryOperator::Add 1786 : BinaryOperator::Sub; 1787 1788 InstructionCost Cost = 0; 1789 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1790 Cost += 1791 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 1792 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1793 return Cost; 1794 } 1795 case Intrinsic::smul_with_overflow: 1796 case Intrinsic::umul_with_overflow: { 1797 Type *MulTy = RetTy->getContainedType(0); 1798 Type *OverflowTy = RetTy->getContainedType(1); 1799 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1800 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1801 1802 unsigned ExtOp = 1803 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1804 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1805 1806 InstructionCost Cost = 0; 1807 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 1808 Cost += 1809 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1810 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 1811 CCH, CostKind); 1812 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, 1813 CostKind, TTI::OK_AnyValue, 1814 TTI::OK_UniformConstantValue); 1815 1816 if (IID == Intrinsic::smul_with_overflow) 1817 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 1818 CostKind, TTI::OK_AnyValue, 1819 TTI::OK_UniformConstantValue); 1820 1821 Cost += 1822 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, 1823 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1824 return Cost; 1825 } 1826 case Intrinsic::ctpop: 1827 ISDs.push_back(ISD::CTPOP); 1828 // In case of legalization use TCC_Expensive. This is cheaper than a 1829 // library call but still not a cheap instruction. 1830 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1831 break; 1832 case Intrinsic::ctlz: 1833 ISDs.push_back(ISD::CTLZ); 1834 break; 1835 case Intrinsic::cttz: 1836 ISDs.push_back(ISD::CTTZ); 1837 break; 1838 case Intrinsic::bswap: 1839 ISDs.push_back(ISD::BSWAP); 1840 break; 1841 case Intrinsic::bitreverse: 1842 ISDs.push_back(ISD::BITREVERSE); 1843 break; 1844 } 1845 1846 const TargetLoweringBase *TLI = getTLI(); 1847 std::pair<InstructionCost, MVT> LT = 1848 TLI->getTypeLegalizationCost(DL, RetTy); 1849 1850 SmallVector<InstructionCost, 2> LegalCost; 1851 SmallVector<InstructionCost, 2> CustomCost; 1852 for (unsigned ISD : ISDs) { 1853 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1854 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 1855 TLI->isFAbsFree(LT.second)) { 1856 return 0; 1857 } 1858 1859 // The operation is legal. Assume it costs 1. 1860 // If the type is split to multiple registers, assume that there is some 1861 // overhead to this. 1862 // TODO: Once we have extract/insert subvector cost we need to use them. 1863 if (LT.first > 1) 1864 LegalCost.push_back(LT.first * 2); 1865 else 1866 LegalCost.push_back(LT.first * 1); 1867 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1868 // If the operation is custom lowered then assume 1869 // that the code is twice as expensive. 1870 CustomCost.push_back(LT.first * 2); 1871 } 1872 } 1873 1874 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1875 if (MinLegalCostI != LegalCost.end()) 1876 return *MinLegalCostI; 1877 1878 auto MinCustomCostI = 1879 std::min_element(CustomCost.begin(), CustomCost.end()); 1880 if (MinCustomCostI != CustomCost.end()) 1881 return *MinCustomCostI; 1882 1883 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1884 // point mul followed by an add. 1885 if (IID == Intrinsic::fmuladd) 1886 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 1887 CostKind) + 1888 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 1889 CostKind); 1890 if (IID == Intrinsic::experimental_constrained_fmuladd) { 1891 IntrinsicCostAttributes FMulAttrs( 1892 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 1893 IntrinsicCostAttributes FAddAttrs( 1894 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 1895 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 1896 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 1897 } 1898 1899 // Else, assume that we need to scalarize this intrinsic. For math builtins 1900 // this will emit a costly libcall, adding call overhead and spills. Make it 1901 // very expensive. 1902 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1903 // Scalable vectors cannot be scalarized, so return Invalid. 1904 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1905 return isa<ScalableVectorType>(Ty); 1906 })) 1907 return InstructionCost::getInvalid(); 1908 1909 InstructionCost ScalarizationCost = 1910 SkipScalarizationCost ? ScalarizationCostPassed 1911 : getScalarizationOverhead(RetVTy, true, false); 1912 1913 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 1914 SmallVector<Type *, 4> ScalarTys; 1915 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1916 Type *Ty = Tys[i]; 1917 if (Ty->isVectorTy()) 1918 Ty = Ty->getScalarType(); 1919 ScalarTys.push_back(Ty); 1920 } 1921 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 1922 InstructionCost ScalarCost = 1923 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1924 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1925 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 1926 if (!ICA.skipScalarizationCost()) 1927 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1928 ScalarCalls = std::max(ScalarCalls, 1929 cast<FixedVectorType>(VTy)->getNumElements()); 1930 } 1931 } 1932 return ScalarCalls * ScalarCost + ScalarizationCost; 1933 } 1934 1935 // This is going to be turned into a library call, make it expensive. 1936 return SingleCallCost; 1937 } 1938 1939 /// Compute a cost of the given call instruction. 1940 /// 1941 /// Compute the cost of calling function F with return type RetTy and 1942 /// argument types Tys. F might be nullptr, in this case the cost of an 1943 /// arbitrary call with the specified signature will be returned. 1944 /// This is used, for instance, when we estimate call of a vector 1945 /// counterpart of the given function. 1946 /// \param F Called function, might be nullptr. 1947 /// \param RetTy Return value types. 1948 /// \param Tys Argument types. 1949 /// \returns The cost of Call instruction. 1950 InstructionCost 1951 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, 1952 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) { 1953 return 10; 1954 } 1955 1956 unsigned getNumberOfParts(Type *Tp) { 1957 std::pair<InstructionCost, MVT> LT = 1958 getTLI()->getTypeLegalizationCost(DL, Tp); 1959 return *LT.first.getValue(); 1960 } 1961 1962 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 1963 const SCEV *) { 1964 return 0; 1965 } 1966 1967 /// Try to calculate arithmetic and shuffle op costs for reduction operations. 1968 /// We're assuming that reduction operation are performing the following way: 1969 /// 1. Non-pairwise reduction 1970 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1971 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1972 /// \----------------v-------------/ \----------v------------/ 1973 /// n/2 elements n/2 elements 1974 /// %red1 = op <n x t> %val, <n x t> val1 1975 /// After this operation we have a vector %red1 where only the first n/2 1976 /// elements are meaningful, the second n/2 elements are undefined and can be 1977 /// dropped. All other operations are actually working with the vector of 1978 /// length n/2, not n, though the real vector length is still n. 1979 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1980 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1981 /// \----------------v-------------/ \----------v------------/ 1982 /// n/4 elements 3*n/4 elements 1983 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1984 /// length n/2, the resulting vector has length n/4 etc. 1985 /// 2. Pairwise reduction: 1986 /// Everything is the same except for an additional shuffle operation which 1987 /// is used to produce operands for pairwise kind of reductions. 1988 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1989 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1990 /// \-------------v----------/ \----------v------------/ 1991 /// n/2 elements n/2 elements 1992 /// %val2 = shufflevector<n x t> %val, <n x t> %undef, 1993 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1994 /// \-------------v----------/ \----------v------------/ 1995 /// n/2 elements n/2 elements 1996 /// %red1 = op <n x t> %val1, <n x t> val2 1997 /// Again, the operation is performed on <n x t> vector, but the resulting 1998 /// vector %red1 is <n/2 x t> vector. 1999 /// 2000 /// The cost model should take into account that the actual length of the 2001 /// vector is reduced on each iteration. 2002 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2003 bool IsPairwise, 2004 TTI::TargetCostKind CostKind) { 2005 Type *ScalarTy = Ty->getElementType(); 2006 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2007 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2008 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2009 NumVecElts >= 2) { 2010 // Or reduction for i1 is represented as: 2011 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2012 // %res = cmp ne iReduxWidth %val, 0 2013 // And reduction for i1 is represented as: 2014 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2015 // %res = cmp eq iReduxWidth %val, 11111 2016 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2017 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2018 TTI::CastContextHint::None, CostKind) + 2019 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2020 CmpInst::makeCmpResultType(ValTy), 2021 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2022 } 2023 unsigned NumReduxLevels = Log2_32(NumVecElts); 2024 InstructionCost ArithCost = 0; 2025 InstructionCost ShuffleCost = 0; 2026 std::pair<InstructionCost, MVT> LT = 2027 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2028 unsigned LongVectorCount = 0; 2029 unsigned MVTLen = 2030 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2031 while (NumVecElts > MVTLen) { 2032 NumVecElts /= 2; 2033 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2034 // Assume the pairwise shuffles add a cost. 2035 ShuffleCost += (IsPairwise + 1) * 2036 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2037 NumVecElts, SubTy); 2038 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2039 Ty = SubTy; 2040 ++LongVectorCount; 2041 } 2042 2043 NumReduxLevels -= LongVectorCount; 2044 2045 // The minimal length of the vector is limited by the real length of vector 2046 // operations performed on the current platform. That's why several final 2047 // reduction operations are performed on the vectors with the same 2048 // architecture-dependent length. 2049 2050 // Non pairwise reductions need one shuffle per reduction level. Pairwise 2051 // reductions need two shuffles on every level, but the last one. On that 2052 // level one of the shuffles is <0, u, u, ...> which is identity. 2053 unsigned NumShuffles = NumReduxLevels; 2054 if (IsPairwise && NumReduxLevels >= 1) 2055 NumShuffles += NumReduxLevels - 1; 2056 ShuffleCost += NumShuffles * thisT()->getShuffleCost( 2057 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2058 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); 2059 return ShuffleCost + ArithCost + 2060 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2061 } 2062 2063 /// Try to calculate op costs for min/max reduction operations. 2064 /// \param CondTy Conditional type for the Select instruction. 2065 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2066 bool IsPairwise, bool IsUnsigned, 2067 TTI::TargetCostKind CostKind) { 2068 Type *ScalarTy = Ty->getElementType(); 2069 Type *ScalarCondTy = CondTy->getElementType(); 2070 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2071 unsigned NumReduxLevels = Log2_32(NumVecElts); 2072 unsigned CmpOpcode; 2073 if (Ty->isFPOrFPVectorTy()) { 2074 CmpOpcode = Instruction::FCmp; 2075 } else { 2076 assert(Ty->isIntOrIntVectorTy() && 2077 "expecting floating point or integer type for min/max reduction"); 2078 CmpOpcode = Instruction::ICmp; 2079 } 2080 InstructionCost MinMaxCost = 0; 2081 InstructionCost ShuffleCost = 0; 2082 std::pair<InstructionCost, MVT> LT = 2083 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2084 unsigned LongVectorCount = 0; 2085 unsigned MVTLen = 2086 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2087 while (NumVecElts > MVTLen) { 2088 NumVecElts /= 2; 2089 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2090 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); 2091 2092 // Assume the pairwise shuffles add a cost. 2093 ShuffleCost += (IsPairwise + 1) * 2094 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2095 NumVecElts, SubTy); 2096 MinMaxCost += 2097 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, 2098 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2099 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 2100 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2101 Ty = SubTy; 2102 ++LongVectorCount; 2103 } 2104 2105 NumReduxLevels -= LongVectorCount; 2106 2107 // The minimal length of the vector is limited by the real length of vector 2108 // operations performed on the current platform. That's why several final 2109 // reduction opertions are perfomed on the vectors with the same 2110 // architecture-dependent length. 2111 2112 // Non pairwise reductions need one shuffle per reduction level. Pairwise 2113 // reductions need two shuffles on every level, but the last one. On that 2114 // level one of the shuffles is <0, u, u, ...> which is identity. 2115 unsigned NumShuffles = NumReduxLevels; 2116 if (IsPairwise && NumReduxLevels >= 1) 2117 NumShuffles += NumReduxLevels - 1; 2118 ShuffleCost += NumShuffles * thisT()->getShuffleCost( 2119 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2120 MinMaxCost += 2121 NumReduxLevels * 2122 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, 2123 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2124 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 2125 CmpInst::BAD_ICMP_PREDICATE, CostKind)); 2126 // The last min/max should be in vector registers and we counted it above. 2127 // So just need a single extractelement. 2128 return ShuffleCost + MinMaxCost + 2129 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2130 } 2131 2132 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 2133 Type *ResTy, VectorType *Ty, 2134 TTI::TargetCostKind CostKind) { 2135 // Without any native support, this is equivalent to the cost of 2136 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) 2137 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2138 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2139 Instruction::Add, ExtTy, false, CostKind); 2140 InstructionCost MulCost = 0; 2141 InstructionCost ExtCost = thisT()->getCastInstrCost( 2142 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2143 TTI::CastContextHint::None, CostKind); 2144 if (IsMLA) { 2145 MulCost = 2146 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2147 ExtCost *= 2; 2148 } 2149 2150 return RedCost + MulCost + ExtCost; 2151 } 2152 2153 InstructionCost getVectorSplitCost() { return 1; } 2154 2155 /// @} 2156}; 2157 2158/// Concrete BasicTTIImpl that can be used if no further customization 2159/// is needed. 2160class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2161 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2162 2163 friend class BasicTTIImplBase<BasicTTIImpl>; 2164 2165 const TargetSubtargetInfo *ST; 2166 const TargetLoweringBase *TLI; 2167 2168 const TargetSubtargetInfo *getST() const { return ST; } 2169 const TargetLoweringBase *getTLI() const { return TLI; } 2170 2171public: 2172 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2173}; 2174 2175} // end namespace llvm 2176 2177#endif // LLVM_CODEGEN_BASICTTIIMPL_H 2178