1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// This file provides a helper that implements much of the TTI interface in 11/// terms of the target-independent code generator and TargetLowering 12/// interfaces. 13// 14//===----------------------------------------------------------------------===// 15 16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17#define LLVM_CODEGEN_BASICTTIIMPL_H 18 19#include "llvm/ADT/APInt.h" 20#include "llvm/ADT/ArrayRef.h" 21#include "llvm/ADT/BitVector.h" 22#include "llvm/ADT/SmallPtrSet.h" 23#include "llvm/ADT/SmallVector.h" 24#include "llvm/Analysis/LoopInfo.h" 25#include "llvm/Analysis/OptimizationRemarkEmitter.h" 26#include "llvm/Analysis/TargetTransformInfo.h" 27#include "llvm/Analysis/TargetTransformInfoImpl.h" 28#include "llvm/CodeGen/ISDOpcodes.h" 29#include "llvm/CodeGen/MachineValueType.h" 30#include "llvm/CodeGen/TargetLowering.h" 31#include "llvm/CodeGen/TargetSubtargetInfo.h" 32#include "llvm/CodeGen/ValueTypes.h" 33#include "llvm/IR/BasicBlock.h" 34#include "llvm/IR/Constant.h" 35#include "llvm/IR/Constants.h" 36#include "llvm/IR/DataLayout.h" 37#include "llvm/IR/DerivedTypes.h" 38#include "llvm/IR/InstrTypes.h" 39#include "llvm/IR/Instruction.h" 40#include "llvm/IR/Instructions.h" 41#include "llvm/IR/Intrinsics.h" 42#include "llvm/IR/Operator.h" 43#include "llvm/IR/Type.h" 44#include "llvm/IR/Value.h" 45#include "llvm/Support/Casting.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/ErrorHandling.h" 48#include "llvm/Support/MathExtras.h" 49#include "llvm/Target/TargetMachine.h" 50#include "llvm/Target/TargetOptions.h" 51#include <algorithm> 52#include <cassert> 53#include <cstdint> 54#include <limits> 55#include <optional> 56#include <utility> 57 58namespace llvm { 59 60class Function; 61class GlobalValue; 62class LLVMContext; 63class ScalarEvolution; 64class SCEV; 65class TargetMachine; 66 67extern cl::opt<unsigned> PartialUnrollingThreshold; 68 69/// Base class which can be used to help build a TTI implementation. 70/// 71/// This class provides as much implementation of the TTI interface as is 72/// possible using the target independent parts of the code generator. 73/// 74/// In order to subclass it, your class must implement a getST() method to 75/// return the subtarget, and a getTLI() method to return the target lowering. 76/// We need these methods implemented in the derived class so that this class 77/// doesn't have to duplicate storage for them. 78template <typename T> 79class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 80private: 81 using BaseT = TargetTransformInfoImplCRTPBase<T>; 82 using TTI = TargetTransformInfo; 83 84 /// Helper function to access this as a T. 85 T *thisT() { return static_cast<T *>(this); } 86 87 /// Estimate a cost of Broadcast as an extract and sequence of insert 88 /// operations. 89 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, 90 TTI::TargetCostKind CostKind) { 91 InstructionCost Cost = 0; 92 // Broadcast cost is equal to the cost of extracting the zero'th element 93 // plus the cost of inserting it into every element of the result vector. 94 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 95 CostKind, 0, nullptr, nullptr); 96 97 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 98 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 99 CostKind, i, nullptr, nullptr); 100 } 101 return Cost; 102 } 103 104 /// Estimate a cost of shuffle as a sequence of extract and insert 105 /// operations. 106 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, 107 TTI::TargetCostKind CostKind) { 108 InstructionCost Cost = 0; 109 // Shuffle cost is equal to the cost of extracting element from its argument 110 // plus the cost of inserting them onto the result vector. 111 112 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 113 // index 0 of first vector, index 1 of second vector,index 2 of first 114 // vector and finally index 3 of second vector and insert them at index 115 // <0,1,2,3> of result vector. 116 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 117 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 118 CostKind, i, nullptr, nullptr); 119 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 120 CostKind, i, nullptr, nullptr); 121 } 122 return Cost; 123 } 124 125 /// Estimate a cost of subvector extraction as a sequence of extract and 126 /// insert operations. 127 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, 128 TTI::TargetCostKind CostKind, 129 int Index, 130 FixedVectorType *SubVTy) { 131 assert(VTy && SubVTy && 132 "Can only extract subvectors from vectors"); 133 int NumSubElts = SubVTy->getNumElements(); 134 assert((!isa<FixedVectorType>(VTy) || 135 (Index + NumSubElts) <= 136 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 137 "SK_ExtractSubvector index out of range"); 138 139 InstructionCost Cost = 0; 140 // Subvector extraction cost is equal to the cost of extracting element from 141 // the source type plus the cost of inserting them into the result vector 142 // type. 143 for (int i = 0; i != NumSubElts; ++i) { 144 Cost += 145 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 146 CostKind, i + Index, nullptr, nullptr); 147 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, 148 CostKind, i, nullptr, nullptr); 149 } 150 return Cost; 151 } 152 153 /// Estimate a cost of subvector insertion as a sequence of extract and 154 /// insert operations. 155 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, 156 TTI::TargetCostKind CostKind, 157 int Index, 158 FixedVectorType *SubVTy) { 159 assert(VTy && SubVTy && 160 "Can only insert subvectors into vectors"); 161 int NumSubElts = SubVTy->getNumElements(); 162 assert((!isa<FixedVectorType>(VTy) || 163 (Index + NumSubElts) <= 164 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 165 "SK_InsertSubvector index out of range"); 166 167 InstructionCost Cost = 0; 168 // Subvector insertion cost is equal to the cost of extracting element from 169 // the source type plus the cost of inserting them into the result vector 170 // type. 171 for (int i = 0; i != NumSubElts; ++i) { 172 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, 173 CostKind, i, nullptr, nullptr); 174 Cost += 175 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, 176 i + Index, nullptr, nullptr); 177 } 178 return Cost; 179 } 180 181 /// Local query method delegates up to T which *must* implement this! 182 const TargetSubtargetInfo *getST() const { 183 return static_cast<const T *>(this)->getST(); 184 } 185 186 /// Local query method delegates up to T which *must* implement this! 187 const TargetLoweringBase *getTLI() const { 188 return static_cast<const T *>(this)->getTLI(); 189 } 190 191 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 192 switch (M) { 193 case TTI::MIM_Unindexed: 194 return ISD::UNINDEXED; 195 case TTI::MIM_PreInc: 196 return ISD::PRE_INC; 197 case TTI::MIM_PreDec: 198 return ISD::PRE_DEC; 199 case TTI::MIM_PostInc: 200 return ISD::POST_INC; 201 case TTI::MIM_PostDec: 202 return ISD::POST_DEC; 203 } 204 llvm_unreachable("Unexpected MemIndexedMode"); 205 } 206 207 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 208 Align Alignment, 209 bool VariableMask, 210 bool IsGatherScatter, 211 TTI::TargetCostKind CostKind) { 212 // We cannot scalarize scalable vectors, so return Invalid. 213 if (isa<ScalableVectorType>(DataTy)) 214 return InstructionCost::getInvalid(); 215 216 auto *VT = cast<FixedVectorType>(DataTy); 217 // Assume the target does not have support for gather/scatter operations 218 // and provide a rough estimate. 219 // 220 // First, compute the cost of the individual memory operations. 221 InstructionCost AddrExtractCost = 222 IsGatherScatter 223 ? getVectorInstrCost(Instruction::ExtractElement, 224 FixedVectorType::get( 225 PointerType::get(VT->getElementType(), 0), 226 VT->getNumElements()), 227 CostKind, -1, nullptr, nullptr) 228 : 0; 229 InstructionCost LoadCost = 230 VT->getNumElements() * 231 (AddrExtractCost + 232 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 233 234 // Next, compute the cost of packing the result in a vector. 235 InstructionCost PackingCost = 236 getScalarizationOverhead(VT, Opcode != Instruction::Store, 237 Opcode == Instruction::Store, CostKind); 238 239 InstructionCost ConditionalCost = 0; 240 if (VariableMask) { 241 // Compute the cost of conditionally executing the memory operations with 242 // variable masks. This includes extracting the individual conditions, a 243 // branches and PHIs to combine the results. 244 // NOTE: Estimating the cost of conditionally executing the memory 245 // operations accurately is quite difficult and the current solution 246 // provides a very rough estimate only. 247 ConditionalCost = 248 VT->getNumElements() * 249 (getVectorInstrCost( 250 Instruction::ExtractElement, 251 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 252 VT->getNumElements()), 253 CostKind, -1, nullptr, nullptr) + 254 getCFInstrCost(Instruction::Br, CostKind) + 255 getCFInstrCost(Instruction::PHI, CostKind)); 256 } 257 258 return LoadCost + PackingCost + ConditionalCost; 259 } 260 261protected: 262 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 263 : BaseT(DL) {} 264 virtual ~BasicTTIImplBase() = default; 265 266 using TargetTransformInfoImplBase::DL; 267 268public: 269 /// \name Scalar TTI Implementations 270 /// @{ 271 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 272 unsigned AddressSpace, Align Alignment, 273 unsigned *Fast) const { 274 EVT E = EVT::getIntegerVT(Context, BitWidth); 275 return getTLI()->allowsMisalignedMemoryAccesses( 276 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 277 } 278 279 bool hasBranchDivergence(const Function *F = nullptr) { return false; } 280 281 bool isSourceOfDivergence(const Value *V) { return false; } 282 283 bool isAlwaysUniform(const Value *V) { return false; } 284 285 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 286 return false; 287 } 288 289 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 290 return true; 291 } 292 293 unsigned getFlatAddressSpace() { 294 // Return an invalid address space. 295 return -1; 296 } 297 298 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 299 Intrinsic::ID IID) const { 300 return false; 301 } 302 303 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 304 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 305 } 306 307 unsigned getAssumedAddrSpace(const Value *V) const { 308 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 309 } 310 311 bool isSingleThreaded() const { 312 return getTLI()->getTargetMachine().Options.ThreadModel == 313 ThreadModel::Single; 314 } 315 316 std::pair<const Value *, unsigned> 317 getPredicatedAddrSpace(const Value *V) const { 318 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 319 } 320 321 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 322 Value *NewV) const { 323 return nullptr; 324 } 325 326 bool isLegalAddImmediate(int64_t imm) { 327 return getTLI()->isLegalAddImmediate(imm); 328 } 329 330 bool isLegalICmpImmediate(int64_t imm) { 331 return getTLI()->isLegalICmpImmediate(imm); 332 } 333 334 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 335 bool HasBaseReg, int64_t Scale, 336 unsigned AddrSpace, Instruction *I = nullptr) { 337 TargetLoweringBase::AddrMode AM; 338 AM.BaseGV = BaseGV; 339 AM.BaseOffs = BaseOffset; 340 AM.HasBaseReg = HasBaseReg; 341 AM.Scale = Scale; 342 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 343 } 344 345 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { 346 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); 347 } 348 349 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 350 Type *ScalarValTy) const { 351 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 352 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 353 EVT VT = getTLI()->getValueType(DL, SrcTy); 354 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 355 getTLI()->isOperationCustom(ISD::STORE, VT)) 356 return true; 357 358 EVT ValVT = 359 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 360 EVT LegalizedVT = 361 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 362 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 363 }; 364 while (VF > 2 && IsSupportedByTarget(VF)) 365 VF /= 2; 366 return VF; 367 } 368 369 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 370 const DataLayout &DL) const { 371 EVT VT = getTLI()->getValueType(DL, Ty); 372 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 373 } 374 375 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 376 const DataLayout &DL) const { 377 EVT VT = getTLI()->getValueType(DL, Ty); 378 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 379 } 380 381 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 382 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 383 } 384 385 bool isNumRegsMajorCostOfLSR() { 386 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 387 } 388 389 bool shouldFoldTerminatingConditionAfterLSR() const { 390 return TargetTransformInfoImplBase:: 391 shouldFoldTerminatingConditionAfterLSR(); 392 } 393 394 bool isProfitableLSRChainElement(Instruction *I) { 395 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 396 } 397 398 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 399 int64_t BaseOffset, bool HasBaseReg, 400 int64_t Scale, unsigned AddrSpace) { 401 TargetLoweringBase::AddrMode AM; 402 AM.BaseGV = BaseGV; 403 AM.BaseOffs = BaseOffset; 404 AM.HasBaseReg = HasBaseReg; 405 AM.Scale = Scale; 406 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 407 return 0; 408 return -1; 409 } 410 411 bool isTruncateFree(Type *Ty1, Type *Ty2) { 412 return getTLI()->isTruncateFree(Ty1, Ty2); 413 } 414 415 bool isProfitableToHoist(Instruction *I) { 416 return getTLI()->isProfitableToHoist(I); 417 } 418 419 bool useAA() const { return getST()->useAA(); } 420 421 bool isTypeLegal(Type *Ty) { 422 EVT VT = getTLI()->getValueType(DL, Ty); 423 return getTLI()->isTypeLegal(VT); 424 } 425 426 unsigned getRegUsageForType(Type *Ty) { 427 EVT ETy = getTLI()->getValueType(DL, Ty); 428 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 429 } 430 431 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 432 ArrayRef<const Value *> Operands, Type *AccessType, 433 TTI::TargetCostKind CostKind) { 434 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); 435 } 436 437 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 438 unsigned &JumpTableSize, 439 ProfileSummaryInfo *PSI, 440 BlockFrequencyInfo *BFI) { 441 /// Try to find the estimated number of clusters. Note that the number of 442 /// clusters identified in this function could be different from the actual 443 /// numbers found in lowering. This function ignore switches that are 444 /// lowered with a mix of jump table / bit test / BTree. This function was 445 /// initially intended to be used when estimating the cost of switch in 446 /// inline cost heuristic, but it's a generic cost model to be used in other 447 /// places (e.g., in loop unrolling). 448 unsigned N = SI.getNumCases(); 449 const TargetLoweringBase *TLI = getTLI(); 450 const DataLayout &DL = this->getDataLayout(); 451 452 JumpTableSize = 0; 453 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 454 455 // Early exit if both a jump table and bit test are not allowed. 456 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 457 return N; 458 459 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 460 APInt MinCaseVal = MaxCaseVal; 461 for (auto CI : SI.cases()) { 462 const APInt &CaseVal = CI.getCaseValue()->getValue(); 463 if (CaseVal.sgt(MaxCaseVal)) 464 MaxCaseVal = CaseVal; 465 if (CaseVal.slt(MinCaseVal)) 466 MinCaseVal = CaseVal; 467 } 468 469 // Check if suitable for a bit test 470 if (N <= DL.getIndexSizeInBits(0u)) { 471 SmallPtrSet<const BasicBlock *, 4> Dests; 472 for (auto I : SI.cases()) 473 Dests.insert(I.getCaseSuccessor()); 474 475 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 476 DL)) 477 return 1; 478 } 479 480 // Check if suitable for a jump table. 481 if (IsJTAllowed) { 482 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 483 return N; 484 uint64_t Range = 485 (MaxCaseVal - MinCaseVal) 486 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 487 // Check whether a range of clusters is dense enough for a jump table 488 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 489 JumpTableSize = Range; 490 return 1; 491 } 492 } 493 return N; 494 } 495 496 bool shouldBuildLookupTables() { 497 const TargetLoweringBase *TLI = getTLI(); 498 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 499 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 500 } 501 502 bool shouldBuildRelLookupTables() const { 503 const TargetMachine &TM = getTLI()->getTargetMachine(); 504 // If non-PIC mode, do not generate a relative lookup table. 505 if (!TM.isPositionIndependent()) 506 return false; 507 508 /// Relative lookup table entries consist of 32-bit offsets. 509 /// Do not generate relative lookup tables for large code models 510 /// in 64-bit achitectures where 32-bit offsets might not be enough. 511 if (TM.getCodeModel() == CodeModel::Medium || 512 TM.getCodeModel() == CodeModel::Large) 513 return false; 514 515 Triple TargetTriple = TM.getTargetTriple(); 516 if (!TargetTriple.isArch64Bit()) 517 return false; 518 519 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 520 // there. 521 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 522 return false; 523 524 return true; 525 } 526 527 bool haveFastSqrt(Type *Ty) { 528 const TargetLoweringBase *TLI = getTLI(); 529 EVT VT = TLI->getValueType(DL, Ty); 530 return TLI->isTypeLegal(VT) && 531 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 532 } 533 534 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 535 return true; 536 } 537 538 InstructionCost getFPOpCost(Type *Ty) { 539 // Check whether FADD is available, as a proxy for floating-point in 540 // general. 541 const TargetLoweringBase *TLI = getTLI(); 542 EVT VT = TLI->getValueType(DL, Ty); 543 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 544 return TargetTransformInfo::TCC_Basic; 545 return TargetTransformInfo::TCC_Expensive; 546 } 547 548 bool preferToKeepConstantsAttached(const Instruction &Inst, 549 const Function &Fn) const { 550 switch (Inst.getOpcode()) { 551 default: 552 break; 553 case Instruction::SDiv: 554 case Instruction::SRem: 555 case Instruction::UDiv: 556 case Instruction::URem: { 557 if (!isa<ConstantInt>(Inst.getOperand(1))) 558 return false; 559 EVT VT = getTLI()->getValueType(DL, Inst.getType()); 560 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); 561 } 562 }; 563 564 return false; 565 } 566 567 unsigned getInliningThresholdMultiplier() const { return 1; } 568 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 569 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { 570 return 0; 571 } 572 573 int getInlinerVectorBonusPercent() const { return 150; } 574 575 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 576 TTI::UnrollingPreferences &UP, 577 OptimizationRemarkEmitter *ORE) { 578 // This unrolling functionality is target independent, but to provide some 579 // motivation for its intended use, for x86: 580 581 // According to the Intel 64 and IA-32 Architectures Optimization Reference 582 // Manual, Intel Core models and later have a loop stream detector (and 583 // associated uop queue) that can benefit from partial unrolling. 584 // The relevant requirements are: 585 // - The loop must have no more than 4 (8 for Nehalem and later) branches 586 // taken, and none of them may be calls. 587 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 588 589 // According to the Software Optimization Guide for AMD Family 15h 590 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 591 // and loop buffer which can benefit from partial unrolling. 592 // The relevant requirements are: 593 // - The loop must have fewer than 16 branches 594 // - The loop must have less than 40 uops in all executed loop branches 595 596 // The number of taken branches in a loop is hard to estimate here, and 597 // benchmarking has revealed that it is better not to be conservative when 598 // estimating the branch count. As a result, we'll ignore the branch limits 599 // until someone finds a case where it matters in practice. 600 601 unsigned MaxOps; 602 const TargetSubtargetInfo *ST = getST(); 603 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 604 MaxOps = PartialUnrollingThreshold; 605 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 606 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 607 else 608 return; 609 610 // Scan the loop: don't unroll loops with calls. 611 for (BasicBlock *BB : L->blocks()) { 612 for (Instruction &I : *BB) { 613 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 614 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 615 if (!thisT()->isLoweredToCall(F)) 616 continue; 617 } 618 619 if (ORE) { 620 ORE->emit([&]() { 621 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 622 L->getHeader()) 623 << "advising against unrolling the loop because it " 624 "contains a " 625 << ore::NV("Call", &I); 626 }); 627 } 628 return; 629 } 630 } 631 } 632 633 // Enable runtime and partial unrolling up to the specified size. 634 // Enable using trip count upper bound to unroll loops. 635 UP.Partial = UP.Runtime = UP.UpperBound = true; 636 UP.PartialThreshold = MaxOps; 637 638 // Avoid unrolling when optimizing for size. 639 UP.OptSizeThreshold = 0; 640 UP.PartialOptSizeThreshold = 0; 641 642 // Set number of instructions optimized when "back edge" 643 // becomes "fall through" to default value of 2. 644 UP.BEInsns = 2; 645 } 646 647 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 648 TTI::PeelingPreferences &PP) { 649 PP.PeelCount = 0; 650 PP.AllowPeeling = true; 651 PP.AllowLoopNestsPeeling = false; 652 PP.PeelProfiledIterations = true; 653 } 654 655 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 656 AssumptionCache &AC, 657 TargetLibraryInfo *LibInfo, 658 HardwareLoopInfo &HWLoopInfo) { 659 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 660 } 661 662 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 663 return BaseT::preferPredicateOverEpilogue(TFI); 664 } 665 666 TailFoldingStyle 667 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { 668 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); 669 } 670 671 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 672 IntrinsicInst &II) { 673 return BaseT::instCombineIntrinsic(IC, II); 674 } 675 676 std::optional<Value *> 677 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 678 APInt DemandedMask, KnownBits &Known, 679 bool &KnownBitsComputed) { 680 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 681 KnownBitsComputed); 682 } 683 684 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 685 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 686 APInt &UndefElts2, APInt &UndefElts3, 687 std::function<void(Instruction *, unsigned, APInt, APInt &)> 688 SimplifyAndSetOp) { 689 return BaseT::simplifyDemandedVectorEltsIntrinsic( 690 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 691 SimplifyAndSetOp); 692 } 693 694 virtual std::optional<unsigned> 695 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 696 return std::optional<unsigned>( 697 getST()->getCacheSize(static_cast<unsigned>(Level))); 698 } 699 700 virtual std::optional<unsigned> 701 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 702 std::optional<unsigned> TargetResult = 703 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 704 705 if (TargetResult) 706 return TargetResult; 707 708 return BaseT::getCacheAssociativity(Level); 709 } 710 711 virtual unsigned getCacheLineSize() const { 712 return getST()->getCacheLineSize(); 713 } 714 715 virtual unsigned getPrefetchDistance() const { 716 return getST()->getPrefetchDistance(); 717 } 718 719 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 720 unsigned NumStridedMemAccesses, 721 unsigned NumPrefetches, 722 bool HasCall) const { 723 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 724 NumPrefetches, HasCall); 725 } 726 727 virtual unsigned getMaxPrefetchIterationsAhead() const { 728 return getST()->getMaxPrefetchIterationsAhead(); 729 } 730 731 virtual bool enableWritePrefetching() const { 732 return getST()->enableWritePrefetching(); 733 } 734 735 virtual bool shouldPrefetchAddressSpace(unsigned AS) const { 736 return getST()->shouldPrefetchAddressSpace(AS); 737 } 738 739 /// @} 740 741 /// \name Vector TTI Implementations 742 /// @{ 743 744 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 745 return TypeSize::getFixed(32); 746 } 747 748 std::optional<unsigned> getMaxVScale() const { return std::nullopt; } 749 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } 750 bool isVScaleKnownToBeAPowerOfTwo() const { return false; } 751 752 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 753 /// are set if the demanded result elements need to be inserted and/or 754 /// extracted from vectors. 755 InstructionCost getScalarizationOverhead(VectorType *InTy, 756 const APInt &DemandedElts, 757 bool Insert, bool Extract, 758 TTI::TargetCostKind CostKind) { 759 /// FIXME: a bitfield is not a reasonable abstraction for talking about 760 /// which elements are needed from a scalable vector 761 if (isa<ScalableVectorType>(InTy)) 762 return InstructionCost::getInvalid(); 763 auto *Ty = cast<FixedVectorType>(InTy); 764 765 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 766 "Vector size mismatch"); 767 768 InstructionCost Cost = 0; 769 770 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 771 if (!DemandedElts[i]) 772 continue; 773 if (Insert) 774 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, 775 CostKind, i, nullptr, nullptr); 776 if (Extract) 777 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 778 CostKind, i, nullptr, nullptr); 779 } 780 781 return Cost; 782 } 783 784 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. 785 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 786 bool Extract, 787 TTI::TargetCostKind CostKind) { 788 if (isa<ScalableVectorType>(InTy)) 789 return InstructionCost::getInvalid(); 790 auto *Ty = cast<FixedVectorType>(InTy); 791 792 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 793 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 794 CostKind); 795 } 796 797 /// Estimate the overhead of scalarizing an instructions unique 798 /// non-constant operands. The (potentially vector) types to use for each of 799 /// argument are passes via Tys. 800 InstructionCost 801 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 802 ArrayRef<Type *> Tys, 803 TTI::TargetCostKind CostKind) { 804 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 805 806 InstructionCost Cost = 0; 807 SmallPtrSet<const Value*, 4> UniqueOperands; 808 for (int I = 0, E = Args.size(); I != E; I++) { 809 // Disregard things like metadata arguments. 810 const Value *A = Args[I]; 811 Type *Ty = Tys[I]; 812 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 813 !Ty->isPtrOrPtrVectorTy()) 814 continue; 815 816 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 817 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 818 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, 819 /*Extract*/ true, CostKind); 820 } 821 } 822 823 return Cost; 824 } 825 826 /// Estimate the overhead of scalarizing the inputs and outputs of an 827 /// instruction, with return type RetTy and arguments Args of type Tys. If 828 /// Args are unknown (empty), then the cost associated with one argument is 829 /// added as a heuristic. 830 InstructionCost getScalarizationOverhead(VectorType *RetTy, 831 ArrayRef<const Value *> Args, 832 ArrayRef<Type *> Tys, 833 TTI::TargetCostKind CostKind) { 834 InstructionCost Cost = getScalarizationOverhead( 835 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); 836 if (!Args.empty()) 837 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); 838 else 839 // When no information on arguments is provided, we add the cost 840 // associated with one argument as a heuristic. 841 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, 842 /*Extract*/ true, CostKind); 843 844 return Cost; 845 } 846 847 /// Estimate the cost of type-legalization and the legalized type. 848 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { 849 LLVMContext &C = Ty->getContext(); 850 EVT MTy = getTLI()->getValueType(DL, Ty); 851 852 InstructionCost Cost = 1; 853 // We keep legalizing the type until we find a legal kind. We assume that 854 // the only operation that costs anything is the split. After splitting 855 // we need to handle two types. 856 while (true) { 857 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); 858 859 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { 860 // Ensure we return a sensible simple VT here, since many callers of 861 // this function require it. 862 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; 863 return std::make_pair(InstructionCost::getInvalid(), VT); 864 } 865 866 if (LK.first == TargetLoweringBase::TypeLegal) 867 return std::make_pair(Cost, MTy.getSimpleVT()); 868 869 if (LK.first == TargetLoweringBase::TypeSplitVector || 870 LK.first == TargetLoweringBase::TypeExpandInteger) 871 Cost *= 2; 872 873 // Do not loop with f128 type. 874 if (MTy == LK.second) 875 return std::make_pair(Cost, MTy.getSimpleVT()); 876 877 // Keep legalizing the type. 878 MTy = LK.second; 879 } 880 } 881 882 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } 883 884 InstructionCost getArithmeticInstrCost( 885 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 886 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 887 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 888 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 889 const Instruction *CxtI = nullptr) { 890 // Check if any of the operands are vector operands. 891 const TargetLoweringBase *TLI = getTLI(); 892 int ISD = TLI->InstructionOpcodeToISD(Opcode); 893 assert(ISD && "Invalid opcode"); 894 895 // TODO: Handle more cost kinds. 896 if (CostKind != TTI::TCK_RecipThroughput) 897 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 898 Opd1Info, Opd2Info, 899 Args, CxtI); 900 901 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 902 903 bool IsFloat = Ty->isFPOrFPVectorTy(); 904 // Assume that floating point arithmetic operations cost twice as much as 905 // integer operations. 906 InstructionCost OpCost = (IsFloat ? 2 : 1); 907 908 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 909 // The operation is legal. Assume it costs 1. 910 // TODO: Once we have extract/insert subvector cost we need to use them. 911 return LT.first * OpCost; 912 } 913 914 if (!TLI->isOperationExpand(ISD, LT.second)) { 915 // If the operation is custom lowered, then assume that the code is twice 916 // as expensive. 917 return LT.first * 2 * OpCost; 918 } 919 920 // An 'Expand' of URem and SRem is special because it may default 921 // to expanding the operation into a sequence of sub-operations 922 // i.e. X % Y -> X-(X/Y)*Y. 923 if (ISD == ISD::UREM || ISD == ISD::SREM) { 924 bool IsSigned = ISD == ISD::SREM; 925 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 926 LT.second) || 927 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 928 LT.second)) { 929 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 930 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 931 DivOpc, Ty, CostKind, Opd1Info, Opd2Info); 932 InstructionCost MulCost = 933 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 934 InstructionCost SubCost = 935 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 936 return DivCost + MulCost + SubCost; 937 } 938 } 939 940 // We cannot scalarize scalable vectors, so return Invalid. 941 if (isa<ScalableVectorType>(Ty)) 942 return InstructionCost::getInvalid(); 943 944 // Else, assume that we need to scalarize this op. 945 // TODO: If one of the types get legalized by splitting, handle this 946 // similarly to what getCastInstrCost() does. 947 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 948 InstructionCost Cost = thisT()->getArithmeticInstrCost( 949 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 950 Args, CxtI); 951 // Return the cost of multiple scalar invocation plus the cost of 952 // inserting and extracting the values. 953 SmallVector<Type *> Tys(Args.size(), Ty); 954 return getScalarizationOverhead(VTy, Args, Tys, CostKind) + 955 VTy->getNumElements() * Cost; 956 } 957 958 // We don't know anything about this scalar instruction. 959 return OpCost; 960 } 961 962 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 963 ArrayRef<int> Mask, 964 VectorType *Ty, int &Index, 965 VectorType *&SubTy) const { 966 if (Mask.empty()) 967 return Kind; 968 int NumSrcElts = Ty->getElementCount().getKnownMinValue(); 969 switch (Kind) { 970 case TTI::SK_PermuteSingleSrc: 971 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) 972 return TTI::SK_Reverse; 973 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) 974 return TTI::SK_Broadcast; 975 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && 976 (Index + Mask.size()) <= (size_t)NumSrcElts) { 977 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); 978 return TTI::SK_ExtractSubvector; 979 } 980 break; 981 case TTI::SK_PermuteTwoSrc: { 982 int NumSubElts; 983 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 984 Mask, NumSrcElts, NumSubElts, Index)) { 985 if (Index + NumSubElts > NumSrcElts) 986 return Kind; 987 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); 988 return TTI::SK_InsertSubvector; 989 } 990 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) 991 return TTI::SK_Select; 992 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) 993 return TTI::SK_Transpose; 994 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) 995 return TTI::SK_Splice; 996 break; 997 } 998 case TTI::SK_Select: 999 case TTI::SK_Reverse: 1000 case TTI::SK_Broadcast: 1001 case TTI::SK_Transpose: 1002 case TTI::SK_InsertSubvector: 1003 case TTI::SK_ExtractSubvector: 1004 case TTI::SK_Splice: 1005 break; 1006 } 1007 return Kind; 1008 } 1009 1010 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1011 ArrayRef<int> Mask, 1012 TTI::TargetCostKind CostKind, int Index, 1013 VectorType *SubTp, 1014 ArrayRef<const Value *> Args = std::nullopt) { 1015 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { 1016 case TTI::SK_Broadcast: 1017 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1018 return getBroadcastShuffleOverhead(FVT, CostKind); 1019 return InstructionCost::getInvalid(); 1020 case TTI::SK_Select: 1021 case TTI::SK_Splice: 1022 case TTI::SK_Reverse: 1023 case TTI::SK_Transpose: 1024 case TTI::SK_PermuteSingleSrc: 1025 case TTI::SK_PermuteTwoSrc: 1026 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1027 return getPermuteShuffleOverhead(FVT, CostKind); 1028 return InstructionCost::getInvalid(); 1029 case TTI::SK_ExtractSubvector: 1030 return getExtractSubvectorOverhead(Tp, CostKind, Index, 1031 cast<FixedVectorType>(SubTp)); 1032 case TTI::SK_InsertSubvector: 1033 return getInsertSubvectorOverhead(Tp, CostKind, Index, 1034 cast<FixedVectorType>(SubTp)); 1035 } 1036 llvm_unreachable("Unknown TTI::ShuffleKind"); 1037 } 1038 1039 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1040 TTI::CastContextHint CCH, 1041 TTI::TargetCostKind CostKind, 1042 const Instruction *I = nullptr) { 1043 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 1044 return 0; 1045 1046 const TargetLoweringBase *TLI = getTLI(); 1047 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1048 assert(ISD && "Invalid opcode"); 1049 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1050 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1051 1052 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 1053 TypeSize DstSize = DstLT.second.getSizeInBits(); 1054 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 1055 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 1056 1057 switch (Opcode) { 1058 default: 1059 break; 1060 case Instruction::Trunc: 1061 // Check for NOOP conversions. 1062 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 1063 return 0; 1064 [[fallthrough]]; 1065 case Instruction::BitCast: 1066 // Bitcast between types that are legalized to the same type are free and 1067 // assume int to/from ptr of the same size is also free. 1068 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 1069 SrcSize == DstSize) 1070 return 0; 1071 break; 1072 case Instruction::FPExt: 1073 if (I && getTLI()->isExtFree(I)) 1074 return 0; 1075 break; 1076 case Instruction::ZExt: 1077 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 1078 return 0; 1079 [[fallthrough]]; 1080 case Instruction::SExt: 1081 if (I && getTLI()->isExtFree(I)) 1082 return 0; 1083 1084 // If this is a zext/sext of a load, return 0 if the corresponding 1085 // extending load exists on target and the result type is legal. 1086 if (CCH == TTI::CastContextHint::Normal) { 1087 EVT ExtVT = EVT::getEVT(Dst); 1088 EVT LoadVT = EVT::getEVT(Src); 1089 unsigned LType = 1090 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 1091 if (DstLT.first == SrcLT.first && 1092 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 1093 return 0; 1094 } 1095 break; 1096 case Instruction::AddrSpaceCast: 1097 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 1098 Dst->getPointerAddressSpace())) 1099 return 0; 1100 break; 1101 } 1102 1103 auto *SrcVTy = dyn_cast<VectorType>(Src); 1104 auto *DstVTy = dyn_cast<VectorType>(Dst); 1105 1106 // If the cast is marked as legal (or promote) then assume low cost. 1107 if (SrcLT.first == DstLT.first && 1108 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1109 return SrcLT.first; 1110 1111 // Handle scalar conversions. 1112 if (!SrcVTy && !DstVTy) { 1113 // Just check the op cost. If the operation is legal then assume it costs 1114 // 1. 1115 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1116 return 1; 1117 1118 // Assume that illegal scalar instruction are expensive. 1119 return 4; 1120 } 1121 1122 // Check vector-to-vector casts. 1123 if (DstVTy && SrcVTy) { 1124 // If the cast is between same-sized registers, then the check is simple. 1125 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1126 1127 // Assume that Zext is done using AND. 1128 if (Opcode == Instruction::ZExt) 1129 return SrcLT.first; 1130 1131 // Assume that sext is done using SHL and SRA. 1132 if (Opcode == Instruction::SExt) 1133 return SrcLT.first * 2; 1134 1135 // Just check the op cost. If the operation is legal then assume it 1136 // costs 1137 // 1 and multiply by the type-legalization overhead. 1138 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1139 return SrcLT.first * 1; 1140 } 1141 1142 // If we are legalizing by splitting, query the concrete TTI for the cost 1143 // of casting the original vector twice. We also need to factor in the 1144 // cost of the split itself. Count that as 1, to be consistent with 1145 // getTypeLegalizationCost(). 1146 bool SplitSrc = 1147 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1148 TargetLowering::TypeSplitVector; 1149 bool SplitDst = 1150 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1151 TargetLowering::TypeSplitVector; 1152 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1153 DstVTy->getElementCount().isVector()) { 1154 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1155 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1156 T *TTI = static_cast<T *>(this); 1157 // If both types need to be split then the split is free. 1158 InstructionCost SplitCost = 1159 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1160 return SplitCost + 1161 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1162 CostKind, I)); 1163 } 1164 1165 // Scalarization cost is Invalid, can't assume any num elements. 1166 if (isa<ScalableVectorType>(DstVTy)) 1167 return InstructionCost::getInvalid(); 1168 1169 // In other cases where the source or destination are illegal, assume 1170 // the operation will get scalarized. 1171 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1172 InstructionCost Cost = thisT()->getCastInstrCost( 1173 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1174 1175 // Return the cost of multiple scalar invocation plus the cost of 1176 // inserting and extracting the values. 1177 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, 1178 CostKind) + 1179 Num * Cost; 1180 } 1181 1182 // We already handled vector-to-vector and scalar-to-scalar conversions. 1183 // This 1184 // is where we handle bitcast between vectors and scalars. We need to assume 1185 // that the conversion is scalarized in one way or another. 1186 if (Opcode == Instruction::BitCast) { 1187 // Illegal bitcasts are done by storing and loading from a stack slot. 1188 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, 1189 /*Extract*/ true, CostKind) 1190 : 0) + 1191 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, 1192 /*Extract*/ false, CostKind) 1193 : 0); 1194 } 1195 1196 llvm_unreachable("Unhandled cast"); 1197 } 1198 1199 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1200 VectorType *VecTy, unsigned Index) { 1201 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1202 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1203 CostKind, Index, nullptr, nullptr) + 1204 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1205 TTI::CastContextHint::None, CostKind); 1206 } 1207 1208 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1209 const Instruction *I = nullptr) { 1210 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1211 } 1212 1213 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1214 CmpInst::Predicate VecPred, 1215 TTI::TargetCostKind CostKind, 1216 const Instruction *I = nullptr) { 1217 const TargetLoweringBase *TLI = getTLI(); 1218 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1219 assert(ISD && "Invalid opcode"); 1220 1221 // TODO: Handle other cost kinds. 1222 if (CostKind != TTI::TCK_RecipThroughput) 1223 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1224 I); 1225 1226 // Selects on vectors are actually vector selects. 1227 if (ISD == ISD::SELECT) { 1228 assert(CondTy && "CondTy must exist"); 1229 if (CondTy->isVectorTy()) 1230 ISD = ISD::VSELECT; 1231 } 1232 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1233 1234 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1235 !TLI->isOperationExpand(ISD, LT.second)) { 1236 // The operation is legal. Assume it costs 1. Multiply 1237 // by the type-legalization overhead. 1238 return LT.first * 1; 1239 } 1240 1241 // Otherwise, assume that the cast is scalarized. 1242 // TODO: If one of the types get legalized by splitting, handle this 1243 // similarly to what getCastInstrCost() does. 1244 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1245 if (isa<ScalableVectorType>(ValTy)) 1246 return InstructionCost::getInvalid(); 1247 1248 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1249 if (CondTy) 1250 CondTy = CondTy->getScalarType(); 1251 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1252 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1253 1254 // Return the cost of multiple scalar invocation plus the cost of 1255 // inserting and extracting the values. 1256 return getScalarizationOverhead(ValVTy, /*Insert*/ true, 1257 /*Extract*/ false, CostKind) + 1258 Num * Cost; 1259 } 1260 1261 // Unknown scalar opcode. 1262 return 1; 1263 } 1264 1265 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1266 TTI::TargetCostKind CostKind, 1267 unsigned Index, Value *Op0, Value *Op1) { 1268 return getRegUsageForType(Val->getScalarType()); 1269 } 1270 1271 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1272 TTI::TargetCostKind CostKind, 1273 unsigned Index) { 1274 Value *Op0 = nullptr; 1275 Value *Op1 = nullptr; 1276 if (auto *IE = dyn_cast<InsertElementInst>(&I)) { 1277 Op0 = IE->getOperand(0); 1278 Op1 = IE->getOperand(1); 1279 } 1280 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, 1281 Op1); 1282 } 1283 1284 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1285 int VF, 1286 const APInt &DemandedDstElts, 1287 TTI::TargetCostKind CostKind) { 1288 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1289 "Unexpected size of DemandedDstElts."); 1290 1291 InstructionCost Cost; 1292 1293 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1294 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1295 1296 // The Mask shuffling cost is extract all the elements of the Mask 1297 // and insert each of them Factor times into the wide vector: 1298 // 1299 // E.g. an interleaved group with factor 3: 1300 // %mask = icmp ult <8 x i32> %vec1, %vec2 1301 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1302 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1303 // The cost is estimated as extract all mask elements from the <8xi1> mask 1304 // vector and insert them factor times into the <24xi1> shuffled mask 1305 // vector. 1306 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1307 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1308 /*Insert*/ false, 1309 /*Extract*/ true, CostKind); 1310 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1311 /*Insert*/ true, 1312 /*Extract*/ false, CostKind); 1313 1314 return Cost; 1315 } 1316 1317 InstructionCost 1318 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 1319 unsigned AddressSpace, TTI::TargetCostKind CostKind, 1320 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 1321 const Instruction *I = nullptr) { 1322 assert(!Src->isVoidTy() && "Invalid type"); 1323 // Assume types, such as structs, are expensive. 1324 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1325 return 4; 1326 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1327 1328 // Assuming that all loads of legal types cost 1. 1329 InstructionCost Cost = LT.first; 1330 if (CostKind != TTI::TCK_RecipThroughput) 1331 return Cost; 1332 1333 const DataLayout &DL = this->getDataLayout(); 1334 if (Src->isVectorTy() && 1335 // In practice it's not currently possible to have a change in lane 1336 // length for extending loads or truncating stores so both types should 1337 // have the same scalable property. 1338 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1339 LT.second.getSizeInBits())) { 1340 // This is a vector load that legalizes to a larger type than the vector 1341 // itself. Unless the corresponding extending load or truncating store is 1342 // legal, then this will scalarize. 1343 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1344 EVT MemVT = getTLI()->getValueType(DL, Src); 1345 if (Opcode == Instruction::Store) 1346 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1347 else 1348 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1349 1350 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1351 // This is a vector load/store for some illegal type that is scalarized. 1352 // We must account for the cost of building or decomposing the vector. 1353 Cost += getScalarizationOverhead( 1354 cast<VectorType>(Src), Opcode != Instruction::Store, 1355 Opcode == Instruction::Store, CostKind); 1356 } 1357 } 1358 1359 return Cost; 1360 } 1361 1362 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1363 Align Alignment, unsigned AddressSpace, 1364 TTI::TargetCostKind CostKind) { 1365 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1366 CostKind); 1367 } 1368 1369 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1370 const Value *Ptr, bool VariableMask, 1371 Align Alignment, 1372 TTI::TargetCostKind CostKind, 1373 const Instruction *I = nullptr) { 1374 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1375 true, CostKind); 1376 } 1377 1378 InstructionCost getInterleavedMemoryOpCost( 1379 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1380 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1381 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1382 1383 // We cannot scalarize scalable vectors, so return Invalid. 1384 if (isa<ScalableVectorType>(VecTy)) 1385 return InstructionCost::getInvalid(); 1386 1387 auto *VT = cast<FixedVectorType>(VecTy); 1388 1389 unsigned NumElts = VT->getNumElements(); 1390 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1391 1392 unsigned NumSubElts = NumElts / Factor; 1393 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1394 1395 // Firstly, the cost of load/store operation. 1396 InstructionCost Cost; 1397 if (UseMaskForCond || UseMaskForGaps) 1398 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1399 AddressSpace, CostKind); 1400 else 1401 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1402 CostKind); 1403 1404 // Legalize the vector type, and get the legalized and unlegalized type 1405 // sizes. 1406 MVT VecTyLT = getTypeLegalizationCost(VecTy).second; 1407 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1408 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1409 1410 // Scale the cost of the memory operation by the fraction of legalized 1411 // instructions that will actually be used. We shouldn't account for the 1412 // cost of dead instructions since they will be removed. 1413 // 1414 // E.g., An interleaved load of factor 8: 1415 // %vec = load <16 x i64>, <16 x i64>* %ptr 1416 // %v0 = shufflevector %vec, undef, <0, 8> 1417 // 1418 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1419 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1420 // type). The other loads are unused. 1421 // 1422 // TODO: Note that legalization can turn masked loads/stores into unmasked 1423 // (legalized) loads/stores. This can be reflected in the cost. 1424 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1425 // The number of loads of a legal type it will take to represent a load 1426 // of the unlegalized vector type. 1427 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1428 1429 // The number of elements of the unlegalized type that correspond to a 1430 // single legal instruction. 1431 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1432 1433 // Determine which legal instructions will be used. 1434 BitVector UsedInsts(NumLegalInsts, false); 1435 for (unsigned Index : Indices) 1436 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1437 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1438 1439 // Scale the cost of the load by the fraction of legal instructions that 1440 // will be used. 1441 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1442 } 1443 1444 // Then plus the cost of interleave operation. 1445 assert(Indices.size() <= Factor && 1446 "Interleaved memory op has too many members"); 1447 1448 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1449 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1450 1451 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1452 for (unsigned Index : Indices) { 1453 assert(Index < Factor && "Invalid index for interleaved memory op"); 1454 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1455 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1456 } 1457 1458 if (Opcode == Instruction::Load) { 1459 // The interleave cost is similar to extract sub vectors' elements 1460 // from the wide vector, and insert them into sub vectors. 1461 // 1462 // E.g. An interleaved load of factor 2 (with one member of index 0): 1463 // %vec = load <8 x i32>, <8 x i32>* %ptr 1464 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1465 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1466 // <8 x i32> vector and insert them into a <4 x i32> vector. 1467 InstructionCost InsSubCost = thisT()->getScalarizationOverhead( 1468 SubVT, DemandedAllSubElts, 1469 /*Insert*/ true, /*Extract*/ false, CostKind); 1470 Cost += Indices.size() * InsSubCost; 1471 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1472 /*Insert*/ false, 1473 /*Extract*/ true, CostKind); 1474 } else { 1475 // The interleave cost is extract elements from sub vectors, and 1476 // insert them into the wide vector. 1477 // 1478 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1479 // (using VF=4): 1480 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1481 // %gaps.mask = <true, true, false, true, true, false, 1482 // true, true, false, true, true, false> 1483 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1484 // i32 Align, <12 x i1> %gaps.mask 1485 // The cost is estimated as extract all elements (of actual members, 1486 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1487 // i32> vector. 1488 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( 1489 SubVT, DemandedAllSubElts, 1490 /*Insert*/ false, /*Extract*/ true, CostKind); 1491 Cost += ExtSubCost * Indices.size(); 1492 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1493 /*Insert*/ true, 1494 /*Extract*/ false, CostKind); 1495 } 1496 1497 if (!UseMaskForCond) 1498 return Cost; 1499 1500 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1501 1502 Cost += thisT()->getReplicationShuffleCost( 1503 I8Type, Factor, NumSubElts, 1504 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1505 CostKind); 1506 1507 // The Gaps mask is invariant and created outside the loop, therefore the 1508 // cost of creating it is not accounted for here. However if we have both 1509 // a MaskForGaps and some other mask that guards the execution of the 1510 // memory access, we need to account for the cost of And-ing the two masks 1511 // inside the loop. 1512 if (UseMaskForGaps) { 1513 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1514 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1515 CostKind); 1516 } 1517 1518 return Cost; 1519 } 1520 1521 /// Get intrinsic cost based on arguments. 1522 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1523 TTI::TargetCostKind CostKind) { 1524 // Check for generically free intrinsics. 1525 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1526 return 0; 1527 1528 // Assume that target intrinsics are cheap. 1529 Intrinsic::ID IID = ICA.getID(); 1530 if (Function::isTargetIntrinsic(IID)) 1531 return TargetTransformInfo::TCC_Basic; 1532 1533 if (ICA.isTypeBasedOnly()) 1534 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1535 1536 Type *RetTy = ICA.getReturnType(); 1537 1538 ElementCount RetVF = 1539 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1540 : ElementCount::getFixed(1)); 1541 const IntrinsicInst *I = ICA.getInst(); 1542 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1543 FastMathFlags FMF = ICA.getFlags(); 1544 switch (IID) { 1545 default: 1546 break; 1547 1548 case Intrinsic::powi: 1549 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1550 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1551 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1552 ShouldOptForSize)) { 1553 // The cost is modeled on the expansion performed by ExpandPowI in 1554 // SelectionDAGBuilder. 1555 APInt Exponent = RHSC->getValue().abs(); 1556 unsigned ActiveBits = Exponent.getActiveBits(); 1557 unsigned PopCount = Exponent.popcount(); 1558 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1559 thisT()->getArithmeticInstrCost( 1560 Instruction::FMul, RetTy, CostKind); 1561 if (RHSC->isNegative()) 1562 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1563 CostKind); 1564 return Cost; 1565 } 1566 } 1567 break; 1568 case Intrinsic::cttz: 1569 // FIXME: If necessary, this should go in target-specific overrides. 1570 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) 1571 return TargetTransformInfo::TCC_Basic; 1572 break; 1573 1574 case Intrinsic::ctlz: 1575 // FIXME: If necessary, this should go in target-specific overrides. 1576 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) 1577 return TargetTransformInfo::TCC_Basic; 1578 break; 1579 1580 case Intrinsic::memcpy: 1581 return thisT()->getMemcpyCost(ICA.getInst()); 1582 1583 case Intrinsic::masked_scatter: { 1584 const Value *Mask = Args[3]; 1585 bool VarMask = !isa<Constant>(Mask); 1586 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1587 return thisT()->getGatherScatterOpCost(Instruction::Store, 1588 ICA.getArgTypes()[0], Args[1], 1589 VarMask, Alignment, CostKind, I); 1590 } 1591 case Intrinsic::masked_gather: { 1592 const Value *Mask = Args[2]; 1593 bool VarMask = !isa<Constant>(Mask); 1594 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1595 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1596 VarMask, Alignment, CostKind, I); 1597 } 1598 case Intrinsic::experimental_stepvector: { 1599 if (isa<ScalableVectorType>(RetTy)) 1600 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1601 // The cost of materialising a constant integer vector. 1602 return TargetTransformInfo::TCC_Basic; 1603 } 1604 case Intrinsic::vector_extract: { 1605 // FIXME: Handle case where a scalable vector is extracted from a scalable 1606 // vector 1607 if (isa<ScalableVectorType>(RetTy)) 1608 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1609 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1610 return thisT()->getShuffleCost( 1611 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()), 1612 std::nullopt, CostKind, Index, cast<VectorType>(RetTy)); 1613 } 1614 case Intrinsic::vector_insert: { 1615 // FIXME: Handle case where a scalable vector is inserted into a scalable 1616 // vector 1617 if (isa<ScalableVectorType>(Args[1]->getType())) 1618 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1619 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1620 return thisT()->getShuffleCost( 1621 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), 1622 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType())); 1623 } 1624 case Intrinsic::experimental_vector_reverse: { 1625 return thisT()->getShuffleCost( 1626 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt, 1627 CostKind, 0, cast<VectorType>(RetTy)); 1628 } 1629 case Intrinsic::experimental_vector_splice: { 1630 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1631 return thisT()->getShuffleCost( 1632 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt, 1633 CostKind, Index, cast<VectorType>(RetTy)); 1634 } 1635 case Intrinsic::vector_reduce_add: 1636 case Intrinsic::vector_reduce_mul: 1637 case Intrinsic::vector_reduce_and: 1638 case Intrinsic::vector_reduce_or: 1639 case Intrinsic::vector_reduce_xor: 1640 case Intrinsic::vector_reduce_smax: 1641 case Intrinsic::vector_reduce_smin: 1642 case Intrinsic::vector_reduce_fmax: 1643 case Intrinsic::vector_reduce_fmin: 1644 case Intrinsic::vector_reduce_fmaximum: 1645 case Intrinsic::vector_reduce_fminimum: 1646 case Intrinsic::vector_reduce_umax: 1647 case Intrinsic::vector_reduce_umin: { 1648 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1649 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1650 } 1651 case Intrinsic::vector_reduce_fadd: 1652 case Intrinsic::vector_reduce_fmul: { 1653 IntrinsicCostAttributes Attrs( 1654 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1655 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1656 } 1657 case Intrinsic::fshl: 1658 case Intrinsic::fshr: { 1659 const Value *X = Args[0]; 1660 const Value *Y = Args[1]; 1661 const Value *Z = Args[2]; 1662 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); 1663 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); 1664 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); 1665 const TTI::OperandValueInfo OpInfoBW = 1666 {TTI::OK_UniformConstantValue, 1667 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1668 : TTI::OP_None}; 1669 1670 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1671 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1672 InstructionCost Cost = 0; 1673 Cost += 1674 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1675 Cost += 1676 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1677 Cost += thisT()->getArithmeticInstrCost( 1678 BinaryOperator::Shl, RetTy, CostKind, OpInfoX, 1679 {OpInfoZ.Kind, TTI::OP_None}); 1680 Cost += thisT()->getArithmeticInstrCost( 1681 BinaryOperator::LShr, RetTy, CostKind, OpInfoY, 1682 {OpInfoZ.Kind, TTI::OP_None}); 1683 // Non-constant shift amounts requires a modulo. 1684 if (!OpInfoZ.isConstant()) 1685 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1686 CostKind, OpInfoZ, OpInfoBW); 1687 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1688 if (X != Y) { 1689 Type *CondTy = RetTy->getWithNewBitWidth(1); 1690 Cost += 1691 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1692 CmpInst::ICMP_EQ, CostKind); 1693 Cost += 1694 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1695 CmpInst::ICMP_EQ, CostKind); 1696 } 1697 return Cost; 1698 } 1699 case Intrinsic::get_active_lane_mask: { 1700 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1701 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1702 1703 // If we're not expanding the intrinsic then we assume this is cheap 1704 // to implement. 1705 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1706 return getTypeLegalizationCost(RetTy).first; 1707 } 1708 1709 // Create the expanded types that will be used to calculate the uadd_sat 1710 // operation. 1711 Type *ExpRetTy = VectorType::get( 1712 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1713 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1714 InstructionCost Cost = 1715 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1716 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1717 CmpInst::ICMP_ULT, CostKind); 1718 return Cost; 1719 } 1720 } 1721 1722 // VP Intrinsics should have the same cost as their non-vp counterpart. 1723 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp 1724 // counterpart when the vector length argument is smaller than the maximum 1725 // vector length. 1726 // TODO: Support other kinds of VPIntrinsics 1727 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { 1728 std::optional<unsigned> FOp = 1729 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1730 if (FOp) { 1731 if (ICA.getID() == Intrinsic::vp_load) { 1732 Align Alignment; 1733 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1734 Alignment = VPI->getPointerAlignment().valueOrOne(); 1735 unsigned AS = 0; 1736 if (ICA.getArgs().size() > 1) 1737 if (auto *PtrTy = 1738 dyn_cast<PointerType>(ICA.getArgs()[0]->getType())) 1739 AS = PtrTy->getAddressSpace(); 1740 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, 1741 AS, CostKind); 1742 } 1743 if (ICA.getID() == Intrinsic::vp_store) { 1744 Align Alignment; 1745 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1746 Alignment = VPI->getPointerAlignment().valueOrOne(); 1747 unsigned AS = 0; 1748 if (ICA.getArgs().size() >= 2) 1749 if (auto *PtrTy = 1750 dyn_cast<PointerType>(ICA.getArgs()[1]->getType())) 1751 AS = PtrTy->getAddressSpace(); 1752 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, 1753 AS, CostKind); 1754 } 1755 if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { 1756 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), 1757 CostKind); 1758 } 1759 } 1760 1761 std::optional<Intrinsic::ID> FID = 1762 VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); 1763 if (FID) { 1764 // Non-vp version will have same Args/Tys except mask and vector length. 1765 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && 1766 "Expected VPIntrinsic to have Mask and Vector Length args and " 1767 "types"); 1768 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); 1769 1770 // VPReduction intrinsics have a start value argument that their non-vp 1771 // counterparts do not have, except for the fadd and fmul non-vp 1772 // counterpart. 1773 if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && 1774 *FID != Intrinsic::vector_reduce_fadd && 1775 *FID != Intrinsic::vector_reduce_fmul) 1776 NewTys = NewTys.drop_front(); 1777 1778 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, 1779 ICA.getFlags()); 1780 return thisT()->getIntrinsicInstrCost(NewICA, CostKind); 1781 } 1782 } 1783 1784 // Assume that we need to scalarize this intrinsic.) 1785 // Compute the scalarization overhead based on Args for a vector 1786 // intrinsic. 1787 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1788 if (RetVF.isVector() && !RetVF.isScalable()) { 1789 ScalarizationCost = 0; 1790 if (!RetTy->isVoidTy()) 1791 ScalarizationCost += getScalarizationOverhead( 1792 cast<VectorType>(RetTy), 1793 /*Insert*/ true, /*Extract*/ false, CostKind); 1794 ScalarizationCost += 1795 getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); 1796 } 1797 1798 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1799 ScalarizationCost); 1800 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1801 } 1802 1803 /// Get intrinsic cost based on argument types. 1804 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1805 /// cost of scalarizing the arguments and the return value will be computed 1806 /// based on types. 1807 InstructionCost 1808 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1809 TTI::TargetCostKind CostKind) { 1810 Intrinsic::ID IID = ICA.getID(); 1811 Type *RetTy = ICA.getReturnType(); 1812 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1813 FastMathFlags FMF = ICA.getFlags(); 1814 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1815 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1816 1817 VectorType *VecOpTy = nullptr; 1818 if (!Tys.empty()) { 1819 // The vector reduction operand is operand 0 except for fadd/fmul. 1820 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1821 unsigned VecTyIndex = 0; 1822 if (IID == Intrinsic::vector_reduce_fadd || 1823 IID == Intrinsic::vector_reduce_fmul) 1824 VecTyIndex = 1; 1825 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1826 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1827 } 1828 1829 // Library call cost - other than size, make it expensive. 1830 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1831 unsigned ISD = 0; 1832 switch (IID) { 1833 default: { 1834 // Scalable vectors cannot be scalarized, so return Invalid. 1835 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1836 return isa<ScalableVectorType>(Ty); 1837 })) 1838 return InstructionCost::getInvalid(); 1839 1840 // Assume that we need to scalarize this intrinsic. 1841 InstructionCost ScalarizationCost = 1842 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1843 unsigned ScalarCalls = 1; 1844 Type *ScalarRetTy = RetTy; 1845 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1846 if (!SkipScalarizationCost) 1847 ScalarizationCost = getScalarizationOverhead( 1848 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); 1849 ScalarCalls = std::max(ScalarCalls, 1850 cast<FixedVectorType>(RetVTy)->getNumElements()); 1851 ScalarRetTy = RetTy->getScalarType(); 1852 } 1853 SmallVector<Type *, 4> ScalarTys; 1854 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1855 Type *Ty = Tys[i]; 1856 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1857 if (!SkipScalarizationCost) 1858 ScalarizationCost += getScalarizationOverhead( 1859 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 1860 ScalarCalls = std::max(ScalarCalls, 1861 cast<FixedVectorType>(VTy)->getNumElements()); 1862 Ty = Ty->getScalarType(); 1863 } 1864 ScalarTys.push_back(Ty); 1865 } 1866 if (ScalarCalls == 1) 1867 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1868 1869 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1870 InstructionCost ScalarCost = 1871 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1872 1873 return ScalarCalls * ScalarCost + ScalarizationCost; 1874 } 1875 // Look for intrinsics that can be lowered directly or turned into a scalar 1876 // intrinsic call. 1877 case Intrinsic::sqrt: 1878 ISD = ISD::FSQRT; 1879 break; 1880 case Intrinsic::sin: 1881 ISD = ISD::FSIN; 1882 break; 1883 case Intrinsic::cos: 1884 ISD = ISD::FCOS; 1885 break; 1886 case Intrinsic::exp: 1887 ISD = ISD::FEXP; 1888 break; 1889 case Intrinsic::exp2: 1890 ISD = ISD::FEXP2; 1891 break; 1892 case Intrinsic::exp10: 1893 ISD = ISD::FEXP10; 1894 break; 1895 case Intrinsic::log: 1896 ISD = ISD::FLOG; 1897 break; 1898 case Intrinsic::log10: 1899 ISD = ISD::FLOG10; 1900 break; 1901 case Intrinsic::log2: 1902 ISD = ISD::FLOG2; 1903 break; 1904 case Intrinsic::fabs: 1905 ISD = ISD::FABS; 1906 break; 1907 case Intrinsic::canonicalize: 1908 ISD = ISD::FCANONICALIZE; 1909 break; 1910 case Intrinsic::minnum: 1911 ISD = ISD::FMINNUM; 1912 break; 1913 case Intrinsic::maxnum: 1914 ISD = ISD::FMAXNUM; 1915 break; 1916 case Intrinsic::minimum: 1917 ISD = ISD::FMINIMUM; 1918 break; 1919 case Intrinsic::maximum: 1920 ISD = ISD::FMAXIMUM; 1921 break; 1922 case Intrinsic::copysign: 1923 ISD = ISD::FCOPYSIGN; 1924 break; 1925 case Intrinsic::floor: 1926 ISD = ISD::FFLOOR; 1927 break; 1928 case Intrinsic::ceil: 1929 ISD = ISD::FCEIL; 1930 break; 1931 case Intrinsic::trunc: 1932 ISD = ISD::FTRUNC; 1933 break; 1934 case Intrinsic::nearbyint: 1935 ISD = ISD::FNEARBYINT; 1936 break; 1937 case Intrinsic::rint: 1938 ISD = ISD::FRINT; 1939 break; 1940 case Intrinsic::lrint: 1941 ISD = ISD::LRINT; 1942 break; 1943 case Intrinsic::llrint: 1944 ISD = ISD::LLRINT; 1945 break; 1946 case Intrinsic::round: 1947 ISD = ISD::FROUND; 1948 break; 1949 case Intrinsic::roundeven: 1950 ISD = ISD::FROUNDEVEN; 1951 break; 1952 case Intrinsic::pow: 1953 ISD = ISD::FPOW; 1954 break; 1955 case Intrinsic::fma: 1956 ISD = ISD::FMA; 1957 break; 1958 case Intrinsic::fmuladd: 1959 ISD = ISD::FMA; 1960 break; 1961 case Intrinsic::experimental_constrained_fmuladd: 1962 ISD = ISD::STRICT_FMA; 1963 break; 1964 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1965 case Intrinsic::lifetime_start: 1966 case Intrinsic::lifetime_end: 1967 case Intrinsic::sideeffect: 1968 case Intrinsic::pseudoprobe: 1969 case Intrinsic::arithmetic_fence: 1970 return 0; 1971 case Intrinsic::masked_store: { 1972 Type *Ty = Tys[0]; 1973 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1974 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1975 CostKind); 1976 } 1977 case Intrinsic::masked_load: { 1978 Type *Ty = RetTy; 1979 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1980 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1981 CostKind); 1982 } 1983 case Intrinsic::vector_reduce_add: 1984 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1985 std::nullopt, CostKind); 1986 case Intrinsic::vector_reduce_mul: 1987 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1988 std::nullopt, CostKind); 1989 case Intrinsic::vector_reduce_and: 1990 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1991 std::nullopt, CostKind); 1992 case Intrinsic::vector_reduce_or: 1993 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, 1994 std::nullopt, CostKind); 1995 case Intrinsic::vector_reduce_xor: 1996 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1997 std::nullopt, CostKind); 1998 case Intrinsic::vector_reduce_fadd: 1999 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 2000 FMF, CostKind); 2001 case Intrinsic::vector_reduce_fmul: 2002 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 2003 FMF, CostKind); 2004 case Intrinsic::vector_reduce_smax: 2005 return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy, 2006 ICA.getFlags(), CostKind); 2007 case Intrinsic::vector_reduce_smin: 2008 return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy, 2009 ICA.getFlags(), CostKind); 2010 case Intrinsic::vector_reduce_umax: 2011 return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy, 2012 ICA.getFlags(), CostKind); 2013 case Intrinsic::vector_reduce_umin: 2014 return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy, 2015 ICA.getFlags(), CostKind); 2016 case Intrinsic::vector_reduce_fmax: 2017 return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy, 2018 ICA.getFlags(), CostKind); 2019 case Intrinsic::vector_reduce_fmin: 2020 return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy, 2021 ICA.getFlags(), CostKind); 2022 case Intrinsic::vector_reduce_fmaximum: 2023 return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy, 2024 ICA.getFlags(), CostKind); 2025 case Intrinsic::vector_reduce_fminimum: 2026 return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy, 2027 ICA.getFlags(), CostKind); 2028 case Intrinsic::abs: { 2029 // abs(X) = select(icmp(X,0),X,sub(0,X)) 2030 Type *CondTy = RetTy->getWithNewBitWidth(1); 2031 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2032 InstructionCost Cost = 0; 2033 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2034 Pred, CostKind); 2035 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2036 Pred, CostKind); 2037 // TODO: Should we add an OperandValueProperties::OP_Zero property? 2038 Cost += thisT()->getArithmeticInstrCost( 2039 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); 2040 return Cost; 2041 } 2042 case Intrinsic::smax: 2043 case Intrinsic::smin: 2044 case Intrinsic::umax: 2045 case Intrinsic::umin: { 2046 // minmax(X,Y) = select(icmp(X,Y),X,Y) 2047 Type *CondTy = RetTy->getWithNewBitWidth(1); 2048 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 2049 CmpInst::Predicate Pred = 2050 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 2051 InstructionCost Cost = 0; 2052 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2053 Pred, CostKind); 2054 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2055 Pred, CostKind); 2056 return Cost; 2057 } 2058 case Intrinsic::sadd_sat: 2059 case Intrinsic::ssub_sat: { 2060 Type *CondTy = RetTy->getWithNewBitWidth(1); 2061 2062 Type *OpTy = StructType::create({RetTy, CondTy}); 2063 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 2064 ? Intrinsic::sadd_with_overflow 2065 : Intrinsic::ssub_with_overflow; 2066 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2067 2068 // SatMax -> Overflow && SumDiff < 0 2069 // SatMin -> Overflow && SumDiff >= 0 2070 InstructionCost Cost = 0; 2071 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2072 nullptr, ScalarizationCostPassed); 2073 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2074 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2075 Pred, CostKind); 2076 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 2077 CondTy, Pred, CostKind); 2078 return Cost; 2079 } 2080 case Intrinsic::uadd_sat: 2081 case Intrinsic::usub_sat: { 2082 Type *CondTy = RetTy->getWithNewBitWidth(1); 2083 2084 Type *OpTy = StructType::create({RetTy, CondTy}); 2085 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 2086 ? Intrinsic::uadd_with_overflow 2087 : Intrinsic::usub_with_overflow; 2088 2089 InstructionCost Cost = 0; 2090 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2091 nullptr, ScalarizationCostPassed); 2092 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2093 Cost += 2094 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2095 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2096 return Cost; 2097 } 2098 case Intrinsic::smul_fix: 2099 case Intrinsic::umul_fix: { 2100 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 2101 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 2102 2103 unsigned ExtOp = 2104 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 2105 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2106 2107 InstructionCost Cost = 0; 2108 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 2109 Cost += 2110 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2111 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 2112 CCH, CostKind); 2113 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 2114 CostKind, 2115 {TTI::OK_AnyValue, TTI::OP_None}, 2116 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2117 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 2118 {TTI::OK_AnyValue, TTI::OP_None}, 2119 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2120 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 2121 return Cost; 2122 } 2123 case Intrinsic::sadd_with_overflow: 2124 case Intrinsic::ssub_with_overflow: { 2125 Type *SumTy = RetTy->getContainedType(0); 2126 Type *OverflowTy = RetTy->getContainedType(1); 2127 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 2128 ? BinaryOperator::Add 2129 : BinaryOperator::Sub; 2130 2131 // Add: 2132 // Overflow -> (Result < LHS) ^ (RHS < 0) 2133 // Sub: 2134 // Overflow -> (Result < LHS) ^ (RHS > 0) 2135 InstructionCost Cost = 0; 2136 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2137 Cost += 2 * thisT()->getCmpSelInstrCost( 2138 Instruction::ICmp, SumTy, OverflowTy, 2139 CmpInst::ICMP_SGT, CostKind); 2140 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 2141 CostKind); 2142 return Cost; 2143 } 2144 case Intrinsic::uadd_with_overflow: 2145 case Intrinsic::usub_with_overflow: { 2146 Type *SumTy = RetTy->getContainedType(0); 2147 Type *OverflowTy = RetTy->getContainedType(1); 2148 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 2149 ? BinaryOperator::Add 2150 : BinaryOperator::Sub; 2151 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 2152 ? CmpInst::ICMP_ULT 2153 : CmpInst::ICMP_UGT; 2154 2155 InstructionCost Cost = 0; 2156 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2157 Cost += 2158 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 2159 Pred, CostKind); 2160 return Cost; 2161 } 2162 case Intrinsic::smul_with_overflow: 2163 case Intrinsic::umul_with_overflow: { 2164 Type *MulTy = RetTy->getContainedType(0); 2165 Type *OverflowTy = RetTy->getContainedType(1); 2166 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 2167 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 2168 bool IsSigned = IID == Intrinsic::smul_with_overflow; 2169 2170 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 2171 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2172 2173 InstructionCost Cost = 0; 2174 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 2175 Cost += 2176 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2177 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 2178 CCH, CostKind); 2179 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 2180 CostKind, 2181 {TTI::OK_AnyValue, TTI::OP_None}, 2182 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2183 2184 if (IsSigned) 2185 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 2186 CostKind, 2187 {TTI::OK_AnyValue, TTI::OP_None}, 2188 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2189 2190 Cost += thisT()->getCmpSelInstrCost( 2191 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 2192 return Cost; 2193 } 2194 case Intrinsic::fptosi_sat: 2195 case Intrinsic::fptoui_sat: { 2196 if (Tys.empty()) 2197 break; 2198 Type *FromTy = Tys[0]; 2199 bool IsSigned = IID == Intrinsic::fptosi_sat; 2200 2201 InstructionCost Cost = 0; 2202 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 2203 {FromTy, FromTy}); 2204 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 2205 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 2206 {FromTy, FromTy}); 2207 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 2208 Cost += thisT()->getCastInstrCost( 2209 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 2210 TTI::CastContextHint::None, CostKind); 2211 if (IsSigned) { 2212 Type *CondTy = RetTy->getWithNewBitWidth(1); 2213 Cost += thisT()->getCmpSelInstrCost( 2214 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2215 Cost += thisT()->getCmpSelInstrCost( 2216 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2217 } 2218 return Cost; 2219 } 2220 case Intrinsic::ctpop: 2221 ISD = ISD::CTPOP; 2222 // In case of legalization use TCC_Expensive. This is cheaper than a 2223 // library call but still not a cheap instruction. 2224 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2225 break; 2226 case Intrinsic::ctlz: 2227 ISD = ISD::CTLZ; 2228 break; 2229 case Intrinsic::cttz: 2230 ISD = ISD::CTTZ; 2231 break; 2232 case Intrinsic::bswap: 2233 ISD = ISD::BSWAP; 2234 break; 2235 case Intrinsic::bitreverse: 2236 ISD = ISD::BITREVERSE; 2237 break; 2238 } 2239 2240 const TargetLoweringBase *TLI = getTLI(); 2241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 2242 2243 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2244 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2245 TLI->isFAbsFree(LT.second)) { 2246 return 0; 2247 } 2248 2249 // The operation is legal. Assume it costs 1. 2250 // If the type is split to multiple registers, assume that there is some 2251 // overhead to this. 2252 // TODO: Once we have extract/insert subvector cost we need to use them. 2253 if (LT.first > 1) 2254 return (LT.first * 2); 2255 else 2256 return (LT.first * 1); 2257 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2258 // If the operation is custom lowered then assume 2259 // that the code is twice as expensive. 2260 return (LT.first * 2); 2261 } 2262 2263 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2264 // point mul followed by an add. 2265 if (IID == Intrinsic::fmuladd) 2266 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2267 CostKind) + 2268 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2269 CostKind); 2270 if (IID == Intrinsic::experimental_constrained_fmuladd) { 2271 IntrinsicCostAttributes FMulAttrs( 2272 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2273 IntrinsicCostAttributes FAddAttrs( 2274 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2275 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2276 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2277 } 2278 2279 // Else, assume that we need to scalarize this intrinsic. For math builtins 2280 // this will emit a costly libcall, adding call overhead and spills. Make it 2281 // very expensive. 2282 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2283 // Scalable vectors cannot be scalarized, so return Invalid. 2284 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2285 return isa<ScalableVectorType>(Ty); 2286 })) 2287 return InstructionCost::getInvalid(); 2288 2289 InstructionCost ScalarizationCost = 2290 SkipScalarizationCost 2291 ? ScalarizationCostPassed 2292 : getScalarizationOverhead(RetVTy, /*Insert*/ true, 2293 /*Extract*/ false, CostKind); 2294 2295 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2296 SmallVector<Type *, 4> ScalarTys; 2297 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2298 Type *Ty = Tys[i]; 2299 if (Ty->isVectorTy()) 2300 Ty = Ty->getScalarType(); 2301 ScalarTys.push_back(Ty); 2302 } 2303 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2304 InstructionCost ScalarCost = 2305 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2306 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2307 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 2308 if (!ICA.skipScalarizationCost()) 2309 ScalarizationCost += getScalarizationOverhead( 2310 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2311 ScalarCalls = std::max(ScalarCalls, 2312 cast<FixedVectorType>(VTy)->getNumElements()); 2313 } 2314 } 2315 return ScalarCalls * ScalarCost + ScalarizationCost; 2316 } 2317 2318 // This is going to be turned into a library call, make it expensive. 2319 return SingleCallCost; 2320 } 2321 2322 /// Compute a cost of the given call instruction. 2323 /// 2324 /// Compute the cost of calling function F with return type RetTy and 2325 /// argument types Tys. F might be nullptr, in this case the cost of an 2326 /// arbitrary call with the specified signature will be returned. 2327 /// This is used, for instance, when we estimate call of a vector 2328 /// counterpart of the given function. 2329 /// \param F Called function, might be nullptr. 2330 /// \param RetTy Return value types. 2331 /// \param Tys Argument types. 2332 /// \returns The cost of Call instruction. 2333 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2334 ArrayRef<Type *> Tys, 2335 TTI::TargetCostKind CostKind) { 2336 return 10; 2337 } 2338 2339 unsigned getNumberOfParts(Type *Tp) { 2340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 2341 return LT.first.isValid() ? *LT.first.getValue() : 0; 2342 } 2343 2344 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2345 const SCEV *) { 2346 return 0; 2347 } 2348 2349 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2350 /// We're assuming that reduction operation are performing the following way: 2351 /// 2352 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2353 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2354 /// \----------------v-------------/ \----------v------------/ 2355 /// n/2 elements n/2 elements 2356 /// %red1 = op <n x t> %val, <n x t> val1 2357 /// After this operation we have a vector %red1 where only the first n/2 2358 /// elements are meaningful, the second n/2 elements are undefined and can be 2359 /// dropped. All other operations are actually working with the vector of 2360 /// length n/2, not n, though the real vector length is still n. 2361 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2362 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2363 /// \----------------v-------------/ \----------v------------/ 2364 /// n/4 elements 3*n/4 elements 2365 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2366 /// length n/2, the resulting vector has length n/4 etc. 2367 /// 2368 /// The cost model should take into account that the actual length of the 2369 /// vector is reduced on each iteration. 2370 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2371 TTI::TargetCostKind CostKind) { 2372 // Targets must implement a default value for the scalable case, since 2373 // we don't know how many lanes the vector has. 2374 if (isa<ScalableVectorType>(Ty)) 2375 return InstructionCost::getInvalid(); 2376 2377 Type *ScalarTy = Ty->getElementType(); 2378 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2379 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2380 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2381 NumVecElts >= 2) { 2382 // Or reduction for i1 is represented as: 2383 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2384 // %res = cmp ne iReduxWidth %val, 0 2385 // And reduction for i1 is represented as: 2386 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2387 // %res = cmp eq iReduxWidth %val, 11111 2388 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2389 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2390 TTI::CastContextHint::None, CostKind) + 2391 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2392 CmpInst::makeCmpResultType(ValTy), 2393 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2394 } 2395 unsigned NumReduxLevels = Log2_32(NumVecElts); 2396 InstructionCost ArithCost = 0; 2397 InstructionCost ShuffleCost = 0; 2398 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2399 unsigned LongVectorCount = 0; 2400 unsigned MVTLen = 2401 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2402 while (NumVecElts > MVTLen) { 2403 NumVecElts /= 2; 2404 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2405 ShuffleCost += 2406 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2407 CostKind, NumVecElts, SubTy); 2408 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2409 Ty = SubTy; 2410 ++LongVectorCount; 2411 } 2412 2413 NumReduxLevels -= LongVectorCount; 2414 2415 // The minimal length of the vector is limited by the real length of vector 2416 // operations performed on the current platform. That's why several final 2417 // reduction operations are performed on the vectors with the same 2418 // architecture-dependent length. 2419 2420 // By default reductions need one shuffle per reduction level. 2421 ShuffleCost += 2422 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2423 std::nullopt, CostKind, 0, Ty); 2424 ArithCost += 2425 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2426 return ShuffleCost + ArithCost + 2427 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2428 CostKind, 0, nullptr, nullptr); 2429 } 2430 2431 /// Try to calculate the cost of performing strict (in-order) reductions, 2432 /// which involves doing a sequence of floating point additions in lane 2433 /// order, starting with an initial value. For example, consider a scalar 2434 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2435 /// 2436 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2437 /// 2438 /// %add1 = %InitVal + %v0 2439 /// %add2 = %add1 + %v1 2440 /// %add3 = %add2 + %v2 2441 /// %add4 = %add3 + %v3 2442 /// 2443 /// As a simple estimate we can say the cost of such a reduction is 4 times 2444 /// the cost of a scalar FP addition. We can only estimate the costs for 2445 /// fixed-width vectors here because for scalable vectors we do not know the 2446 /// runtime number of operations. 2447 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2448 TTI::TargetCostKind CostKind) { 2449 // Targets must implement a default value for the scalable case, since 2450 // we don't know how many lanes the vector has. 2451 if (isa<ScalableVectorType>(Ty)) 2452 return InstructionCost::getInvalid(); 2453 2454 auto *VTy = cast<FixedVectorType>(Ty); 2455 InstructionCost ExtractCost = getScalarizationOverhead( 2456 VTy, /*Insert=*/false, /*Extract=*/true, CostKind); 2457 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2458 Opcode, VTy->getElementType(), CostKind); 2459 ArithCost *= VTy->getNumElements(); 2460 2461 return ExtractCost + ArithCost; 2462 } 2463 2464 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2465 std::optional<FastMathFlags> FMF, 2466 TTI::TargetCostKind CostKind) { 2467 assert(Ty && "Unknown reduction vector type"); 2468 if (TTI::requiresOrderedReduction(FMF)) 2469 return getOrderedReductionCost(Opcode, Ty, CostKind); 2470 return getTreeReductionCost(Opcode, Ty, CostKind); 2471 } 2472 2473 /// Try to calculate op costs for min/max reduction operations. 2474 /// \param CondTy Conditional type for the Select instruction. 2475 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 2476 FastMathFlags FMF, 2477 TTI::TargetCostKind CostKind) { 2478 // Targets must implement a default value for the scalable case, since 2479 // we don't know how many lanes the vector has. 2480 if (isa<ScalableVectorType>(Ty)) 2481 return InstructionCost::getInvalid(); 2482 2483 Type *ScalarTy = Ty->getElementType(); 2484 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2485 unsigned NumReduxLevels = Log2_32(NumVecElts); 2486 InstructionCost MinMaxCost = 0; 2487 InstructionCost ShuffleCost = 0; 2488 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2489 unsigned LongVectorCount = 0; 2490 unsigned MVTLen = 2491 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2492 while (NumVecElts > MVTLen) { 2493 NumVecElts /= 2; 2494 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2495 2496 ShuffleCost += 2497 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2498 CostKind, NumVecElts, SubTy); 2499 2500 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); 2501 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); 2502 Ty = SubTy; 2503 ++LongVectorCount; 2504 } 2505 2506 NumReduxLevels -= LongVectorCount; 2507 2508 // The minimal length of the vector is limited by the real length of vector 2509 // operations performed on the current platform. That's why several final 2510 // reduction opertions are perfomed on the vectors with the same 2511 // architecture-dependent length. 2512 ShuffleCost += 2513 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2514 std::nullopt, CostKind, 0, Ty); 2515 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); 2516 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); 2517 // The last min/max should be in vector registers and we counted it above. 2518 // So just need a single extractelement. 2519 return ShuffleCost + MinMaxCost + 2520 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2521 CostKind, 0, nullptr, nullptr); 2522 } 2523 2524 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, 2525 Type *ResTy, VectorType *Ty, 2526 FastMathFlags FMF, 2527 TTI::TargetCostKind CostKind) { 2528 // Without any native support, this is equivalent to the cost of 2529 // vecreduce.opcode(ext(Ty A)). 2530 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2531 InstructionCost RedCost = 2532 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); 2533 InstructionCost ExtCost = thisT()->getCastInstrCost( 2534 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2535 TTI::CastContextHint::None, CostKind); 2536 2537 return RedCost + ExtCost; 2538 } 2539 2540 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 2541 VectorType *Ty, 2542 TTI::TargetCostKind CostKind) { 2543 // Without any native support, this is equivalent to the cost of 2544 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or 2545 // vecreduce.add(mul(A, B)). 2546 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2547 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2548 Instruction::Add, ExtTy, std::nullopt, CostKind); 2549 InstructionCost ExtCost = thisT()->getCastInstrCost( 2550 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2551 TTI::CastContextHint::None, CostKind); 2552 2553 InstructionCost MulCost = 2554 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2555 2556 return RedCost + MulCost + 2 * ExtCost; 2557 } 2558 2559 InstructionCost getVectorSplitCost() { return 1; } 2560 2561 /// @} 2562}; 2563 2564/// Concrete BasicTTIImpl that can be used if no further customization 2565/// is needed. 2566class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2567 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2568 2569 friend class BasicTTIImplBase<BasicTTIImpl>; 2570 2571 const TargetSubtargetInfo *ST; 2572 const TargetLoweringBase *TLI; 2573 2574 const TargetSubtargetInfo *getST() const { return ST; } 2575 const TargetLoweringBase *getTLI() const { return TLI; } 2576 2577public: 2578 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2579}; 2580 2581} // end namespace llvm 2582 2583#endif // LLVM_CODEGEN_BASICTTIIMPL_H 2584