1327952Sdim//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// 2284677Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6284677Sdim// 7284677Sdim//===----------------------------------------------------------------------===// 8284677Sdim// 9284677Sdim// \file 10284677Sdim// This file implements a TargetTransformInfo analysis pass specific to the 11284677Sdim// AMDGPU target machine. It uses the target's detailed information to provide 12284677Sdim// more precise answers to certain TTI queries, while letting the target 13284677Sdim// independent and default TTI implementations handle the rest. 14284677Sdim// 15284677Sdim//===----------------------------------------------------------------------===// 16284677Sdim 17284677Sdim#include "AMDGPUTargetTransformInfo.h" 18327952Sdim#include "AMDGPUSubtarget.h" 19341825Sdim#include "Utils/AMDGPUBaseInfo.h" 20327952Sdim#include "llvm/ADT/STLExtras.h" 21284677Sdim#include "llvm/Analysis/LoopInfo.h" 22284677Sdim#include "llvm/Analysis/TargetTransformInfo.h" 23284677Sdim#include "llvm/Analysis/ValueTracking.h" 24327952Sdim#include "llvm/CodeGen/ISDOpcodes.h" 25327952Sdim#include "llvm/CodeGen/ValueTypes.h" 26327952Sdim#include "llvm/IR/Argument.h" 27327952Sdim#include "llvm/IR/Attributes.h" 28327952Sdim#include "llvm/IR/BasicBlock.h" 29327952Sdim#include "llvm/IR/CallingConv.h" 30327952Sdim#include "llvm/IR/DataLayout.h" 31327952Sdim#include "llvm/IR/DerivedTypes.h" 32327952Sdim#include "llvm/IR/Function.h" 33327952Sdim#include "llvm/IR/Instruction.h" 34327952Sdim#include "llvm/IR/Instructions.h" 35327952Sdim#include "llvm/IR/IntrinsicInst.h" 36284677Sdim#include "llvm/IR/Module.h" 37327952Sdim#include "llvm/IR/PatternMatch.h" 38327952Sdim#include "llvm/IR/Type.h" 39327952Sdim#include "llvm/IR/Value.h" 40327952Sdim#include "llvm/MC/SubtargetFeature.h" 41327952Sdim#include "llvm/Support/Casting.h" 42327952Sdim#include "llvm/Support/CommandLine.h" 43284677Sdim#include "llvm/Support/Debug.h" 44327952Sdim#include "llvm/Support/ErrorHandling.h" 45341825Sdim#include "llvm/Support/MachineValueType.h" 46327952Sdim#include "llvm/Support/raw_ostream.h" 47327952Sdim#include "llvm/Target/TargetMachine.h" 48327952Sdim#include <algorithm> 49327952Sdim#include <cassert> 50327952Sdim#include <limits> 51327952Sdim#include <utility> 52327952Sdim 53284677Sdimusing namespace llvm; 54284677Sdim 55284677Sdim#define DEBUG_TYPE "AMDGPUtti" 56284677Sdim 57321369Sdimstatic cl::opt<unsigned> UnrollThresholdPrivate( 58321369Sdim "amdgpu-unroll-threshold-private", 59321369Sdim cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 60360784Sdim cl::init(2700), cl::Hidden); 61309124Sdim 62321369Sdimstatic cl::opt<unsigned> UnrollThresholdLocal( 63321369Sdim "amdgpu-unroll-threshold-local", 64321369Sdim cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 65321369Sdim cl::init(1000), cl::Hidden); 66321369Sdim 67321369Sdimstatic cl::opt<unsigned> UnrollThresholdIf( 68321369Sdim "amdgpu-unroll-threshold-if", 69321369Sdim cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), 70321369Sdim cl::init(150), cl::Hidden); 71321369Sdim 72321369Sdimstatic bool dependsOnLocalPhi(const Loop *L, const Value *Cond, 73321369Sdim unsigned Depth = 0) { 74321369Sdim const Instruction *I = dyn_cast<Instruction>(Cond); 75321369Sdim if (!I) 76321369Sdim return false; 77321369Sdim 78321369Sdim for (const Value *V : I->operand_values()) { 79321369Sdim if (!L->contains(I)) 80321369Sdim continue; 81321369Sdim if (const PHINode *PHI = dyn_cast<PHINode>(V)) { 82327952Sdim if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { 83321369Sdim return SubLoop->contains(PHI); })) 84321369Sdim return true; 85321369Sdim } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) 86321369Sdim return true; 87321369Sdim } 88321369Sdim return false; 89321369Sdim} 90321369Sdim 91321369Sdimvoid AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 92284677Sdim TTI::UnrollingPreferences &UP) { 93360784Sdim const Function &F = *L->getHeader()->getParent(); 94360784Sdim UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300); 95327952Sdim UP.MaxCount = std::numeric_limits<unsigned>::max(); 96284677Sdim UP.Partial = true; 97284677Sdim 98284677Sdim // TODO: Do we want runtime unrolling? 99284677Sdim 100321369Sdim // Maximum alloca size than can fit registers. Reserve 16 registers. 101321369Sdim const unsigned MaxAlloca = (256 - 16) * 4; 102321369Sdim unsigned ThresholdPrivate = UnrollThresholdPrivate; 103321369Sdim unsigned ThresholdLocal = UnrollThresholdLocal; 104321369Sdim unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 105284677Sdim for (const BasicBlock *BB : L->getBlocks()) { 106284677Sdim const DataLayout &DL = BB->getModule()->getDataLayout(); 107321369Sdim unsigned LocalGEPsSeen = 0; 108321369Sdim 109327952Sdim if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { 110321369Sdim return SubLoop->contains(BB); })) 111321369Sdim continue; // Block belongs to an inner loop. 112321369Sdim 113284677Sdim for (const Instruction &I : *BB) { 114321369Sdim // Unroll a loop which contains an "if" statement whose condition 115321369Sdim // defined by a PHI belonging to the loop. This may help to eliminate 116321369Sdim // if region and potentially even PHI itself, saving on both divergence 117321369Sdim // and registers used for the PHI. 118321369Sdim // Add a small bonus for each of such "if" statements. 119321369Sdim if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { 120321369Sdim if (UP.Threshold < MaxBoost && Br->isConditional()) { 121353358Sdim BasicBlock *Succ0 = Br->getSuccessor(0); 122353358Sdim BasicBlock *Succ1 = Br->getSuccessor(1); 123353358Sdim if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || 124353358Sdim (L->contains(Succ1) && L->isLoopExiting(Succ1))) 125321369Sdim continue; 126321369Sdim if (dependsOnLocalPhi(L, Br->getCondition())) { 127321369Sdim UP.Threshold += UnrollThresholdIf; 128341825Sdim LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold 129341825Sdim << " for loop:\n" 130341825Sdim << *L << " due to " << *Br << '\n'); 131321369Sdim if (UP.Threshold >= MaxBoost) 132321369Sdim return; 133321369Sdim } 134321369Sdim } 135321369Sdim continue; 136321369Sdim } 137321369Sdim 138284677Sdim const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 139321369Sdim if (!GEP) 140284677Sdim continue; 141284677Sdim 142321369Sdim unsigned AS = GEP->getAddressSpace(); 143321369Sdim unsigned Threshold = 0; 144344779Sdim if (AS == AMDGPUAS::PRIVATE_ADDRESS) 145321369Sdim Threshold = ThresholdPrivate; 146353358Sdim else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 147321369Sdim Threshold = ThresholdLocal; 148321369Sdim else 149321369Sdim continue; 150321369Sdim 151321369Sdim if (UP.Threshold >= Threshold) 152321369Sdim continue; 153321369Sdim 154344779Sdim if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 155321369Sdim const Value *Ptr = GEP->getPointerOperand(); 156321369Sdim const AllocaInst *Alloca = 157321369Sdim dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 158321369Sdim if (!Alloca || !Alloca->isStaticAlloca()) 159321369Sdim continue; 160321369Sdim Type *Ty = Alloca->getAllocatedType(); 161321369Sdim unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 162321369Sdim if (AllocaSize > MaxAlloca) 163321369Sdim continue; 164353358Sdim } else if (AS == AMDGPUAS::LOCAL_ADDRESS || 165353358Sdim AS == AMDGPUAS::REGION_ADDRESS) { 166321369Sdim LocalGEPsSeen++; 167321369Sdim // Inhibit unroll for local memory if we have seen addressing not to 168321369Sdim // a variable, most likely we will be unable to combine it. 169321369Sdim // Do not unroll too deep inner loops for local memory to give a chance 170321369Sdim // to unroll an outer loop for a more important reason. 171321369Sdim if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 172321369Sdim (!isa<GlobalVariable>(GEP->getPointerOperand()) && 173321369Sdim !isa<Argument>(GEP->getPointerOperand()))) 174321369Sdim continue; 175284677Sdim } 176321369Sdim 177321369Sdim // Check if GEP depends on a value defined by this loop itself. 178321369Sdim bool HasLoopDef = false; 179321369Sdim for (const Value *Op : GEP->operands()) { 180321369Sdim const Instruction *Inst = dyn_cast<Instruction>(Op); 181321369Sdim if (!Inst || L->isLoopInvariant(Op)) 182321369Sdim continue; 183321369Sdim 184327952Sdim if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 185321369Sdim return SubLoop->contains(Inst); })) 186321369Sdim continue; 187321369Sdim HasLoopDef = true; 188321369Sdim break; 189321369Sdim } 190321369Sdim if (!HasLoopDef) 191321369Sdim continue; 192321369Sdim 193321369Sdim // We want to do whatever we can to limit the number of alloca 194321369Sdim // instructions that make it through to the code generator. allocas 195321369Sdim // require us to use indirect addressing, which is slow and prone to 196321369Sdim // compiler bugs. If this loop does an address calculation on an 197321369Sdim // alloca ptr, then we want to use a higher than normal loop unroll 198321369Sdim // threshold. This will give SROA a better chance to eliminate these 199321369Sdim // allocas. 200321369Sdim // 201321369Sdim // We also want to have more unrolling for local memory to let ds 202321369Sdim // instructions with different offsets combine. 203321369Sdim // 204321369Sdim // Don't use the maximum allowed value here as it will make some 205321369Sdim // programs way too big. 206321369Sdim UP.Threshold = Threshold; 207341825Sdim LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold 208341825Sdim << " for loop:\n" 209341825Sdim << *L << " due to " << *GEP << '\n'); 210321369Sdim if (UP.Threshold >= MaxBoost) 211321369Sdim return; 212284677Sdim } 213284677Sdim } 214284677Sdim} 215284677Sdim 216341825Sdimunsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { 217321369Sdim // The concept of vector registers doesn't really exist. Some packed vector 218321369Sdim // operations operate on the normal 32-bit registers. 219341825Sdim return 256; 220284677Sdim} 221284677Sdim 222341825Sdimunsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { 223321369Sdim // This is really the number of registers to fill when vectorizing / 224321369Sdim // interleaving loops, so we lie to avoid trying to use all registers. 225321369Sdim return getHardwareNumberOfRegisters(Vec) >> 3; 226296417Sdim} 227284677Sdim 228341825Sdimunsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { 229321369Sdim return 32; 230321369Sdim} 231321369Sdim 232341825Sdimunsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { 233321369Sdim return 32; 234321369Sdim} 235321369Sdim 236341825Sdimunsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, 237341825Sdim unsigned ChainSizeInBytes, 238341825Sdim VectorType *VecTy) const { 239341825Sdim unsigned VecRegBitWidth = VF * LoadSize; 240341825Sdim if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) 241341825Sdim // TODO: Support element-size less than 32bit? 242341825Sdim return 128 / LoadSize; 243341825Sdim 244341825Sdim return VF; 245341825Sdim} 246341825Sdim 247341825Sdimunsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, 248341825Sdim unsigned ChainSizeInBytes, 249341825Sdim VectorType *VecTy) const { 250341825Sdim unsigned VecRegBitWidth = VF * StoreSize; 251341825Sdim if (VecRegBitWidth > 128) 252341825Sdim return 128 / StoreSize; 253341825Sdim 254341825Sdim return VF; 255341825Sdim} 256341825Sdim 257341825Sdimunsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 258344779Sdim if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 259344779Sdim AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || 260353358Sdim AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 261353358Sdim AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { 262341825Sdim return 512; 263341825Sdim } 264341825Sdim 265344779Sdim if (AddrSpace == AMDGPUAS::FLAT_ADDRESS || 266344779Sdim AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 267344779Sdim AddrSpace == AMDGPUAS::REGION_ADDRESS) 268309124Sdim return 128; 269341825Sdim 270344779Sdim if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 271309124Sdim return 8 * ST->getMaxPrivateElementSize(); 272321369Sdim 273321369Sdim llvm_unreachable("unhandled address space"); 274321369Sdim} 275321369Sdim 276341825Sdimbool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 277321369Sdim unsigned Alignment, 278321369Sdim unsigned AddrSpace) const { 279321369Sdim // We allow vectorization of flat stores, even though we may need to decompose 280321369Sdim // them later if they may access private memory. We don't have enough context 281321369Sdim // here, and legalization can handle it. 282344779Sdim if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 283321369Sdim return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 284321369Sdim ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 285309124Sdim } 286321369Sdim return true; 287309124Sdim} 288309124Sdim 289341825Sdimbool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 290321369Sdim unsigned Alignment, 291321369Sdim unsigned AddrSpace) const { 292321369Sdim return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 293321369Sdim} 294321369Sdim 295341825Sdimbool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 296321369Sdim unsigned Alignment, 297321369Sdim unsigned AddrSpace) const { 298321369Sdim return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 299321369Sdim} 300321369Sdim 301341825Sdimunsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { 302321369Sdim // Disable unrolling if the loop is not vectorized. 303321369Sdim // TODO: Enable this again. 304321369Sdim if (VF == 1) 305321369Sdim return 1; 306321369Sdim 307321369Sdim return 8; 308284677Sdim} 309296417Sdim 310341825Sdimbool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 311327952Sdim MemIntrinsicInfo &Info) const { 312327952Sdim switch (Inst->getIntrinsicID()) { 313327952Sdim case Intrinsic::amdgcn_atomic_inc: 314341825Sdim case Intrinsic::amdgcn_atomic_dec: 315344779Sdim case Intrinsic::amdgcn_ds_ordered_add: 316344779Sdim case Intrinsic::amdgcn_ds_ordered_swap: 317341825Sdim case Intrinsic::amdgcn_ds_fadd: 318341825Sdim case Intrinsic::amdgcn_ds_fmin: 319341825Sdim case Intrinsic::amdgcn_ds_fmax: { 320327952Sdim auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); 321327952Sdim auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); 322327952Sdim if (!Ordering || !Volatile) 323327952Sdim return false; // Invalid. 324327952Sdim 325327952Sdim unsigned OrderingVal = Ordering->getZExtValue(); 326327952Sdim if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) 327327952Sdim return false; 328327952Sdim 329327952Sdim Info.PtrVal = Inst->getArgOperand(0); 330327952Sdim Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); 331327952Sdim Info.ReadMem = true; 332327952Sdim Info.WriteMem = true; 333327952Sdim Info.IsVolatile = !Volatile->isNullValue(); 334327952Sdim return true; 335327952Sdim } 336327952Sdim default: 337327952Sdim return false; 338327952Sdim } 339327952Sdim} 340327952Sdim 341360784Sdimint GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 342360784Sdim TTI::OperandValueKind Opd1Info, 343360784Sdim TTI::OperandValueKind Opd2Info, 344360784Sdim TTI::OperandValueProperties Opd1PropInfo, 345360784Sdim TTI::OperandValueProperties Opd2PropInfo, 346360784Sdim ArrayRef<const Value *> Args, 347360784Sdim const Instruction *CxtI) { 348309124Sdim EVT OrigTy = TLI->getValueType(DL, Ty); 349309124Sdim if (!OrigTy.isSimple()) { 350309124Sdim return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 351309124Sdim Opd1PropInfo, Opd2PropInfo); 352309124Sdim } 353309124Sdim 354309124Sdim // Legalize the type. 355309124Sdim std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 356309124Sdim int ISD = TLI->InstructionOpcodeToISD(Opcode); 357309124Sdim 358309124Sdim // Because we don't have any legal vector operations, but the legal types, we 359309124Sdim // need to account for split vectors. 360309124Sdim unsigned NElts = LT.second.isVector() ? 361309124Sdim LT.second.getVectorNumElements() : 1; 362309124Sdim 363309124Sdim MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 364309124Sdim 365309124Sdim switch (ISD) { 366309124Sdim case ISD::SHL: 367309124Sdim case ISD::SRL: 368327952Sdim case ISD::SRA: 369309124Sdim if (SLT == MVT::i64) 370309124Sdim return get64BitInstrCost() * LT.first * NElts; 371309124Sdim 372360784Sdim if (ST->has16BitInsts() && SLT == MVT::i16) 373360784Sdim NElts = (NElts + 1) / 2; 374360784Sdim 375309124Sdim // i32 376309124Sdim return getFullRateInstrCost() * LT.first * NElts; 377309124Sdim case ISD::ADD: 378309124Sdim case ISD::SUB: 379309124Sdim case ISD::AND: 380309124Sdim case ISD::OR: 381327952Sdim case ISD::XOR: 382360784Sdim if (SLT == MVT::i64) { 383309124Sdim // and, or and xor are typically split into 2 VALU instructions. 384309124Sdim return 2 * getFullRateInstrCost() * LT.first * NElts; 385309124Sdim } 386309124Sdim 387360784Sdim if (ST->has16BitInsts() && SLT == MVT::i16) 388360784Sdim NElts = (NElts + 1) / 2; 389360784Sdim 390309124Sdim return LT.first * NElts * getFullRateInstrCost(); 391309124Sdim case ISD::MUL: { 392309124Sdim const int QuarterRateCost = getQuarterRateInstrCost(); 393309124Sdim if (SLT == MVT::i64) { 394309124Sdim const int FullRateCost = getFullRateInstrCost(); 395309124Sdim return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 396309124Sdim } 397309124Sdim 398360784Sdim if (ST->has16BitInsts() && SLT == MVT::i16) 399360784Sdim NElts = (NElts + 1) / 2; 400360784Sdim 401309124Sdim // i32 402309124Sdim return QuarterRateCost * NElts * LT.first; 403309124Sdim } 404309124Sdim case ISD::FADD: 405309124Sdim case ISD::FSUB: 406309124Sdim case ISD::FMUL: 407309124Sdim if (SLT == MVT::f64) 408309124Sdim return LT.first * NElts * get64BitInstrCost(); 409309124Sdim 410360784Sdim if (ST->has16BitInsts() && SLT == MVT::f16) 411360784Sdim NElts = (NElts + 1) / 2; 412360784Sdim 413309124Sdim if (SLT == MVT::f32 || SLT == MVT::f16) 414309124Sdim return LT.first * NElts * getFullRateInstrCost(); 415309124Sdim break; 416309124Sdim case ISD::FDIV: 417309124Sdim case ISD::FREM: 418309124Sdim // FIXME: frem should be handled separately. The fdiv in it is most of it, 419309124Sdim // but the current lowering is also not entirely correct. 420309124Sdim if (SLT == MVT::f64) { 421309124Sdim int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 422309124Sdim // Add cost of workaround. 423353358Sdim if (!ST->hasUsableDivScaleConditionOutput()) 424309124Sdim Cost += 3 * getFullRateInstrCost(); 425309124Sdim 426309124Sdim return LT.first * Cost * NElts; 427309124Sdim } 428309124Sdim 429327952Sdim if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { 430327952Sdim // TODO: This is more complicated, unsafe flags etc. 431360784Sdim if ((SLT == MVT::f32 && !HasFP32Denormals) || 432327952Sdim (SLT == MVT::f16 && ST->has16BitInsts())) { 433327952Sdim return LT.first * getQuarterRateInstrCost() * NElts; 434327952Sdim } 435327952Sdim } 436327952Sdim 437327952Sdim if (SLT == MVT::f16 && ST->has16BitInsts()) { 438327952Sdim // 2 x v_cvt_f32_f16 439327952Sdim // f32 rcp 440327952Sdim // f32 fmul 441327952Sdim // v_cvt_f16_f32 442327952Sdim // f16 div_fixup 443327952Sdim int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); 444327952Sdim return LT.first * Cost * NElts; 445327952Sdim } 446327952Sdim 447309124Sdim if (SLT == MVT::f32 || SLT == MVT::f16) { 448309124Sdim int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 449327952Sdim 450360784Sdim if (!HasFP32Denormals) { 451327952Sdim // FP mode switches. 452327952Sdim Cost += 2 * getFullRateInstrCost(); 453327952Sdim } 454327952Sdim 455309124Sdim return LT.first * NElts * Cost; 456309124Sdim } 457309124Sdim break; 458309124Sdim default: 459309124Sdim break; 460309124Sdim } 461309124Sdim 462309124Sdim return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 463309124Sdim Opd1PropInfo, Opd2PropInfo); 464309124Sdim} 465309124Sdim 466360784Sdimtemplate <typename T> 467360784Sdimint GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 468360784Sdim ArrayRef<T *> Args, 469360784Sdim FastMathFlags FMF, unsigned VF) { 470360784Sdim if (ID != Intrinsic::fma) 471360784Sdim return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); 472360784Sdim 473360784Sdim EVT OrigTy = TLI->getValueType(DL, RetTy); 474360784Sdim if (!OrigTy.isSimple()) { 475360784Sdim return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); 476360784Sdim } 477360784Sdim 478360784Sdim // Legalize the type. 479360784Sdim std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 480360784Sdim 481360784Sdim unsigned NElts = LT.second.isVector() ? 482360784Sdim LT.second.getVectorNumElements() : 1; 483360784Sdim 484360784Sdim MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 485360784Sdim 486360784Sdim if (SLT == MVT::f64) 487360784Sdim return LT.first * NElts * get64BitInstrCost(); 488360784Sdim 489360784Sdim if (ST->has16BitInsts() && SLT == MVT::f16) 490360784Sdim NElts = (NElts + 1) / 2; 491360784Sdim 492360784Sdim return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost() 493360784Sdim : getQuarterRateInstrCost()); 494360784Sdim} 495360784Sdim 496360784Sdimint GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 497360784Sdim ArrayRef<Value*> Args, FastMathFlags FMF, 498360784Sdim unsigned VF) { 499360784Sdim return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF); 500360784Sdim} 501360784Sdim 502360784Sdimint GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 503360784Sdim ArrayRef<Type *> Tys, FastMathFlags FMF, 504360784Sdim unsigned ScalarizationCostPassed) { 505360784Sdim return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF, 506360784Sdim ScalarizationCostPassed); 507360784Sdim} 508360784Sdim 509341825Sdimunsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { 510296417Sdim // XXX - For some reason this isn't called for switch. 511296417Sdim switch (Opcode) { 512296417Sdim case Instruction::Br: 513296417Sdim case Instruction::Ret: 514296417Sdim return 10; 515296417Sdim default: 516296417Sdim return BaseT::getCFInstrCost(Opcode); 517296417Sdim } 518296417Sdim} 519296417Sdim 520341825Sdimint GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, 521341825Sdim bool IsPairwise) { 522341825Sdim EVT OrigTy = TLI->getValueType(DL, Ty); 523341825Sdim 524341825Sdim // Computes cost on targets that have packed math instructions(which support 525341825Sdim // 16-bit types only). 526341825Sdim if (IsPairwise || 527341825Sdim !ST->hasVOP3PInsts() || 528341825Sdim OrigTy.getScalarSizeInBits() != 16) 529341825Sdim return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); 530341825Sdim 531341825Sdim std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 532341825Sdim return LT.first * getFullRateInstrCost(); 533341825Sdim} 534341825Sdim 535341825Sdimint GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, 536341825Sdim bool IsPairwise, 537341825Sdim bool IsUnsigned) { 538341825Sdim EVT OrigTy = TLI->getValueType(DL, Ty); 539341825Sdim 540341825Sdim // Computes cost on targets that have packed math instructions(which support 541341825Sdim // 16-bit types only). 542341825Sdim if (IsPairwise || 543341825Sdim !ST->hasVOP3PInsts() || 544341825Sdim OrigTy.getScalarSizeInBits() != 16) 545341825Sdim return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); 546341825Sdim 547341825Sdim std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 548341825Sdim return LT.first * getHalfRateInstrCost(); 549341825Sdim} 550341825Sdim 551341825Sdimint GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 552296417Sdim unsigned Index) { 553296417Sdim switch (Opcode) { 554296417Sdim case Instruction::ExtractElement: 555321369Sdim case Instruction::InsertElement: { 556321369Sdim unsigned EltSize 557321369Sdim = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 558321369Sdim if (EltSize < 32) { 559321369Sdim if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) 560321369Sdim return 0; 561321369Sdim return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 562321369Sdim } 563321369Sdim 564309124Sdim // Extracts are just reads of a subregister, so are free. Inserts are 565309124Sdim // considered free because we don't want to have any cost for scalarizing 566309124Sdim // operations, and we don't have to copy into a different register class. 567309124Sdim 568296417Sdim // Dynamic indexing isn't free and is best avoided. 569296417Sdim return Index == ~0u ? 2 : 0; 570321369Sdim } 571296417Sdim default: 572296417Sdim return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 573296417Sdim } 574296417Sdim} 575296417Sdim 576296417Sdim 577341825Sdim 578296417Sdimstatic bool isArgPassedInSGPR(const Argument *A) { 579296417Sdim const Function *F = A->getParent(); 580296417Sdim 581296417Sdim // Arguments to compute shaders are never a source of divergence. 582321369Sdim CallingConv::ID CC = F->getCallingConv(); 583321369Sdim switch (CC) { 584321369Sdim case CallingConv::AMDGPU_KERNEL: 585321369Sdim case CallingConv::SPIR_KERNEL: 586296417Sdim return true; 587321369Sdim case CallingConv::AMDGPU_VS: 588327952Sdim case CallingConv::AMDGPU_LS: 589321369Sdim case CallingConv::AMDGPU_HS: 590327952Sdim case CallingConv::AMDGPU_ES: 591321369Sdim case CallingConv::AMDGPU_GS: 592321369Sdim case CallingConv::AMDGPU_PS: 593321369Sdim case CallingConv::AMDGPU_CS: 594321369Sdim // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 595321369Sdim // Everything else is in VGPRs. 596321369Sdim return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || 597321369Sdim F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); 598321369Sdim default: 599321369Sdim // TODO: Should calls support inreg for SGPR inputs? 600321369Sdim return false; 601321369Sdim } 602296417Sdim} 603296417Sdim 604296417Sdim/// \returns true if the result of the value could potentially be 605296417Sdim/// different across workitems in a wavefront. 606341825Sdimbool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { 607296417Sdim if (const Argument *A = dyn_cast<Argument>(V)) 608296417Sdim return !isArgPassedInSGPR(A); 609296417Sdim 610344779Sdim // Loads from the private and flat address spaces are divergent, because 611344779Sdim // threads can execute the load instruction with the same inputs and get 612344779Sdim // different results. 613296417Sdim // 614296417Sdim // All other loads are not divergent, because if threads issue loads with the 615296417Sdim // same arguments, they will always get the same result. 616296417Sdim if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 617344779Sdim return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 618344779Sdim Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; 619296417Sdim 620309124Sdim // Atomics are divergent because they are executed sequentially: when an 621309124Sdim // atomic operation refers to the same address in each thread, then each 622309124Sdim // thread after the first sees the value written by the previous thread as 623309124Sdim // original value. 624309124Sdim if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 625309124Sdim return true; 626309124Sdim 627321369Sdim if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 628341825Sdim return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); 629296417Sdim 630296417Sdim // Assume all function calls are a source of divergence. 631296417Sdim if (isa<CallInst>(V) || isa<InvokeInst>(V)) 632296417Sdim return true; 633296417Sdim 634296417Sdim return false; 635296417Sdim} 636321369Sdim 637341825Sdimbool GCNTTIImpl::isAlwaysUniform(const Value *V) const { 638321369Sdim if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 639321369Sdim switch (Intrinsic->getIntrinsicID()) { 640321369Sdim default: 641321369Sdim return false; 642321369Sdim case Intrinsic::amdgcn_readfirstlane: 643321369Sdim case Intrinsic::amdgcn_readlane: 644353358Sdim case Intrinsic::amdgcn_icmp: 645353358Sdim case Intrinsic::amdgcn_fcmp: 646321369Sdim return true; 647321369Sdim } 648321369Sdim } 649321369Sdim return false; 650321369Sdim} 651321369Sdim 652360784Sdimbool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 653360784Sdim Intrinsic::ID IID) const { 654360784Sdim switch (IID) { 655360784Sdim case Intrinsic::amdgcn_atomic_inc: 656360784Sdim case Intrinsic::amdgcn_atomic_dec: 657360784Sdim case Intrinsic::amdgcn_ds_fadd: 658360784Sdim case Intrinsic::amdgcn_ds_fmin: 659360784Sdim case Intrinsic::amdgcn_ds_fmax: 660360784Sdim case Intrinsic::amdgcn_is_shared: 661360784Sdim case Intrinsic::amdgcn_is_private: 662360784Sdim OpIndexes.push_back(0); 663360784Sdim return true; 664360784Sdim default: 665360784Sdim return false; 666360784Sdim } 667360784Sdim} 668360784Sdim 669360784Sdimbool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( 670360784Sdim IntrinsicInst *II, Value *OldV, Value *NewV) const { 671360784Sdim auto IntrID = II->getIntrinsicID(); 672360784Sdim switch (IntrID) { 673360784Sdim case Intrinsic::amdgcn_atomic_inc: 674360784Sdim case Intrinsic::amdgcn_atomic_dec: 675360784Sdim case Intrinsic::amdgcn_ds_fadd: 676360784Sdim case Intrinsic::amdgcn_ds_fmin: 677360784Sdim case Intrinsic::amdgcn_ds_fmax: { 678360784Sdim const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); 679360784Sdim if (!IsVolatile->isZero()) 680360784Sdim return false; 681360784Sdim Module *M = II->getParent()->getParent()->getParent(); 682360784Sdim Type *DestTy = II->getType(); 683360784Sdim Type *SrcTy = NewV->getType(); 684360784Sdim Function *NewDecl = 685360784Sdim Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); 686360784Sdim II->setArgOperand(0, NewV); 687360784Sdim II->setCalledFunction(NewDecl); 688360784Sdim return true; 689360784Sdim } 690360784Sdim case Intrinsic::amdgcn_is_shared: 691360784Sdim case Intrinsic::amdgcn_is_private: { 692360784Sdim unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? 693360784Sdim AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; 694360784Sdim unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 695360784Sdim LLVMContext &Ctx = NewV->getType()->getContext(); 696360784Sdim ConstantInt *NewVal = (TrueAS == NewAS) ? 697360784Sdim ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); 698360784Sdim II->replaceAllUsesWith(NewVal); 699360784Sdim II->eraseFromParent(); 700360784Sdim return true; 701360784Sdim } 702360784Sdim default: 703360784Sdim return false; 704360784Sdim } 705360784Sdim} 706360784Sdim 707341825Sdimunsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 708321369Sdim Type *SubTp) { 709321369Sdim if (ST->hasVOP3PInsts()) { 710321369Sdim VectorType *VT = cast<VectorType>(Tp); 711321369Sdim if (VT->getNumElements() == 2 && 712321369Sdim DL.getTypeSizeInBits(VT->getElementType()) == 16) { 713321369Sdim // With op_sel VOP3P instructions freely can access the low half or high 714321369Sdim // half of a register, so any swizzle is free. 715321369Sdim 716321369Sdim switch (Kind) { 717321369Sdim case TTI::SK_Broadcast: 718321369Sdim case TTI::SK_Reverse: 719321369Sdim case TTI::SK_PermuteSingleSrc: 720321369Sdim return 0; 721321369Sdim default: 722321369Sdim break; 723321369Sdim } 724321369Sdim } 725321369Sdim } 726321369Sdim 727321369Sdim return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 728321369Sdim} 729327952Sdim 730341825Sdimbool GCNTTIImpl::areInlineCompatible(const Function *Caller, 731353358Sdim const Function *Callee) const { 732327952Sdim const TargetMachine &TM = getTLI()->getTargetMachine(); 733360784Sdim const GCNSubtarget *CallerST 734360784Sdim = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); 735360784Sdim const GCNSubtarget *CalleeST 736360784Sdim = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); 737327952Sdim 738360784Sdim const FeatureBitset &CallerBits = CallerST->getFeatureBits(); 739360784Sdim const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); 740360784Sdim 741327952Sdim FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 742327952Sdim FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 743353358Sdim if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 744353358Sdim return false; 745353358Sdim 746353358Sdim // FIXME: dx10_clamp can just take the caller setting, but there seems to be 747353358Sdim // no way to support merge for backend defined attributes. 748360784Sdim AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST); 749360784Sdim AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); 750353358Sdim return CallerMode.isInlineCompatible(CalleeMode); 751327952Sdim} 752341825Sdim 753341825Sdimvoid GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 754341825Sdim TTI::UnrollingPreferences &UP) { 755341825Sdim CommonTTI.getUnrollingPreferences(L, SE, UP); 756341825Sdim} 757341825Sdim 758360784Sdimunsigned GCNTTIImpl::getUserCost(const User *U, 759360784Sdim ArrayRef<const Value *> Operands) { 760360784Sdim const Instruction *I = dyn_cast<Instruction>(U); 761360784Sdim if (!I) 762360784Sdim return BaseT::getUserCost(U, Operands); 763360784Sdim 764360784Sdim // Estimate different operations to be optimized out 765360784Sdim switch (I->getOpcode()) { 766360784Sdim case Instruction::ExtractElement: { 767360784Sdim ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); 768360784Sdim unsigned Idx = -1; 769360784Sdim if (CI) 770360784Sdim Idx = CI->getZExtValue(); 771360784Sdim return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx); 772360784Sdim } 773360784Sdim case Instruction::InsertElement: { 774360784Sdim ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); 775360784Sdim unsigned Idx = -1; 776360784Sdim if (CI) 777360784Sdim Idx = CI->getZExtValue(); 778360784Sdim return getVectorInstrCost(I->getOpcode(), I->getType(), Idx); 779360784Sdim } 780360784Sdim case Instruction::Call: { 781360784Sdim if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { 782360784Sdim SmallVector<Value *, 4> Args(II->arg_operands()); 783360784Sdim FastMathFlags FMF; 784360784Sdim if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 785360784Sdim FMF = FPMO->getFastMathFlags(); 786360784Sdim return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, 787360784Sdim FMF); 788360784Sdim } else { 789360784Sdim return BaseT::getUserCost(U, Operands); 790360784Sdim } 791360784Sdim } 792360784Sdim case Instruction::ShuffleVector: { 793360784Sdim const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); 794360784Sdim Type *Ty = Shuffle->getType(); 795360784Sdim Type *SrcTy = Shuffle->getOperand(0)->getType(); 796360784Sdim 797360784Sdim // TODO: Identify and add costs for insert subvector, etc. 798360784Sdim int SubIndex; 799360784Sdim if (Shuffle->isExtractSubvectorMask(SubIndex)) 800360784Sdim return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty); 801360784Sdim 802360784Sdim if (Shuffle->changesLength()) 803360784Sdim return BaseT::getUserCost(U, Operands); 804360784Sdim 805360784Sdim if (Shuffle->isIdentity()) 806360784Sdim return 0; 807360784Sdim 808360784Sdim if (Shuffle->isReverse()) 809360784Sdim return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr); 810360784Sdim 811360784Sdim if (Shuffle->isSelect()) 812360784Sdim return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr); 813360784Sdim 814360784Sdim if (Shuffle->isTranspose()) 815360784Sdim return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr); 816360784Sdim 817360784Sdim if (Shuffle->isZeroEltSplat()) 818360784Sdim return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr); 819360784Sdim 820360784Sdim if (Shuffle->isSingleSource()) 821360784Sdim return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr); 822360784Sdim 823360784Sdim return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr); 824360784Sdim } 825360784Sdim case Instruction::ZExt: 826360784Sdim case Instruction::SExt: 827360784Sdim case Instruction::FPToUI: 828360784Sdim case Instruction::FPToSI: 829360784Sdim case Instruction::FPExt: 830360784Sdim case Instruction::PtrToInt: 831360784Sdim case Instruction::IntToPtr: 832360784Sdim case Instruction::SIToFP: 833360784Sdim case Instruction::UIToFP: 834360784Sdim case Instruction::Trunc: 835360784Sdim case Instruction::FPTrunc: 836360784Sdim case Instruction::BitCast: 837360784Sdim case Instruction::AddrSpaceCast: { 838360784Sdim return getCastInstrCost(I->getOpcode(), I->getType(), 839360784Sdim I->getOperand(0)->getType(), I); 840360784Sdim } 841360784Sdim case Instruction::Add: 842360784Sdim case Instruction::FAdd: 843360784Sdim case Instruction::Sub: 844360784Sdim case Instruction::FSub: 845360784Sdim case Instruction::Mul: 846360784Sdim case Instruction::FMul: 847360784Sdim case Instruction::UDiv: 848360784Sdim case Instruction::SDiv: 849360784Sdim case Instruction::FDiv: 850360784Sdim case Instruction::URem: 851360784Sdim case Instruction::SRem: 852360784Sdim case Instruction::FRem: 853360784Sdim case Instruction::Shl: 854360784Sdim case Instruction::LShr: 855360784Sdim case Instruction::AShr: 856360784Sdim case Instruction::And: 857360784Sdim case Instruction::Or: 858360784Sdim case Instruction::Xor: 859360784Sdim case Instruction::FNeg: { 860360784Sdim return getArithmeticInstrCost(I->getOpcode(), I->getType(), 861360784Sdim TTI::OK_AnyValue, TTI::OK_AnyValue, 862360784Sdim TTI::OP_None, TTI::OP_None, Operands, I); 863360784Sdim } 864360784Sdim default: 865360784Sdim break; 866360784Sdim } 867360784Sdim 868360784Sdim return BaseT::getUserCost(U, Operands); 869360784Sdim} 870360784Sdim 871341825Sdimunsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { 872341825Sdim return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 873341825Sdim} 874341825Sdim 875341825Sdimunsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { 876341825Sdim return getHardwareNumberOfRegisters(Vec); 877341825Sdim} 878341825Sdim 879341825Sdimunsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { 880341825Sdim return 32; 881341825Sdim} 882341825Sdim 883341825Sdimunsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { 884341825Sdim return 32; 885341825Sdim} 886341825Sdim 887341825Sdimunsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 888344779Sdim if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 889344779Sdim AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) 890341825Sdim return 128; 891344779Sdim if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 892344779Sdim AddrSpace == AMDGPUAS::REGION_ADDRESS) 893341825Sdim return 64; 894344779Sdim if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 895341825Sdim return 32; 896341825Sdim 897344779Sdim if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || 898344779Sdim AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || 899344779Sdim (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && 900344779Sdim AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) 901341825Sdim return 128; 902341825Sdim llvm_unreachable("unhandled address space"); 903341825Sdim} 904341825Sdim 905341825Sdimbool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 906341825Sdim unsigned Alignment, 907341825Sdim unsigned AddrSpace) const { 908341825Sdim // We allow vectorization of flat stores, even though we may need to decompose 909341825Sdim // them later if they may access private memory. We don't have enough context 910341825Sdim // here, and legalization can handle it. 911344779Sdim return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); 912341825Sdim} 913341825Sdim 914341825Sdimbool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 915341825Sdim unsigned Alignment, 916341825Sdim unsigned AddrSpace) const { 917341825Sdim return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 918341825Sdim} 919341825Sdim 920341825Sdimbool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 921341825Sdim unsigned Alignment, 922341825Sdim unsigned AddrSpace) const { 923341825Sdim return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 924341825Sdim} 925341825Sdim 926341825Sdimunsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { 927341825Sdim // Disable unrolling if the loop is not vectorized. 928341825Sdim // TODO: Enable this again. 929341825Sdim if (VF == 1) 930341825Sdim return 1; 931341825Sdim 932341825Sdim return 8; 933341825Sdim} 934341825Sdim 935341825Sdimunsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { 936341825Sdim // XXX - For some reason this isn't called for switch. 937341825Sdim switch (Opcode) { 938341825Sdim case Instruction::Br: 939341825Sdim case Instruction::Ret: 940341825Sdim return 10; 941341825Sdim default: 942341825Sdim return BaseT::getCFInstrCost(Opcode); 943341825Sdim } 944341825Sdim} 945341825Sdim 946341825Sdimint R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 947341825Sdim unsigned Index) { 948341825Sdim switch (Opcode) { 949341825Sdim case Instruction::ExtractElement: 950341825Sdim case Instruction::InsertElement: { 951341825Sdim unsigned EltSize 952341825Sdim = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 953341825Sdim if (EltSize < 32) { 954341825Sdim return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 955341825Sdim } 956341825Sdim 957341825Sdim // Extracts are just reads of a subregister, so are free. Inserts are 958341825Sdim // considered free because we don't want to have any cost for scalarizing 959341825Sdim // operations, and we don't have to copy into a different register class. 960341825Sdim 961341825Sdim // Dynamic indexing isn't free and is best avoided. 962341825Sdim return Index == ~0u ? 2 : 0; 963341825Sdim } 964341825Sdim default: 965341825Sdim return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 966341825Sdim } 967341825Sdim} 968341825Sdim 969341825Sdimvoid R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 970341825Sdim TTI::UnrollingPreferences &UP) { 971341825Sdim CommonTTI.getUnrollingPreferences(L, SE, UP); 972341825Sdim} 973