1//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// \file 11// This file implements a TargetTransformInfo analysis pass specific to the 12// AMDGPU target machine. It uses the target's detailed information to provide 13// more precise answers to certain TTI queries, while letting the target 14// independent and default TTI implementations handle the rest. 15// 16//===----------------------------------------------------------------------===// 17 18#include "AMDGPUTargetTransformInfo.h" 19#include "llvm/Analysis/LoopInfo.h" 20#include "llvm/Analysis/TargetTransformInfo.h" 21#include "llvm/Analysis/ValueTracking.h" 22#include "llvm/CodeGen/BasicTTIImpl.h" 23#include "llvm/IR/Module.h" 24#include "llvm/Support/Debug.h" 25#include "llvm/Target/CostTable.h" 26#include "llvm/Target/TargetLowering.h" 27using namespace llvm; 28 29#define DEBUG_TYPE "AMDGPUtti" 30 31void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, 32 TTI::UnrollingPreferences &UP) { 33 UP.Threshold = 300; // Twice the default. 34 UP.MaxCount = UINT_MAX; 35 UP.Partial = true; 36 37 // TODO: Do we want runtime unrolling? 38 39 for (const BasicBlock *BB : L->getBlocks()) { 40 const DataLayout &DL = BB->getModule()->getDataLayout(); 41 for (const Instruction &I : *BB) { 42 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 43 if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 44 continue; 45 46 const Value *Ptr = GEP->getPointerOperand(); 47 const AllocaInst *Alloca = 48 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 49 if (Alloca) { 50 // We want to do whatever we can to limit the number of alloca 51 // instructions that make it through to the code generator. allocas 52 // require us to use indirect addressing, which is slow and prone to 53 // compiler bugs. If this loop does an address calculation on an 54 // alloca ptr, then we want to use a higher than normal loop unroll 55 // threshold. This will give SROA a better chance to eliminate these 56 // allocas. 57 // 58 // Don't use the maximum allowed value here as it will make some 59 // programs way too big. 60 UP.Threshold = 800; 61 } 62 } 63 } 64} 65 66unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { 67 if (Vec) 68 return 0; 69 70 // Number of VGPRs on SI. 71 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 72 return 256; 73 74 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 75} 76 77unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { 78 return Vector ? 0 : 32; 79} 80 81unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 82 // Semi-arbitrary large amount. 83 return 64; 84} 85 86unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 87 // XXX - For some reason this isn't called for switch. 88 switch (Opcode) { 89 case Instruction::Br: 90 case Instruction::Ret: 91 return 10; 92 default: 93 return BaseT::getCFInstrCost(Opcode); 94 } 95} 96 97int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 98 unsigned Index) { 99 switch (Opcode) { 100 case Instruction::ExtractElement: 101 // Dynamic indexing isn't free and is best avoided. 102 return Index == ~0u ? 2 : 0; 103 default: 104 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 105 } 106} 107 108static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, 109 const IntrinsicInst *I) { 110 switch (I->getIntrinsicID()) { 111 default: 112 return false; 113 case Intrinsic::not_intrinsic: 114 // This means we have an intrinsic that isn't defined in 115 // IntrinsicsAMDGPU.td 116 break; 117 118 case Intrinsic::amdgcn_interp_p1: 119 case Intrinsic::amdgcn_interp_p2: 120 case Intrinsic::amdgcn_mbcnt_hi: 121 case Intrinsic::amdgcn_mbcnt_lo: 122 case Intrinsic::r600_read_tidig_x: 123 case Intrinsic::r600_read_tidig_y: 124 case Intrinsic::r600_read_tidig_z: 125 return true; 126 } 127 128 StringRef Name = I->getCalledFunction()->getName(); 129 switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { 130 default: 131 return false; 132 case AMDGPUIntrinsic::SI_tid: 133 case AMDGPUIntrinsic::SI_fs_interp: 134 return true; 135 } 136} 137 138static bool isArgPassedInSGPR(const Argument *A) { 139 const Function *F = A->getParent(); 140 unsigned ShaderType = AMDGPU::getShaderType(*F); 141 142 // Arguments to compute shaders are never a source of divergence. 143 if (ShaderType == ShaderType::COMPUTE) 144 return true; 145 146 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 147 if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || 148 F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) 149 return true; 150 151 // Everything else is in VGPRs. 152 return false; 153} 154 155/// 156/// \returns true if the result of the value could potentially be 157/// different across workitems in a wavefront. 158bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 159 160 if (const Argument *A = dyn_cast<Argument>(V)) 161 return !isArgPassedInSGPR(A); 162 163 // Loads from the private address space are divergent, because threads 164 // can execute the load instruction with the same inputs and get different 165 // results. 166 // 167 // All other loads are not divergent, because if threads issue loads with the 168 // same arguments, they will always get the same result. 169 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 170 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; 171 172 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 173 const TargetMachine &TM = getTLI()->getTargetMachine(); 174 return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); 175 } 176 177 // Assume all function calls are a source of divergence. 178 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 179 return true; 180 181 return false; 182} 183