1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file implements a TargetTransformInfo analysis pass specific to the 11/// PPC target machine. It uses the target's detailed information to provide 12/// more precise answers to certain TTI queries, while letting the target 13/// independent and default TTI implementations handle the rest. 14/// 15//===----------------------------------------------------------------------===// 16 17#define DEBUG_TYPE "ppctti" 18#include "PPC.h" 19#include "PPCTargetMachine.h" 20#include "llvm/Analysis/TargetTransformInfo.h" 21#include "llvm/Support/Debug.h" 22#include "llvm/Target/TargetLowering.h" 23#include "llvm/Target/CostTable.h" 24using namespace llvm; 25 26// Declare the pass initialization routine locally as target-specific passes 27// don't havve a target-wide initialization entry point, and so we rely on the 28// pass constructor initialization. 29namespace llvm { 30void initializePPCTTIPass(PassRegistry &); 31} 32 33namespace { 34 35class PPCTTI : public ImmutablePass, public TargetTransformInfo { 36 const PPCTargetMachine *TM; 37 const PPCSubtarget *ST; 38 const PPCTargetLowering *TLI; 39 40 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 41 /// are set if the result needs to be inserted and/or extracted from vectors. 42 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 43 44public: 45 PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { 46 llvm_unreachable("This pass cannot be directly constructed"); 47 } 48 49 PPCTTI(const PPCTargetMachine *TM) 50 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 51 TLI(TM->getTargetLowering()) { 52 initializePPCTTIPass(*PassRegistry::getPassRegistry()); 53 } 54 55 virtual void initializePass() { 56 pushTTIStack(this); 57 } 58 59 virtual void finalizePass() { 60 popTTIStack(); 61 } 62 63 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 64 TargetTransformInfo::getAnalysisUsage(AU); 65 } 66 67 /// Pass identification. 68 static char ID; 69 70 /// Provide necessary pointer adjustments for the two base classes. 71 virtual void *getAdjustedAnalysisPointer(const void *ID) { 72 if (ID == &TargetTransformInfo::ID) 73 return (TargetTransformInfo*)this; 74 return this; 75 } 76 77 /// \name Scalar TTI Implementations 78 /// @{ 79 virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; 80 virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const; 81 82 /// @} 83 84 /// \name Vector TTI Implementations 85 /// @{ 86 87 virtual unsigned getNumberOfRegisters(bool Vector) const; 88 virtual unsigned getRegisterBitWidth(bool Vector) const; 89 virtual unsigned getMaximumUnrollFactor() const; 90 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 91 OperandValueKind, 92 OperandValueKind) const; 93 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 94 int Index, Type *SubTp) const; 95 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 96 Type *Src) const; 97 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 98 Type *CondTy) const; 99 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 100 unsigned Index) const; 101 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 102 unsigned Alignment, 103 unsigned AddressSpace) const; 104 105 /// @} 106}; 107 108} // end anonymous namespace 109 110INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", 111 "PPC Target Transform Info", true, true, false) 112char PPCTTI::ID = 0; 113 114ImmutablePass * 115llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { 116 return new PPCTTI(TM); 117} 118 119 120//===----------------------------------------------------------------------===// 121// 122// PPC cost model. 123// 124//===----------------------------------------------------------------------===// 125 126PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { 127 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 128 if (ST->hasPOPCNTD() && TyWidth <= 64) 129 return PSK_FastHardware; 130 return PSK_Software; 131} 132 133void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { 134 if (ST->getDarwinDirective() == PPC::DIR_A2) { 135 // The A2 is in-order with a deep pipeline, and concatenation unrolling 136 // helps expose latency-hiding opportunities to the instruction scheduler. 137 UP.Partial = UP.Runtime = true; 138 } 139} 140 141unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { 142 if (Vector && !ST->hasAltivec()) 143 return 0; 144 return 32; 145} 146 147unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { 148 if (Vector) { 149 if (ST->hasAltivec()) return 128; 150 return 0; 151 } 152 153 if (ST->isPPC64()) 154 return 64; 155 return 32; 156 157} 158 159unsigned PPCTTI::getMaximumUnrollFactor() const { 160 unsigned Directive = ST->getDarwinDirective(); 161 // The 440 has no SIMD support, but floating-point instructions 162 // have a 5-cycle latency, so unroll by 5x for latency hiding. 163 if (Directive == PPC::DIR_440) 164 return 5; 165 166 // The A2 has no SIMD support, but floating-point instructions 167 // have a 6-cycle latency, so unroll by 6x for latency hiding. 168 if (Directive == PPC::DIR_A2) 169 return 6; 170 171 // FIXME: For lack of any better information, do no harm... 172 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 173 return 1; 174 175 // For most things, modern systems have two execution units (and 176 // out-of-order execution). 177 return 2; 178} 179 180unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 181 OperandValueKind Op1Info, 182 OperandValueKind Op2Info) const { 183 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 184 185 // Fallback to the default implementation. 186 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 187 Op2Info); 188} 189 190unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 191 Type *SubTp) const { 192 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 193} 194 195unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 196 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 197 198 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 199} 200 201unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 202 Type *CondTy) const { 203 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 204} 205 206unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, 207 unsigned Index) const { 208 assert(Val->isVectorTy() && "This must be a vector type"); 209 210 int ISD = TLI->InstructionOpcodeToISD(Opcode); 211 assert(ISD && "Invalid opcode"); 212 213 // Estimated cost of a load-hit-store delay. This was obtained 214 // experimentally as a minimum needed to prevent unprofitable 215 // vectorization for the paq8p benchmark. It may need to be 216 // raised further if other unprofitable cases remain. 217 unsigned LHSPenalty = 12; 218 219 // Vector element insert/extract with Altivec is very expensive, 220 // because they require store and reload with the attendant 221 // processor stall for load-hit-store. Until VSX is available, 222 // these need to be estimated as very costly. 223 if (ISD == ISD::EXTRACT_VECTOR_ELT || 224 ISD == ISD::INSERT_VECTOR_ELT) 225 return LHSPenalty + 226 TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 227 228 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 229} 230 231unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 232 unsigned AddressSpace) const { 233 // Legalize the type. 234 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 235 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 236 "Invalid Opcode"); 237 238 // Each load/store unit costs 1. 239 unsigned Cost = LT.first * 1; 240 241 // PPC in general does not support unaligned loads and stores. They'll need 242 // to be decomposed based on the alignment factor. 243 unsigned SrcBytes = LT.second.getStoreSize(); 244 if (SrcBytes && Alignment && Alignment < SrcBytes) 245 Cost *= (SrcBytes/Alignment); 246 247 return Cost; 248} 249 250