1178825Sdfr//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2178825Sdfr// 3178825Sdfr// The LLVM Compiler Infrastructure 4233294Sstas// 5178825Sdfr// This file is distributed under the University of Illinois Open Source 6178825Sdfr// License. See LICENSE.TXT for details. 7178825Sdfr// 8178825Sdfr//===----------------------------------------------------------------------===// 9178825Sdfr/// \file 10178825Sdfr/// This file implements a TargetTransformInfo analysis pass specific to the 11178825Sdfr/// X86 target machine. It uses the target's detailed information to provide 12178825Sdfr/// more precise answers to certain TTI queries, while letting the target 13178825Sdfr/// independent and default TTI implementations handle the rest. 14178825Sdfr/// 15178825Sdfr//===----------------------------------------------------------------------===// 16178825Sdfr 17178825Sdfr#define DEBUG_TYPE "x86tti" 18178825Sdfr#include "X86.h" 19178825Sdfr#include "X86TargetMachine.h" 20178825Sdfr#include "llvm/Analysis/TargetTransformInfo.h" 21178825Sdfr#include "llvm/Support/Debug.h" 22178825Sdfr#include "llvm/Target/TargetLowering.h" 23178825Sdfr#include "llvm/Target/CostTable.h" 24178825Sdfrusing namespace llvm; 25178825Sdfr 26178825Sdfr// Declare the pass initialization routine locally as target-specific passes 27178825Sdfr// don't havve a target-wide initialization entry point, and so we rely on the 28178825Sdfr// pass constructor initialization. 29178825Sdfrnamespace llvm { 30178825Sdfrvoid initializeX86TTIPass(PassRegistry &); 31178825Sdfr} 32178825Sdfr 33178825Sdfrnamespace { 34178825Sdfr 35233294Sstasclass X86TTI : public ImmutablePass, public TargetTransformInfo { 36178825Sdfr const X86TargetMachine *TM; 37178825Sdfr const X86Subtarget *ST; 38178825Sdfr const X86TargetLowering *TLI; 39178825Sdfr 40178825Sdfr /// Estimate the overhead of scalarizing an instruction. Insert and Extract 41178825Sdfr /// are set if the result needs to be inserted and/or extracted from vectors. 42178825Sdfr unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 43178825Sdfr 44178825Sdfrpublic: 45178825Sdfr X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { 46178825Sdfr llvm_unreachable("This pass cannot be directly constructed"); 47178825Sdfr } 48178825Sdfr 49178825Sdfr X86TTI(const X86TargetMachine *TM) 50178825Sdfr : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 51178825Sdfr TLI(TM->getTargetLowering()) { 52178825Sdfr initializeX86TTIPass(*PassRegistry::getPassRegistry()); 53178825Sdfr } 54178825Sdfr 55178825Sdfr virtual void initializePass() { 56178825Sdfr pushTTIStack(this); 57178825Sdfr } 58178825Sdfr 59178825Sdfr virtual void finalizePass() { 60178825Sdfr popTTIStack(); 61178825Sdfr } 62178825Sdfr 63178825Sdfr virtual void getAnalysisUsage(AnalysisUsage &AU) const { 64178825Sdfr TargetTransformInfo::getAnalysisUsage(AU); 65178825Sdfr } 66178825Sdfr 67178825Sdfr /// Pass identification. 68178825Sdfr static char ID; 69178825Sdfr 70178825Sdfr /// Provide necessary pointer adjustments for the two base classes. 71178825Sdfr virtual void *getAdjustedAnalysisPointer(const void *ID) { 72178825Sdfr if (ID == &TargetTransformInfo::ID) 73178825Sdfr return (TargetTransformInfo*)this; 74178825Sdfr return this; 75178825Sdfr } 76178825Sdfr 77178825Sdfr /// \name Scalar TTI Implementations 78178825Sdfr /// @{ 79178825Sdfr virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; 80178825Sdfr 81178825Sdfr /// @} 82178825Sdfr 83178825Sdfr /// \name Vector TTI Implementations 84178825Sdfr /// @{ 85178825Sdfr 86178825Sdfr virtual unsigned getNumberOfRegisters(bool Vector) const; 87178825Sdfr virtual unsigned getRegisterBitWidth(bool Vector) const; 88178825Sdfr virtual unsigned getMaximumUnrollFactor() const; 89178825Sdfr virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 90178825Sdfr OperandValueKind, 91178825Sdfr OperandValueKind) const; 92178825Sdfr virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 93178825Sdfr int Index, Type *SubTp) const; 94178825Sdfr virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 95178825Sdfr Type *Src) const; 96178825Sdfr virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 97178825Sdfr Type *CondTy) const; 98178825Sdfr virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 99178825Sdfr unsigned Index) const; 100178825Sdfr virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 101178825Sdfr unsigned Alignment, 102178825Sdfr unsigned AddressSpace) const; 103178825Sdfr 104178825Sdfr /// @} 105178825Sdfr}; 106178825Sdfr 107178825Sdfr} // end anonymous namespace 108178825Sdfr 109178825SdfrINITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 110178825Sdfr "X86 Target Transform Info", true, true, false) 111178825Sdfrchar X86TTI::ID = 0; 112178825Sdfr 113178825SdfrImmutablePass * 114178825Sdfrllvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 115178825Sdfr return new X86TTI(TM); 116178825Sdfr} 117178825Sdfr 118178825Sdfr 119178825Sdfr//===----------------------------------------------------------------------===// 120178825Sdfr// 121178825Sdfr// X86 cost model. 122178825Sdfr// 123178825Sdfr//===----------------------------------------------------------------------===// 124178825Sdfr 125178825SdfrX86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 126178825Sdfr assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 127178825Sdfr // TODO: Currently the __builtin_popcount() implementation using SSE3 128178825Sdfr // instructions is inefficient. Once the problem is fixed, we should 129178825Sdfr // call ST->hasSSE3() instead of ST->hasSSE4(). 130178825Sdfr return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; 131178825Sdfr} 132178825Sdfr 133178825Sdfrunsigned X86TTI::getNumberOfRegisters(bool Vector) const { 134178825Sdfr if (Vector && !ST->hasSSE1()) 135178825Sdfr return 0; 136178825Sdfr 137178825Sdfr if (ST->is64Bit()) 138178825Sdfr return 16; 139178825Sdfr return 8; 140178825Sdfr} 141178825Sdfr 142178825Sdfrunsigned X86TTI::getRegisterBitWidth(bool Vector) const { 143178825Sdfr if (Vector) { 144178825Sdfr if (ST->hasAVX()) return 256; 145178825Sdfr if (ST->hasSSE1()) return 128; 146178825Sdfr return 0; 147178825Sdfr } 148178825Sdfr 149178825Sdfr if (ST->is64Bit()) 150178825Sdfr return 64; 151178825Sdfr return 32; 152178825Sdfr 153178825Sdfr} 154178825Sdfr 155178825Sdfrunsigned X86TTI::getMaximumUnrollFactor() const { 156178825Sdfr if (ST->isAtom()) 157178825Sdfr return 1; 158178825Sdfr 159178825Sdfr // Sandybridge and Haswell have multiple execution ports and pipelined 160178825Sdfr // vector units. 161178825Sdfr if (ST->hasAVX()) 162178825Sdfr return 4; 163178825Sdfr 164178825Sdfr return 2; 165178825Sdfr} 166178825Sdfr 167178825Sdfrunsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 168178825Sdfr OperandValueKind Op1Info, 169178825Sdfr OperandValueKind Op2Info) const { 170178825Sdfr // Legalize the type. 171178825Sdfr std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 172178825Sdfr 173178825Sdfr int ISD = TLI->InstructionOpcodeToISD(Opcode); 174178825Sdfr assert(ISD && "Invalid opcode"); 175178825Sdfr 176178825Sdfr static const CostTblEntry<MVT> AVX2CostTable[] = { 177178825Sdfr // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 178178825Sdfr // customize them to detect the cases where shift amount is a scalar one. 179178825Sdfr { ISD::SHL, MVT::v4i32, 1 }, 180178825Sdfr { ISD::SRL, MVT::v4i32, 1 }, 181178825Sdfr { ISD::SRA, MVT::v4i32, 1 }, 182178825Sdfr { ISD::SHL, MVT::v8i32, 1 }, 183178825Sdfr { ISD::SRL, MVT::v8i32, 1 }, 184178825Sdfr { ISD::SRA, MVT::v8i32, 1 }, 185178825Sdfr { ISD::SHL, MVT::v2i64, 1 }, 186178825Sdfr { ISD::SRL, MVT::v2i64, 1 }, 187178825Sdfr { ISD::SHL, MVT::v4i64, 1 }, 188178825Sdfr { ISD::SRL, MVT::v4i64, 1 }, 189178825Sdfr 190178825Sdfr { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. 191178825Sdfr { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. 192178825Sdfr 193178825Sdfr { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. 194178825Sdfr { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. 195178825Sdfr 196178825Sdfr { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. 197178825Sdfr { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. 198178825Sdfr { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. 199178825Sdfr }; 200178825Sdfr 201178825Sdfr // Look for AVX2 lowering tricks. 202178825Sdfr if (ST->hasAVX2()) { 203178825Sdfr int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable), 204178825Sdfr ISD, LT.second); 205178825Sdfr if (Idx != -1) 206178825Sdfr return LT.first * AVX2CostTable[Idx].Cost; 207178825Sdfr } 208178825Sdfr 209178825Sdfr static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = { 210178825Sdfr // We don't correctly identify costs of casts because they are marked as 211178825Sdfr // custom. 212178825Sdfr // Constant splats are cheaper for the following instructions. 213178825Sdfr { ISD::SHL, MVT::v16i8, 1 }, // psllw. 214178825Sdfr { ISD::SHL, MVT::v8i16, 1 }, // psllw. 215178825Sdfr { ISD::SHL, MVT::v4i32, 1 }, // pslld 216178825Sdfr { ISD::SHL, MVT::v2i64, 1 }, // psllq. 217178825Sdfr 218178825Sdfr { ISD::SRL, MVT::v16i8, 1 }, // psrlw. 219178825Sdfr { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 220178825Sdfr { ISD::SRL, MVT::v4i32, 1 }, // psrld. 221178825Sdfr { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 222178825Sdfr 223178825Sdfr { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 224178825Sdfr { ISD::SRA, MVT::v8i16, 1 }, // psraw. 225178825Sdfr { ISD::SRA, MVT::v4i32, 1 }, // psrad. 226178825Sdfr }; 227178825Sdfr 228178825Sdfr if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 229178825Sdfr ST->hasSSE2()) { 230178825Sdfr int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable, 231178825Sdfr array_lengthof(SSE2UniformConstCostTable), 232178825Sdfr ISD, LT.second); 233178825Sdfr if (Idx != -1) 234178825Sdfr return LT.first * SSE2UniformConstCostTable[Idx].Cost; 235178825Sdfr } 236178825Sdfr 237178825Sdfr 238178825Sdfr static const CostTblEntry<MVT> SSE2CostTable[] = { 239178825Sdfr // We don't correctly identify costs of casts because they are marked as 240178825Sdfr // custom. 241178825Sdfr // For some cases, where the shift amount is a scalar we would be able 242178825Sdfr // to generate better code. Unfortunately, when this is the case the value 243178825Sdfr // (the splat) will get hoisted out of the loop, thereby making it invisible 244178825Sdfr // to ISel. The cost model must return worst case assumptions because it is 245178825Sdfr // used for vectorization and we don't want to make vectorized code worse 246178825Sdfr // than scalar code. 247178825Sdfr { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. 248178825Sdfr { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. 249178825Sdfr { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 250178825Sdfr { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. 251178825Sdfr 252178825Sdfr { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. 253178825Sdfr { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. 254178825Sdfr { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. 255178825Sdfr { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. 256178825Sdfr 257178825Sdfr { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. 258178825Sdfr { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. 259178825Sdfr { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. 260178825Sdfr { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. 261178825Sdfr }; 262178825Sdfr 263178825Sdfr if (ST->hasSSE2()) { 264178825Sdfr int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable), 265178825Sdfr ISD, LT.second); 266178825Sdfr if (Idx != -1) 267178825Sdfr return LT.first * SSE2CostTable[Idx].Cost; 268178825Sdfr } 269178825Sdfr 270178825Sdfr static const CostTblEntry<MVT> AVX1CostTable[] = { 271178825Sdfr // We don't have to scalarize unsupported ops. We can issue two half-sized 272178825Sdfr // operations and we only need to extract the upper YMM half. 273178825Sdfr // Two ops + 1 extract + 1 insert = 4. 274178825Sdfr { ISD::MUL, MVT::v8i32, 4 }, 275178825Sdfr { ISD::SUB, MVT::v8i32, 4 }, 276178825Sdfr { ISD::ADD, MVT::v8i32, 4 }, 277178825Sdfr { ISD::SUB, MVT::v4i64, 4 }, 278178825Sdfr { ISD::ADD, MVT::v4i64, 4 }, 279178825Sdfr // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 280178825Sdfr // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 281178825Sdfr // Because we believe v4i64 to be a legal type, we must also include the 282178825Sdfr // split factor of two in the cost table. Therefore, the cost here is 18 283178825Sdfr // instead of 9. 284178825Sdfr { ISD::MUL, MVT::v4i64, 18 }, 285178825Sdfr }; 286178825Sdfr 287178825Sdfr // Look for AVX1 lowering tricks. 288178825Sdfr if (ST->hasAVX() && !ST->hasAVX2()) { 289178825Sdfr int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), 290178825Sdfr ISD, LT.second); 291178825Sdfr if (Idx != -1) 292178825Sdfr return LT.first * AVX1CostTable[Idx].Cost; 293178825Sdfr } 294178825Sdfr 295178825Sdfr // Custom lowering of vectors. 296178825Sdfr static const CostTblEntry<MVT> CustomLowered[] = { 297178825Sdfr // A v2i64/v4i64 and multiply is custom lowered as a series of long 298178825Sdfr // multiplies(3), shifts(4) and adds(2). 299178825Sdfr { ISD::MUL, MVT::v2i64, 9 }, 300178825Sdfr { ISD::MUL, MVT::v4i64, 9 }, 301178825Sdfr }; 302178825Sdfr int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered), 303178825Sdfr ISD, LT.second); 304178825Sdfr if (Idx != -1) 305178825Sdfr return LT.first * CustomLowered[Idx].Cost; 306178825Sdfr 307178825Sdfr // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 308178825Sdfr // 2x pmuludq, 2x shuffle. 309178825Sdfr if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 310178825Sdfr !ST->hasSSE41()) 311178825Sdfr return 6; 312178825Sdfr 313178825Sdfr // Fallback to the default implementation. 314178825Sdfr return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 315178825Sdfr Op2Info); 316178825Sdfr} 317178825Sdfr 318178825Sdfrunsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 319178825Sdfr Type *SubTp) const { 320178825Sdfr // We only estimate the cost of reverse shuffles. 321178825Sdfr if (Kind != SK_Reverse) 322178825Sdfr return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 323178825Sdfr 324178825Sdfr std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 325178825Sdfr unsigned Cost = 1; 326178825Sdfr if (LT.second.getSizeInBits() > 128) 327178825Sdfr Cost = 3; // Extract + insert + copy. 328178825Sdfr 329178825Sdfr // Multiple by the number of parts. 330178825Sdfr return Cost * LT.first; 331178825Sdfr} 332178825Sdfr 333178825Sdfrunsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 334178825Sdfr int ISD = TLI->InstructionOpcodeToISD(Opcode); 335178825Sdfr assert(ISD && "Invalid opcode"); 336178825Sdfr 337178825Sdfr std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); 338178825Sdfr std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); 339178825Sdfr 340178825Sdfr static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = { 341178825Sdfr // These are somewhat magic numbers justified by looking at the output of 342178825Sdfr // Intel's IACA, running some kernels and making sure when we take 343178825Sdfr // legalization into account the throughput will be overestimated. 344178825Sdfr { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 345178825Sdfr { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 346178825Sdfr { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 347178825Sdfr { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 348178825Sdfr { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 349178825Sdfr { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 350178825Sdfr { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 351178825Sdfr { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 352178825Sdfr // There are faster sequences for float conversions. 353178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 354178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 355178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 356178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 357178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 358178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 359178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 360178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 361178825Sdfr }; 362178825Sdfr 363178825Sdfr if (ST->hasSSE2() && !ST->hasAVX()) { 364178825Sdfr int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl, 365178825Sdfr array_lengthof(SSE2ConvTbl), 366178825Sdfr ISD, LTDest.second, LTSrc.second); 367178825Sdfr if (Idx != -1) 368178825Sdfr return LTSrc.first * SSE2ConvTbl[Idx].Cost; 369178825Sdfr } 370178825Sdfr 371178825Sdfr EVT SrcTy = TLI->getValueType(Src); 372178825Sdfr EVT DstTy = TLI->getValueType(Dst); 373178825Sdfr 374178825Sdfr // The function getSimpleVT only handles simple value types. 375178825Sdfr if (!SrcTy.isSimple() || !DstTy.isSimple()) 376178825Sdfr return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 377178825Sdfr 378178825Sdfr static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = { 379178825Sdfr { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 380178825Sdfr { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 381178825Sdfr { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 382178825Sdfr { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 383178825Sdfr { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 384178825Sdfr { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, 385178825Sdfr 386178825Sdfr { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 387178825Sdfr { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 388178825Sdfr { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 389178825Sdfr { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 390178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 391178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 392178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 393178825Sdfr { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 394178825Sdfr { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 395178825Sdfr { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 396178825Sdfr { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 397178825Sdfr { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 398178825Sdfr 399178825Sdfr { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 400178825Sdfr { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 401178825Sdfr { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 402178825Sdfr { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 403178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 404178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 405178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 406178825Sdfr { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 407178825Sdfr { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 408178825Sdfr { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 409178825Sdfr { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 410178825Sdfr { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 411178825Sdfr 412178825Sdfr { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, 413178825Sdfr { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 414178825Sdfr { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, 415178825Sdfr { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, 416178825Sdfr { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, 417178825Sdfr { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 418178825Sdfr { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 419178825Sdfr { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, 420178825Sdfr }; 421178825Sdfr 422178825Sdfr if (ST->hasAVX()) { 423178825Sdfr int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl, 424178825Sdfr array_lengthof(AVXConversionTbl), 425178825Sdfr ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 426178825Sdfr if (Idx != -1) 427178825Sdfr return AVXConversionTbl[Idx].Cost; 428178825Sdfr } 429178825Sdfr 430178825Sdfr return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 431178825Sdfr} 432178825Sdfr 433178825Sdfrunsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 434178825Sdfr Type *CondTy) const { 435178825Sdfr // Legalize the type. 436178825Sdfr std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 437178825Sdfr 438178825Sdfr MVT MTy = LT.second; 439178825Sdfr 440178825Sdfr int ISD = TLI->InstructionOpcodeToISD(Opcode); 441178825Sdfr assert(ISD && "Invalid opcode"); 442178825Sdfr 443178825Sdfr static const CostTblEntry<MVT> SSE42CostTbl[] = { 444178825Sdfr { ISD::SETCC, MVT::v2f64, 1 }, 445178825Sdfr { ISD::SETCC, MVT::v4f32, 1 }, 446178825Sdfr { ISD::SETCC, MVT::v2i64, 1 }, 447178825Sdfr { ISD::SETCC, MVT::v4i32, 1 }, 448178825Sdfr { ISD::SETCC, MVT::v8i16, 1 }, 449178825Sdfr { ISD::SETCC, MVT::v16i8, 1 }, 450178825Sdfr }; 451178825Sdfr 452178825Sdfr static const CostTblEntry<MVT> AVX1CostTbl[] = { 453178825Sdfr { ISD::SETCC, MVT::v4f64, 1 }, 454178825Sdfr { ISD::SETCC, MVT::v8f32, 1 }, 455178825Sdfr // AVX1 does not support 8-wide integer compare. 456178825Sdfr { ISD::SETCC, MVT::v4i64, 4 }, 457178825Sdfr { ISD::SETCC, MVT::v8i32, 4 }, 458178825Sdfr { ISD::SETCC, MVT::v16i16, 4 }, 459178825Sdfr { ISD::SETCC, MVT::v32i8, 4 }, 460178825Sdfr }; 461178825Sdfr 462178825Sdfr static const CostTblEntry<MVT> AVX2CostTbl[] = { 463178825Sdfr { ISD::SETCC, MVT::v4i64, 1 }, 464178825Sdfr { ISD::SETCC, MVT::v8i32, 1 }, 465178825Sdfr { ISD::SETCC, MVT::v16i16, 1 }, 466178825Sdfr { ISD::SETCC, MVT::v32i8, 1 }, 467178825Sdfr }; 468178825Sdfr 469178825Sdfr if (ST->hasAVX2()) { 470178825Sdfr int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); 471178825Sdfr if (Idx != -1) 472178825Sdfr return LT.first * AVX2CostTbl[Idx].Cost; 473178825Sdfr } 474178825Sdfr 475178825Sdfr if (ST->hasAVX()) { 476178825Sdfr int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); 477178825Sdfr if (Idx != -1) 478178825Sdfr return LT.first * AVX1CostTbl[Idx].Cost; 479178825Sdfr } 480178825Sdfr 481178825Sdfr if (ST->hasSSE42()) { 482178825Sdfr int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); 483178825Sdfr if (Idx != -1) 484178825Sdfr return LT.first * SSE42CostTbl[Idx].Cost; 485178825Sdfr } 486178825Sdfr 487178825Sdfr return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 488178825Sdfr} 489178825Sdfr 490178825Sdfrunsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 491178825Sdfr unsigned Index) const { 492178825Sdfr assert(Val->isVectorTy() && "This must be a vector type"); 493178825Sdfr 494178825Sdfr if (Index != -1U) { 495178825Sdfr // Legalize the type. 496178825Sdfr std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 497178825Sdfr 498178825Sdfr // This type is legalized to a scalar type. 499178825Sdfr if (!LT.second.isVector()) 500178825Sdfr return 0; 501178825Sdfr 502178825Sdfr // The type may be split. Normalize the index to the new type. 503178825Sdfr unsigned Width = LT.second.getVectorNumElements(); 504178825Sdfr Index = Index % Width; 505178825Sdfr 506178825Sdfr // Floating point scalars are already located in index #0. 507178825Sdfr if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 508178825Sdfr return 0; 509178825Sdfr } 510178825Sdfr 511178825Sdfr return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 512178825Sdfr} 513178825Sdfr 514178825Sdfrunsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 515178825Sdfr unsigned AddressSpace) const { 516178825Sdfr // Legalize the type. 517178825Sdfr std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 518178825Sdfr assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 519178825Sdfr "Invalid Opcode"); 520178825Sdfr 521178825Sdfr // Each load/store unit costs 1. 522178825Sdfr unsigned Cost = LT.first * 1; 523178825Sdfr 524178825Sdfr // On Sandybridge 256bit load/stores are double pumped 525178825Sdfr // (but not on Haswell). 526178825Sdfr if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 527178825Sdfr Cost*=2; 528178825Sdfr 529178825Sdfr return Cost; 530178825Sdfr} 531178825Sdfr