1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of
17/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost.
21/// Some examples of other technologies/CPUs:
22///   SSE 3   - Pentium4 / Athlon64
23///   SSE 4.1 - Penryn
24///   SSE 4.2 - Nehalem
25///   AVX     - Sandy Bridge
26///   AVX2    - Haswell
27///   AVX-512 - Xeon Phi / Skylake
28/// And some examples of instruction target dependent costs (latency)
29///                   divss     sqrtss          rsqrtss
30///   AMD K7            11-16     19              3
31///   Piledriver        9-24      13-15           5
32///   Jaguar            14        16              2
33///   Pentium II,III    18        30              2
34///   Nehalem           7-14      7-18            3
35///   Haswell           10-13     11              5
36/// TODO: Develop and implement  the target dependent cost model and
37/// specialize cost numbers for different Cost Model Targets such as throughput,
38/// code size, latency and uop count.
39//===----------------------------------------------------------------------===//
40
41#include "X86TargetTransformInfo.h"
42#include "llvm/Analysis/TargetTransformInfo.h"
43#include "llvm/CodeGen/BasicTTIImpl.h"
44#include "llvm/CodeGen/CostTable.h"
45#include "llvm/CodeGen/TargetLowering.h"
46#include "llvm/IR/IntrinsicInst.h"
47#include "llvm/Support/Debug.h"
48
49using namespace llvm;
50
51#define DEBUG_TYPE "x86tti"
52
53//===----------------------------------------------------------------------===//
54//
55// X86 cost model.
56//
57//===----------------------------------------------------------------------===//
58
59TargetTransformInfo::PopcntSupportKind
60X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62  // TODO: Currently the __builtin_popcount() implementation using SSE3
63  //   instructions is inefficient. Once the problem is fixed, we should
64  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
65  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
66}
67
68llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69  TargetTransformInfo::CacheLevel Level) const {
70  switch (Level) {
71  case TargetTransformInfo::CacheLevel::L1D:
72    //   - Penryn
73    //   - Nehalem
74    //   - Westmere
75    //   - Sandy Bridge
76    //   - Ivy Bridge
77    //   - Haswell
78    //   - Broadwell
79    //   - Skylake
80    //   - Kabylake
81    return 32 * 1024;  //  32 KByte
82  case TargetTransformInfo::CacheLevel::L2D:
83    //   - Penryn
84    //   - Nehalem
85    //   - Westmere
86    //   - Sandy Bridge
87    //   - Ivy Bridge
88    //   - Haswell
89    //   - Broadwell
90    //   - Skylake
91    //   - Kabylake
92    return 256 * 1024; // 256 KByte
93  }
94
95  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96}
97
98llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99  TargetTransformInfo::CacheLevel Level) const {
100  //   - Penryn
101  //   - Nehalem
102  //   - Westmere
103  //   - Sandy Bridge
104  //   - Ivy Bridge
105  //   - Haswell
106  //   - Broadwell
107  //   - Skylake
108  //   - Kabylake
109  switch (Level) {
110  case TargetTransformInfo::CacheLevel::L1D:
111    LLVM_FALLTHROUGH;
112  case TargetTransformInfo::CacheLevel::L2D:
113    return 8;
114  }
115
116  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117}
118
119unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
120  bool Vector = (ClassID == 1);
121  if (Vector && !ST->hasSSE1())
122    return 0;
123
124  if (ST->is64Bit()) {
125    if (Vector && ST->hasAVX512())
126      return 32;
127    return 16;
128  }
129  return 8;
130}
131
132unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135    if (ST->hasAVX512() && PreferVectorWidth >= 512)
136      return 512;
137    if (ST->hasAVX() && PreferVectorWidth >= 256)
138      return 256;
139    if (ST->hasSSE1() && PreferVectorWidth >= 128)
140      return 128;
141    return 0;
142  }
143
144  if (ST->is64Bit())
145    return 64;
146
147  return 32;
148}
149
150unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152}
153
154unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159    return 1;
160
161  if (ST->isAtom())
162    return 1;
163
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167    return 4;
168
169  return 2;
170}
171
172int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
173                                       TTI::TargetCostKind CostKind,
174                                       TTI::OperandValueKind Op1Info,
175                                       TTI::OperandValueKind Op2Info,
176                                       TTI::OperandValueProperties Opd1PropInfo,
177                                       TTI::OperandValueProperties Opd2PropInfo,
178                                       ArrayRef<const Value *> Args,
179                                       const Instruction *CxtI) {
180  // TODO: Handle more cost kinds.
181  if (CostKind != TTI::TCK_RecipThroughput)
182    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
183                                         Op2Info, Opd1PropInfo,
184                                         Opd2PropInfo, Args, CxtI);
185  // Legalize the type.
186  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
187
188  int ISD = TLI->InstructionOpcodeToISD(Opcode);
189  assert(ISD && "Invalid opcode");
190
191  static const CostTblEntry GLMCostTable[] = {
192    { ISD::FDIV,  MVT::f32,   18 }, // divss
193    { ISD::FDIV,  MVT::v4f32, 35 }, // divps
194    { ISD::FDIV,  MVT::f64,   33 }, // divsd
195    { ISD::FDIV,  MVT::v2f64, 65 }, // divpd
196  };
197
198  if (ST->useGLMDivSqrtCosts())
199    if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
200                                            LT.second))
201      return LT.first * Entry->Cost;
202
203  static const CostTblEntry SLMCostTable[] = {
204    { ISD::MUL,   MVT::v4i32, 11 }, // pmulld
205    { ISD::MUL,   MVT::v8i16, 2  }, // pmullw
206    { ISD::MUL,   MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
207    { ISD::FMUL,  MVT::f64,   2  }, // mulsd
208    { ISD::FMUL,  MVT::v2f64, 4  }, // mulpd
209    { ISD::FMUL,  MVT::v4f32, 2  }, // mulps
210    { ISD::FDIV,  MVT::f32,   17 }, // divss
211    { ISD::FDIV,  MVT::v4f32, 39 }, // divps
212    { ISD::FDIV,  MVT::f64,   32 }, // divsd
213    { ISD::FDIV,  MVT::v2f64, 69 }, // divpd
214    { ISD::FADD,  MVT::v2f64, 2  }, // addpd
215    { ISD::FSUB,  MVT::v2f64, 2  }, // subpd
216    // v2i64/v4i64 mul is custom lowered as a series of long:
217    // multiplies(3), shifts(3) and adds(2)
218    // slm muldq version throughput is 2 and addq throughput 4
219    // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
220    //       3X4 (addq throughput) = 17
221    { ISD::MUL,   MVT::v2i64, 17 },
222    // slm addq\subq throughput is 4
223    { ISD::ADD,   MVT::v2i64, 4  },
224    { ISD::SUB,   MVT::v2i64, 4  },
225  };
226
227  if (ST->isSLM()) {
228    if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
229      // Check if the operands can be shrinked into a smaller datatype.
230      bool Op1Signed = false;
231      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
232      bool Op2Signed = false;
233      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
234
235      bool signedMode = Op1Signed | Op2Signed;
236      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
237
238      if (OpMinSize <= 7)
239        return LT.first * 3; // pmullw/sext
240      if (!signedMode && OpMinSize <= 8)
241        return LT.first * 3; // pmullw/zext
242      if (OpMinSize <= 15)
243        return LT.first * 5; // pmullw/pmulhw/pshuf
244      if (!signedMode && OpMinSize <= 16)
245        return LT.first * 5; // pmullw/pmulhw/pshuf
246    }
247
248    if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
249                                            LT.second)) {
250      return LT.first * Entry->Cost;
251    }
252  }
253
254  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
255       ISD == ISD::UREM) &&
256      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
257       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
258      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
259    if (ISD == ISD::SDIV || ISD == ISD::SREM) {
260      // On X86, vector signed division by constants power-of-two are
261      // normally expanded to the sequence SRA + SRL + ADD + SRA.
262      // The OperandValue properties may not be the same as that of the previous
263      // operation; conservatively assume OP_None.
264      int Cost =
265          2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
266                                     Op2Info,
267                                     TargetTransformInfo::OP_None,
268                                     TargetTransformInfo::OP_None);
269      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
270                                     Op2Info,
271                                     TargetTransformInfo::OP_None,
272                                     TargetTransformInfo::OP_None);
273      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
274                                     Op2Info,
275                                     TargetTransformInfo::OP_None,
276                                     TargetTransformInfo::OP_None);
277
278      if (ISD == ISD::SREM) {
279        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
280        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
281                                       Op2Info);
282        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
283                                       Op2Info);
284      }
285
286      return Cost;
287    }
288
289    // Vector unsigned division/remainder will be simplified to shifts/masks.
290    if (ISD == ISD::UDIV)
291      return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
292                                    Op1Info, Op2Info,
293                                    TargetTransformInfo::OP_None,
294                                    TargetTransformInfo::OP_None);
295
296    else // UREM
297      return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
298                                    Op1Info, Op2Info,
299                                    TargetTransformInfo::OP_None,
300                                    TargetTransformInfo::OP_None);
301  }
302
303  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
304    { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
305    { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
306    { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
307  };
308
309  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
310      ST->hasBWI()) {
311    if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
312                                            LT.second))
313      return LT.first * Entry->Cost;
314  }
315
316  static const CostTblEntry AVX512UniformConstCostTable[] = {
317    { ISD::SRA,  MVT::v2i64,   1 },
318    { ISD::SRA,  MVT::v4i64,   1 },
319    { ISD::SRA,  MVT::v8i64,   1 },
320
321    { ISD::SHL,  MVT::v64i8,   4 }, // psllw + pand.
322    { ISD::SRL,  MVT::v64i8,   4 }, // psrlw + pand.
323    { ISD::SRA,  MVT::v64i8,   8 }, // psrlw, pand, pxor, psubb.
324  };
325
326  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
327      ST->hasAVX512()) {
328    if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
329                                            LT.second))
330      return LT.first * Entry->Cost;
331  }
332
333  static const CostTblEntry AVX2UniformConstCostTable[] = {
334    { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
335    { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
336    { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
337
338    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
339  };
340
341  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
342      ST->hasAVX2()) {
343    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
344                                            LT.second))
345      return LT.first * Entry->Cost;
346  }
347
348  static const CostTblEntry SSE2UniformConstCostTable[] = {
349    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
350    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
351    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
352
353    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
354    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
355    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
356  };
357
358  // XOP has faster vXi8 shifts.
359  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
360      ST->hasSSE2() && !ST->hasXOP()) {
361    if (const auto *Entry =
362            CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
363      return LT.first * Entry->Cost;
364  }
365
366  static const CostTblEntry AVX512BWConstCostTable[] = {
367    { ISD::SDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
368    { ISD::SREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
369    { ISD::UDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
370    { ISD::UREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
371    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
372    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
373    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
374    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
375  };
376
377  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
378       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
379      ST->hasBWI()) {
380    if (const auto *Entry =
381            CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
382      return LT.first * Entry->Cost;
383  }
384
385  static const CostTblEntry AVX512ConstCostTable[] = {
386    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
387    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
388    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
389    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
390    { ISD::SDIV, MVT::v64i8,  28 }, // 4*ext+4*pmulhw sequence
391    { ISD::SREM, MVT::v64i8,  32 }, // 4*ext+4*pmulhw+mul+sub sequence
392    { ISD::UDIV, MVT::v64i8,  28 }, // 4*ext+4*pmulhw sequence
393    { ISD::UREM, MVT::v64i8,  32 }, // 4*ext+4*pmulhw+mul+sub sequence
394    { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
395    { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
396    { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
397    { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
398  };
399
400  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
401       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
402      ST->hasAVX512()) {
403    if (const auto *Entry =
404            CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
405      return LT.first * Entry->Cost;
406  }
407
408  static const CostTblEntry AVX2ConstCostTable[] = {
409    { ISD::SDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
410    { ISD::SREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
411    { ISD::UDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
412    { ISD::UREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
413    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
414    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
415    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
416    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
417    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
418    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
419    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
420    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
421  };
422
423  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
424       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
425      ST->hasAVX2()) {
426    if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
427      return LT.first * Entry->Cost;
428  }
429
430  static const CostTblEntry SSE2ConstCostTable[] = {
431    { ISD::SDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
432    { ISD::SREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
433    { ISD::SDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
434    { ISD::SREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
435    { ISD::UDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
436    { ISD::UREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
437    { ISD::UDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
438    { ISD::UREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
439    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
440    { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
441    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
442    { ISD::SREM, MVT::v8i16,     8 }, // pmulhw+mul+sub sequence
443    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
444    { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
445    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
446    { ISD::UREM, MVT::v8i16,     8 }, // pmulhuw+mul+sub sequence
447    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
448    { ISD::SREM, MVT::v8i32,  48+2 }, // 2*pmuludq+mul+sub sequence + split.
449    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
450    { ISD::SREM, MVT::v4i32,    24 }, // pmuludq+mul+sub sequence
451    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
452    { ISD::UREM, MVT::v8i32,  40+2 }, // 2*pmuludq+mul+sub sequence + split.
453    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
454    { ISD::UREM, MVT::v4i32,    20 }, // pmuludq+mul+sub sequence
455  };
456
457  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
458       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
459      ST->hasSSE2()) {
460    // pmuldq sequence.
461    if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
462      return LT.first * 32;
463    if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
464      return LT.first * 38;
465    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
466      return LT.first * 15;
467    if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
468      return LT.first * 20;
469
470    if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
471      return LT.first * Entry->Cost;
472  }
473
474  static const CostTblEntry AVX512BWShiftCostTable[] = {
475    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
476    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
477    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
478
479    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
480    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
481    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
482
483    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
484    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
485    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
486  };
487
488  if (ST->hasBWI())
489    if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
490      return LT.first * Entry->Cost;
491
492  static const CostTblEntry AVX2UniformCostTable[] = {
493    // Uniform splats are cheaper for the following instructions.
494    { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
495    { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
496    { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
497    { ISD::SHL,  MVT::v32i16, 2 }, // 2*psllw.
498    { ISD::SRL,  MVT::v32i16, 2 }, // 2*psrlw.
499    { ISD::SRA,  MVT::v32i16, 2 }, // 2*psraw.
500  };
501
502  if (ST->hasAVX2() &&
503      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
504       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
505    if (const auto *Entry =
506            CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
507      return LT.first * Entry->Cost;
508  }
509
510  static const CostTblEntry SSE2UniformCostTable[] = {
511    // Uniform splats are cheaper for the following instructions.
512    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
513    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
514    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
515
516    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
517    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
518    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
519
520    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
521    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
522  };
523
524  if (ST->hasSSE2() &&
525      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
526       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
527    if (const auto *Entry =
528            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
529      return LT.first * Entry->Cost;
530  }
531
532  static const CostTblEntry AVX512DQCostTable[] = {
533    { ISD::MUL,  MVT::v2i64, 1 },
534    { ISD::MUL,  MVT::v4i64, 1 },
535    { ISD::MUL,  MVT::v8i64, 1 }
536  };
537
538  // Look for AVX512DQ lowering tricks for custom cases.
539  if (ST->hasDQI())
540    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
541      return LT.first * Entry->Cost;
542
543  static const CostTblEntry AVX512BWCostTable[] = {
544    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
545    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
546    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
547
548    { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
549    { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
550    { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
551  };
552
553  // Look for AVX512BW lowering tricks for custom cases.
554  if (ST->hasBWI())
555    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
556      return LT.first * Entry->Cost;
557
558  static const CostTblEntry AVX512CostTable[] = {
559    { ISD::SHL,     MVT::v16i32,     1 },
560    { ISD::SRL,     MVT::v16i32,     1 },
561    { ISD::SRA,     MVT::v16i32,     1 },
562
563    { ISD::SHL,     MVT::v8i64,      1 },
564    { ISD::SRL,     MVT::v8i64,      1 },
565
566    { ISD::SRA,     MVT::v2i64,      1 },
567    { ISD::SRA,     MVT::v4i64,      1 },
568    { ISD::SRA,     MVT::v8i64,      1 },
569
570    { ISD::MUL,     MVT::v64i8,     26 }, // extend/pmullw/trunc sequence.
571    { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
572    { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
573    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld (Skylake from agner.org)
574    { ISD::MUL,     MVT::v8i32,      1 }, // pmulld (Skylake from agner.org)
575    { ISD::MUL,     MVT::v4i32,      1 }, // pmulld (Skylake from agner.org)
576    { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
577
578    { ISD::FADD,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
579    { ISD::FSUB,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
580    { ISD::FMUL,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
581
582    { ISD::FADD,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
583    { ISD::FSUB,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
584    { ISD::FMUL,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
585  };
586
587  if (ST->hasAVX512())
588    if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
589      return LT.first * Entry->Cost;
590
591  static const CostTblEntry AVX2ShiftCostTable[] = {
592    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
593    // customize them to detect the cases where shift amount is a scalar one.
594    { ISD::SHL,     MVT::v4i32,    1 },
595    { ISD::SRL,     MVT::v4i32,    1 },
596    { ISD::SRA,     MVT::v4i32,    1 },
597    { ISD::SHL,     MVT::v8i32,    1 },
598    { ISD::SRL,     MVT::v8i32,    1 },
599    { ISD::SRA,     MVT::v8i32,    1 },
600    { ISD::SHL,     MVT::v2i64,    1 },
601    { ISD::SRL,     MVT::v2i64,    1 },
602    { ISD::SHL,     MVT::v4i64,    1 },
603    { ISD::SRL,     MVT::v4i64,    1 },
604  };
605
606  if (ST->hasAVX512()) {
607    if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
608        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
609         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
610      // On AVX512, a packed v32i16 shift left by a constant build_vector
611      // is lowered into a vector multiply (vpmullw).
612      return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
613                                    Op1Info, Op2Info,
614                                    TargetTransformInfo::OP_None,
615                                    TargetTransformInfo::OP_None);
616  }
617
618  // Look for AVX2 lowering tricks.
619  if (ST->hasAVX2()) {
620    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
621        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
622         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
623      // On AVX2, a packed v16i16 shift left by a constant build_vector
624      // is lowered into a vector multiply (vpmullw).
625      return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
626                                    Op1Info, Op2Info,
627                                    TargetTransformInfo::OP_None,
628                                    TargetTransformInfo::OP_None);
629
630    if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
631      return LT.first * Entry->Cost;
632  }
633
634  static const CostTblEntry XOPShiftCostTable[] = {
635    // 128bit shifts take 1cy, but right shifts require negation beforehand.
636    { ISD::SHL,     MVT::v16i8,    1 },
637    { ISD::SRL,     MVT::v16i8,    2 },
638    { ISD::SRA,     MVT::v16i8,    2 },
639    { ISD::SHL,     MVT::v8i16,    1 },
640    { ISD::SRL,     MVT::v8i16,    2 },
641    { ISD::SRA,     MVT::v8i16,    2 },
642    { ISD::SHL,     MVT::v4i32,    1 },
643    { ISD::SRL,     MVT::v4i32,    2 },
644    { ISD::SRA,     MVT::v4i32,    2 },
645    { ISD::SHL,     MVT::v2i64,    1 },
646    { ISD::SRL,     MVT::v2i64,    2 },
647    { ISD::SRA,     MVT::v2i64,    2 },
648    // 256bit shifts require splitting if AVX2 didn't catch them above.
649    { ISD::SHL,     MVT::v32i8,  2+2 },
650    { ISD::SRL,     MVT::v32i8,  4+2 },
651    { ISD::SRA,     MVT::v32i8,  4+2 },
652    { ISD::SHL,     MVT::v16i16, 2+2 },
653    { ISD::SRL,     MVT::v16i16, 4+2 },
654    { ISD::SRA,     MVT::v16i16, 4+2 },
655    { ISD::SHL,     MVT::v8i32,  2+2 },
656    { ISD::SRL,     MVT::v8i32,  4+2 },
657    { ISD::SRA,     MVT::v8i32,  4+2 },
658    { ISD::SHL,     MVT::v4i64,  2+2 },
659    { ISD::SRL,     MVT::v4i64,  4+2 },
660    { ISD::SRA,     MVT::v4i64,  4+2 },
661  };
662
663  // Look for XOP lowering tricks.
664  if (ST->hasXOP()) {
665    // If the right shift is constant then we'll fold the negation so
666    // it's as cheap as a left shift.
667    int ShiftISD = ISD;
668    if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
669        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
670         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
671      ShiftISD = ISD::SHL;
672    if (const auto *Entry =
673            CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
674      return LT.first * Entry->Cost;
675  }
676
677  static const CostTblEntry SSE2UniformShiftCostTable[] = {
678    // Uniform splats are cheaper for the following instructions.
679    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
680    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
681    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
682
683    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
684    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
685    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
686
687    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
688    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
689    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
690    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
691  };
692
693  if (ST->hasSSE2() &&
694      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
695       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
696
697    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
698    if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
699      return LT.first * 4; // 2*psrad + shuffle.
700
701    if (const auto *Entry =
702            CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
703      return LT.first * Entry->Cost;
704  }
705
706  if (ISD == ISD::SHL &&
707      Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
708    MVT VT = LT.second;
709    // Vector shift left by non uniform constant can be lowered
710    // into vector multiply.
711    if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
712        ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
713      ISD = ISD::MUL;
714  }
715
716  static const CostTblEntry AVX2CostTable[] = {
717    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
718    { ISD::SHL,  MVT::v64i8,     22 }, // 2*vpblendvb sequence.
719    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
720    { ISD::SHL,  MVT::v32i16,    20 }, // 2*extend/vpsrlvd/pack sequence.
721
722    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
723    { ISD::SRL,  MVT::v64i8,     22 }, // 2*vpblendvb sequence.
724    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
725    { ISD::SRL,  MVT::v32i16,    20 }, // 2*extend/vpsrlvd/pack sequence.
726
727    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
728    { ISD::SRA,  MVT::v64i8,     48 }, // 2*vpblendvb sequence.
729    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
730    { ISD::SRA,  MVT::v32i16,    20 }, // 2*extend/vpsravd/pack sequence.
731    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
732    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
733
734    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
735    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
736    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
737    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
738    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
739    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
740    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
741    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
742
743    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
744    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
745    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
746    { ISD::MUL,  MVT::v8i32,      2 }, // pmulld (Haswell from agner.org)
747    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
748
749    { ISD::FADD, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
750    { ISD::FADD, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
751    { ISD::FSUB, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
752    { ISD::FSUB, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
753    { ISD::FMUL, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
754    { ISD::FMUL, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
755
756    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
757    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
758    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
759    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
760    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
761    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
762  };
763
764  // Look for AVX2 lowering tricks for custom cases.
765  if (ST->hasAVX2())
766    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
767      return LT.first * Entry->Cost;
768
769  static const CostTblEntry AVX1CostTable[] = {
770    // We don't have to scalarize unsupported ops. We can issue two half-sized
771    // operations and we only need to extract the upper YMM half.
772    // Two ops + 1 extract + 1 insert = 4.
773    { ISD::MUL,     MVT::v16i16,     4 },
774    { ISD::MUL,     MVT::v8i32,      4 },
775    { ISD::SUB,     MVT::v32i8,      4 },
776    { ISD::ADD,     MVT::v32i8,      4 },
777    { ISD::SUB,     MVT::v16i16,     4 },
778    { ISD::ADD,     MVT::v16i16,     4 },
779    { ISD::SUB,     MVT::v8i32,      4 },
780    { ISD::ADD,     MVT::v8i32,      4 },
781    { ISD::SUB,     MVT::v4i64,      4 },
782    { ISD::ADD,     MVT::v4i64,      4 },
783
784    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
785    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
786    // Because we believe v4i64 to be a legal type, we must also include the
787    // extract+insert in the cost table. Therefore, the cost here is 18
788    // instead of 8.
789    { ISD::MUL,     MVT::v4i64,     18 },
790
791    { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
792
793    { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
794    { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
795    { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
796    { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
797    { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
798    { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
799  };
800
801  if (ST->hasAVX())
802    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
803      return LT.first * Entry->Cost;
804
805  static const CostTblEntry SSE42CostTable[] = {
806    { ISD::FADD, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
807    { ISD::FADD, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
808    { ISD::FADD, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
809    { ISD::FADD, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
810
811    { ISD::FSUB, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
812    { ISD::FSUB, MVT::f32 ,    1 }, // Nehalem from http://www.agner.org/
813    { ISD::FSUB, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
814    { ISD::FSUB, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
815
816    { ISD::FMUL, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
817    { ISD::FMUL, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
818    { ISD::FMUL, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
819    { ISD::FMUL, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
820
821    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
822    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
823    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
824    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
825  };
826
827  if (ST->hasSSE42())
828    if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
829      return LT.first * Entry->Cost;
830
831  static const CostTblEntry SSE41CostTable[] = {
832    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
833    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
834    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
835    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
836    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
837    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
838
839    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
840    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
841    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
842    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
843    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
844    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
845
846    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
847    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
848    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
849    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
850    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
851    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
852
853    { ISD::MUL,  MVT::v4i32,       2 }  // pmulld (Nehalem from agner.org)
854  };
855
856  if (ST->hasSSE41())
857    if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
858      return LT.first * Entry->Cost;
859
860  static const CostTblEntry SSE2CostTable[] = {
861    // We don't correctly identify costs of casts because they are marked as
862    // custom.
863    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
864    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
865    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
866    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
867    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
868
869    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
870    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
871    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
872    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
873    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
874
875    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
876    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
877    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
878    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
879    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
880
881    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
882    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
883    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
884    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
885
886    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
887    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
888    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
889    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
890
891    { ISD::FADD, MVT::f32,         2 }, // Pentium IV from http://www.agner.org/
892    { ISD::FADD, MVT::f64,         2 }, // Pentium IV from http://www.agner.org/
893
894    { ISD::FSUB, MVT::f32,         2 }, // Pentium IV from http://www.agner.org/
895    { ISD::FSUB, MVT::f64,         2 }, // Pentium IV from http://www.agner.org/
896  };
897
898  if (ST->hasSSE2())
899    if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
900      return LT.first * Entry->Cost;
901
902  static const CostTblEntry SSE1CostTable[] = {
903    { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
904    { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
905
906    { ISD::FADD, MVT::f32,    1 }, // Pentium III from http://www.agner.org/
907    { ISD::FADD, MVT::v4f32,  2 }, // Pentium III from http://www.agner.org/
908
909    { ISD::FSUB, MVT::f32,    1 }, // Pentium III from http://www.agner.org/
910    { ISD::FSUB, MVT::v4f32,  2 }, // Pentium III from http://www.agner.org/
911
912    { ISD::ADD, MVT::i8,      1 }, // Pentium III from http://www.agner.org/
913    { ISD::ADD, MVT::i16,     1 }, // Pentium III from http://www.agner.org/
914    { ISD::ADD, MVT::i32,     1 }, // Pentium III from http://www.agner.org/
915
916    { ISD::SUB, MVT::i8,      1 }, // Pentium III from http://www.agner.org/
917    { ISD::SUB, MVT::i16,     1 }, // Pentium III from http://www.agner.org/
918    { ISD::SUB, MVT::i32,     1 }, // Pentium III from http://www.agner.org/
919  };
920
921  if (ST->hasSSE1())
922    if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
923      return LT.first * Entry->Cost;
924
925  // It is not a good idea to vectorize division. We have to scalarize it and
926  // in the process we will often end up having to spilling regular
927  // registers. The overhead of division is going to dominate most kernels
928  // anyways so try hard to prevent vectorization of division - it is
929  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
930  // to hide "20 cycles" for each lane.
931  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
932                               ISD == ISD::UDIV || ISD == ISD::UREM)) {
933    int ScalarCost = getArithmeticInstrCost(
934        Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
935        TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
936    return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
937  }
938
939  // Fallback to the default implementation.
940  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
941}
942
943int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
944                               int Index, VectorType *SubTp) {
945  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
946  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
947  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
948
949  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
950  if (Kind == TTI::SK_Transpose)
951    Kind = TTI::SK_PermuteTwoSrc;
952
953  // For Broadcasts we are splatting the first element from the first input
954  // register, so only need to reference that input and all the output
955  // registers are the same.
956  if (Kind == TTI::SK_Broadcast)
957    LT.first = 1;
958
959  // Subvector extractions are free if they start at the beginning of a
960  // vector and cheap if the subvectors are aligned.
961  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
962    int NumElts = LT.second.getVectorNumElements();
963    if ((Index % NumElts) == 0)
964      return 0;
965    std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
966    if (SubLT.second.isVector()) {
967      int NumSubElts = SubLT.second.getVectorNumElements();
968      if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
969        return SubLT.first;
970      // Handle some cases for widening legalization. For now we only handle
971      // cases where the original subvector was naturally aligned and evenly
972      // fit in its legalized subvector type.
973      // FIXME: Remove some of the alignment restrictions.
974      // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
975      // vectors.
976      int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
977      if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
978          (NumSubElts % OrigSubElts) == 0 &&
979          LT.second.getVectorElementType() ==
980              SubLT.second.getVectorElementType() &&
981          LT.second.getVectorElementType().getSizeInBits() ==
982              BaseTp->getElementType()->getPrimitiveSizeInBits()) {
983        assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
984               "Unexpected number of elements!");
985        auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
986                                           LT.second.getVectorNumElements());
987        auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
988                                           SubLT.second.getVectorNumElements());
989        int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
990        int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
991                                         ExtractIndex, SubTy);
992
993        // If the original size is 32-bits or more, we can use pshufd. Otherwise
994        // if we have SSSE3 we can use pshufb.
995        if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
996          return ExtractCost + 1; // pshufd or pshufb
997
998        assert(SubTp->getPrimitiveSizeInBits() == 16 &&
999               "Unexpected vector size");
1000
1001        return ExtractCost + 2; // worst case pshufhw + pshufd
1002      }
1003    }
1004  }
1005
1006  // Handle some common (illegal) sub-vector types as they are often very cheap
1007  // to shuffle even on targets without PSHUFB.
1008  EVT VT = TLI->getValueType(DL, BaseTp);
1009  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1010      !ST->hasSSSE3()) {
1011     static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1012      {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
1013      {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
1014      {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
1015      {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
1016      {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
1017
1018      {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
1019      {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
1020      {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
1021      {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
1022
1023      {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
1024      {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
1025      {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
1026      {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
1027      {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
1028
1029      {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1030      {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1031      {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
1032      {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
1033      {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
1034    };
1035
1036    if (ST->hasSSE2())
1037      if (const auto *Entry =
1038              CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1039        return Entry->Cost;
1040  }
1041
1042  // We are going to permute multiple sources and the result will be in multiple
1043  // destinations. Providing an accurate cost only for splits where the element
1044  // type remains the same.
1045  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1046    MVT LegalVT = LT.second;
1047    if (LegalVT.isVector() &&
1048        LegalVT.getVectorElementType().getSizeInBits() ==
1049            BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1050        LegalVT.getVectorNumElements() <
1051            cast<FixedVectorType>(BaseTp)->getNumElements()) {
1052
1053      unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1054      unsigned LegalVTSize = LegalVT.getStoreSize();
1055      // Number of source vectors after legalization:
1056      unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1057      // Number of destination vectors after legalization:
1058      unsigned NumOfDests = LT.first;
1059
1060      auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1061                                              LegalVT.getVectorNumElements());
1062
1063      unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1064      return NumOfShuffles *
1065             getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
1066    }
1067
1068    return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1069  }
1070
1071  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1072  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1073    // We assume that source and destination have the same vector type.
1074    int NumOfDests = LT.first;
1075    int NumOfShufflesPerDest = LT.first * 2 - 1;
1076    LT.first = NumOfDests * NumOfShufflesPerDest;
1077  }
1078
1079  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1080      {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1081      {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1082
1083      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1084      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1085
1086      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1087      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1088      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
1089  };
1090
1091  if (ST->hasVBMI())
1092    if (const auto *Entry =
1093            CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1094      return LT.first * Entry->Cost;
1095
1096  static const CostTblEntry AVX512BWShuffleTbl[] = {
1097      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1098      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
1099
1100      {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1101      {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1102      {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
1103
1104      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1105      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1106      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
1107
1108      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1109      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1110      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
1111      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1112  };
1113
1114  if (ST->hasBWI())
1115    if (const auto *Entry =
1116            CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1117      return LT.first * Entry->Cost;
1118
1119  static const CostTblEntry AVX512ShuffleTbl[] = {
1120      {TTI::SK_Broadcast, MVT::v8f64, 1},  // vbroadcastpd
1121      {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1122      {TTI::SK_Broadcast, MVT::v8i64, 1},  // vpbroadcastq
1123      {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1124      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1125      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
1126
1127      {TTI::SK_Reverse, MVT::v8f64, 1},  // vpermpd
1128      {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1129      {TTI::SK_Reverse, MVT::v8i64, 1},  // vpermq
1130      {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1131
1132      {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1},  // vpermpd
1133      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1134      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1},  // vpermpd
1135      {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1136      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1137      {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1},  // vpermps
1138      {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1},  // vpermq
1139      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1140      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1},  // vpermq
1141      {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1142      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1143      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1},  // vpermd
1144      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1},  // pshufb
1145
1146      {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1},  // vpermt2pd
1147      {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1148      {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1},  // vpermt2q
1149      {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1150      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1},  // vpermt2pd
1151      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1},  // vpermt2ps
1152      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1},  // vpermt2q
1153      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1},  // vpermt2d
1154      {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1},  // vpermt2pd
1155      {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1},  // vpermt2ps
1156      {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1},  // vpermt2q
1157      {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1},  // vpermt2d
1158
1159      // FIXME: This just applies the type legalization cost rules above
1160      // assuming these completely split.
1161      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1162      {TTI::SK_PermuteSingleSrc, MVT::v64i8,  14},
1163      {TTI::SK_PermuteTwoSrc,    MVT::v32i16, 42},
1164      {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  42},
1165  };
1166
1167  if (ST->hasAVX512())
1168    if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1169      return LT.first * Entry->Cost;
1170
1171  static const CostTblEntry AVX2ShuffleTbl[] = {
1172      {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
1173      {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
1174      {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
1175      {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
1176      {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1177      {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
1178
1179      {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
1180      {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
1181      {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
1182      {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
1183      {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1184      {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
1185
1186      {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1187      {TTI::SK_Select, MVT::v32i8, 1},  // vpblendvb
1188
1189      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1190      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1191      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1192      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1193      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1194                                                  // + vpblendvb
1195      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
1196                                                  // + vpblendvb
1197
1198      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
1199      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
1200      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
1201      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
1202      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1203                                               // + vpblendvb
1204      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
1205                                               // + vpblendvb
1206  };
1207
1208  if (ST->hasAVX2())
1209    if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1210      return LT.first * Entry->Cost;
1211
1212  static const CostTblEntry XOPShuffleTbl[] = {
1213      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
1214      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
1215      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
1216      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
1217      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1218                                                  // + vinsertf128
1219      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
1220                                                  // + vinsertf128
1221
1222      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1223                                               // + vinsertf128
1224      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
1225      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
1226                                               // + vinsertf128
1227      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
1228  };
1229
1230  if (ST->hasXOP())
1231    if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1232      return LT.first * Entry->Cost;
1233
1234  static const CostTblEntry AVX1ShuffleTbl[] = {
1235      {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1236      {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1237      {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1238      {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1239      {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1240      {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
1241
1242      {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1243      {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1244      {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1245      {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1246      {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1247                                         // + vinsertf128
1248      {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
1249                                         // + vinsertf128
1250
1251      {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
1252      {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
1253      {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
1254      {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
1255      {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1256      {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
1257
1258      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
1259      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
1260      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
1261      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
1262      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1263                                                  // + 2*por + vinsertf128
1264      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
1265                                                  // + 2*por + vinsertf128
1266
1267      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
1268      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
1269      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
1270      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
1271      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1272                                                // + 4*por + vinsertf128
1273      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
1274                                                // + 4*por + vinsertf128
1275  };
1276
1277  if (ST->hasAVX())
1278    if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1279      return LT.first * Entry->Cost;
1280
1281  static const CostTblEntry SSE41ShuffleTbl[] = {
1282      {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1283      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1284      {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1285      {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1286      {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1287      {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
1288  };
1289
1290  if (ST->hasSSE41())
1291    if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1292      return LT.first * Entry->Cost;
1293
1294  static const CostTblEntry SSSE3ShuffleTbl[] = {
1295      {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1296      {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1297
1298      {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1299      {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1300
1301      {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1302      {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1303
1304      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1305      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1306
1307      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1308      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1309  };
1310
1311  if (ST->hasSSSE3())
1312    if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1313      return LT.first * Entry->Cost;
1314
1315  static const CostTblEntry SSE2ShuffleTbl[] = {
1316      {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1317      {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1318      {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1319      {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1320      {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1321
1322      {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1323      {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1324      {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1325      {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1326      {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1327                                        // + 2*pshufd + 2*unpck + packus
1328
1329      {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1330      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1331      {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1332      {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1333      {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1334
1335      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1336      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1337      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1338      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1339                                                  // + pshufd/unpck
1340    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1341                                                  // + 2*pshufd + 2*unpck + 2*packus
1342
1343    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
1344    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
1345    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
1346    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
1347    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
1348  };
1349
1350  if (ST->hasSSE2())
1351    if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1352      return LT.first * Entry->Cost;
1353
1354  static const CostTblEntry SSE1ShuffleTbl[] = {
1355    { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
1356    { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
1357    { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
1358    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1359    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
1360  };
1361
1362  if (ST->hasSSE1())
1363    if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1364      return LT.first * Entry->Cost;
1365
1366  return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
1367}
1368
1369int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1370                                 TTI::TargetCostKind CostKind,
1371                                 const Instruction *I) {
1372  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1373  assert(ISD && "Invalid opcode");
1374
1375  // TODO: Allow non-throughput costs that aren't binary.
1376  auto AdjustCost = [&CostKind](int Cost) {
1377    if (CostKind != TTI::TCK_RecipThroughput)
1378      return Cost == 0 ? 0 : 1;
1379    return Cost;
1380  };
1381
1382  // FIXME: Need a better design of the cost table to handle non-simple types of
1383  // potential massive combinations (elem_num x src_type x dst_type).
1384
1385  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1386    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1387    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1388
1389    // Mask sign extend has an instruction.
1390    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
1391    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
1392    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
1393    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
1394    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
1395    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
1396    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
1397    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1398    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
1399    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1400    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
1401
1402    // Mask zero extend is a sext + shift.
1403    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
1404    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
1405    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
1406    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
1407    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
1408    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
1409    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
1410    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1411    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
1412    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1413    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
1414
1415    { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
1416    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
1417    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // widen to zmm
1418    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // widen to zmm
1419    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // widen to zmm
1420    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // widen to zmm
1421    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // widen to zmm
1422    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // widen to zmm
1423    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // widen to zmm
1424    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // widen to zmm
1425    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // widen to zmm
1426    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
1427    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
1428  };
1429
1430  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1431    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1432    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1433
1434    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1435    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1436
1437    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
1438    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
1439
1440    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
1441    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
1442  };
1443
1444  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1445  // 256-bit wide vectors.
1446
1447  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1448    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
1449    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
1450    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
1451
1452    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
1453    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
1454    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
1455    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
1456    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
1457    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
1458    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
1459    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1460    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
1461    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
1462    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
1463    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
1464    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
1465    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
1466    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
1467    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 },
1468    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 },
1469    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 },
1470    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 },
1471    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
1472    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
1473    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1474
1475    { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
1476    { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
1477
1478    // Sign extend is zmm vpternlogd+vptruncdb.
1479    // Zero extend is zmm broadcast load+vptruncdw.
1480    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
1481    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
1482    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
1483    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
1484    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
1485    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
1486    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
1487    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
1488
1489    // Sign extend is zmm vpternlogd+vptruncdw.
1490    // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1491    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
1492    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
1493    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
1494    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
1495    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
1496    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
1497    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
1498    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
1499
1500    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
1501    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
1502    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
1503    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
1504    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
1505    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
1506    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
1507    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
1508    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
1509    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
1510
1511    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
1512    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
1513    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
1514    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
1515
1516    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1517    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1518    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1519    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1520    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
1521    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
1522    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1523    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1524    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1525    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1526
1527    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1528    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1529
1530    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1531    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1532    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1533    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1534    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1535    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1536    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1537    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1538
1539    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1540    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1541    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1542    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1543    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1544    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1545    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1546    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1547    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
1548    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
1549
1550    { ISD::FP_TO_SINT,  MVT::v8i8,   MVT::v8f64,  3 },
1551    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
1552    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 3 },
1553    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 3 },
1554
1555    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
1556    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
1557    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
1558    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
1559    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
1560    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
1561  };
1562
1563  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1564    // Mask sign extend has an instruction.
1565    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
1566    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
1567    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
1568    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
1569    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
1570    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
1571    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
1572    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1573    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
1574
1575    // Mask zero extend is a sext + shift.
1576    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
1577    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
1578    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
1579    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
1580    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
1581    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
1582    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
1583    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1584    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
1585
1586    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
1587    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // vpsllw+vptestmb
1588    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // vpsllw+vptestmw
1589    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // vpsllw+vptestmb
1590    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // vpsllw+vptestmw
1591    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // vpsllw+vptestmb
1592    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // vpsllw+vptestmw
1593    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // vpsllw+vptestmb
1594    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // vpsllw+vptestmw
1595    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // vpsllw+vptestmb
1596  };
1597
1598  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1599    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1600    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1601    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1602    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1603
1604    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1605    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1606    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1607    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1608
1609    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
1610    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
1611    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
1612    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
1613
1614    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
1615    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
1616    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
1617    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
1618  };
1619
1620  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
1621    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
1622    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
1623    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
1624    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
1625    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
1626    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
1627    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
1628    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
1629    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
1630    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
1631    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
1632    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
1633    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
1634    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
1635
1636    // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
1637    // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
1638    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
1639    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
1640    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
1641    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
1642    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
1643    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
1644    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
1645    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
1646
1647    // sign extend is vpcmpeq+maskedmove+vpmovdw
1648    // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
1649    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
1650    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
1651    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
1652    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
1653    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
1654    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
1655    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
1656    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
1657
1658    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
1659    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
1660    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
1661    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
1662    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
1663    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
1664    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
1665    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
1666    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
1667    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
1668
1669    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
1670    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
1671    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
1672    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
1673    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
1674    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
1675    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
1676    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
1677    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
1678    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
1679    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
1680    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
1681    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
1682    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
1683
1684    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
1685    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
1686
1687    { ISD::FP_TO_SINT,  MVT::v8i8,   MVT::v8f32,  3 },
1688    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f32,  3 },
1689
1690    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
1691    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
1692
1693    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
1694    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
1695    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f64,  1 },
1696    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
1697    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
1698  };
1699
1700  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1701    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1702    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1703    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1704    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1705    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
1706    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
1707    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
1708    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
1709    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
1710    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
1711    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1712    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1713    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
1714    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
1715    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1716    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1717    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1718    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1719    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1720    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
1721
1722    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
1723    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
1724
1725    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
1726    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
1727    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
1728    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
1729
1730    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
1731    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
1732
1733    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
1734  };
1735
1736  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1737    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
1738    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
1739    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
1740    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
1741    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1742    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1743    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1744    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1745    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1746    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1747    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1748    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1749    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 4 },
1750    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1751    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1752    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1753    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1754    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1755
1756    { ISD::TRUNCATE,    MVT::v4i1,  MVT::v4i64,  4 },
1757    { ISD::TRUNCATE,    MVT::v8i1,  MVT::v8i32,  5 },
1758    { ISD::TRUNCATE,    MVT::v16i1, MVT::v16i16, 4 },
1759    { ISD::TRUNCATE,    MVT::v8i1,  MVT::v8i64,  9 },
1760    { ISD::TRUNCATE,    MVT::v16i1, MVT::v16i64, 11 },
1761
1762    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
1763    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
1764    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
1765    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
1766    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
1767    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  2 },
1768    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i64, 11 },
1769    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i64,  9 },
1770    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  3 },
1771    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i64, 11 },
1772
1773    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
1774    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
1775    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
1776    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
1777    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
1778    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
1779    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
1780    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
1781    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1782    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
1783    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
1784    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
1785
1786    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
1787    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
1788    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
1789    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
1790    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
1791    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
1792    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
1793    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
1794    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1795    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
1796    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
1797    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
1798    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
1799    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
1800    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
1801    // The generic code to compute the scalar overhead is currently broken.
1802    // Workaround this limitation by estimating the scalarization overhead
1803    // here. We have roughly 10 instructions per scalar element.
1804    // Multiply that by the vector width.
1805    // FIXME: remove that when PR19268 is fixed.
1806    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1807    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1808
1809    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 4 },
1810    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f64, 3 },
1811    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f64, 2 },
1812    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 3 },
1813
1814    { ISD::FP_TO_UINT,  MVT::v4i8,  MVT::v4f64, 3 },
1815    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f64, 2 },
1816    { ISD::FP_TO_UINT,  MVT::v8i8,  MVT::v8f32, 4 },
1817    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 3 },
1818    // This node is expanded into scalarized operations but BasicTTI is overly
1819    // optimistic estimating its cost.  It computes 3 per element (one
1820    // vector-extract, one scalar conversion and one vector-insert).  The
1821    // problem is that the inserts form a read-modify-write chain so latency
1822    // should be factored in too.  Inflating the cost per element by 1.
1823    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
1824    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
1825
1826    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
1827    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
1828  };
1829
1830  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1831    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1832    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1833    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1834    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1835    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1836    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1837
1838    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1839    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
1840    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1841    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1842    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1843    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1844    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1845    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1846    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1847    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1848    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1849    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1850    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1851    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1852    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1853    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1854    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1855    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1856
1857    // These truncates end up widening elements.
1858    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
1859    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
1860    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
1861
1862    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  1 },
1863    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  1 },
1864    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
1865    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
1866    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
1867    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
1868    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
1869    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
1870    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  1 }, // PSHUFB
1871
1872    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
1873    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
1874
1875    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  3 },
1876    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  3 },
1877
1878    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f32,  3 },
1879    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f64,  3 },
1880    { ISD::FP_TO_UINT,  MVT::v4i16,  MVT::v4f32,  2 },
1881  };
1882
1883  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1884    // These are somewhat magic numbers justified by looking at the output of
1885    // Intel's IACA, running some kernels and making sure when we take
1886    // legalization into account the throughput will be overestimated.
1887    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1888    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1889    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1890    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1891    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1892    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
1893    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
1894    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1895    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1896
1897    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1898    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1899    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1900    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1901    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1902    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1903    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
1904    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1905
1906    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  4 },
1907    { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f32,  2 },
1908    { ISD::FP_TO_SINT,  MVT::v4i8,   MVT::v4f32,  3 },
1909    { ISD::FP_TO_SINT,  MVT::v4i16,  MVT::v4f32,  2 },
1910    { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f64,  2 },
1911    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  4 },
1912
1913    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  1 },
1914
1915    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    6 },
1916    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
1917
1918    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
1919    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
1920    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f32,  4 },
1921    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f64,  4 },
1922    { ISD::FP_TO_UINT,  MVT::v4i8,   MVT::v4f32,  3 },
1923    { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f32,  2 },
1924    { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f64,  2 },
1925    { ISD::FP_TO_UINT,  MVT::v4i16,  MVT::v4f32,  4 },
1926
1927    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1928    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
1929    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
1930    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
1931    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
1932    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
1933    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1934    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
1935    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1936    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1937    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
1938    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
1939    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
1940    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
1941    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1942    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
1943    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1944    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
1945    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
1946    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
1947    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1948    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1949    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
1950    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
1951
1952    // These truncates are really widening elements.
1953    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
1954    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
1955    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
1956    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
1957    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
1958    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
1959
1960    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // PAND+PACKUSWB
1961    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // PAND+PACKUSWB
1962    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // PAND+PACKUSWB
1963    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
1964    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i32,  3 }, // PAND+2*PACKUSWB
1965    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
1966    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
1967    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
1968    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
1969    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
1970    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
1971    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
1972    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  4 }, // PAND+3*PACKUSWB
1973    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
1974    { ISD::TRUNCATE,    MVT::v2i32,  MVT::v2i64,  1 }, // PSHUFD
1975  };
1976
1977  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1978  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1979
1980  if (ST->hasSSE2() && !ST->hasAVX()) {
1981    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1982                                                   LTDest.second, LTSrc.second))
1983      return AdjustCost(LTSrc.first * Entry->Cost);
1984  }
1985
1986  EVT SrcTy = TLI->getValueType(DL, Src);
1987  EVT DstTy = TLI->getValueType(DL, Dst);
1988
1989  // The function getSimpleVT only handles simple value types.
1990  if (!SrcTy.isSimple() || !DstTy.isSimple())
1991    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
1992
1993  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1994  MVT SimpleDstTy = DstTy.getSimpleVT();
1995
1996  if (ST->useAVX512Regs()) {
1997    if (ST->hasBWI())
1998      if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1999                                                     SimpleDstTy, SimpleSrcTy))
2000        return AdjustCost(Entry->Cost);
2001
2002    if (ST->hasDQI())
2003      if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
2004                                                     SimpleDstTy, SimpleSrcTy))
2005        return AdjustCost(Entry->Cost);
2006
2007    if (ST->hasAVX512())
2008      if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
2009                                                     SimpleDstTy, SimpleSrcTy))
2010        return AdjustCost(Entry->Cost);
2011  }
2012
2013  if (ST->hasBWI())
2014    if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2015                                                   SimpleDstTy, SimpleSrcTy))
2016      return AdjustCost(Entry->Cost);
2017
2018  if (ST->hasDQI())
2019    if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2020                                                   SimpleDstTy, SimpleSrcTy))
2021      return AdjustCost(Entry->Cost);
2022
2023  if (ST->hasAVX512())
2024    if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2025                                                   SimpleDstTy, SimpleSrcTy))
2026      return AdjustCost(Entry->Cost);
2027
2028  if (ST->hasAVX2()) {
2029    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2030                                                   SimpleDstTy, SimpleSrcTy))
2031      return AdjustCost(Entry->Cost);
2032  }
2033
2034  if (ST->hasAVX()) {
2035    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2036                                                   SimpleDstTy, SimpleSrcTy))
2037      return AdjustCost(Entry->Cost);
2038  }
2039
2040  if (ST->hasSSE41()) {
2041    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2042                                                   SimpleDstTy, SimpleSrcTy))
2043      return AdjustCost(Entry->Cost);
2044  }
2045
2046  if (ST->hasSSE2()) {
2047    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2048                                                   SimpleDstTy, SimpleSrcTy))
2049      return AdjustCost(Entry->Cost);
2050  }
2051
2052  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
2053}
2054
2055int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
2056                                   TTI::TargetCostKind CostKind,
2057                                   const Instruction *I) {
2058  // TODO: Handle other cost kinds.
2059  if (CostKind != TTI::TCK_RecipThroughput)
2060    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
2061
2062  // Legalize the type.
2063  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2064
2065  MVT MTy = LT.second;
2066
2067  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2068  assert(ISD && "Invalid opcode");
2069
2070  unsigned ExtraCost = 0;
2071  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
2072    // Some vector comparison predicates cost extra instructions.
2073    if (MTy.isVector() &&
2074        !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2075          (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2076          ST->hasBWI())) {
2077      switch (cast<CmpInst>(I)->getPredicate()) {
2078      case CmpInst::Predicate::ICMP_NE:
2079        // xor(cmpeq(x,y),-1)
2080        ExtraCost = 1;
2081        break;
2082      case CmpInst::Predicate::ICMP_SGE:
2083      case CmpInst::Predicate::ICMP_SLE:
2084        // xor(cmpgt(x,y),-1)
2085        ExtraCost = 1;
2086        break;
2087      case CmpInst::Predicate::ICMP_ULT:
2088      case CmpInst::Predicate::ICMP_UGT:
2089        // cmpgt(xor(x,signbit),xor(y,signbit))
2090        // xor(cmpeq(pmaxu(x,y),x),-1)
2091        ExtraCost = 2;
2092        break;
2093      case CmpInst::Predicate::ICMP_ULE:
2094      case CmpInst::Predicate::ICMP_UGE:
2095        if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2096            (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2097          // cmpeq(psubus(x,y),0)
2098          // cmpeq(pminu(x,y),x)
2099          ExtraCost = 1;
2100        } else {
2101          // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2102          ExtraCost = 3;
2103        }
2104        break;
2105      default:
2106        break;
2107      }
2108    }
2109  }
2110
2111  static const CostTblEntry SLMCostTbl[] = {
2112    // slm pcmpeq/pcmpgt throughput is 2
2113    { ISD::SETCC,   MVT::v2i64,   2 },
2114  };
2115
2116  static const CostTblEntry AVX512BWCostTbl[] = {
2117    { ISD::SETCC,   MVT::v32i16,  1 },
2118    { ISD::SETCC,   MVT::v64i8,   1 },
2119
2120    { ISD::SELECT,  MVT::v32i16,  1 },
2121    { ISD::SELECT,  MVT::v64i8,   1 },
2122  };
2123
2124  static const CostTblEntry AVX512CostTbl[] = {
2125    { ISD::SETCC,   MVT::v8i64,   1 },
2126    { ISD::SETCC,   MVT::v16i32,  1 },
2127    { ISD::SETCC,   MVT::v8f64,   1 },
2128    { ISD::SETCC,   MVT::v16f32,  1 },
2129
2130    { ISD::SELECT,  MVT::v8i64,   1 },
2131    { ISD::SELECT,  MVT::v16i32,  1 },
2132    { ISD::SELECT,  MVT::v8f64,   1 },
2133    { ISD::SELECT,  MVT::v16f32,  1 },
2134
2135    { ISD::SETCC,   MVT::v32i16,  2 }, // FIXME: should probably be 4
2136    { ISD::SETCC,   MVT::v64i8,   2 }, // FIXME: should probably be 4
2137
2138    { ISD::SELECT,  MVT::v32i16,  2 }, // FIXME: should be 3
2139    { ISD::SELECT,  MVT::v64i8,   2 }, // FIXME: should be 3
2140  };
2141
2142  static const CostTblEntry AVX2CostTbl[] = {
2143    { ISD::SETCC,   MVT::v4i64,   1 },
2144    { ISD::SETCC,   MVT::v8i32,   1 },
2145    { ISD::SETCC,   MVT::v16i16,  1 },
2146    { ISD::SETCC,   MVT::v32i8,   1 },
2147
2148    { ISD::SELECT,  MVT::v4i64,   1 }, // pblendvb
2149    { ISD::SELECT,  MVT::v8i32,   1 }, // pblendvb
2150    { ISD::SELECT,  MVT::v16i16,  1 }, // pblendvb
2151    { ISD::SELECT,  MVT::v32i8,   1 }, // pblendvb
2152  };
2153
2154  static const CostTblEntry AVX1CostTbl[] = {
2155    { ISD::SETCC,   MVT::v4f64,   1 },
2156    { ISD::SETCC,   MVT::v8f32,   1 },
2157    // AVX1 does not support 8-wide integer compare.
2158    { ISD::SETCC,   MVT::v4i64,   4 },
2159    { ISD::SETCC,   MVT::v8i32,   4 },
2160    { ISD::SETCC,   MVT::v16i16,  4 },
2161    { ISD::SETCC,   MVT::v32i8,   4 },
2162
2163    { ISD::SELECT,  MVT::v4f64,   1 }, // vblendvpd
2164    { ISD::SELECT,  MVT::v8f32,   1 }, // vblendvps
2165    { ISD::SELECT,  MVT::v4i64,   1 }, // vblendvpd
2166    { ISD::SELECT,  MVT::v8i32,   1 }, // vblendvps
2167    { ISD::SELECT,  MVT::v16i16,  3 }, // vandps + vandnps + vorps
2168    { ISD::SELECT,  MVT::v32i8,   3 }, // vandps + vandnps + vorps
2169  };
2170
2171  static const CostTblEntry SSE42CostTbl[] = {
2172    { ISD::SETCC,   MVT::v2f64,   1 },
2173    { ISD::SETCC,   MVT::v4f32,   1 },
2174    { ISD::SETCC,   MVT::v2i64,   1 },
2175  };
2176
2177  static const CostTblEntry SSE41CostTbl[] = {
2178    { ISD::SELECT,  MVT::v2f64,   1 }, // blendvpd
2179    { ISD::SELECT,  MVT::v4f32,   1 }, // blendvps
2180    { ISD::SELECT,  MVT::v2i64,   1 }, // pblendvb
2181    { ISD::SELECT,  MVT::v4i32,   1 }, // pblendvb
2182    { ISD::SELECT,  MVT::v8i16,   1 }, // pblendvb
2183    { ISD::SELECT,  MVT::v16i8,   1 }, // pblendvb
2184  };
2185
2186  static const CostTblEntry SSE2CostTbl[] = {
2187    { ISD::SETCC,   MVT::v2f64,   2 },
2188    { ISD::SETCC,   MVT::f64,     1 },
2189    { ISD::SETCC,   MVT::v2i64,   8 },
2190    { ISD::SETCC,   MVT::v4i32,   1 },
2191    { ISD::SETCC,   MVT::v8i16,   1 },
2192    { ISD::SETCC,   MVT::v16i8,   1 },
2193
2194    { ISD::SELECT,  MVT::v2f64,   3 }, // andpd + andnpd + orpd
2195    { ISD::SELECT,  MVT::v2i64,   3 }, // pand + pandn + por
2196    { ISD::SELECT,  MVT::v4i32,   3 }, // pand + pandn + por
2197    { ISD::SELECT,  MVT::v8i16,   3 }, // pand + pandn + por
2198    { ISD::SELECT,  MVT::v16i8,   3 }, // pand + pandn + por
2199  };
2200
2201  static const CostTblEntry SSE1CostTbl[] = {
2202    { ISD::SETCC,   MVT::v4f32,   2 },
2203    { ISD::SETCC,   MVT::f32,     1 },
2204
2205    { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
2206  };
2207
2208  if (ST->isSLM())
2209    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2210      return LT.first * (ExtraCost + Entry->Cost);
2211
2212  if (ST->hasBWI())
2213    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2214      return LT.first * (ExtraCost + Entry->Cost);
2215
2216  if (ST->hasAVX512())
2217    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2218      return LT.first * (ExtraCost + Entry->Cost);
2219
2220  if (ST->hasAVX2())
2221    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2222      return LT.first * (ExtraCost + Entry->Cost);
2223
2224  if (ST->hasAVX())
2225    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2226      return LT.first * (ExtraCost + Entry->Cost);
2227
2228  if (ST->hasSSE42())
2229    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2230      return LT.first * (ExtraCost + Entry->Cost);
2231
2232  if (ST->hasSSE41())
2233    if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2234      return LT.first * (ExtraCost + Entry->Cost);
2235
2236  if (ST->hasSSE2())
2237    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2238      return LT.first * (ExtraCost + Entry->Cost);
2239
2240  if (ST->hasSSE1())
2241    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2242      return LT.first * (ExtraCost + Entry->Cost);
2243
2244  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
2245}
2246
2247unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2248
2249int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
2250  const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
2251
2252  // Costs should match the codegen from:
2253  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2254  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2255  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2256  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2257  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2258  static const CostTblEntry AVX512CDCostTbl[] = {
2259    { ISD::CTLZ,       MVT::v8i64,   1 },
2260    { ISD::CTLZ,       MVT::v16i32,  1 },
2261    { ISD::CTLZ,       MVT::v32i16,  8 },
2262    { ISD::CTLZ,       MVT::v64i8,  20 },
2263    { ISD::CTLZ,       MVT::v4i64,   1 },
2264    { ISD::CTLZ,       MVT::v8i32,   1 },
2265    { ISD::CTLZ,       MVT::v16i16,  4 },
2266    { ISD::CTLZ,       MVT::v32i8,  10 },
2267    { ISD::CTLZ,       MVT::v2i64,   1 },
2268    { ISD::CTLZ,       MVT::v4i32,   1 },
2269    { ISD::CTLZ,       MVT::v8i16,   4 },
2270    { ISD::CTLZ,       MVT::v16i8,   4 },
2271  };
2272  static const CostTblEntry AVX512BWCostTbl[] = {
2273    { ISD::BITREVERSE, MVT::v8i64,   5 },
2274    { ISD::BITREVERSE, MVT::v16i32,  5 },
2275    { ISD::BITREVERSE, MVT::v32i16,  5 },
2276    { ISD::BITREVERSE, MVT::v64i8,   5 },
2277    { ISD::CTLZ,       MVT::v8i64,  23 },
2278    { ISD::CTLZ,       MVT::v16i32, 22 },
2279    { ISD::CTLZ,       MVT::v32i16, 18 },
2280    { ISD::CTLZ,       MVT::v64i8,  17 },
2281    { ISD::CTPOP,      MVT::v8i64,   7 },
2282    { ISD::CTPOP,      MVT::v16i32, 11 },
2283    { ISD::CTPOP,      MVT::v32i16,  9 },
2284    { ISD::CTPOP,      MVT::v64i8,   6 },
2285    { ISD::CTTZ,       MVT::v8i64,  10 },
2286    { ISD::CTTZ,       MVT::v16i32, 14 },
2287    { ISD::CTTZ,       MVT::v32i16, 12 },
2288    { ISD::CTTZ,       MVT::v64i8,   9 },
2289    { ISD::SADDSAT,    MVT::v32i16,  1 },
2290    { ISD::SADDSAT,    MVT::v64i8,   1 },
2291    { ISD::SSUBSAT,    MVT::v32i16,  1 },
2292    { ISD::SSUBSAT,    MVT::v64i8,   1 },
2293    { ISD::UADDSAT,    MVT::v32i16,  1 },
2294    { ISD::UADDSAT,    MVT::v64i8,   1 },
2295    { ISD::USUBSAT,    MVT::v32i16,  1 },
2296    { ISD::USUBSAT,    MVT::v64i8,   1 },
2297  };
2298  static const CostTblEntry AVX512CostTbl[] = {
2299    { ISD::BITREVERSE, MVT::v8i64,  36 },
2300    { ISD::BITREVERSE, MVT::v16i32, 24 },
2301    { ISD::BITREVERSE, MVT::v32i16, 10 },
2302    { ISD::BITREVERSE, MVT::v64i8,  10 },
2303    { ISD::CTLZ,       MVT::v8i64,  29 },
2304    { ISD::CTLZ,       MVT::v16i32, 35 },
2305    { ISD::CTLZ,       MVT::v32i16, 28 },
2306    { ISD::CTLZ,       MVT::v64i8,  18 },
2307    { ISD::CTPOP,      MVT::v8i64,  16 },
2308    { ISD::CTPOP,      MVT::v16i32, 24 },
2309    { ISD::CTPOP,      MVT::v32i16, 18 },
2310    { ISD::CTPOP,      MVT::v64i8,  12 },
2311    { ISD::CTTZ,       MVT::v8i64,  20 },
2312    { ISD::CTTZ,       MVT::v16i32, 28 },
2313    { ISD::CTTZ,       MVT::v32i16, 24 },
2314    { ISD::CTTZ,       MVT::v64i8,  18 },
2315    { ISD::USUBSAT,    MVT::v16i32,  2 }, // pmaxud + psubd
2316    { ISD::USUBSAT,    MVT::v2i64,   2 }, // pmaxuq + psubq
2317    { ISD::USUBSAT,    MVT::v4i64,   2 }, // pmaxuq + psubq
2318    { ISD::USUBSAT,    MVT::v8i64,   2 }, // pmaxuq + psubq
2319    { ISD::UADDSAT,    MVT::v16i32,  3 }, // not + pminud + paddd
2320    { ISD::UADDSAT,    MVT::v2i64,   3 }, // not + pminuq + paddq
2321    { ISD::UADDSAT,    MVT::v4i64,   3 }, // not + pminuq + paddq
2322    { ISD::UADDSAT,    MVT::v8i64,   3 }, // not + pminuq + paddq
2323    { ISD::SADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
2324    { ISD::SADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
2325    { ISD::SSUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
2326    { ISD::SSUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
2327    { ISD::UADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
2328    { ISD::UADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
2329    { ISD::USUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
2330    { ISD::USUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
2331    { ISD::FMAXNUM,    MVT::f32,     2 },
2332    { ISD::FMAXNUM,    MVT::v4f32,   2 },
2333    { ISD::FMAXNUM,    MVT::v8f32,   2 },
2334    { ISD::FMAXNUM,    MVT::v16f32,  2 },
2335    { ISD::FMAXNUM,    MVT::f64,     2 },
2336    { ISD::FMAXNUM,    MVT::v2f64,   2 },
2337    { ISD::FMAXNUM,    MVT::v4f64,   2 },
2338    { ISD::FMAXNUM,    MVT::v8f64,   2 },
2339  };
2340  static const CostTblEntry XOPCostTbl[] = {
2341    { ISD::BITREVERSE, MVT::v4i64,   4 },
2342    { ISD::BITREVERSE, MVT::v8i32,   4 },
2343    { ISD::BITREVERSE, MVT::v16i16,  4 },
2344    { ISD::BITREVERSE, MVT::v32i8,   4 },
2345    { ISD::BITREVERSE, MVT::v2i64,   1 },
2346    { ISD::BITREVERSE, MVT::v4i32,   1 },
2347    { ISD::BITREVERSE, MVT::v8i16,   1 },
2348    { ISD::BITREVERSE, MVT::v16i8,   1 },
2349    { ISD::BITREVERSE, MVT::i64,     3 },
2350    { ISD::BITREVERSE, MVT::i32,     3 },
2351    { ISD::BITREVERSE, MVT::i16,     3 },
2352    { ISD::BITREVERSE, MVT::i8,      3 }
2353  };
2354  static const CostTblEntry AVX2CostTbl[] = {
2355    { ISD::BITREVERSE, MVT::v4i64,   5 },
2356    { ISD::BITREVERSE, MVT::v8i32,   5 },
2357    { ISD::BITREVERSE, MVT::v16i16,  5 },
2358    { ISD::BITREVERSE, MVT::v32i8,   5 },
2359    { ISD::BSWAP,      MVT::v4i64,   1 },
2360    { ISD::BSWAP,      MVT::v8i32,   1 },
2361    { ISD::BSWAP,      MVT::v16i16,  1 },
2362    { ISD::CTLZ,       MVT::v4i64,  23 },
2363    { ISD::CTLZ,       MVT::v8i32,  18 },
2364    { ISD::CTLZ,       MVT::v16i16, 14 },
2365    { ISD::CTLZ,       MVT::v32i8,   9 },
2366    { ISD::CTPOP,      MVT::v4i64,   7 },
2367    { ISD::CTPOP,      MVT::v8i32,  11 },
2368    { ISD::CTPOP,      MVT::v16i16,  9 },
2369    { ISD::CTPOP,      MVT::v32i8,   6 },
2370    { ISD::CTTZ,       MVT::v4i64,  10 },
2371    { ISD::CTTZ,       MVT::v8i32,  14 },
2372    { ISD::CTTZ,       MVT::v16i16, 12 },
2373    { ISD::CTTZ,       MVT::v32i8,   9 },
2374    { ISD::SADDSAT,    MVT::v16i16,  1 },
2375    { ISD::SADDSAT,    MVT::v32i8,   1 },
2376    { ISD::SSUBSAT,    MVT::v16i16,  1 },
2377    { ISD::SSUBSAT,    MVT::v32i8,   1 },
2378    { ISD::UADDSAT,    MVT::v16i16,  1 },
2379    { ISD::UADDSAT,    MVT::v32i8,   1 },
2380    { ISD::UADDSAT,    MVT::v8i32,   3 }, // not + pminud + paddd
2381    { ISD::USUBSAT,    MVT::v16i16,  1 },
2382    { ISD::USUBSAT,    MVT::v32i8,   1 },
2383    { ISD::USUBSAT,    MVT::v8i32,   2 }, // pmaxud + psubd
2384    { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
2385    { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
2386    { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
2387    { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
2388    { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
2389    { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
2390  };
2391  static const CostTblEntry AVX1CostTbl[] = {
2392    { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
2393    { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
2394    { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
2395    { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
2396    { ISD::BSWAP,      MVT::v4i64,   4 },
2397    { ISD::BSWAP,      MVT::v8i32,   4 },
2398    { ISD::BSWAP,      MVT::v16i16,  4 },
2399    { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
2400    { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
2401    { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
2402    { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
2403    { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
2404    { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
2405    { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
2406    { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
2407    { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
2408    { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
2409    { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
2410    { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
2411    { ISD::SADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
2412    { ISD::SADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
2413    { ISD::SSUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
2414    { ISD::SSUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
2415    { ISD::UADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
2416    { ISD::UADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
2417    { ISD::UADDSAT,    MVT::v8i32,   8 }, // 2 x 128-bit Op + extract/insert
2418    { ISD::USUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
2419    { ISD::USUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
2420    { ISD::USUBSAT,    MVT::v8i32,   6 }, // 2 x 128-bit Op + extract/insert
2421    { ISD::FMAXNUM,    MVT::f32,     3 },
2422    { ISD::FMAXNUM,    MVT::v4f32,   3 },
2423    { ISD::FMAXNUM,    MVT::v8f32,   5 },
2424    { ISD::FMAXNUM,    MVT::f64,     3 },
2425    { ISD::FMAXNUM,    MVT::v2f64,   3 },
2426    { ISD::FMAXNUM,    MVT::v4f64,   5 },
2427    { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
2428    { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
2429    { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
2430    { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
2431    { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
2432    { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
2433  };
2434  static const CostTblEntry GLMCostTbl[] = {
2435    { ISD::FSQRT, MVT::f32,   19 }, // sqrtss
2436    { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
2437    { ISD::FSQRT, MVT::f64,   34 }, // sqrtsd
2438    { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
2439  };
2440  static const CostTblEntry SLMCostTbl[] = {
2441    { ISD::FSQRT, MVT::f32,   20 }, // sqrtss
2442    { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
2443    { ISD::FSQRT, MVT::f64,   35 }, // sqrtsd
2444    { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
2445  };
2446  static const CostTblEntry SSE42CostTbl[] = {
2447    { ISD::USUBSAT,    MVT::v4i32,   2 }, // pmaxud + psubd
2448    { ISD::UADDSAT,    MVT::v4i32,   3 }, // not + pminud + paddd
2449    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
2450    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
2451  };
2452  static const CostTblEntry SSSE3CostTbl[] = {
2453    { ISD::BITREVERSE, MVT::v2i64,   5 },
2454    { ISD::BITREVERSE, MVT::v4i32,   5 },
2455    { ISD::BITREVERSE, MVT::v8i16,   5 },
2456    { ISD::BITREVERSE, MVT::v16i8,   5 },
2457    { ISD::BSWAP,      MVT::v2i64,   1 },
2458    { ISD::BSWAP,      MVT::v4i32,   1 },
2459    { ISD::BSWAP,      MVT::v8i16,   1 },
2460    { ISD::CTLZ,       MVT::v2i64,  23 },
2461    { ISD::CTLZ,       MVT::v4i32,  18 },
2462    { ISD::CTLZ,       MVT::v8i16,  14 },
2463    { ISD::CTLZ,       MVT::v16i8,   9 },
2464    { ISD::CTPOP,      MVT::v2i64,   7 },
2465    { ISD::CTPOP,      MVT::v4i32,  11 },
2466    { ISD::CTPOP,      MVT::v8i16,   9 },
2467    { ISD::CTPOP,      MVT::v16i8,   6 },
2468    { ISD::CTTZ,       MVT::v2i64,  10 },
2469    { ISD::CTTZ,       MVT::v4i32,  14 },
2470    { ISD::CTTZ,       MVT::v8i16,  12 },
2471    { ISD::CTTZ,       MVT::v16i8,   9 }
2472  };
2473  static const CostTblEntry SSE2CostTbl[] = {
2474    { ISD::BITREVERSE, MVT::v2i64,  29 },
2475    { ISD::BITREVERSE, MVT::v4i32,  27 },
2476    { ISD::BITREVERSE, MVT::v8i16,  27 },
2477    { ISD::BITREVERSE, MVT::v16i8,  20 },
2478    { ISD::BSWAP,      MVT::v2i64,   7 },
2479    { ISD::BSWAP,      MVT::v4i32,   7 },
2480    { ISD::BSWAP,      MVT::v8i16,   7 },
2481    { ISD::CTLZ,       MVT::v2i64,  25 },
2482    { ISD::CTLZ,       MVT::v4i32,  26 },
2483    { ISD::CTLZ,       MVT::v8i16,  20 },
2484    { ISD::CTLZ,       MVT::v16i8,  17 },
2485    { ISD::CTPOP,      MVT::v2i64,  12 },
2486    { ISD::CTPOP,      MVT::v4i32,  15 },
2487    { ISD::CTPOP,      MVT::v8i16,  13 },
2488    { ISD::CTPOP,      MVT::v16i8,  10 },
2489    { ISD::CTTZ,       MVT::v2i64,  14 },
2490    { ISD::CTTZ,       MVT::v4i32,  18 },
2491    { ISD::CTTZ,       MVT::v8i16,  16 },
2492    { ISD::CTTZ,       MVT::v16i8,  13 },
2493    { ISD::SADDSAT,    MVT::v8i16,   1 },
2494    { ISD::SADDSAT,    MVT::v16i8,   1 },
2495    { ISD::SSUBSAT,    MVT::v8i16,   1 },
2496    { ISD::SSUBSAT,    MVT::v16i8,   1 },
2497    { ISD::UADDSAT,    MVT::v8i16,   1 },
2498    { ISD::UADDSAT,    MVT::v16i8,   1 },
2499    { ISD::USUBSAT,    MVT::v8i16,   1 },
2500    { ISD::USUBSAT,    MVT::v16i8,   1 },
2501    { ISD::FMAXNUM,    MVT::f64,     4 },
2502    { ISD::FMAXNUM,    MVT::v2f64,   4 },
2503    { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
2504    { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
2505  };
2506  static const CostTblEntry SSE1CostTbl[] = {
2507    { ISD::FMAXNUM,    MVT::f32,     4 },
2508    { ISD::FMAXNUM,    MVT::v4f32,   4 },
2509    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
2510    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
2511  };
2512  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
2513    { ISD::CTTZ,       MVT::i64,     1 },
2514  };
2515  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
2516    { ISD::CTTZ,       MVT::i32,     1 },
2517    { ISD::CTTZ,       MVT::i16,     1 },
2518    { ISD::CTTZ,       MVT::i8,      1 },
2519  };
2520  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2521    { ISD::CTLZ,       MVT::i64,     1 },
2522  };
2523  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2524    { ISD::CTLZ,       MVT::i32,     1 },
2525    { ISD::CTLZ,       MVT::i16,     1 },
2526    { ISD::CTLZ,       MVT::i8,      1 },
2527  };
2528  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2529    { ISD::CTPOP,      MVT::i64,     1 },
2530  };
2531  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2532    { ISD::CTPOP,      MVT::i32,     1 },
2533    { ISD::CTPOP,      MVT::i16,     1 },
2534    { ISD::CTPOP,      MVT::i8,      1 },
2535  };
2536  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2537    { ISD::BITREVERSE, MVT::i64,    14 },
2538    { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
2539    { ISD::CTTZ,       MVT::i64,     3 }, // TEST+BSF+CMOV/BRANCH
2540    { ISD::CTPOP,      MVT::i64,    10 },
2541    { ISD::SADDO,      MVT::i64,     1 },
2542    { ISD::UADDO,      MVT::i64,     1 },
2543  };
2544  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2545    { ISD::BITREVERSE, MVT::i32,    14 },
2546    { ISD::BITREVERSE, MVT::i16,    14 },
2547    { ISD::BITREVERSE, MVT::i8,     11 },
2548    { ISD::CTLZ,       MVT::i32,     4 }, // BSR+XOR or BSR+XOR+CMOV
2549    { ISD::CTLZ,       MVT::i16,     4 }, // BSR+XOR or BSR+XOR+CMOV
2550    { ISD::CTLZ,       MVT::i8,      4 }, // BSR+XOR or BSR+XOR+CMOV
2551    { ISD::CTTZ,       MVT::i32,     3 }, // TEST+BSF+CMOV/BRANCH
2552    { ISD::CTTZ,       MVT::i16,     3 }, // TEST+BSF+CMOV/BRANCH
2553    { ISD::CTTZ,       MVT::i8,      3 }, // TEST+BSF+CMOV/BRANCH
2554    { ISD::CTPOP,      MVT::i32,     8 },
2555    { ISD::CTPOP,      MVT::i16,     9 },
2556    { ISD::CTPOP,      MVT::i8,      7 },
2557    { ISD::SADDO,      MVT::i32,     1 },
2558    { ISD::SADDO,      MVT::i16,     1 },
2559    { ISD::SADDO,      MVT::i8,      1 },
2560    { ISD::UADDO,      MVT::i32,     1 },
2561    { ISD::UADDO,      MVT::i16,     1 },
2562    { ISD::UADDO,      MVT::i8,      1 },
2563  };
2564
2565  Type *RetTy = ICA.getReturnType();
2566  Type *OpTy = RetTy;
2567  Intrinsic::ID IID = ICA.getID();
2568  unsigned ISD = ISD::DELETED_NODE;
2569  switch (IID) {
2570  default:
2571    break;
2572  case Intrinsic::bitreverse:
2573    ISD = ISD::BITREVERSE;
2574    break;
2575  case Intrinsic::bswap:
2576    ISD = ISD::BSWAP;
2577    break;
2578  case Intrinsic::ctlz:
2579    ISD = ISD::CTLZ;
2580    break;
2581  case Intrinsic::ctpop:
2582    ISD = ISD::CTPOP;
2583    break;
2584  case Intrinsic::cttz:
2585    ISD = ISD::CTTZ;
2586    break;
2587  case Intrinsic::maxnum:
2588  case Intrinsic::minnum:
2589    // FMINNUM has same costs so don't duplicate.
2590    ISD = ISD::FMAXNUM;
2591    break;
2592  case Intrinsic::sadd_sat:
2593    ISD = ISD::SADDSAT;
2594    break;
2595  case Intrinsic::ssub_sat:
2596    ISD = ISD::SSUBSAT;
2597    break;
2598  case Intrinsic::uadd_sat:
2599    ISD = ISD::UADDSAT;
2600    break;
2601  case Intrinsic::usub_sat:
2602    ISD = ISD::USUBSAT;
2603    break;
2604  case Intrinsic::sqrt:
2605    ISD = ISD::FSQRT;
2606    break;
2607  case Intrinsic::sadd_with_overflow:
2608  case Intrinsic::ssub_with_overflow:
2609    // SSUBO has same costs so don't duplicate.
2610    ISD = ISD::SADDO;
2611    OpTy = RetTy->getContainedType(0);
2612    break;
2613  case Intrinsic::uadd_with_overflow:
2614  case Intrinsic::usub_with_overflow:
2615    // USUBO has same costs so don't duplicate.
2616    ISD = ISD::UADDO;
2617    OpTy = RetTy->getContainedType(0);
2618    break;
2619  }
2620
2621  if (ISD != ISD::DELETED_NODE) {
2622    // Legalize the type.
2623    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2624    MVT MTy = LT.second;
2625
2626    // Attempt to lookup cost.
2627    if (ST->useGLMDivSqrtCosts())
2628      if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2629        return LT.first * Entry->Cost;
2630
2631    if (ST->isSLM())
2632      if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2633        return LT.first * Entry->Cost;
2634
2635    if (ST->hasCDI())
2636      if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2637        return LT.first * Entry->Cost;
2638
2639    if (ST->hasBWI())
2640      if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2641        return LT.first * Entry->Cost;
2642
2643    if (ST->hasAVX512())
2644      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2645        return LT.first * Entry->Cost;
2646
2647    if (ST->hasXOP())
2648      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2649        return LT.first * Entry->Cost;
2650
2651    if (ST->hasAVX2())
2652      if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2653        return LT.first * Entry->Cost;
2654
2655    if (ST->hasAVX())
2656      if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2657        return LT.first * Entry->Cost;
2658
2659    if (ST->hasSSE42())
2660      if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2661        return LT.first * Entry->Cost;
2662
2663    if (ST->hasSSSE3())
2664      if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2665        return LT.first * Entry->Cost;
2666
2667    if (ST->hasSSE2())
2668      if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2669        return LT.first * Entry->Cost;
2670
2671    if (ST->hasSSE1())
2672      if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2673        return LT.first * Entry->Cost;
2674
2675    if (ST->hasBMI()) {
2676      if (ST->is64Bit())
2677        if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
2678          return LT.first * Entry->Cost;
2679
2680      if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
2681        return LT.first * Entry->Cost;
2682    }
2683
2684    if (ST->hasLZCNT()) {
2685      if (ST->is64Bit())
2686        if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2687          return LT.first * Entry->Cost;
2688
2689      if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2690        return LT.first * Entry->Cost;
2691    }
2692
2693    if (ST->hasPOPCNT()) {
2694      if (ST->is64Bit())
2695        if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2696          return LT.first * Entry->Cost;
2697
2698      if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2699        return LT.first * Entry->Cost;
2700    }
2701
2702    // TODO - add BMI (TZCNT) scalar handling
2703
2704    if (ST->is64Bit())
2705      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2706        return LT.first * Entry->Cost;
2707
2708    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2709      return LT.first * Entry->Cost;
2710  }
2711
2712  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2713}
2714
2715int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2716                                      TTI::TargetCostKind CostKind) {
2717  if (CostKind != TTI::TCK_RecipThroughput)
2718    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2719
2720  if (ICA.isTypeBasedOnly())
2721    return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2722
2723  static const CostTblEntry AVX512CostTbl[] = {
2724    { ISD::ROTL,       MVT::v8i64,   1 },
2725    { ISD::ROTL,       MVT::v4i64,   1 },
2726    { ISD::ROTL,       MVT::v2i64,   1 },
2727    { ISD::ROTL,       MVT::v16i32,  1 },
2728    { ISD::ROTL,       MVT::v8i32,   1 },
2729    { ISD::ROTL,       MVT::v4i32,   1 },
2730    { ISD::ROTR,       MVT::v8i64,   1 },
2731    { ISD::ROTR,       MVT::v4i64,   1 },
2732    { ISD::ROTR,       MVT::v2i64,   1 },
2733    { ISD::ROTR,       MVT::v16i32,  1 },
2734    { ISD::ROTR,       MVT::v8i32,   1 },
2735    { ISD::ROTR,       MVT::v4i32,   1 }
2736  };
2737  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2738  static const CostTblEntry XOPCostTbl[] = {
2739    { ISD::ROTL,       MVT::v4i64,   4 },
2740    { ISD::ROTL,       MVT::v8i32,   4 },
2741    { ISD::ROTL,       MVT::v16i16,  4 },
2742    { ISD::ROTL,       MVT::v32i8,   4 },
2743    { ISD::ROTL,       MVT::v2i64,   1 },
2744    { ISD::ROTL,       MVT::v4i32,   1 },
2745    { ISD::ROTL,       MVT::v8i16,   1 },
2746    { ISD::ROTL,       MVT::v16i8,   1 },
2747    { ISD::ROTR,       MVT::v4i64,   6 },
2748    { ISD::ROTR,       MVT::v8i32,   6 },
2749    { ISD::ROTR,       MVT::v16i16,  6 },
2750    { ISD::ROTR,       MVT::v32i8,   6 },
2751    { ISD::ROTR,       MVT::v2i64,   2 },
2752    { ISD::ROTR,       MVT::v4i32,   2 },
2753    { ISD::ROTR,       MVT::v8i16,   2 },
2754    { ISD::ROTR,       MVT::v16i8,   2 }
2755  };
2756  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2757    { ISD::ROTL,       MVT::i64,     1 },
2758    { ISD::ROTR,       MVT::i64,     1 },
2759    { ISD::FSHL,       MVT::i64,     4 }
2760  };
2761  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2762    { ISD::ROTL,       MVT::i32,     1 },
2763    { ISD::ROTL,       MVT::i16,     1 },
2764    { ISD::ROTL,       MVT::i8,      1 },
2765    { ISD::ROTR,       MVT::i32,     1 },
2766    { ISD::ROTR,       MVT::i16,     1 },
2767    { ISD::ROTR,       MVT::i8,      1 },
2768    { ISD::FSHL,       MVT::i32,     4 },
2769    { ISD::FSHL,       MVT::i16,     4 },
2770    { ISD::FSHL,       MVT::i8,      4 }
2771  };
2772
2773  Intrinsic::ID IID = ICA.getID();
2774  Type *RetTy = ICA.getReturnType();
2775  const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
2776  unsigned ISD = ISD::DELETED_NODE;
2777  switch (IID) {
2778  default:
2779    break;
2780  case Intrinsic::fshl:
2781    ISD = ISD::FSHL;
2782    if (Args[0] == Args[1])
2783      ISD = ISD::ROTL;
2784    break;
2785  case Intrinsic::fshr:
2786    // FSHR has same costs so don't duplicate.
2787    ISD = ISD::FSHL;
2788    if (Args[0] == Args[1])
2789      ISD = ISD::ROTR;
2790    break;
2791  }
2792
2793  if (ISD != ISD::DELETED_NODE) {
2794    // Legalize the type.
2795    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2796    MVT MTy = LT.second;
2797
2798    // Attempt to lookup cost.
2799    if (ST->hasAVX512())
2800      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2801        return LT.first * Entry->Cost;
2802
2803    if (ST->hasXOP())
2804      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2805        return LT.first * Entry->Cost;
2806
2807    if (ST->is64Bit())
2808      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2809        return LT.first * Entry->Cost;
2810
2811    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2812      return LT.first * Entry->Cost;
2813  }
2814
2815  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2816}
2817
2818int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2819  static const CostTblEntry SLMCostTbl[] = {
2820     { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
2821     { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
2822     { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
2823     { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
2824   };
2825
2826  assert(Val->isVectorTy() && "This must be a vector type");
2827  Type *ScalarType = Val->getScalarType();
2828  int RegisterFileMoveCost = 0;
2829
2830  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
2831                       Opcode == Instruction::InsertElement)) {
2832    // Legalize the type.
2833    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2834
2835    // This type is legalized to a scalar type.
2836    if (!LT.second.isVector())
2837      return 0;
2838
2839    // The type may be split. Normalize the index to the new type.
2840    unsigned NumElts = LT.second.getVectorNumElements();
2841    unsigned SubNumElts = NumElts;
2842    Index = Index % NumElts;
2843
2844    // For >128-bit vectors, we need to extract higher 128-bit subvectors.
2845    // For inserts, we also need to insert the subvector back.
2846    if (LT.second.getSizeInBits() > 128) {
2847      assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
2848      unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
2849      SubNumElts = NumElts / NumSubVecs;
2850      if (SubNumElts <= Index) {
2851        RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
2852        Index %= SubNumElts;
2853      }
2854    }
2855
2856    if (Index == 0) {
2857      // Floating point scalars are already located in index #0.
2858      // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
2859      // true for all.
2860      if (ScalarType->isFloatingPointTy())
2861        return RegisterFileMoveCost;
2862
2863      // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
2864      if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
2865        return 1 + RegisterFileMoveCost;
2866    }
2867
2868    int ISD = TLI->InstructionOpcodeToISD(Opcode);
2869    assert(ISD && "Unexpected vector opcode");
2870    MVT MScalarTy = LT.second.getScalarType();
2871    if (ST->isSLM())
2872      if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
2873        return Entry->Cost + RegisterFileMoveCost;
2874
2875    // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
2876    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
2877        (MScalarTy.isInteger() && ST->hasSSE41()))
2878      return 1 + RegisterFileMoveCost;
2879
2880    // Assume insertps is relatively cheap on all targets.
2881    if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
2882        Opcode == Instruction::InsertElement)
2883      return 1 + RegisterFileMoveCost;
2884
2885    // For extractions we just need to shuffle the element to index 0, which
2886    // should be very cheap (assume cost = 1). For insertions we need to shuffle
2887    // the elements to its destination. In both cases we must handle the
2888    // subvector move(s).
2889    // If the vector type is already less than 128-bits then don't reduce it.
2890    // TODO: Under what circumstances should we shuffle using the full width?
2891    int ShuffleCost = 1;
2892    if (Opcode == Instruction::InsertElement) {
2893      auto *SubTy = cast<VectorType>(Val);
2894      EVT VT = TLI->getValueType(DL, Val);
2895      if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
2896        SubTy = FixedVectorType::get(ScalarType, SubNumElts);
2897      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
2898    }
2899    int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
2900    return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
2901  }
2902
2903  // Add to the base cost if we know that the extracted element of a vector is
2904  // destined to be moved to and used in the integer register file.
2905  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2906    RegisterFileMoveCost += 1;
2907
2908  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2909}
2910
2911unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
2912                                              const APInt &DemandedElts,
2913                                              bool Insert, bool Extract) {
2914  unsigned Cost = 0;
2915
2916  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
2917  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
2918  if (Insert) {
2919    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
2920    MVT MScalarTy = LT.second.getScalarType();
2921
2922    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
2923        (MScalarTy.isInteger() && ST->hasSSE41()) ||
2924        (MScalarTy == MVT::f32 && ST->hasSSE41())) {
2925      // For types we can insert directly, insertion into 128-bit sub vectors is
2926      // cheap, followed by a cheap chain of concatenations.
2927      if (LT.second.getSizeInBits() <= 128) {
2928        Cost +=
2929            BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
2930      } else {
2931        unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
2932        Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
2933        Cost += DemandedElts.countPopulation();
2934
2935        // For vXf32 cases, insertion into the 0'th index in each v4f32
2936        // 128-bit vector is free.
2937        // NOTE: This assumes legalization widens vXf32 vectors.
2938        if (MScalarTy == MVT::f32)
2939          for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
2940               i < e; i += 4)
2941            if (DemandedElts[i])
2942              Cost--;
2943      }
2944    } else if (LT.second.isVector()) {
2945      // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
2946      // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
2947      // series of UNPCK followed by CONCAT_VECTORS - all of these can be
2948      // considered cheap.
2949      if (Ty->isIntOrIntVectorTy())
2950        Cost += DemandedElts.countPopulation();
2951
2952      // Get the smaller of the legalized or original pow2-extended number of
2953      // vector elements, which represents the number of unpacks we'll end up
2954      // performing.
2955      unsigned NumElts = LT.second.getVectorNumElements();
2956      unsigned Pow2Elts =
2957          PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
2958      Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
2959    }
2960  }
2961
2962  // TODO: Use default extraction for now, but we should investigate extending this
2963  // to handle repeated subvector extraction.
2964  if (Extract)
2965    Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
2966
2967  return Cost;
2968}
2969
2970int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
2971                                MaybeAlign Alignment, unsigned AddressSpace,
2972                                TTI::TargetCostKind CostKind,
2973                                const Instruction *I) {
2974  // TODO: Handle other cost kinds.
2975  if (CostKind != TTI::TCK_RecipThroughput) {
2976    if (isa_and_nonnull<StoreInst>(I)) {
2977      Value *Ptr = I->getOperand(1);
2978      // Store instruction with index and scale costs 2 Uops.
2979      // Check the preceding GEP to identify non-const indices.
2980      if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2981        if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2982          return TTI::TCC_Basic * 2;
2983      }
2984    }
2985    return TTI::TCC_Basic;
2986  }
2987
2988  // Handle non-power-of-two vectors such as <3 x float>
2989  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
2990    unsigned NumElem = VTy->getNumElements();
2991
2992    // Handle a few common cases:
2993    // <3 x float>
2994    if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2995      // Cost = 64 bit store + extract + 32 bit store.
2996      return 3;
2997
2998    // <3 x double>
2999    if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
3000      // Cost = 128 bit store + unpack + 64 bit store.
3001      return 3;
3002
3003    // Assume that all other non-power-of-two numbers are scalarized.
3004    if (!isPowerOf2_32(NumElem)) {
3005      APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3006      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
3007                                        AddressSpace, CostKind);
3008      int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
3009                                               Opcode == Instruction::Load,
3010                                               Opcode == Instruction::Store);
3011      return NumElem * Cost + SplitCost;
3012    }
3013  }
3014
3015  // Type legalization can't handle structs
3016  if (TLI->getValueType(DL, Src,  true) == MVT::Other)
3017    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3018                                  CostKind);
3019
3020  // Legalize the type.
3021  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
3022  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3023         "Invalid Opcode");
3024
3025  // Each load/store unit costs 1.
3026  int Cost = LT.first * 1;
3027
3028  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
3029  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
3030  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
3031    Cost *= 2;
3032
3033  return Cost;
3034}
3035
3036int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
3037                                      Align Alignment, unsigned AddressSpace,
3038                                      TTI::TargetCostKind CostKind) {
3039  bool IsLoad = (Instruction::Load == Opcode);
3040  bool IsStore = (Instruction::Store == Opcode);
3041
3042  auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
3043  if (!SrcVTy)
3044    // To calculate scalar take the regular cost, without mask
3045    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
3046
3047  unsigned NumElem = SrcVTy->getNumElements();
3048  auto *MaskTy =
3049      FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
3050  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
3051      (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
3052      !isPowerOf2_32(NumElem)) {
3053    // Scalarization
3054    APInt DemandedElts = APInt::getAllOnesValue(NumElem);
3055    int MaskSplitCost =
3056        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3057    int ScalarCompareCost = getCmpSelInstrCost(
3058        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
3059        CostKind);
3060    int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3061    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
3062    int ValueSplitCost =
3063        getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
3064    int MemopCost =
3065        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3066                                         Alignment, AddressSpace, CostKind);
3067    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
3068  }
3069
3070  // Legalize the type.
3071  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3072  auto VT = TLI->getValueType(DL, SrcVTy);
3073  int Cost = 0;
3074  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
3075      LT.second.getVectorNumElements() == NumElem)
3076    // Promotion requires expand/truncate for data and a shuffle for mask.
3077    Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
3078            getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
3079
3080  else if (LT.second.getVectorNumElements() > NumElem) {
3081    auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
3082                                           LT.second.getVectorNumElements());
3083    // Expanding requires fill mask with zeroes
3084    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
3085  }
3086
3087  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
3088  if (!ST->hasAVX512())
3089    return Cost + LT.first * (IsLoad ? 2 : 8);
3090
3091  // AVX-512 masked load/store is cheapper
3092  return Cost + LT.first;
3093}
3094
3095int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
3096                                          const SCEV *Ptr) {
3097  // Address computations in vectorized code with non-consecutive addresses will
3098  // likely result in more instructions compared to scalar code where the
3099  // computation can more often be merged into the index mode. The resulting
3100  // extra micro-ops can significantly decrease throughput.
3101  const unsigned NumVectorInstToHideOverhead = 10;
3102
3103  // Cost modeling of Strided Access Computation is hidden by the indexing
3104  // modes of X86 regardless of the stride value. We dont believe that there
3105  // is a difference between constant strided access in gerenal and constant
3106  // strided value which is less than or equal to 64.
3107  // Even in the case of (loop invariant) stride whose value is not known at
3108  // compile time, the address computation will not incur more than one extra
3109  // ADD instruction.
3110  if (Ty->isVectorTy() && SE) {
3111    if (!BaseT::isStridedAccess(Ptr))
3112      return NumVectorInstToHideOverhead;
3113    if (!BaseT::getConstantStrideStep(SE, Ptr))
3114      return 1;
3115  }
3116
3117  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
3118}
3119
3120int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3121                                           bool IsPairwise,
3122                                           TTI::TargetCostKind CostKind) {
3123  // Just use the default implementation for pair reductions.
3124  if (IsPairwise)
3125    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
3126
3127  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3128  // and make it as the cost.
3129
3130  static const CostTblEntry SLMCostTblNoPairWise[] = {
3131    { ISD::FADD,  MVT::v2f64,   3 },
3132    { ISD::ADD,   MVT::v2i64,   5 },
3133  };
3134
3135  static const CostTblEntry SSE2CostTblNoPairWise[] = {
3136    { ISD::FADD,  MVT::v2f64,   2 },
3137    { ISD::FADD,  MVT::v4f32,   4 },
3138    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
3139    { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
3140    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
3141    { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
3142    { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
3143    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
3144    { ISD::ADD,   MVT::v2i8,    2 },
3145    { ISD::ADD,   MVT::v4i8,    2 },
3146    { ISD::ADD,   MVT::v8i8,    2 },
3147    { ISD::ADD,   MVT::v16i8,   3 },
3148  };
3149
3150  static const CostTblEntry AVX1CostTblNoPairWise[] = {
3151    { ISD::FADD,  MVT::v4f64,   3 },
3152    { ISD::FADD,  MVT::v4f32,   3 },
3153    { ISD::FADD,  MVT::v8f32,   4 },
3154    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
3155    { ISD::ADD,   MVT::v4i64,   3 },
3156    { ISD::ADD,   MVT::v8i32,   5 },
3157    { ISD::ADD,   MVT::v16i16,  5 },
3158    { ISD::ADD,   MVT::v32i8,   4 },
3159  };
3160
3161  int ISD = TLI->InstructionOpcodeToISD(Opcode);
3162  assert(ISD && "Invalid opcode");
3163
3164  // Before legalizing the type, give a chance to look up illegal narrow types
3165  // in the table.
3166  // FIXME: Is there a better way to do this?
3167  EVT VT = TLI->getValueType(DL, ValTy);
3168  if (VT.isSimple()) {
3169    MVT MTy = VT.getSimpleVT();
3170    if (ST->isSLM())
3171      if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3172        return Entry->Cost;
3173
3174    if (ST->hasAVX())
3175      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3176        return Entry->Cost;
3177
3178    if (ST->hasSSE2())
3179      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3180        return Entry->Cost;
3181  }
3182
3183  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3184
3185  MVT MTy = LT.second;
3186
3187  auto *ValVTy = cast<FixedVectorType>(ValTy);
3188
3189  unsigned ArithmeticCost = 0;
3190  if (LT.first != 1 && MTy.isVector() &&
3191      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3192    // Type needs to be split. We need LT.first - 1 arithmetic ops.
3193    auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3194                                            MTy.getVectorNumElements());
3195    ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3196    ArithmeticCost *= LT.first - 1;
3197  }
3198
3199  if (ST->isSLM())
3200    if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
3201      return ArithmeticCost + Entry->Cost;
3202
3203  if (ST->hasAVX())
3204    if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3205      return ArithmeticCost + Entry->Cost;
3206
3207  if (ST->hasSSE2())
3208    if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3209      return ArithmeticCost + Entry->Cost;
3210
3211  // FIXME: These assume a naive kshift+binop lowering, which is probably
3212  // conservative in most cases.
3213  static const CostTblEntry AVX512BoolReduction[] = {
3214    { ISD::AND,  MVT::v2i1,   3 },
3215    { ISD::AND,  MVT::v4i1,   5 },
3216    { ISD::AND,  MVT::v8i1,   7 },
3217    { ISD::AND,  MVT::v16i1,  9 },
3218    { ISD::AND,  MVT::v32i1, 11 },
3219    { ISD::AND,  MVT::v64i1, 13 },
3220    { ISD::OR,   MVT::v2i1,   3 },
3221    { ISD::OR,   MVT::v4i1,   5 },
3222    { ISD::OR,   MVT::v8i1,   7 },
3223    { ISD::OR,   MVT::v16i1,  9 },
3224    { ISD::OR,   MVT::v32i1, 11 },
3225    { ISD::OR,   MVT::v64i1, 13 },
3226  };
3227
3228  static const CostTblEntry AVX2BoolReduction[] = {
3229    { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
3230    { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
3231    { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
3232    { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
3233  };
3234
3235  static const CostTblEntry AVX1BoolReduction[] = {
3236    { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
3237    { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
3238    { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
3239    { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
3240    { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
3241    { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
3242    { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
3243    { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
3244  };
3245
3246  static const CostTblEntry SSE2BoolReduction[] = {
3247    { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
3248    { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
3249    { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
3250    { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
3251    { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
3252    { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
3253    { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
3254    { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
3255  };
3256
3257  // Handle bool allof/anyof patterns.
3258  if (ValVTy->getElementType()->isIntegerTy(1)) {
3259    unsigned ArithmeticCost = 0;
3260    if (LT.first != 1 && MTy.isVector() &&
3261        MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3262      // Type needs to be split. We need LT.first - 1 arithmetic ops.
3263      auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
3264                                              MTy.getVectorNumElements());
3265      ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
3266      ArithmeticCost *= LT.first - 1;
3267    }
3268
3269    if (ST->hasAVX512())
3270      if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
3271        return ArithmeticCost + Entry->Cost;
3272    if (ST->hasAVX2())
3273      if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
3274        return ArithmeticCost + Entry->Cost;
3275    if (ST->hasAVX())
3276      if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
3277        return ArithmeticCost + Entry->Cost;
3278    if (ST->hasSSE2())
3279      if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
3280        return ArithmeticCost + Entry->Cost;
3281
3282    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3283                                             CostKind);
3284  }
3285
3286  unsigned NumVecElts = ValVTy->getNumElements();
3287  unsigned ScalarSize = ValVTy->getScalarSizeInBits();
3288
3289  // Special case power of 2 reductions where the scalar type isn't changed
3290  // by type legalization.
3291  if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
3292    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
3293                                             CostKind);
3294
3295  unsigned ReductionCost = 0;
3296
3297  auto *Ty = ValVTy;
3298  if (LT.first != 1 && MTy.isVector() &&
3299      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3300    // Type needs to be split. We need LT.first - 1 arithmetic ops.
3301    Ty = FixedVectorType::get(ValVTy->getElementType(),
3302                              MTy.getVectorNumElements());
3303    ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3304    ReductionCost *= LT.first - 1;
3305    NumVecElts = MTy.getVectorNumElements();
3306  }
3307
3308  // Now handle reduction with the legal type, taking into account size changes
3309  // at each level.
3310  while (NumVecElts > 1) {
3311    // Determine the size of the remaining vector we need to reduce.
3312    unsigned Size = NumVecElts * ScalarSize;
3313    NumVecElts /= 2;
3314    // If we're reducing from 256/512 bits, use an extract_subvector.
3315    if (Size > 128) {
3316      auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3317      ReductionCost +=
3318          getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3319      Ty = SubTy;
3320    } else if (Size == 128) {
3321      // Reducing from 128 bits is a permute of v2f64/v2i64.
3322      FixedVectorType *ShufTy;
3323      if (ValVTy->isFloatingPointTy())
3324        ShufTy =
3325            FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
3326      else
3327        ShufTy =
3328            FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
3329      ReductionCost +=
3330          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3331    } else if (Size == 64) {
3332      // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3333      FixedVectorType *ShufTy;
3334      if (ValVTy->isFloatingPointTy())
3335        ShufTy =
3336            FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
3337      else
3338        ShufTy =
3339            FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
3340      ReductionCost +=
3341          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3342    } else {
3343      // Reducing from smaller size is a shift by immediate.
3344      auto *ShiftTy = FixedVectorType::get(
3345          Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
3346      ReductionCost += getArithmeticInstrCost(
3347          Instruction::LShr, ShiftTy, CostKind,
3348          TargetTransformInfo::OK_AnyValue,
3349          TargetTransformInfo::OK_UniformConstantValue,
3350          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3351    }
3352
3353    // Add the arithmetic op for this level.
3354    ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
3355  }
3356
3357  // Add the final extract element to the cost.
3358  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3359}
3360
3361int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
3362  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3363
3364  MVT MTy = LT.second;
3365
3366  int ISD;
3367  if (Ty->isIntOrIntVectorTy()) {
3368    ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3369  } else {
3370    assert(Ty->isFPOrFPVectorTy() &&
3371           "Expected float point or integer vector type.");
3372    ISD = ISD::FMINNUM;
3373  }
3374
3375  static const CostTblEntry SSE1CostTbl[] = {
3376    {ISD::FMINNUM, MVT::v4f32, 1},
3377  };
3378
3379  static const CostTblEntry SSE2CostTbl[] = {
3380    {ISD::FMINNUM, MVT::v2f64, 1},
3381    {ISD::SMIN,    MVT::v8i16, 1},
3382    {ISD::UMIN,    MVT::v16i8, 1},
3383  };
3384
3385  static const CostTblEntry SSE41CostTbl[] = {
3386    {ISD::SMIN,    MVT::v4i32, 1},
3387    {ISD::UMIN,    MVT::v4i32, 1},
3388    {ISD::UMIN,    MVT::v8i16, 1},
3389    {ISD::SMIN,    MVT::v16i8, 1},
3390  };
3391
3392  static const CostTblEntry SSE42CostTbl[] = {
3393    {ISD::UMIN,    MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
3394  };
3395
3396  static const CostTblEntry AVX1CostTbl[] = {
3397    {ISD::FMINNUM, MVT::v8f32,  1},
3398    {ISD::FMINNUM, MVT::v4f64,  1},
3399    {ISD::SMIN,    MVT::v8i32,  3},
3400    {ISD::UMIN,    MVT::v8i32,  3},
3401    {ISD::SMIN,    MVT::v16i16, 3},
3402    {ISD::UMIN,    MVT::v16i16, 3},
3403    {ISD::SMIN,    MVT::v32i8,  3},
3404    {ISD::UMIN,    MVT::v32i8,  3},
3405  };
3406
3407  static const CostTblEntry AVX2CostTbl[] = {
3408    {ISD::SMIN,    MVT::v8i32,  1},
3409    {ISD::UMIN,    MVT::v8i32,  1},
3410    {ISD::SMIN,    MVT::v16i16, 1},
3411    {ISD::UMIN,    MVT::v16i16, 1},
3412    {ISD::SMIN,    MVT::v32i8,  1},
3413    {ISD::UMIN,    MVT::v32i8,  1},
3414  };
3415
3416  static const CostTblEntry AVX512CostTbl[] = {
3417    {ISD::FMINNUM, MVT::v16f32, 1},
3418    {ISD::FMINNUM, MVT::v8f64,  1},
3419    {ISD::SMIN,    MVT::v2i64,  1},
3420    {ISD::UMIN,    MVT::v2i64,  1},
3421    {ISD::SMIN,    MVT::v4i64,  1},
3422    {ISD::UMIN,    MVT::v4i64,  1},
3423    {ISD::SMIN,    MVT::v8i64,  1},
3424    {ISD::UMIN,    MVT::v8i64,  1},
3425    {ISD::SMIN,    MVT::v16i32, 1},
3426    {ISD::UMIN,    MVT::v16i32, 1},
3427  };
3428
3429  static const CostTblEntry AVX512BWCostTbl[] = {
3430    {ISD::SMIN,    MVT::v32i16, 1},
3431    {ISD::UMIN,    MVT::v32i16, 1},
3432    {ISD::SMIN,    MVT::v64i8,  1},
3433    {ISD::UMIN,    MVT::v64i8,  1},
3434  };
3435
3436  // If we have a native MIN/MAX instruction for this type, use it.
3437  if (ST->hasBWI())
3438    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3439      return LT.first * Entry->Cost;
3440
3441  if (ST->hasAVX512())
3442    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3443      return LT.first * Entry->Cost;
3444
3445  if (ST->hasAVX2())
3446    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3447      return LT.first * Entry->Cost;
3448
3449  if (ST->hasAVX())
3450    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3451      return LT.first * Entry->Cost;
3452
3453  if (ST->hasSSE42())
3454    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3455      return LT.first * Entry->Cost;
3456
3457  if (ST->hasSSE41())
3458    if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3459      return LT.first * Entry->Cost;
3460
3461  if (ST->hasSSE2())
3462    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3463      return LT.first * Entry->Cost;
3464
3465  if (ST->hasSSE1())
3466    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3467      return LT.first * Entry->Cost;
3468
3469  unsigned CmpOpcode;
3470  if (Ty->isFPOrFPVectorTy()) {
3471    CmpOpcode = Instruction::FCmp;
3472  } else {
3473    assert(Ty->isIntOrIntVectorTy() &&
3474           "expecting floating point or integer type for min/max reduction");
3475    CmpOpcode = Instruction::ICmp;
3476  }
3477
3478  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3479  // Otherwise fall back to cmp+select.
3480  return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
3481         getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
3482}
3483
3484int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
3485                                       bool IsPairwise, bool IsUnsigned,
3486                                       TTI::TargetCostKind CostKind) {
3487  // Just use the default implementation for pair reductions.
3488  if (IsPairwise)
3489    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3490                                         CostKind);
3491
3492  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
3493
3494  MVT MTy = LT.second;
3495
3496  int ISD;
3497  if (ValTy->isIntOrIntVectorTy()) {
3498    ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
3499  } else {
3500    assert(ValTy->isFPOrFPVectorTy() &&
3501           "Expected float point or integer vector type.");
3502    ISD = ISD::FMINNUM;
3503  }
3504
3505  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
3506  // and make it as the cost.
3507
3508  static const CostTblEntry SSE2CostTblNoPairWise[] = {
3509      {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
3510      {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
3511      {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
3512  };
3513
3514  static const CostTblEntry SSE41CostTblNoPairWise[] = {
3515      {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
3516      {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
3517      {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
3518      {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
3519      {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
3520      {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
3521      {ISD::SMIN, MVT::v2i8,  3}, // pminsb
3522      {ISD::SMIN, MVT::v4i8,  5}, // pminsb
3523      {ISD::SMIN, MVT::v8i8,  7}, // pminsb
3524      {ISD::SMIN, MVT::v16i8, 6},
3525      {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
3526      {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
3527      {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
3528      {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
3529  };
3530
3531  static const CostTblEntry AVX1CostTblNoPairWise[] = {
3532      {ISD::SMIN, MVT::v16i16, 6},
3533      {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
3534      {ISD::SMIN, MVT::v32i8, 8},
3535      {ISD::UMIN, MVT::v32i8, 8},
3536  };
3537
3538  static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
3539      {ISD::SMIN, MVT::v32i16, 8},
3540      {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
3541      {ISD::SMIN, MVT::v64i8, 10},
3542      {ISD::UMIN, MVT::v64i8, 10},
3543  };
3544
3545  // Before legalizing the type, give a chance to look up illegal narrow types
3546  // in the table.
3547  // FIXME: Is there a better way to do this?
3548  EVT VT = TLI->getValueType(DL, ValTy);
3549  if (VT.isSimple()) {
3550    MVT MTy = VT.getSimpleVT();
3551    if (ST->hasBWI())
3552      if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3553        return Entry->Cost;
3554
3555    if (ST->hasAVX())
3556      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3557        return Entry->Cost;
3558
3559    if (ST->hasSSE41())
3560      if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3561        return Entry->Cost;
3562
3563    if (ST->hasSSE2())
3564      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3565        return Entry->Cost;
3566  }
3567
3568  auto *ValVTy = cast<FixedVectorType>(ValTy);
3569  unsigned NumVecElts = ValVTy->getNumElements();
3570
3571  auto *Ty = ValVTy;
3572  unsigned MinMaxCost = 0;
3573  if (LT.first != 1 && MTy.isVector() &&
3574      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
3575    // Type needs to be split. We need LT.first - 1 operations ops.
3576    Ty = FixedVectorType::get(ValVTy->getElementType(),
3577                              MTy.getVectorNumElements());
3578    auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
3579                                           MTy.getVectorNumElements());
3580    MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3581    MinMaxCost *= LT.first - 1;
3582    NumVecElts = MTy.getVectorNumElements();
3583  }
3584
3585  if (ST->hasBWI())
3586    if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
3587      return MinMaxCost + Entry->Cost;
3588
3589  if (ST->hasAVX())
3590    if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
3591      return MinMaxCost + Entry->Cost;
3592
3593  if (ST->hasSSE41())
3594    if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
3595      return MinMaxCost + Entry->Cost;
3596
3597  if (ST->hasSSE2())
3598    if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
3599      return MinMaxCost + Entry->Cost;
3600
3601  unsigned ScalarSize = ValTy->getScalarSizeInBits();
3602
3603  // Special case power of 2 reductions where the scalar type isn't changed
3604  // by type legalization.
3605  if (!isPowerOf2_32(ValVTy->getNumElements()) ||
3606      ScalarSize != MTy.getScalarSizeInBits())
3607    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
3608                                         CostKind);
3609
3610  // Now handle reduction with the legal type, taking into account size changes
3611  // at each level.
3612  while (NumVecElts > 1) {
3613    // Determine the size of the remaining vector we need to reduce.
3614    unsigned Size = NumVecElts * ScalarSize;
3615    NumVecElts /= 2;
3616    // If we're reducing from 256/512 bits, use an extract_subvector.
3617    if (Size > 128) {
3618      auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
3619      MinMaxCost +=
3620          getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
3621      Ty = SubTy;
3622    } else if (Size == 128) {
3623      // Reducing from 128 bits is a permute of v2f64/v2i64.
3624      VectorType *ShufTy;
3625      if (ValTy->isFloatingPointTy())
3626        ShufTy =
3627            FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
3628      else
3629        ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
3630      MinMaxCost +=
3631          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3632    } else if (Size == 64) {
3633      // Reducing from 64 bits is a shuffle of v4f32/v4i32.
3634      FixedVectorType *ShufTy;
3635      if (ValTy->isFloatingPointTy())
3636        ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
3637      else
3638        ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
3639      MinMaxCost +=
3640          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
3641    } else {
3642      // Reducing from smaller size is a shift by immediate.
3643      auto *ShiftTy = FixedVectorType::get(
3644          Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
3645      MinMaxCost += getArithmeticInstrCost(
3646          Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
3647          TargetTransformInfo::OK_AnyValue,
3648          TargetTransformInfo::OK_UniformConstantValue,
3649          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
3650    }
3651
3652    // Add the arithmetic op for this level.
3653    auto *SubCondTy =
3654        FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
3655    MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
3656  }
3657
3658  // Add the final extract element to the cost.
3659  return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
3660}
3661
3662/// Calculate the cost of materializing a 64-bit value. This helper
3663/// method might only calculate a fraction of a larger immediate. Therefore it
3664/// is valid to return a cost of ZERO.
3665int X86TTIImpl::getIntImmCost(int64_t Val) {
3666  if (Val == 0)
3667    return TTI::TCC_Free;
3668
3669  if (isInt<32>(Val))
3670    return TTI::TCC_Basic;
3671
3672  return 2 * TTI::TCC_Basic;
3673}
3674
3675int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3676                              TTI::TargetCostKind CostKind) {
3677  assert(Ty->isIntegerTy());
3678
3679  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3680  if (BitSize == 0)
3681    return ~0U;
3682
3683  // Never hoist constants larger than 128bit, because this might lead to
3684  // incorrect code generation or assertions in codegen.
3685  // Fixme: Create a cost model for types larger than i128 once the codegen
3686  // issues have been fixed.
3687  if (BitSize > 128)
3688    return TTI::TCC_Free;
3689
3690  if (Imm == 0)
3691    return TTI::TCC_Free;
3692
3693  // Sign-extend all constants to a multiple of 64-bit.
3694  APInt ImmVal = Imm;
3695  if (BitSize % 64 != 0)
3696    ImmVal = Imm.sext(alignTo(BitSize, 64));
3697
3698  // Split the constant into 64-bit chunks and calculate the cost for each
3699  // chunk.
3700  int Cost = 0;
3701  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
3702    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
3703    int64_t Val = Tmp.getSExtValue();
3704    Cost += getIntImmCost(Val);
3705  }
3706  // We need at least one instruction to materialize the constant.
3707  return std::max(1, Cost);
3708}
3709
3710int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
3711                                  Type *Ty, TTI::TargetCostKind CostKind) {
3712  assert(Ty->isIntegerTy());
3713
3714  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3715  // There is no cost model for constants with a bit size of 0. Return TCC_Free
3716  // here, so that constant hoisting will ignore this constant.
3717  if (BitSize == 0)
3718    return TTI::TCC_Free;
3719
3720  unsigned ImmIdx = ~0U;
3721  switch (Opcode) {
3722  default:
3723    return TTI::TCC_Free;
3724  case Instruction::GetElementPtr:
3725    // Always hoist the base address of a GetElementPtr. This prevents the
3726    // creation of new constants for every base constant that gets constant
3727    // folded with the offset.
3728    if (Idx == 0)
3729      return 2 * TTI::TCC_Basic;
3730    return TTI::TCC_Free;
3731  case Instruction::Store:
3732    ImmIdx = 0;
3733    break;
3734  case Instruction::ICmp:
3735    // This is an imperfect hack to prevent constant hoisting of
3736    // compares that might be trying to check if a 64-bit value fits in
3737    // 32-bits. The backend can optimize these cases using a right shift by 32.
3738    // Ideally we would check the compare predicate here. There also other
3739    // similar immediates the backend can use shifts for.
3740    if (Idx == 1 && Imm.getBitWidth() == 64) {
3741      uint64_t ImmVal = Imm.getZExtValue();
3742      if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
3743        return TTI::TCC_Free;
3744    }
3745    ImmIdx = 1;
3746    break;
3747  case Instruction::And:
3748    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
3749    // by using a 32-bit operation with implicit zero extension. Detect such
3750    // immediates here as the normal path expects bit 31 to be sign extended.
3751    if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
3752      return TTI::TCC_Free;
3753    ImmIdx = 1;
3754    break;
3755  case Instruction::Add:
3756  case Instruction::Sub:
3757    // For add/sub, we can use the opposite instruction for INT32_MIN.
3758    if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
3759      return TTI::TCC_Free;
3760    ImmIdx = 1;
3761    break;
3762  case Instruction::UDiv:
3763  case Instruction::SDiv:
3764  case Instruction::URem:
3765  case Instruction::SRem:
3766    // Division by constant is typically expanded later into a different
3767    // instruction sequence. This completely changes the constants.
3768    // Report them as "free" to stop ConstantHoist from marking them as opaque.
3769    return TTI::TCC_Free;
3770  case Instruction::Mul:
3771  case Instruction::Or:
3772  case Instruction::Xor:
3773    ImmIdx = 1;
3774    break;
3775  // Always return TCC_Free for the shift value of a shift instruction.
3776  case Instruction::Shl:
3777  case Instruction::LShr:
3778  case Instruction::AShr:
3779    if (Idx == 1)
3780      return TTI::TCC_Free;
3781    break;
3782  case Instruction::Trunc:
3783  case Instruction::ZExt:
3784  case Instruction::SExt:
3785  case Instruction::IntToPtr:
3786  case Instruction::PtrToInt:
3787  case Instruction::BitCast:
3788  case Instruction::PHI:
3789  case Instruction::Call:
3790  case Instruction::Select:
3791  case Instruction::Ret:
3792  case Instruction::Load:
3793    break;
3794  }
3795
3796  if (Idx == ImmIdx) {
3797    int NumConstants = divideCeil(BitSize, 64);
3798    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3799    return (Cost <= NumConstants * TTI::TCC_Basic)
3800               ? static_cast<int>(TTI::TCC_Free)
3801               : Cost;
3802  }
3803
3804  return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3805}
3806
3807int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
3808                                    const APInt &Imm, Type *Ty,
3809                                    TTI::TargetCostKind CostKind) {
3810  assert(Ty->isIntegerTy());
3811
3812  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3813  // There is no cost model for constants with a bit size of 0. Return TCC_Free
3814  // here, so that constant hoisting will ignore this constant.
3815  if (BitSize == 0)
3816    return TTI::TCC_Free;
3817
3818  switch (IID) {
3819  default:
3820    return TTI::TCC_Free;
3821  case Intrinsic::sadd_with_overflow:
3822  case Intrinsic::uadd_with_overflow:
3823  case Intrinsic::ssub_with_overflow:
3824  case Intrinsic::usub_with_overflow:
3825  case Intrinsic::smul_with_overflow:
3826  case Intrinsic::umul_with_overflow:
3827    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
3828      return TTI::TCC_Free;
3829    break;
3830  case Intrinsic::experimental_stackmap:
3831    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3832      return TTI::TCC_Free;
3833    break;
3834  case Intrinsic::experimental_patchpoint_void:
3835  case Intrinsic::experimental_patchpoint_i64:
3836    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
3837      return TTI::TCC_Free;
3838    break;
3839  }
3840  return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
3841}
3842
3843unsigned
3844X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
3845  if (CostKind != TTI::TCK_RecipThroughput)
3846    return Opcode == Instruction::PHI ? 0 : 1;
3847  // Branches are assumed to be predicted.
3848  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
3849}
3850
3851// Return an average cost of Gather / Scatter instruction, maybe improved later
3852int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
3853                                Align Alignment, unsigned AddressSpace) {
3854
3855  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
3856  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
3857
3858  // Try to reduce index size from 64 bit (default for GEP)
3859  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
3860  // operation will use 16 x 64 indices which do not fit in a zmm and needs
3861  // to split. Also check that the base pointer is the same for all lanes,
3862  // and that there's at most one variable index.
3863  auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
3864    unsigned IndexSize = DL.getPointerSizeInBits();
3865    const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
3866    if (IndexSize < 64 || !GEP)
3867      return IndexSize;
3868
3869    unsigned NumOfVarIndices = 0;
3870    const Value *Ptrs = GEP->getPointerOperand();
3871    if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
3872      return IndexSize;
3873    for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
3874      if (isa<Constant>(GEP->getOperand(i)))
3875        continue;
3876      Type *IndxTy = GEP->getOperand(i)->getType();
3877      if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
3878        IndxTy = IndexVTy->getElementType();
3879      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
3880          !isa<SExtInst>(GEP->getOperand(i))) ||
3881         ++NumOfVarIndices > 1)
3882        return IndexSize; // 64
3883    }
3884    return (unsigned)32;
3885  };
3886
3887  // Trying to reduce IndexSize to 32 bits for vector 16.
3888  // By default the IndexSize is equal to pointer size.
3889  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
3890                           ? getIndexSizeInBits(Ptr, DL)
3891                           : DL.getPointerSizeInBits();
3892
3893  auto *IndexVTy = FixedVectorType::get(
3894      IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
3895  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3896  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3897  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3898  if (SplitFactor > 1) {
3899    // Handle splitting of vector of pointers
3900    auto *SplitSrcTy =
3901        FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3902    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3903                                         AddressSpace);
3904  }
3905
3906  // The gather / scatter cost is given by Intel architects. It is a rough
3907  // number since we are looking at one instruction in a time.
3908  const int GSOverhead = (Opcode == Instruction::Load)
3909                             ? ST->getGatherOverhead()
3910                             : ST->getScatterOverhead();
3911  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3912                                           MaybeAlign(Alignment), AddressSpace,
3913                                           TTI::TCK_RecipThroughput);
3914}
3915
3916/// Return the cost of full scalarization of gather / scatter operation.
3917///
3918/// Opcode - Load or Store instruction.
3919/// SrcVTy - The type of the data vector that should be gathered or scattered.
3920/// VariableMask - The mask is non-constant at compile time.
3921/// Alignment - Alignment for one element.
3922/// AddressSpace - pointer[s] address space.
3923///
3924int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3925                                bool VariableMask, Align Alignment,
3926                                unsigned AddressSpace) {
3927  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
3928  APInt DemandedElts = APInt::getAllOnesValue(VF);
3929  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3930
3931  int MaskUnpackCost = 0;
3932  if (VariableMask) {
3933    auto *MaskTy =
3934        FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3935    MaskUnpackCost =
3936        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
3937    int ScalarCompareCost =
3938      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3939                         nullptr, CostKind);
3940    int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
3941    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3942  }
3943
3944  // The cost of the scalar loads/stores.
3945  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3946                                          MaybeAlign(Alignment), AddressSpace,
3947                                          CostKind);
3948
3949  int InsertExtractCost = 0;
3950  if (Opcode == Instruction::Load)
3951    for (unsigned i = 0; i < VF; ++i)
3952      // Add the cost of inserting each scalar load into the vector
3953      InsertExtractCost +=
3954        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3955  else
3956    for (unsigned i = 0; i < VF; ++i)
3957      // Add the cost of extracting each element out of the data vector
3958      InsertExtractCost +=
3959        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3960
3961  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3962}
3963
3964/// Calculate the cost of Gather / Scatter operation
3965int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3966                                       const Value *Ptr, bool VariableMask,
3967                                       Align Alignment,
3968                                       TTI::TargetCostKind CostKind,
3969                                       const Instruction *I = nullptr) {
3970
3971  if (CostKind != TTI::TCK_RecipThroughput)
3972    return 1;
3973
3974  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3975  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
3976  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3977  if (!PtrTy && Ptr->getType()->isVectorTy())
3978    PtrTy = dyn_cast<PointerType>(
3979        cast<VectorType>(Ptr->getType())->getElementType());
3980  assert(PtrTy && "Unexpected type for Ptr argument");
3981  unsigned AddressSpace = PtrTy->getAddressSpace();
3982
3983  bool Scalarize = false;
3984  if ((Opcode == Instruction::Load &&
3985       !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
3986      (Opcode == Instruction::Store &&
3987       !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
3988    Scalarize = true;
3989  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3990  // Vector-4 of gather/scatter instruction does not exist on KNL.
3991  // We can extend it to 8 elements, but zeroing upper bits of
3992  // the mask vector will add more instructions. Right now we give the scalar
3993  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3994  // is better in the VariableMask case.
3995  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
3996    Scalarize = true;
3997
3998  if (Scalarize)
3999    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
4000                           AddressSpace);
4001
4002  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
4003}
4004
4005bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
4006                               TargetTransformInfo::LSRCost &C2) {
4007    // X86 specific here are "instruction number 1st priority".
4008    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
4009                    C1.NumIVMuls, C1.NumBaseAdds,
4010                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4011           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
4012                    C2.NumIVMuls, C2.NumBaseAdds,
4013                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4014}
4015
4016bool X86TTIImpl::canMacroFuseCmp() {
4017  return ST->hasMacroFusion() || ST->hasBranchFusion();
4018}
4019
4020bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
4021  if (!ST->hasAVX())
4022    return false;
4023
4024  // The backend can't handle a single element vector.
4025  if (isa<VectorType>(DataTy) &&
4026      cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4027    return false;
4028  Type *ScalarTy = DataTy->getScalarType();
4029
4030  if (ScalarTy->isPointerTy())
4031    return true;
4032
4033  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4034    return true;
4035
4036  if (!ScalarTy->isIntegerTy())
4037    return false;
4038
4039  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4040  return IntWidth == 32 || IntWidth == 64 ||
4041         ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
4042}
4043
4044bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
4045  return isLegalMaskedLoad(DataType, Alignment);
4046}
4047
4048bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
4049  unsigned DataSize = DL.getTypeStoreSize(DataType);
4050  // The only supported nontemporal loads are for aligned vectors of 16 or 32
4051  // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
4052  // (the equivalent stores only require AVX).
4053  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
4054    return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
4055
4056  return false;
4057}
4058
4059bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
4060  unsigned DataSize = DL.getTypeStoreSize(DataType);
4061
4062  // SSE4A supports nontemporal stores of float and double at arbitrary
4063  // alignment.
4064  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
4065    return true;
4066
4067  // Besides the SSE4A subtarget exception above, only aligned stores are
4068  // available nontemporaly on any other subtarget.  And only stores with a size
4069  // of 4..32 bytes (powers of 2, only) are permitted.
4070  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
4071      !isPowerOf2_32(DataSize))
4072    return false;
4073
4074  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
4075  // loads require AVX2).
4076  if (DataSize == 32)
4077    return ST->hasAVX();
4078  else if (DataSize == 16)
4079    return ST->hasSSE1();
4080  return true;
4081}
4082
4083bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
4084  if (!isa<VectorType>(DataTy))
4085    return false;
4086
4087  if (!ST->hasAVX512())
4088    return false;
4089
4090  // The backend can't handle a single element vector.
4091  if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
4092    return false;
4093
4094  Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
4095
4096  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4097    return true;
4098
4099  if (!ScalarTy->isIntegerTy())
4100    return false;
4101
4102  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4103  return IntWidth == 32 || IntWidth == 64 ||
4104         ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
4105}
4106
4107bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
4108  return isLegalMaskedExpandLoad(DataTy);
4109}
4110
4111bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
4112  // Some CPUs have better gather performance than others.
4113  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
4114  // enable gather with a -march.
4115  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
4116    return false;
4117
4118  // This function is called now in two cases: from the Loop Vectorizer
4119  // and from the Scalarizer.
4120  // When the Loop Vectorizer asks about legality of the feature,
4121  // the vectorization factor is not calculated yet. The Loop Vectorizer
4122  // sends a scalar type and the decision is based on the width of the
4123  // scalar element.
4124  // Later on, the cost model will estimate usage this intrinsic based on
4125  // the vector type.
4126  // The Scalarizer asks again about legality. It sends a vector type.
4127  // In this case we can reject non-power-of-2 vectors.
4128  // We also reject single element vectors as the type legalizer can't
4129  // scalarize it.
4130  if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
4131    unsigned NumElts = DataVTy->getNumElements();
4132    if (NumElts == 1 || !isPowerOf2_32(NumElts))
4133      return false;
4134  }
4135  Type *ScalarTy = DataTy->getScalarType();
4136  if (ScalarTy->isPointerTy())
4137    return true;
4138
4139  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
4140    return true;
4141
4142  if (!ScalarTy->isIntegerTy())
4143    return false;
4144
4145  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
4146  return IntWidth == 32 || IntWidth == 64;
4147}
4148
4149bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
4150  // AVX2 doesn't support scatter
4151  if (!ST->hasAVX512())
4152    return false;
4153  return isLegalMaskedGather(DataType, Alignment);
4154}
4155
4156bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
4157  EVT VT = TLI->getValueType(DL, DataType);
4158  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
4159}
4160
4161bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
4162  return false;
4163}
4164
4165bool X86TTIImpl::areInlineCompatible(const Function *Caller,
4166                                     const Function *Callee) const {
4167  const TargetMachine &TM = getTLI()->getTargetMachine();
4168
4169  // Work this as a subsetting of subtarget features.
4170  const FeatureBitset &CallerBits =
4171      TM.getSubtargetImpl(*Caller)->getFeatureBits();
4172  const FeatureBitset &CalleeBits =
4173      TM.getSubtargetImpl(*Callee)->getFeatureBits();
4174
4175  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
4176  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
4177  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
4178}
4179
4180bool X86TTIImpl::areFunctionArgsABICompatible(
4181    const Function *Caller, const Function *Callee,
4182    SmallPtrSetImpl<Argument *> &Args) const {
4183  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
4184    return false;
4185
4186  // If we get here, we know the target features match. If one function
4187  // considers 512-bit vectors legal and the other does not, consider them
4188  // incompatible.
4189  const TargetMachine &TM = getTLI()->getTargetMachine();
4190
4191  if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
4192      TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
4193    return true;
4194
4195  // Consider the arguments compatible if they aren't vectors or aggregates.
4196  // FIXME: Look at the size of vectors.
4197  // FIXME: Look at the element types of aggregates to see if there are vectors.
4198  // FIXME: The API of this function seems intended to allow arguments
4199  // to be removed from the set, but the caller doesn't check if the set
4200  // becomes empty so that may not work in practice.
4201  return llvm::none_of(Args, [](Argument *A) {
4202    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
4203    return EltTy->isVectorTy() || EltTy->isAggregateType();
4204  });
4205}
4206
4207X86TTIImpl::TTI::MemCmpExpansionOptions
4208X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4209  TTI::MemCmpExpansionOptions Options;
4210  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4211  Options.NumLoadsPerBlock = 2;
4212  // All GPR and vector loads can be unaligned.
4213  Options.AllowOverlappingLoads = true;
4214  if (IsZeroCmp) {
4215    // Only enable vector loads for equality comparison. Right now the vector
4216    // version is not as fast for three way compare (see #33329).
4217    const unsigned PreferredWidth = ST->getPreferVectorWidth();
4218    if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
4219    if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
4220    if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
4221  }
4222  if (ST->is64Bit()) {
4223    Options.LoadSizes.push_back(8);
4224  }
4225  Options.LoadSizes.push_back(4);
4226  Options.LoadSizes.push_back(2);
4227  Options.LoadSizes.push_back(1);
4228  return Options;
4229}
4230
4231bool X86TTIImpl::enableInterleavedAccessVectorization() {
4232  // TODO: We expect this to be beneficial regardless of arch,
4233  // but there are currently some unexplained performance artifacts on Atom.
4234  // As a temporary solution, disable on Atom.
4235  return !(ST->isAtom());
4236}
4237
4238// Get estimation for interleaved load/store operations for AVX2.
4239// \p Factor is the interleaved-access factor (stride) - number of
4240// (interleaved) elements in the group.
4241// \p Indices contains the indices for a strided load: when the
4242// interleaved load has gaps they indicate which elements are used.
4243// If Indices is empty (or if the number of indices is equal to the size
4244// of the interleaved-access as given in \p Factor) the access has no gaps.
4245//
4246// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
4247// computing the cost using a generic formula as a function of generic
4248// shuffles. We therefore use a lookup table instead, filled according to
4249// the instruction sequences that codegen currently generates.
4250int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
4251    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4252    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4253    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4254
4255  if (UseMaskForCond || UseMaskForGaps)
4256    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4257                                             Alignment, AddressSpace, CostKind,
4258                                             UseMaskForCond, UseMaskForGaps);
4259
4260  // We currently Support only fully-interleaved groups, with no gaps.
4261  // TODO: Support also strided loads (interleaved-groups with gaps).
4262  if (Indices.size() && Indices.size() != Factor)
4263    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4264                                             Alignment, AddressSpace,
4265                                             CostKind);
4266
4267  // VecTy for interleave memop is <VF*Factor x Elt>.
4268  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4269  // VecTy = <12 x i32>.
4270  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4271
4272  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
4273  // the VF=2, while v2i128 is an unsupported MVT vector type
4274  // (see MachineValueType.h::getVectorVT()).
4275  if (!LegalVT.isVector())
4276    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4277                                             Alignment, AddressSpace,
4278                                             CostKind);
4279
4280  unsigned VF = VecTy->getNumElements() / Factor;
4281  Type *ScalarTy = VecTy->getElementType();
4282
4283  // Calculate the number of memory operations (NumOfMemOps), required
4284  // for load/store the VecTy.
4285  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4286  unsigned LegalVTSize = LegalVT.getStoreSize();
4287  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4288
4289  // Get the cost of one memory operation.
4290  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4291                                             LegalVT.getVectorNumElements());
4292  unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4293                                       MaybeAlign(Alignment), AddressSpace,
4294                                       CostKind);
4295
4296  auto *VT = FixedVectorType::get(ScalarTy, VF);
4297  EVT ETy = TLI->getValueType(DL, VT);
4298  if (!ETy.isSimple())
4299    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4300                                             Alignment, AddressSpace,
4301                                             CostKind);
4302
4303  // TODO: Complete for other data-types and strides.
4304  // Each combination of Stride, ElementTy and VF results in a different
4305  // sequence; The cost tables are therefore accessed with:
4306  // Factor (stride) and VectorType=VFxElemType.
4307  // The Cost accounts only for the shuffle sequence;
4308  // The cost of the loads/stores is accounted for separately.
4309  //
4310  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
4311    { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
4312    { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
4313
4314    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
4315    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
4316    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
4317    { 3, MVT::v16i8, 11},  //(load 48i8 and) deinterleave into 3 x 16i8
4318    { 3, MVT::v32i8, 13},  //(load 96i8 and) deinterleave into 3 x 32i8
4319    { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
4320
4321    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
4322    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
4323    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
4324    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
4325    { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
4326
4327    { 8, MVT::v8f32, 40 }  //(load 64f32 and)deinterleave into 8 x 8f32
4328  };
4329
4330  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
4331    { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
4332    { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
4333
4334    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
4335    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
4336    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
4337    { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
4338    { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
4339
4340    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
4341    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
4342    { 4, MVT::v8i8,  10 }, //interleave 4 x 8i8  into 32i8 (and store)
4343    { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
4344    { 4, MVT::v32i8, 12 }  //interleave 4 x 32i8 into 128i8 (and store)
4345  };
4346
4347  if (Opcode == Instruction::Load) {
4348    if (const auto *Entry =
4349            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
4350      return NumOfMemOps * MemOpCost + Entry->Cost;
4351  } else {
4352    assert(Opcode == Instruction::Store &&
4353           "Expected Store Instruction at this  point");
4354    if (const auto *Entry =
4355            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
4356      return NumOfMemOps * MemOpCost + Entry->Cost;
4357  }
4358
4359  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4360                                           Alignment, AddressSpace, CostKind);
4361}
4362
4363// Get estimation for interleaved load/store operations and strided load.
4364// \p Indices contains indices for strided load.
4365// \p Factor - the factor of interleaving.
4366// AVX-512 provides 3-src shuffles that significantly reduces the cost.
4367int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
4368    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
4369    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
4370    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
4371
4372  if (UseMaskForCond || UseMaskForGaps)
4373    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4374                                             Alignment, AddressSpace, CostKind,
4375                                             UseMaskForCond, UseMaskForGaps);
4376
4377  // VecTy for interleave memop is <VF*Factor x Elt>.
4378  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
4379  // VecTy = <12 x i32>.
4380
4381  // Calculate the number of memory operations (NumOfMemOps), required
4382  // for load/store the VecTy.
4383  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
4384  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
4385  unsigned LegalVTSize = LegalVT.getStoreSize();
4386  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
4387
4388  // Get the cost of one memory operation.
4389  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
4390                                             LegalVT.getVectorNumElements());
4391  unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
4392                                       MaybeAlign(Alignment), AddressSpace,
4393                                       CostKind);
4394
4395  unsigned VF = VecTy->getNumElements() / Factor;
4396  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
4397
4398  if (Opcode == Instruction::Load) {
4399    // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
4400    // contain the cost of the optimized shuffle sequence that the
4401    // X86InterleavedAccess pass will generate.
4402    // The cost of loads and stores are computed separately from the table.
4403
4404    // X86InterleavedAccess support only the following interleaved-access group.
4405    static const CostTblEntry AVX512InterleavedLoadTbl[] = {
4406        {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
4407        {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
4408        {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
4409    };
4410
4411    if (const auto *Entry =
4412            CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
4413      return NumOfMemOps * MemOpCost + Entry->Cost;
4414    //If an entry does not exist, fallback to the default implementation.
4415
4416    // Kind of shuffle depends on number of loaded values.
4417    // If we load the entire data in one register, we can use a 1-src shuffle.
4418    // Otherwise, we'll merge 2 sources in each operation.
4419    TTI::ShuffleKind ShuffleKind =
4420        (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
4421
4422    unsigned ShuffleCost =
4423        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
4424
4425    unsigned NumOfLoadsInInterleaveGrp =
4426        Indices.size() ? Indices.size() : Factor;
4427    auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
4428                                          VecTy->getNumElements() / Factor);
4429    unsigned NumOfResults =
4430        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
4431        NumOfLoadsInInterleaveGrp;
4432
4433    // About a half of the loads may be folded in shuffles when we have only
4434    // one result. If we have more than one result, we do not fold loads at all.
4435    unsigned NumOfUnfoldedLoads =
4436        NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
4437
4438    // Get a number of shuffle operations per result.
4439    unsigned NumOfShufflesPerResult =
4440        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
4441
4442    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4443    // When we have more than one destination, we need additional instructions
4444    // to keep sources.
4445    unsigned NumOfMoves = 0;
4446    if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
4447      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
4448
4449    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
4450               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
4451
4452    return Cost;
4453  }
4454
4455  // Store.
4456  assert(Opcode == Instruction::Store &&
4457         "Expected Store Instruction at this  point");
4458  // X86InterleavedAccess support only the following interleaved-access group.
4459  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
4460      {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
4461      {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
4462      {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
4463
4464      {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
4465      {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
4466      {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
4467      {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
4468  };
4469
4470  if (const auto *Entry =
4471          CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
4472    return NumOfMemOps * MemOpCost + Entry->Cost;
4473  //If an entry does not exist, fallback to the default implementation.
4474
4475  // There is no strided stores meanwhile. And store can't be folded in
4476  // shuffle.
4477  unsigned NumOfSources = Factor; // The number of values to be merged.
4478  unsigned ShuffleCost =
4479      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
4480  unsigned NumOfShufflesPerStore = NumOfSources - 1;
4481
4482  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
4483  // We need additional instructions to keep sources.
4484  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
4485  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
4486             NumOfMoves;
4487  return Cost;
4488}
4489
4490int X86TTIImpl::getInterleavedMemoryOpCost(
4491    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4492    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4493    bool UseMaskForCond, bool UseMaskForGaps) {
4494  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
4495    Type *EltTy = cast<VectorType>(VecTy)->getElementType();
4496    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
4497        EltTy->isIntegerTy(32) || EltTy->isPointerTy())
4498      return true;
4499    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
4500      return HasBW;
4501    return false;
4502  };
4503  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
4504    return getInterleavedMemoryOpCostAVX512(
4505        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
4506        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4507  if (ST->hasAVX2())
4508    return getInterleavedMemoryOpCostAVX2(
4509        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
4510        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
4511
4512  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4513                                           Alignment, AddressSpace, CostKind,
4514                                           UseMaskForCond, UseMaskForGaps);
4515}
4516