1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "Utils/AMDGPUBaseInfo.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/Analysis/LoopInfo.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/ISDOpcodes.h"
25#include "llvm/CodeGen/ValueTypes.h"
26#include "llvm/IR/Argument.h"
27#include "llvm/IR/Attributes.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/CallingConv.h"
30#include "llvm/IR/DataLayout.h"
31#include "llvm/IR/DerivedTypes.h"
32#include "llvm/IR/Function.h"
33#include "llvm/IR/Instruction.h"
34#include "llvm/IR/Instructions.h"
35#include "llvm/IR/IntrinsicInst.h"
36#include "llvm/IR/Module.h"
37#include "llvm/IR/PatternMatch.h"
38#include "llvm/IR/Type.h"
39#include "llvm/IR/Value.h"
40#include "llvm/MC/SubtargetFeature.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/MachineValueType.h"
46#include "llvm/Support/raw_ostream.h"
47#include "llvm/Target/TargetMachine.h"
48#include <algorithm>
49#include <cassert>
50#include <limits>
51#include <utility>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "AMDGPUtti"
56
57static cl::opt<unsigned> UnrollThresholdPrivate(
58  "amdgpu-unroll-threshold-private",
59  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60  cl::init(2700), cl::Hidden);
61
62static cl::opt<unsigned> UnrollThresholdLocal(
63  "amdgpu-unroll-threshold-local",
64  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65  cl::init(1000), cl::Hidden);
66
67static cl::opt<unsigned> UnrollThresholdIf(
68  "amdgpu-unroll-threshold-if",
69  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70  cl::init(150), cl::Hidden);
71
72static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
73                              unsigned Depth = 0) {
74  const Instruction *I = dyn_cast<Instruction>(Cond);
75  if (!I)
76    return false;
77
78  for (const Value *V : I->operand_values()) {
79    if (!L->contains(I))
80      continue;
81    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
82      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
83                  return SubLoop->contains(PHI); }))
84        return true;
85    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
86      return true;
87  }
88  return false;
89}
90
91void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
92                                            TTI::UnrollingPreferences &UP) {
93  const Function &F = *L->getHeader()->getParent();
94  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
95  UP.MaxCount = std::numeric_limits<unsigned>::max();
96  UP.Partial = true;
97
98  // TODO: Do we want runtime unrolling?
99
100  // Maximum alloca size than can fit registers. Reserve 16 registers.
101  const unsigned MaxAlloca = (256 - 16) * 4;
102  unsigned ThresholdPrivate = UnrollThresholdPrivate;
103  unsigned ThresholdLocal = UnrollThresholdLocal;
104  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
105  for (const BasicBlock *BB : L->getBlocks()) {
106    const DataLayout &DL = BB->getModule()->getDataLayout();
107    unsigned LocalGEPsSeen = 0;
108
109    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
110               return SubLoop->contains(BB); }))
111        continue; // Block belongs to an inner loop.
112
113    for (const Instruction &I : *BB) {
114      // Unroll a loop which contains an "if" statement whose condition
115      // defined by a PHI belonging to the loop. This may help to eliminate
116      // if region and potentially even PHI itself, saving on both divergence
117      // and registers used for the PHI.
118      // Add a small bonus for each of such "if" statements.
119      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
120        if (UP.Threshold < MaxBoost && Br->isConditional()) {
121          BasicBlock *Succ0 = Br->getSuccessor(0);
122          BasicBlock *Succ1 = Br->getSuccessor(1);
123          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
124              (L->contains(Succ1) && L->isLoopExiting(Succ1)))
125            continue;
126          if (dependsOnLocalPhi(L, Br->getCondition())) {
127            UP.Threshold += UnrollThresholdIf;
128            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
129                              << " for loop:\n"
130                              << *L << " due to " << *Br << '\n');
131            if (UP.Threshold >= MaxBoost)
132              return;
133          }
134        }
135        continue;
136      }
137
138      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
139      if (!GEP)
140        continue;
141
142      unsigned AS = GEP->getAddressSpace();
143      unsigned Threshold = 0;
144      if (AS == AMDGPUAS::PRIVATE_ADDRESS)
145        Threshold = ThresholdPrivate;
146      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
147        Threshold = ThresholdLocal;
148      else
149        continue;
150
151      if (UP.Threshold >= Threshold)
152        continue;
153
154      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
155        const Value *Ptr = GEP->getPointerOperand();
156        const AllocaInst *Alloca =
157            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
158        if (!Alloca || !Alloca->isStaticAlloca())
159          continue;
160        Type *Ty = Alloca->getAllocatedType();
161        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
162        if (AllocaSize > MaxAlloca)
163          continue;
164      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
165                 AS == AMDGPUAS::REGION_ADDRESS) {
166        LocalGEPsSeen++;
167        // Inhibit unroll for local memory if we have seen addressing not to
168        // a variable, most likely we will be unable to combine it.
169        // Do not unroll too deep inner loops for local memory to give a chance
170        // to unroll an outer loop for a more important reason.
171        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
172            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
173             !isa<Argument>(GEP->getPointerOperand())))
174          continue;
175      }
176
177      // Check if GEP depends on a value defined by this loop itself.
178      bool HasLoopDef = false;
179      for (const Value *Op : GEP->operands()) {
180        const Instruction *Inst = dyn_cast<Instruction>(Op);
181        if (!Inst || L->isLoopInvariant(Op))
182          continue;
183
184        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
185             return SubLoop->contains(Inst); }))
186          continue;
187        HasLoopDef = true;
188        break;
189      }
190      if (!HasLoopDef)
191        continue;
192
193      // We want to do whatever we can to limit the number of alloca
194      // instructions that make it through to the code generator.  allocas
195      // require us to use indirect addressing, which is slow and prone to
196      // compiler bugs.  If this loop does an address calculation on an
197      // alloca ptr, then we want to use a higher than normal loop unroll
198      // threshold. This will give SROA a better chance to eliminate these
199      // allocas.
200      //
201      // We also want to have more unrolling for local memory to let ds
202      // instructions with different offsets combine.
203      //
204      // Don't use the maximum allowed value here as it will make some
205      // programs way too big.
206      UP.Threshold = Threshold;
207      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
208                        << " for loop:\n"
209                        << *L << " due to " << *GEP << '\n');
210      if (UP.Threshold >= MaxBoost)
211        return;
212    }
213  }
214}
215
216unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
217  // The concept of vector registers doesn't really exist. Some packed vector
218  // operations operate on the normal 32-bit registers.
219  return 256;
220}
221
222unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
223  // This is really the number of registers to fill when vectorizing /
224  // interleaving loops, so we lie to avoid trying to use all registers.
225  return getHardwareNumberOfRegisters(Vec) >> 3;
226}
227
228unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
229  return 32;
230}
231
232unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
233  return 32;
234}
235
236unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
237                                            unsigned ChainSizeInBytes,
238                                            VectorType *VecTy) const {
239  unsigned VecRegBitWidth = VF * LoadSize;
240  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
241    // TODO: Support element-size less than 32bit?
242    return 128 / LoadSize;
243
244  return VF;
245}
246
247unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
248                                             unsigned ChainSizeInBytes,
249                                             VectorType *VecTy) const {
250  unsigned VecRegBitWidth = VF * StoreSize;
251  if (VecRegBitWidth > 128)
252    return 128 / StoreSize;
253
254  return VF;
255}
256
257unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
258  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
259      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
260      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
261      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
262    return 512;
263  }
264
265  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
266      AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
267      AddrSpace == AMDGPUAS::REGION_ADDRESS)
268    return 128;
269
270  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
271    return 8 * ST->getMaxPrivateElementSize();
272
273  llvm_unreachable("unhandled address space");
274}
275
276bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
277                                               unsigned Alignment,
278                                               unsigned AddrSpace) const {
279  // We allow vectorization of flat stores, even though we may need to decompose
280  // them later if they may access private memory. We don't have enough context
281  // here, and legalization can handle it.
282  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
283    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
284      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
285  }
286  return true;
287}
288
289bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
290                                                unsigned Alignment,
291                                                unsigned AddrSpace) const {
292  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
293}
294
295bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
296                                                 unsigned Alignment,
297                                                 unsigned AddrSpace) const {
298  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
299}
300
301unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
302  // Disable unrolling if the loop is not vectorized.
303  // TODO: Enable this again.
304  if (VF == 1)
305    return 1;
306
307  return 8;
308}
309
310bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
311                                       MemIntrinsicInfo &Info) const {
312  switch (Inst->getIntrinsicID()) {
313  case Intrinsic::amdgcn_atomic_inc:
314  case Intrinsic::amdgcn_atomic_dec:
315  case Intrinsic::amdgcn_ds_ordered_add:
316  case Intrinsic::amdgcn_ds_ordered_swap:
317  case Intrinsic::amdgcn_ds_fadd:
318  case Intrinsic::amdgcn_ds_fmin:
319  case Intrinsic::amdgcn_ds_fmax: {
320    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
321    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
322    if (!Ordering || !Volatile)
323      return false; // Invalid.
324
325    unsigned OrderingVal = Ordering->getZExtValue();
326    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
327      return false;
328
329    Info.PtrVal = Inst->getArgOperand(0);
330    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
331    Info.ReadMem = true;
332    Info.WriteMem = true;
333    Info.IsVolatile = !Volatile->isNullValue();
334    return true;
335  }
336  default:
337    return false;
338  }
339}
340
341int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
342                                       TTI::OperandValueKind Opd1Info,
343                                       TTI::OperandValueKind Opd2Info,
344                                       TTI::OperandValueProperties Opd1PropInfo,
345                                       TTI::OperandValueProperties Opd2PropInfo,
346                                       ArrayRef<const Value *> Args,
347                                       const Instruction *CxtI) {
348  EVT OrigTy = TLI->getValueType(DL, Ty);
349  if (!OrigTy.isSimple()) {
350    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
351                                         Opd1PropInfo, Opd2PropInfo);
352  }
353
354  // Legalize the type.
355  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
356  int ISD = TLI->InstructionOpcodeToISD(Opcode);
357
358  // Because we don't have any legal vector operations, but the legal types, we
359  // need to account for split vectors.
360  unsigned NElts = LT.second.isVector() ?
361    LT.second.getVectorNumElements() : 1;
362
363  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
364
365  switch (ISD) {
366  case ISD::SHL:
367  case ISD::SRL:
368  case ISD::SRA:
369    if (SLT == MVT::i64)
370      return get64BitInstrCost() * LT.first * NElts;
371
372    if (ST->has16BitInsts() && SLT == MVT::i16)
373      NElts = (NElts + 1) / 2;
374
375    // i32
376    return getFullRateInstrCost() * LT.first * NElts;
377  case ISD::ADD:
378  case ISD::SUB:
379  case ISD::AND:
380  case ISD::OR:
381  case ISD::XOR:
382    if (SLT == MVT::i64) {
383      // and, or and xor are typically split into 2 VALU instructions.
384      return 2 * getFullRateInstrCost() * LT.first * NElts;
385    }
386
387    if (ST->has16BitInsts() && SLT == MVT::i16)
388      NElts = (NElts + 1) / 2;
389
390    return LT.first * NElts * getFullRateInstrCost();
391  case ISD::MUL: {
392    const int QuarterRateCost = getQuarterRateInstrCost();
393    if (SLT == MVT::i64) {
394      const int FullRateCost = getFullRateInstrCost();
395      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
396    }
397
398    if (ST->has16BitInsts() && SLT == MVT::i16)
399      NElts = (NElts + 1) / 2;
400
401    // i32
402    return QuarterRateCost * NElts * LT.first;
403  }
404  case ISD::FADD:
405  case ISD::FSUB:
406  case ISD::FMUL:
407    if (SLT == MVT::f64)
408      return LT.first * NElts * get64BitInstrCost();
409
410    if (ST->has16BitInsts() && SLT == MVT::f16)
411      NElts = (NElts + 1) / 2;
412
413    if (SLT == MVT::f32 || SLT == MVT::f16)
414      return LT.first * NElts * getFullRateInstrCost();
415    break;
416  case ISD::FDIV:
417  case ISD::FREM:
418    // FIXME: frem should be handled separately. The fdiv in it is most of it,
419    // but the current lowering is also not entirely correct.
420    if (SLT == MVT::f64) {
421      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
422      // Add cost of workaround.
423      if (!ST->hasUsableDivScaleConditionOutput())
424        Cost += 3 * getFullRateInstrCost();
425
426      return LT.first * Cost * NElts;
427    }
428
429    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
430      // TODO: This is more complicated, unsafe flags etc.
431      if ((SLT == MVT::f32 && !HasFP32Denormals) ||
432          (SLT == MVT::f16 && ST->has16BitInsts())) {
433        return LT.first * getQuarterRateInstrCost() * NElts;
434      }
435    }
436
437    if (SLT == MVT::f16 && ST->has16BitInsts()) {
438      // 2 x v_cvt_f32_f16
439      // f32 rcp
440      // f32 fmul
441      // v_cvt_f16_f32
442      // f16 div_fixup
443      int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
444      return LT.first * Cost * NElts;
445    }
446
447    if (SLT == MVT::f32 || SLT == MVT::f16) {
448      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
449
450      if (!HasFP32Denormals) {
451        // FP mode switches.
452        Cost += 2 * getFullRateInstrCost();
453      }
454
455      return LT.first * NElts * Cost;
456    }
457    break;
458  default:
459    break;
460  }
461
462  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
463                                       Opd1PropInfo, Opd2PropInfo);
464}
465
466template <typename T>
467int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
468                                      ArrayRef<T *> Args,
469                                      FastMathFlags FMF, unsigned VF) {
470  if (ID != Intrinsic::fma)
471    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
472
473  EVT OrigTy = TLI->getValueType(DL, RetTy);
474  if (!OrigTy.isSimple()) {
475    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
476  }
477
478  // Legalize the type.
479  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
480
481  unsigned NElts = LT.second.isVector() ?
482    LT.second.getVectorNumElements() : 1;
483
484  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
485
486  if (SLT == MVT::f64)
487    return LT.first * NElts * get64BitInstrCost();
488
489  if (ST->has16BitInsts() && SLT == MVT::f16)
490    NElts = (NElts + 1) / 2;
491
492  return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost()
493                                                 : getQuarterRateInstrCost());
494}
495
496int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
497                                      ArrayRef<Value*> Args, FastMathFlags FMF,
498                                      unsigned VF) {
499  return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
500}
501
502int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
503                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
504                                      unsigned ScalarizationCostPassed) {
505  return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
506                                     ScalarizationCostPassed);
507}
508
509unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
510  // XXX - For some reason this isn't called for switch.
511  switch (Opcode) {
512  case Instruction::Br:
513  case Instruction::Ret:
514    return 10;
515  default:
516    return BaseT::getCFInstrCost(Opcode);
517  }
518}
519
520int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
521                                              bool IsPairwise) {
522  EVT OrigTy = TLI->getValueType(DL, Ty);
523
524  // Computes cost on targets that have packed math instructions(which support
525  // 16-bit types only).
526  if (IsPairwise ||
527      !ST->hasVOP3PInsts() ||
528      OrigTy.getScalarSizeInBits() != 16)
529    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
530
531  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
532  return LT.first * getFullRateInstrCost();
533}
534
535int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
536                                          bool IsPairwise,
537                                          bool IsUnsigned) {
538  EVT OrigTy = TLI->getValueType(DL, Ty);
539
540  // Computes cost on targets that have packed math instructions(which support
541  // 16-bit types only).
542  if (IsPairwise ||
543      !ST->hasVOP3PInsts() ||
544      OrigTy.getScalarSizeInBits() != 16)
545    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
546
547  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
548  return LT.first * getHalfRateInstrCost();
549}
550
551int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
552                                      unsigned Index) {
553  switch (Opcode) {
554  case Instruction::ExtractElement:
555  case Instruction::InsertElement: {
556    unsigned EltSize
557      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
558    if (EltSize < 32) {
559      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
560        return 0;
561      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
562    }
563
564    // Extracts are just reads of a subregister, so are free. Inserts are
565    // considered free because we don't want to have any cost for scalarizing
566    // operations, and we don't have to copy into a different register class.
567
568    // Dynamic indexing isn't free and is best avoided.
569    return Index == ~0u ? 2 : 0;
570  }
571  default:
572    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
573  }
574}
575
576
577
578static bool isArgPassedInSGPR(const Argument *A) {
579  const Function *F = A->getParent();
580
581  // Arguments to compute shaders are never a source of divergence.
582  CallingConv::ID CC = F->getCallingConv();
583  switch (CC) {
584  case CallingConv::AMDGPU_KERNEL:
585  case CallingConv::SPIR_KERNEL:
586    return true;
587  case CallingConv::AMDGPU_VS:
588  case CallingConv::AMDGPU_LS:
589  case CallingConv::AMDGPU_HS:
590  case CallingConv::AMDGPU_ES:
591  case CallingConv::AMDGPU_GS:
592  case CallingConv::AMDGPU_PS:
593  case CallingConv::AMDGPU_CS:
594    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
595    // Everything else is in VGPRs.
596    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
597           F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
598  default:
599    // TODO: Should calls support inreg for SGPR inputs?
600    return false;
601  }
602}
603
604/// \returns true if the result of the value could potentially be
605/// different across workitems in a wavefront.
606bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
607  if (const Argument *A = dyn_cast<Argument>(V))
608    return !isArgPassedInSGPR(A);
609
610  // Loads from the private and flat address spaces are divergent, because
611  // threads can execute the load instruction with the same inputs and get
612  // different results.
613  //
614  // All other loads are not divergent, because if threads issue loads with the
615  // same arguments, they will always get the same result.
616  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
617    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
618           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
619
620  // Atomics are divergent because they are executed sequentially: when an
621  // atomic operation refers to the same address in each thread, then each
622  // thread after the first sees the value written by the previous thread as
623  // original value.
624  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
625    return true;
626
627  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
628    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
629
630  // Assume all function calls are a source of divergence.
631  if (isa<CallInst>(V) || isa<InvokeInst>(V))
632    return true;
633
634  return false;
635}
636
637bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
638  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
639    switch (Intrinsic->getIntrinsicID()) {
640    default:
641      return false;
642    case Intrinsic::amdgcn_readfirstlane:
643    case Intrinsic::amdgcn_readlane:
644    case Intrinsic::amdgcn_icmp:
645    case Intrinsic::amdgcn_fcmp:
646      return true;
647    }
648  }
649  return false;
650}
651
652bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
653                                            Intrinsic::ID IID) const {
654  switch (IID) {
655  case Intrinsic::amdgcn_atomic_inc:
656  case Intrinsic::amdgcn_atomic_dec:
657  case Intrinsic::amdgcn_ds_fadd:
658  case Intrinsic::amdgcn_ds_fmin:
659  case Intrinsic::amdgcn_ds_fmax:
660  case Intrinsic::amdgcn_is_shared:
661  case Intrinsic::amdgcn_is_private:
662    OpIndexes.push_back(0);
663    return true;
664  default:
665    return false;
666  }
667}
668
669bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
670  IntrinsicInst *II, Value *OldV, Value *NewV) const {
671  auto IntrID = II->getIntrinsicID();
672  switch (IntrID) {
673  case Intrinsic::amdgcn_atomic_inc:
674  case Intrinsic::amdgcn_atomic_dec:
675  case Intrinsic::amdgcn_ds_fadd:
676  case Intrinsic::amdgcn_ds_fmin:
677  case Intrinsic::amdgcn_ds_fmax: {
678    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
679    if (!IsVolatile->isZero())
680      return false;
681    Module *M = II->getParent()->getParent()->getParent();
682    Type *DestTy = II->getType();
683    Type *SrcTy = NewV->getType();
684    Function *NewDecl =
685        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
686    II->setArgOperand(0, NewV);
687    II->setCalledFunction(NewDecl);
688    return true;
689  }
690  case Intrinsic::amdgcn_is_shared:
691  case Intrinsic::amdgcn_is_private: {
692    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
693      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
694    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
695    LLVMContext &Ctx = NewV->getType()->getContext();
696    ConstantInt *NewVal = (TrueAS == NewAS) ?
697      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
698    II->replaceAllUsesWith(NewVal);
699    II->eraseFromParent();
700    return true;
701  }
702  default:
703    return false;
704  }
705}
706
707unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
708                                       Type *SubTp) {
709  if (ST->hasVOP3PInsts()) {
710    VectorType *VT = cast<VectorType>(Tp);
711    if (VT->getNumElements() == 2 &&
712        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
713      // With op_sel VOP3P instructions freely can access the low half or high
714      // half of a register, so any swizzle is free.
715
716      switch (Kind) {
717      case TTI::SK_Broadcast:
718      case TTI::SK_Reverse:
719      case TTI::SK_PermuteSingleSrc:
720        return 0;
721      default:
722        break;
723      }
724    }
725  }
726
727  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
728}
729
730bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
731                                     const Function *Callee) const {
732  const TargetMachine &TM = getTLI()->getTargetMachine();
733  const GCNSubtarget *CallerST
734    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
735  const GCNSubtarget *CalleeST
736    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
737
738  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
739  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
740
741  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
742  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
743  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
744    return false;
745
746  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
747  // no way to support merge for backend defined attributes.
748  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
749  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
750  return CallerMode.isInlineCompatible(CalleeMode);
751}
752
753void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
754                                         TTI::UnrollingPreferences &UP) {
755  CommonTTI.getUnrollingPreferences(L, SE, UP);
756}
757
758unsigned GCNTTIImpl::getUserCost(const User *U,
759                                 ArrayRef<const Value *> Operands) {
760  const Instruction *I = dyn_cast<Instruction>(U);
761  if (!I)
762    return BaseT::getUserCost(U, Operands);
763
764  // Estimate different operations to be optimized out
765  switch (I->getOpcode()) {
766  case Instruction::ExtractElement: {
767    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
768    unsigned Idx = -1;
769    if (CI)
770      Idx = CI->getZExtValue();
771    return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
772  }
773  case Instruction::InsertElement: {
774    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
775    unsigned Idx = -1;
776    if (CI)
777      Idx = CI->getZExtValue();
778    return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
779  }
780  case Instruction::Call: {
781    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
782      SmallVector<Value *, 4> Args(II->arg_operands());
783      FastMathFlags FMF;
784      if (auto *FPMO = dyn_cast<FPMathOperator>(II))
785        FMF = FPMO->getFastMathFlags();
786      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
787                                   FMF);
788    } else {
789      return BaseT::getUserCost(U, Operands);
790    }
791  }
792  case Instruction::ShuffleVector: {
793    const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
794    Type *Ty = Shuffle->getType();
795    Type *SrcTy = Shuffle->getOperand(0)->getType();
796
797    // TODO: Identify and add costs for insert subvector, etc.
798    int SubIndex;
799    if (Shuffle->isExtractSubvectorMask(SubIndex))
800      return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
801
802    if (Shuffle->changesLength())
803      return BaseT::getUserCost(U, Operands);
804
805    if (Shuffle->isIdentity())
806      return 0;
807
808    if (Shuffle->isReverse())
809      return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
810
811    if (Shuffle->isSelect())
812      return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
813
814    if (Shuffle->isTranspose())
815      return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
816
817    if (Shuffle->isZeroEltSplat())
818      return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
819
820    if (Shuffle->isSingleSource())
821      return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
822
823    return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
824  }
825  case Instruction::ZExt:
826  case Instruction::SExt:
827  case Instruction::FPToUI:
828  case Instruction::FPToSI:
829  case Instruction::FPExt:
830  case Instruction::PtrToInt:
831  case Instruction::IntToPtr:
832  case Instruction::SIToFP:
833  case Instruction::UIToFP:
834  case Instruction::Trunc:
835  case Instruction::FPTrunc:
836  case Instruction::BitCast:
837  case Instruction::AddrSpaceCast: {
838    return getCastInstrCost(I->getOpcode(), I->getType(),
839                            I->getOperand(0)->getType(), I);
840  }
841  case Instruction::Add:
842  case Instruction::FAdd:
843  case Instruction::Sub:
844  case Instruction::FSub:
845  case Instruction::Mul:
846  case Instruction::FMul:
847  case Instruction::UDiv:
848  case Instruction::SDiv:
849  case Instruction::FDiv:
850  case Instruction::URem:
851  case Instruction::SRem:
852  case Instruction::FRem:
853  case Instruction::Shl:
854  case Instruction::LShr:
855  case Instruction::AShr:
856  case Instruction::And:
857  case Instruction::Or:
858  case Instruction::Xor:
859  case Instruction::FNeg: {
860    return getArithmeticInstrCost(I->getOpcode(), I->getType(),
861                                  TTI::OK_AnyValue, TTI::OK_AnyValue,
862                                  TTI::OP_None, TTI::OP_None, Operands, I);
863  }
864  default:
865    break;
866  }
867
868  return BaseT::getUserCost(U, Operands);
869}
870
871unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
872  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
873}
874
875unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
876  return getHardwareNumberOfRegisters(Vec);
877}
878
879unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
880  return 32;
881}
882
883unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
884  return 32;
885}
886
887unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
888  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
889      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
890    return 128;
891  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
892      AddrSpace == AMDGPUAS::REGION_ADDRESS)
893    return 64;
894  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
895    return 32;
896
897  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
898      AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
899      (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
900      AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
901    return 128;
902  llvm_unreachable("unhandled address space");
903}
904
905bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
906                                             unsigned Alignment,
907                                             unsigned AddrSpace) const {
908  // We allow vectorization of flat stores, even though we may need to decompose
909  // them later if they may access private memory. We don't have enough context
910  // here, and legalization can handle it.
911  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
912}
913
914bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
915                                              unsigned Alignment,
916                                              unsigned AddrSpace) const {
917  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
918}
919
920bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
921                                               unsigned Alignment,
922                                               unsigned AddrSpace) const {
923  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
924}
925
926unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
927  // Disable unrolling if the loop is not vectorized.
928  // TODO: Enable this again.
929  if (VF == 1)
930    return 1;
931
932  return 8;
933}
934
935unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
936  // XXX - For some reason this isn't called for switch.
937  switch (Opcode) {
938  case Instruction::Br:
939  case Instruction::Ret:
940    return 10;
941  default:
942    return BaseT::getCFInstrCost(Opcode);
943  }
944}
945
946int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
947                                    unsigned Index) {
948  switch (Opcode) {
949  case Instruction::ExtractElement:
950  case Instruction::InsertElement: {
951    unsigned EltSize
952      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
953    if (EltSize < 32) {
954      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
955    }
956
957    // Extracts are just reads of a subregister, so are free. Inserts are
958    // considered free because we don't want to have any cost for scalarizing
959    // operations, and we don't have to copy into a different register class.
960
961    // Dynamic indexing isn't free and is best avoided.
962    return Index == ~0u ? 2 : 0;
963  }
964  default:
965    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
966  }
967}
968
969void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
970                                          TTI::UnrollingPreferences &UP) {
971  CommonTTI.getUnrollingPreferences(L, SE, UP);
972}
973