1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIModeRegisterDefaults.h"
21#include "llvm/Analysis/InlineCost.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/Support/KnownBits.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
35static cl::opt<unsigned> UnrollThresholdPrivate(
36  "amdgpu-unroll-threshold-private",
37  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38  cl::init(2700), cl::Hidden);
39
40static cl::opt<unsigned> UnrollThresholdLocal(
41  "amdgpu-unroll-threshold-local",
42  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43  cl::init(1000), cl::Hidden);
44
45static cl::opt<unsigned> UnrollThresholdIf(
46  "amdgpu-unroll-threshold-if",
47  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48  cl::init(200), cl::Hidden);
49
50static cl::opt<bool> UnrollRuntimeLocal(
51  "amdgpu-unroll-runtime-local",
52  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53  cl::init(true), cl::Hidden);
54
55static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56    "amdgpu-unroll-max-block-to-analyze",
57    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58    cl::init(32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61                                       cl::Hidden, cl::init(4000),
62                                       cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
67static cl::opt<unsigned>
68    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69                    cl::init(256),
70                    cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
73static cl::opt<size_t> InlineMaxBB(
74    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75    cl::desc("Maximum number of BBs allowed in a function after inlining"
76             " (compile time constraint)"));
77
78static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79                              unsigned Depth = 0) {
80  const Instruction *I = dyn_cast<Instruction>(Cond);
81  if (!I)
82    return false;
83
84  for (const Value *V : I->operand_values()) {
85    if (!L->contains(I))
86      continue;
87    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89                  return SubLoop->contains(PHI); }))
90        return true;
91    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92      return true;
93  }
94  return false;
95}
96
97AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98    : BaseT(TM, F.getParent()->getDataLayout()),
99      TargetTriple(TM->getTargetTriple()),
100      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101      TLI(ST->getTargetLowering()) {}
102
103void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104                                            TTI::UnrollingPreferences &UP,
105                                            OptimizationRemarkEmitter *ORE) {
106  const Function &F = *L->getHeader()->getParent();
107  UP.Threshold =
108      F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109  UP.MaxCount = std::numeric_limits<unsigned>::max();
110  UP.Partial = true;
111
112  // Conditional branch in a loop back edge needs 3 additional exec
113  // manipulations in average.
114  UP.BEInsns += 3;
115
116  // We want to run unroll even for the loops which have been vectorized.
117  UP.UnrollVectorizedLoop = true;
118
119  // TODO: Do we want runtime unrolling?
120
121  // Maximum alloca size than can fit registers. Reserve 16 registers.
122  const unsigned MaxAlloca = (256 - 16) * 4;
123  unsigned ThresholdPrivate = UnrollThresholdPrivate;
124  unsigned ThresholdLocal = UnrollThresholdLocal;
125
126  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127  // provided threshold value as the default for Threshold
128  if (MDNode *LoopUnrollThreshold =
129          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
130    if (LoopUnrollThreshold->getNumOperands() == 2) {
131      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132          LoopUnrollThreshold->getOperand(1));
133      if (MetaThresholdValue) {
134        // We will also use the supplied value for PartialThreshold for now.
135        // We may introduce additional metadata if it becomes necessary in the
136        // future.
137        UP.Threshold = MetaThresholdValue->getSExtValue();
138        UP.PartialThreshold = UP.Threshold;
139        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
140        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
141      }
142    }
143  }
144
145  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
146  for (const BasicBlock *BB : L->getBlocks()) {
147    const DataLayout &DL = BB->getModule()->getDataLayout();
148    unsigned LocalGEPsSeen = 0;
149
150    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
151               return SubLoop->contains(BB); }))
152        continue; // Block belongs to an inner loop.
153
154    for (const Instruction &I : *BB) {
155      // Unroll a loop which contains an "if" statement whose condition
156      // defined by a PHI belonging to the loop. This may help to eliminate
157      // if region and potentially even PHI itself, saving on both divergence
158      // and registers used for the PHI.
159      // Add a small bonus for each of such "if" statements.
160      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
161        if (UP.Threshold < MaxBoost && Br->isConditional()) {
162          BasicBlock *Succ0 = Br->getSuccessor(0);
163          BasicBlock *Succ1 = Br->getSuccessor(1);
164          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
165              (L->contains(Succ1) && L->isLoopExiting(Succ1)))
166            continue;
167          if (dependsOnLocalPhi(L, Br->getCondition())) {
168            UP.Threshold += UnrollThresholdIf;
169            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170                              << " for loop:\n"
171                              << *L << " due to " << *Br << '\n');
172            if (UP.Threshold >= MaxBoost)
173              return;
174          }
175        }
176        continue;
177      }
178
179      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
180      if (!GEP)
181        continue;
182
183      unsigned AS = GEP->getAddressSpace();
184      unsigned Threshold = 0;
185      if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186        Threshold = ThresholdPrivate;
187      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
188        Threshold = ThresholdLocal;
189      else
190        continue;
191
192      if (UP.Threshold >= Threshold)
193        continue;
194
195      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196        const Value *Ptr = GEP->getPointerOperand();
197        const AllocaInst *Alloca =
198            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
199        if (!Alloca || !Alloca->isStaticAlloca())
200          continue;
201        Type *Ty = Alloca->getAllocatedType();
202        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203        if (AllocaSize > MaxAlloca)
204          continue;
205      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
206                 AS == AMDGPUAS::REGION_ADDRESS) {
207        LocalGEPsSeen++;
208        // Inhibit unroll for local memory if we have seen addressing not to
209        // a variable, most likely we will be unable to combine it.
210        // Do not unroll too deep inner loops for local memory to give a chance
211        // to unroll an outer loop for a more important reason.
212        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
214             !isa<Argument>(GEP->getPointerOperand())))
215          continue;
216        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217                          << *L << " due to LDS use.\n");
218        UP.Runtime = UnrollRuntimeLocal;
219      }
220
221      // Check if GEP depends on a value defined by this loop itself.
222      bool HasLoopDef = false;
223      for (const Value *Op : GEP->operands()) {
224        const Instruction *Inst = dyn_cast<Instruction>(Op);
225        if (!Inst || L->isLoopInvariant(Op))
226          continue;
227
228        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
229             return SubLoop->contains(Inst); }))
230          continue;
231        HasLoopDef = true;
232        break;
233      }
234      if (!HasLoopDef)
235        continue;
236
237      // We want to do whatever we can to limit the number of alloca
238      // instructions that make it through to the code generator.  allocas
239      // require us to use indirect addressing, which is slow and prone to
240      // compiler bugs.  If this loop does an address calculation on an
241      // alloca ptr, then we want to use a higher than normal loop unroll
242      // threshold. This will give SROA a better chance to eliminate these
243      // allocas.
244      //
245      // We also want to have more unrolling for local memory to let ds
246      // instructions with different offsets combine.
247      //
248      // Don't use the maximum allowed value here as it will make some
249      // programs way too big.
250      UP.Threshold = Threshold;
251      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252                        << " for loop:\n"
253                        << *L << " due to " << *GEP << '\n');
254      if (UP.Threshold >= MaxBoost)
255        return;
256    }
257
258    // If we got a GEP in a small BB from inner loop then increase max trip
259    // count to analyze for better estimation cost in unroll
260    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261      UP.MaxIterationsCountToAnalyze = 32;
262  }
263}
264
265void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266                                          TTI::PeelingPreferences &PP) {
267  BaseT::getPeelingPreferences(L, SE, PP);
268}
269
270int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271  return 1024;
272}
273
274const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275    // Codegen control options which don't matter.
276    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279    AMDGPU::FeatureUnalignedAccessMode,
280
281    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283    // Property of the kernel/environment which can't actually differ.
284    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285    AMDGPU::FeatureTrapHandler,
286
287    // The default assumption needs to be ecc is enabled, but no directly
288    // exposed operations depend on it, so it can be safely inlined.
289    AMDGPU::FeatureSRAMECC,
290
291    // Perf-tuning features
292    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
294GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
295    : BaseT(TM, F.getParent()->getDataLayout()),
296      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297      TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
299  SIModeRegisterDefaults Mode(F, *ST);
300  HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301  HasFP64FP16Denormals =
302      Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303}
304
305bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306  return !F || !ST->isSingleLaneExecution(*F);
307}
308
309unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311  // registers. See getRegisterClassForType for the implementation.
312  // In this case vector registers are not vector in terms of
313  // VGPRs, but those which can hold multiple values.
314
315  // This is really the number of registers to fill when vectorizing /
316  // interleaving loops, so we lie to avoid trying to use all registers.
317  return 4;
318}
319
320TypeSize
321GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322  switch (K) {
323  case TargetTransformInfo::RGK_Scalar:
324    return TypeSize::getFixed(32);
325  case TargetTransformInfo::RGK_FixedWidthVector:
326    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
327  case TargetTransformInfo::RGK_ScalableVector:
328    return TypeSize::getScalable(0);
329  }
330  llvm_unreachable("Unsupported register kind");
331}
332
333unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334  return 32;
335}
336
337unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339    return 32 * 4 / ElemWidth;
340  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342       : 1;
343}
344
345unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346                                         unsigned ChainSizeInBytes,
347                                         VectorType *VecTy) const {
348  unsigned VecRegBitWidth = VF * LoadSize;
349  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350    // TODO: Support element-size less than 32bit?
351    return 128 / LoadSize;
352
353  return VF;
354}
355
356unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357                                             unsigned ChainSizeInBytes,
358                                             VectorType *VecTy) const {
359  unsigned VecRegBitWidth = VF * StoreSize;
360  if (VecRegBitWidth > 128)
361    return 128 / StoreSize;
362
363  return VF;
364}
365
366unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371      AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
372      AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373    return 512;
374  }
375
376  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377    return 8 * ST->getMaxPrivateElementSize();
378
379  // Common to flat, global, local and region. Assume for unknown addrspace.
380  return 128;
381}
382
383bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384                                            Align Alignment,
385                                            unsigned AddrSpace) const {
386  // We allow vectorization of flat stores, even though we may need to decompose
387  // them later if they may access private memory. We don't have enough context
388  // here, and legalization can handle it.
389  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392  }
393  return true;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397                                             Align Alignment,
398                                             unsigned AddrSpace) const {
399  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400}
401
402bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403                                              Align Alignment,
404                                              unsigned AddrSpace) const {
405  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406}
407
408int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409  return 1024;
410}
411
412// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413// iteration. Should we report a larger size and let it legalize?
414//
415// FIXME: Should we use narrower types for local/region, or account for when
416// unaligned access is legal?
417//
418// FIXME: This could use fine tuning and microbenchmarks.
419Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422    std::optional<uint32_t> AtomicElementSize) const {
423
424  if (AtomicElementSize)
425    return Type::getIntNTy(Context, *AtomicElementSize * 8);
426
427  unsigned MinAlign = std::min(SrcAlign, DestAlign);
428
429  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430  // hardware into byte accesses. If you assume all alignments are equally
431  // probable, it's more efficient on average to use short accesses for this
432  // case.
433  if (MinAlign == 2)
434    return Type::getInt16Ty(Context);
435
436  // Not all subtargets have 128-bit DS instructions, and we currently don't
437  // form them by default.
438  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
439      SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
440      DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441      DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
443  }
444
445  // Global memory works best with 16-byte accesses. Private memory will also
446  // hit this, although they'll be decomposed.
447  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
448}
449
450void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453    unsigned SrcAlign, unsigned DestAlign,
454    std::optional<uint32_t> AtomicCpySize) const {
455  assert(RemainingBytes < 16);
456
457  if (AtomicCpySize)
458    BaseT::getMemcpyLoopResidualLoweringType(
459        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460        DestAlign, AtomicCpySize);
461
462  unsigned MinAlign = std::min(SrcAlign, DestAlign);
463
464  if (MinAlign != 2) {
465    Type *I64Ty = Type::getInt64Ty(Context);
466    while (RemainingBytes >= 8) {
467      OpsOut.push_back(I64Ty);
468      RemainingBytes -= 8;
469    }
470
471    Type *I32Ty = Type::getInt32Ty(Context);
472    while (RemainingBytes >= 4) {
473      OpsOut.push_back(I32Ty);
474      RemainingBytes -= 4;
475    }
476  }
477
478  Type *I16Ty = Type::getInt16Ty(Context);
479  while (RemainingBytes >= 2) {
480    OpsOut.push_back(I16Ty);
481    RemainingBytes -= 2;
482  }
483
484  Type *I8Ty = Type::getInt8Ty(Context);
485  while (RemainingBytes) {
486    OpsOut.push_back(I8Ty);
487    --RemainingBytes;
488  }
489}
490
491unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492  // Disable unrolling if the loop is not vectorized.
493  // TODO: Enable this again.
494  if (VF.isScalar())
495    return 1;
496
497  return 8;
498}
499
500bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501                                       MemIntrinsicInfo &Info) const {
502  switch (Inst->getIntrinsicID()) {
503  case Intrinsic::amdgcn_ds_ordered_add:
504  case Intrinsic::amdgcn_ds_ordered_swap:
505  case Intrinsic::amdgcn_ds_fadd:
506  case Intrinsic::amdgcn_ds_fmin:
507  case Intrinsic::amdgcn_ds_fmax: {
508    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510    if (!Ordering || !Volatile)
511      return false; // Invalid.
512
513    unsigned OrderingVal = Ordering->getZExtValue();
514    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515      return false;
516
517    Info.PtrVal = Inst->getArgOperand(0);
518    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519    Info.ReadMem = true;
520    Info.WriteMem = true;
521    Info.IsVolatile = !Volatile->isZero();
522    return true;
523  }
524  default:
525    return false;
526  }
527}
528
529InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532    ArrayRef<const Value *> Args,
533    const Instruction *CxtI) {
534
535  // Legalize the type.
536  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537  int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539  // Because we don't have any legal vector operations, but the legal types, we
540  // need to account for split vectors.
541  unsigned NElts = LT.second.isVector() ?
542    LT.second.getVectorNumElements() : 1;
543
544  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546  switch (ISD) {
547  case ISD::SHL:
548  case ISD::SRL:
549  case ISD::SRA:
550    if (SLT == MVT::i64)
551      return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553    if (ST->has16BitInsts() && SLT == MVT::i16)
554      NElts = (NElts + 1) / 2;
555
556    // i32
557    return getFullRateInstrCost() * LT.first * NElts;
558  case ISD::ADD:
559  case ISD::SUB:
560  case ISD::AND:
561  case ISD::OR:
562  case ISD::XOR:
563    if (SLT == MVT::i64) {
564      // and, or and xor are typically split into 2 VALU instructions.
565      return 2 * getFullRateInstrCost() * LT.first * NElts;
566    }
567
568    if (ST->has16BitInsts() && SLT == MVT::i16)
569      NElts = (NElts + 1) / 2;
570
571    return LT.first * NElts * getFullRateInstrCost();
572  case ISD::MUL: {
573    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574    if (SLT == MVT::i64) {
575      const int FullRateCost = getFullRateInstrCost();
576      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577    }
578
579    if (ST->has16BitInsts() && SLT == MVT::i16)
580      NElts = (NElts + 1) / 2;
581
582    // i32
583    return QuarterRateCost * NElts * LT.first;
584  }
585  case ISD::FMUL:
586    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588    // fused operation.
589    if (CxtI && CxtI->hasOneUse())
590      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
591        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
592        if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594            return TargetTransformInfo::TCC_Free;
595          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596            return TargetTransformInfo::TCC_Free;
597
598          // Estimate all types may be fused with contract/unsafe flags
599          const TargetOptions &Options = TLI->getTargetMachine().Options;
600          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601              Options.UnsafeFPMath ||
602              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603            return TargetTransformInfo::TCC_Free;
604        }
605      }
606    [[fallthrough]];
607  case ISD::FADD:
608  case ISD::FSUB:
609    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610      NElts = (NElts + 1) / 2;
611    if (SLT == MVT::f64)
612      return LT.first * NElts * get64BitInstrCost(CostKind);
613
614    if (ST->has16BitInsts() && SLT == MVT::f16)
615      NElts = (NElts + 1) / 2;
616
617    if (SLT == MVT::f32 || SLT == MVT::f16)
618      return LT.first * NElts * getFullRateInstrCost();
619    break;
620  case ISD::FDIV:
621  case ISD::FREM:
622    // FIXME: frem should be handled separately. The fdiv in it is most of it,
623    // but the current lowering is also not entirely correct.
624    if (SLT == MVT::f64) {
625      int Cost = 7 * get64BitInstrCost(CostKind) +
626                 getQuarterRateInstrCost(CostKind) +
627                 3 * getHalfRateInstrCost(CostKind);
628      // Add cost of workaround.
629      if (!ST->hasUsableDivScaleConditionOutput())
630        Cost += 3 * getFullRateInstrCost();
631
632      return LT.first * Cost * NElts;
633    }
634
635    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
636      // TODO: This is more complicated, unsafe flags etc.
637      if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638          (SLT == MVT::f16 && ST->has16BitInsts())) {
639        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640      }
641    }
642
643    if (SLT == MVT::f16 && ST->has16BitInsts()) {
644      // 2 x v_cvt_f32_f16
645      // f32 rcp
646      // f32 fmul
647      // v_cvt_f16_f32
648      // f16 div_fixup
649      int Cost =
650          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651      return LT.first * Cost * NElts;
652    }
653
654    if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
655                            TLI->getTargetMachine().Options.UnsafeFPMath)) {
656      // Fast unsafe fdiv lowering:
657      // f32 rcp
658      // f32 fmul
659      int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660      return LT.first * Cost * NElts;
661    }
662
663    if (SLT == MVT::f32 || SLT == MVT::f16) {
664      // 4 more v_cvt_* insts without f16 insts support
665      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666                 1 * getQuarterRateInstrCost(CostKind);
667
668      if (!HasFP32Denormals) {
669        // FP mode switches.
670        Cost += 2 * getFullRateInstrCost();
671      }
672
673      return LT.first * NElts * Cost;
674    }
675    break;
676  case ISD::FNEG:
677    // Use the backend' estimation. If fneg is not free each element will cost
678    // one additional instruction.
679    return TLI->isFNegFree(SLT) ? 0 : NElts;
680  default:
681    break;
682  }
683
684  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
685                                       Args, CxtI);
686}
687
688// Return true if there's a potential benefit from using v2f16/v2i16
689// instructions for an intrinsic, even if it requires nontrivial legalization.
690static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
691  switch (ID) {
692  case Intrinsic::fma: // TODO: fmuladd
693  // There's a small benefit to using vector ops in the legalized code.
694  case Intrinsic::round:
695  case Intrinsic::uadd_sat:
696  case Intrinsic::usub_sat:
697  case Intrinsic::sadd_sat:
698  case Intrinsic::ssub_sat:
699    return true;
700  default:
701    return false;
702  }
703}
704
705InstructionCost
706GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
707                                  TTI::TargetCostKind CostKind) {
708  if (ICA.getID() == Intrinsic::fabs)
709    return 0;
710
711  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
712    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
713
714  Type *RetTy = ICA.getReturnType();
715
716  // Legalize the type.
717  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
718
719  unsigned NElts = LT.second.isVector() ?
720    LT.second.getVectorNumElements() : 1;
721
722  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
723
724  if (SLT == MVT::f64)
725    return LT.first * NElts * get64BitInstrCost(CostKind);
726
727  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
728      (ST->hasPackedFP32Ops() && SLT == MVT::f32))
729    NElts = (NElts + 1) / 2;
730
731  // TODO: Get more refined intrinsic costs?
732  unsigned InstRate = getQuarterRateInstrCost(CostKind);
733
734  switch (ICA.getID()) {
735  case Intrinsic::fma:
736    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
737                                   : getQuarterRateInstrCost(CostKind);
738    break;
739  case Intrinsic::uadd_sat:
740  case Intrinsic::usub_sat:
741  case Intrinsic::sadd_sat:
742  case Intrinsic::ssub_sat:
743    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
744    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
745      NElts = 1;
746    break;
747  }
748
749  return LT.first * NElts * InstRate;
750}
751
752InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
753                                           TTI::TargetCostKind CostKind,
754                                           const Instruction *I) {
755  assert((I == nullptr || I->getOpcode() == Opcode) &&
756         "Opcode should reflect passed instruction.");
757  const bool SCost =
758      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
759  const int CBrCost = SCost ? 5 : 7;
760  switch (Opcode) {
761  case Instruction::Br: {
762    // Branch instruction takes about 4 slots on gfx900.
763    auto BI = dyn_cast_or_null<BranchInst>(I);
764    if (BI && BI->isUnconditional())
765      return SCost ? 1 : 4;
766    // Suppose conditional branch takes additional 3 exec manipulations
767    // instructions in average.
768    return CBrCost;
769  }
770  case Instruction::Switch: {
771    auto SI = dyn_cast_or_null<SwitchInst>(I);
772    // Each case (including default) takes 1 cmp + 1 cbr instructions in
773    // average.
774    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
775  }
776  case Instruction::Ret:
777    return SCost ? 1 : 10;
778  }
779  return BaseT::getCFInstrCost(Opcode, CostKind, I);
780}
781
782InstructionCost
783GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
784                                       std::optional<FastMathFlags> FMF,
785                                       TTI::TargetCostKind CostKind) {
786  if (TTI::requiresOrderedReduction(FMF))
787    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
788
789  EVT OrigTy = TLI->getValueType(DL, Ty);
790
791  // Computes cost on targets that have packed math instructions(which support
792  // 16-bit types only).
793  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
794    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
795
796  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
797  return LT.first * getFullRateInstrCost();
798}
799
800InstructionCost
801GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
802                                   FastMathFlags FMF,
803                                   TTI::TargetCostKind CostKind) {
804  EVT OrigTy = TLI->getValueType(DL, Ty);
805
806  // Computes cost on targets that have packed math instructions(which support
807  // 16-bit types only).
808  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
809    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
810
811  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
812  return LT.first * getHalfRateInstrCost(CostKind);
813}
814
815InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
816                                               TTI::TargetCostKind CostKind,
817                                               unsigned Index, Value *Op0,
818                                               Value *Op1) {
819  switch (Opcode) {
820  case Instruction::ExtractElement:
821  case Instruction::InsertElement: {
822    unsigned EltSize
823      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
824    if (EltSize < 32) {
825      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
826        return 0;
827      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
828                                       Op1);
829    }
830
831    // Extracts are just reads of a subregister, so are free. Inserts are
832    // considered free because we don't want to have any cost for scalarizing
833    // operations, and we don't have to copy into a different register class.
834
835    // Dynamic indexing isn't free and is best avoided.
836    return Index == ~0u ? 2 : 0;
837  }
838  default:
839    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
840  }
841}
842
843/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
844/// this is analyzing the collective result of all output registers. Otherwise,
845/// this is only querying a specific result index if this returns multiple
846/// registers in a struct.
847bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
848  const CallInst *CI, ArrayRef<unsigned> Indices) const {
849  // TODO: Handle complex extract indices
850  if (Indices.size() > 1)
851    return true;
852
853  const DataLayout &DL = CI->getModule()->getDataLayout();
854  const SIRegisterInfo *TRI = ST->getRegisterInfo();
855  TargetLowering::AsmOperandInfoVector TargetConstraints =
856      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
857
858  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
859
860  int OutputIdx = 0;
861  for (auto &TC : TargetConstraints) {
862    if (TC.Type != InlineAsm::isOutput)
863      continue;
864
865    // Skip outputs we don't care about.
866    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
867      continue;
868
869    TLI->ComputeConstraintToUse(TC, SDValue());
870
871    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
872        TRI, TC.ConstraintCode, TC.ConstraintVT).second;
873
874    // For AGPR constraints null is returned on subtargets without AGPRs, so
875    // assume divergent for null.
876    if (!RC || !TRI->isSGPRClass(RC))
877      return true;
878  }
879
880  return false;
881}
882
883bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
884    const IntrinsicInst *ReadReg) const {
885  Metadata *MD =
886      cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
887  StringRef RegName =
888      cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
889
890  // Special case registers that look like VCC.
891  MVT VT = MVT::getVT(ReadReg->getType());
892  if (VT == MVT::i1)
893    return true;
894
895  // Special case scalar registers that start with 'v'.
896  if (RegName.starts_with("vcc") || RegName.empty())
897    return false;
898
899  // VGPR or AGPR is divergent. There aren't any specially named vector
900  // registers.
901  return RegName[0] == 'v' || RegName[0] == 'a';
902}
903
904/// \returns true if the result of the value could potentially be
905/// different across workitems in a wavefront.
906bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
907  if (const Argument *A = dyn_cast<Argument>(V))
908    return !AMDGPU::isArgPassedInSGPR(A);
909
910  // Loads from the private and flat address spaces are divergent, because
911  // threads can execute the load instruction with the same inputs and get
912  // different results.
913  //
914  // All other loads are not divergent, because if threads issue loads with the
915  // same arguments, they will always get the same result.
916  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
917    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
918           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
919
920  // Atomics are divergent because they are executed sequentially: when an
921  // atomic operation refers to the same address in each thread, then each
922  // thread after the first sees the value written by the previous thread as
923  // original value.
924  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
925    return true;
926
927  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
928    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
929      return isReadRegisterSourceOfDivergence(Intrinsic);
930
931    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
932  }
933
934  // Assume all function calls are a source of divergence.
935  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
936    if (CI->isInlineAsm())
937      return isInlineAsmSourceOfDivergence(CI);
938    return true;
939  }
940
941  // Assume all function calls are a source of divergence.
942  if (isa<InvokeInst>(V))
943    return true;
944
945  return false;
946}
947
948bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
949  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
950    return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
951
952  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
953    if (CI->isInlineAsm())
954      return !isInlineAsmSourceOfDivergence(CI);
955    return false;
956  }
957
958  // In most cases TID / wavefrontsize is uniform.
959  //
960  // However, if a kernel has uneven dimesions we can have a value of
961  // workitem-id-x divided by the wavefrontsize non-uniform. For example
962  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
963  // packed into a same wave which gives 1 and 0 after the division by 64
964  // respectively.
965  //
966  // FIXME: limit it to 1D kernels only, although that shall be possible
967  // to perform this optimization is the size of the X dimension is a power
968  // of 2, we just do not currently have infrastructure to query it.
969  using namespace llvm::PatternMatch;
970  uint64_t C;
971  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972                      m_ConstantInt(C))) ||
973      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
974                      m_ConstantInt(C)))) {
975    const Function *F = cast<Instruction>(V)->getFunction();
976    return C >= ST->getWavefrontSizeLog2() &&
977           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
978  }
979
980  Value *Mask;
981  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
982                       m_Value(Mask)))) {
983    const Function *F = cast<Instruction>(V)->getFunction();
984    const DataLayout &DL = F->getParent()->getDataLayout();
985    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
986               ST->getWavefrontSizeLog2() &&
987           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
988  }
989
990  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
991  if (!ExtValue)
992    return false;
993
994  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
995  if (!CI)
996    return false;
997
998  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
999    switch (Intrinsic->getIntrinsicID()) {
1000    default:
1001      return false;
1002    case Intrinsic::amdgcn_if:
1003    case Intrinsic::amdgcn_else: {
1004      ArrayRef<unsigned> Indices = ExtValue->getIndices();
1005      return Indices.size() == 1 && Indices[0] == 1;
1006    }
1007    }
1008  }
1009
1010  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1011  // divergent for the overall struct return. We need to override it in the
1012  // case we're extracting an SGPR component here.
1013  if (CI->isInlineAsm())
1014    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1015
1016  return false;
1017}
1018
1019bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1020                                            Intrinsic::ID IID) const {
1021  switch (IID) {
1022  case Intrinsic::amdgcn_ds_fadd:
1023  case Intrinsic::amdgcn_ds_fmin:
1024  case Intrinsic::amdgcn_ds_fmax:
1025  case Intrinsic::amdgcn_is_shared:
1026  case Intrinsic::amdgcn_is_private:
1027  case Intrinsic::amdgcn_flat_atomic_fadd:
1028  case Intrinsic::amdgcn_flat_atomic_fmax:
1029  case Intrinsic::amdgcn_flat_atomic_fmin:
1030  case Intrinsic::amdgcn_flat_atomic_fmax_num:
1031  case Intrinsic::amdgcn_flat_atomic_fmin_num:
1032    OpIndexes.push_back(0);
1033    return true;
1034  default:
1035    return false;
1036  }
1037}
1038
1039Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1040                                                    Value *OldV,
1041                                                    Value *NewV) const {
1042  auto IntrID = II->getIntrinsicID();
1043  switch (IntrID) {
1044  case Intrinsic::amdgcn_ds_fadd:
1045  case Intrinsic::amdgcn_ds_fmin:
1046  case Intrinsic::amdgcn_ds_fmax: {
1047    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1048    if (!IsVolatile->isZero())
1049      return nullptr;
1050    Module *M = II->getParent()->getParent()->getParent();
1051    Type *DestTy = II->getType();
1052    Type *SrcTy = NewV->getType();
1053    Function *NewDecl =
1054        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1055    II->setArgOperand(0, NewV);
1056    II->setCalledFunction(NewDecl);
1057    return II;
1058  }
1059  case Intrinsic::amdgcn_is_shared:
1060  case Intrinsic::amdgcn_is_private: {
1061    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1062      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1063    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064    LLVMContext &Ctx = NewV->getType()->getContext();
1065    ConstantInt *NewVal = (TrueAS == NewAS) ?
1066      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1067    return NewVal;
1068  }
1069  case Intrinsic::ptrmask: {
1070    unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072    Value *MaskOp = II->getArgOperand(1);
1073    Type *MaskTy = MaskOp->getType();
1074
1075    bool DoTruncate = false;
1076
1077    const GCNTargetMachine &TM =
1078        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1080      // All valid 64-bit to 32-bit casts work by chopping off the high
1081      // bits. Any masking only clearing the low bits will also apply in the new
1082      // address space.
1083      if (DL.getPointerSizeInBits(OldAS) != 64 ||
1084          DL.getPointerSizeInBits(NewAS) != 32)
1085        return nullptr;
1086
1087      // TODO: Do we need to thread more context in here?
1088      KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1089      if (Known.countMinLeadingOnes() < 32)
1090        return nullptr;
1091
1092      DoTruncate = true;
1093    }
1094
1095    IRBuilder<> B(II);
1096    if (DoTruncate) {
1097      MaskTy = B.getInt32Ty();
1098      MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1099    }
1100
1101    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102                             {NewV, MaskOp});
1103  }
1104  case Intrinsic::amdgcn_flat_atomic_fadd:
1105  case Intrinsic::amdgcn_flat_atomic_fmax:
1106  case Intrinsic::amdgcn_flat_atomic_fmin:
1107  case Intrinsic::amdgcn_flat_atomic_fmax_num:
1108  case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1109    Type *DestTy = II->getType();
1110    Type *SrcTy = NewV->getType();
1111    unsigned NewAS = SrcTy->getPointerAddressSpace();
1112    if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
1113      return nullptr;
1114    Module *M = II->getModule();
1115    Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
1116                                                  {DestTy, SrcTy, DestTy});
1117    II->setArgOperand(0, NewV);
1118    II->setCalledFunction(NewDecl);
1119    return II;
1120  }
1121  default:
1122    return nullptr;
1123  }
1124}
1125
1126InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1127                                           VectorType *VT, ArrayRef<int> Mask,
1128                                           TTI::TargetCostKind CostKind,
1129                                           int Index, VectorType *SubTp,
1130                                           ArrayRef<const Value *> Args) {
1131  Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1132
1133  if (ST->hasVOP3PInsts()) {
1134    if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1135        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1136      // With op_sel VOP3P instructions freely can access the low half or high
1137      // half of a register, so any swizzle is free.
1138
1139      switch (Kind) {
1140      case TTI::SK_Broadcast:
1141      case TTI::SK_Reverse:
1142      case TTI::SK_PermuteSingleSrc:
1143        return 0;
1144      default:
1145        break;
1146      }
1147    }
1148  }
1149
1150  return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1151}
1152
1153bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1154                                     const Function *Callee) const {
1155  const TargetMachine &TM = getTLI()->getTargetMachine();
1156  const GCNSubtarget *CallerST
1157    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1158  const GCNSubtarget *CalleeST
1159    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1160
1161  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1162  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1163
1164  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1165  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1166  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1167    return false;
1168
1169  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1170  // no way to support merge for backend defined attributes.
1171  SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1172  SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1173  if (!CallerMode.isInlineCompatible(CalleeMode))
1174    return false;
1175
1176  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1177      Callee->hasFnAttribute(Attribute::InlineHint))
1178    return true;
1179
1180  // Hack to make compile times reasonable.
1181  if (InlineMaxBB) {
1182    // Single BB does not increase total BB amount.
1183    if (Callee->size() == 1)
1184      return true;
1185    size_t BBSize = Caller->size() + Callee->size() - 1;
1186    return BBSize <= InlineMaxBB;
1187  }
1188
1189  return true;
1190}
1191
1192static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1193                                                   const SITargetLowering *TLI,
1194                                                   const GCNTTIImpl *TTIImpl) {
1195  const int NrOfSGPRUntilSpill = 26;
1196  const int NrOfVGPRUntilSpill = 32;
1197
1198  const DataLayout &DL = TTIImpl->getDataLayout();
1199
1200  unsigned adjustThreshold = 0;
1201  int SGPRsInUse = 0;
1202  int VGPRsInUse = 0;
1203  for (const Use &A : CB->args()) {
1204    SmallVector<EVT, 4> ValueVTs;
1205    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1206    for (auto ArgVT : ValueVTs) {
1207      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1208          CB->getContext(), CB->getCallingConv(), ArgVT);
1209      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
1210        SGPRsInUse += CCRegNum;
1211      else
1212        VGPRsInUse += CCRegNum;
1213    }
1214  }
1215
1216  // The cost of passing function arguments through the stack:
1217  //  1 instruction to put a function argument on the stack in the caller.
1218  //  1 instruction to take a function argument from the stack in callee.
1219  //  1 instruction is explicitly take care of data dependencies in callee
1220  //  function.
1221  InstructionCost ArgStackCost(1);
1222  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1223      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1224      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1225  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1226      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1227      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1228
1229  // The penalty cost is computed relative to the cost of instructions and does
1230  // not model any storage costs.
1231  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1232                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1233  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1234                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1235  return adjustThreshold;
1236}
1237
1238static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1239                                           const DataLayout &DL) {
1240  // If we have a pointer to a private array passed into a function
1241  // it will not be optimized out, leaving scratch usage.
1242  // This function calculates the total size in bytes of the memory that would
1243  // end in scratch if the call was not inlined.
1244  unsigned AllocaSize = 0;
1245  SmallPtrSet<const AllocaInst *, 8> AIVisited;
1246  for (Value *PtrArg : CB->args()) {
1247    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1248    if (!Ty)
1249      continue;
1250
1251    unsigned AddrSpace = Ty->getAddressSpace();
1252    if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1253        AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1254      continue;
1255
1256    const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1257    if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1258      continue;
1259
1260    AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1261  }
1262  return AllocaSize;
1263}
1264
1265unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1266  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1267
1268  // Private object passed as arguments may end up in scratch usage if the call
1269  // is not inlined. Increase the inline threshold to promote inlining.
1270  unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1271  if (AllocaSize > 0)
1272    Threshold += ArgAllocaCost;
1273  return Threshold;
1274}
1275
1276unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1277                                         const AllocaInst *AI) const {
1278
1279  // Below the cutoff, assume that the private memory objects would be
1280  // optimized
1281  auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1282  if (AllocaSize <= ArgAllocaCutoff)
1283    return 0;
1284
1285  // Above the cutoff, we give a cost to each private memory object
1286  // depending its size. If the array can be optimized by SROA this cost is not
1287  // added to the total-cost in the inliner cost analysis.
1288  //
1289  // We choose the total cost of the alloca such that their sum cancels the
1290  // bonus given in the threshold (ArgAllocaCost).
1291  //
1292  //   Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1293  //
1294  // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1295  // the single-bb bonus and the vector-bonus.
1296  //
1297  // We compensate the first two multipliers, by repeating logic from the
1298  // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1299  static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1300  unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1301
1302  bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1303    return BB.getTerminator()->getNumSuccessors() > 1;
1304  });
1305  if (SingleBB) {
1306    Threshold += Threshold / 2;
1307  }
1308
1309  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1310
1311  // Attribute the bonus proportionally to the alloca size
1312  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1313
1314  return AllocaThresholdBonus;
1315}
1316
1317void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1318                                         TTI::UnrollingPreferences &UP,
1319                                         OptimizationRemarkEmitter *ORE) {
1320  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1321}
1322
1323void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1324                                       TTI::PeelingPreferences &PP) {
1325  CommonTTI.getPeelingPreferences(L, SE, PP);
1326}
1327
1328int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1329  return ST->hasFullRate64Ops()
1330             ? getFullRateInstrCost()
1331             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1332                                      : getQuarterRateInstrCost(CostKind);
1333}
1334
1335std::pair<InstructionCost, MVT>
1336GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1337  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1338  auto Size = DL.getTypeSizeInBits(Ty);
1339  // Maximum load or store can handle 8 dwords for scalar and 4 for
1340  // vector ALU. Let's assume anything above 8 dwords is expensive
1341  // even if legal.
1342  if (Size <= 256)
1343    return Cost;
1344
1345  Cost.first += (Size + 255) / 256;
1346  return Cost;
1347}
1348
1349unsigned GCNTTIImpl::getPrefetchDistance() const {
1350  return ST->hasPrefetch() ? 128 : 0;
1351}
1352
1353bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1354  return AMDGPU::isFlatGlobalAddrSpace(AS);
1355}
1356