1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "AMDGPUCallLowering.h"
18#include "AMDGPUInstructionSelector.h"
19#include "AMDGPULegalizerInfo.h"
20#include "AMDGPURegisterBankInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/MachineScheduler.h"
25#include "llvm/MC/MCSubtargetInfo.h"
26#include "llvm/IR/MDBuilder.h"
27#include "llvm/CodeGen/TargetFrameLowering.h"
28#include <algorithm>
29
30using namespace llvm;
31
32#define DEBUG_TYPE "amdgpu-subtarget"
33
34#define GET_SUBTARGETINFO_TARGET_DESC
35#define GET_SUBTARGETINFO_CTOR
36#define AMDGPUSubtarget GCNSubtarget
37#include "AMDGPUGenSubtargetInfo.inc"
38#define GET_SUBTARGETINFO_TARGET_DESC
39#define GET_SUBTARGETINFO_CTOR
40#undef AMDGPUSubtarget
41#include "R600GenSubtargetInfo.inc"
42
43static cl::opt<bool> DisablePowerSched(
44  "amdgpu-disable-power-sched",
45  cl::desc("Disable scheduling to minimize mAI power bursts"),
46  cl::init(false));
47
48static cl::opt<bool> EnableVGPRIndexMode(
49  "amdgpu-vgpr-index-mode",
50  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51  cl::init(false));
52
53GCNSubtarget::~GCNSubtarget() = default;
54
55R600Subtarget &
56R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                               StringRef GPU, StringRef FS) {
58  SmallString<256> FullFS("+promote-alloca,");
59  FullFS += FS;
60  ParseSubtargetFeatures(GPU, FullFS);
61
62  // FIXME: I don't think think Evergreen has any useful support for
63  // denormals, but should be checked. Should we issue a warning somewhere
64  // if someone tries to enable these?
65  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66    FP32Denormals = false;
67  }
68
69  HasMulU24 = getGeneration() >= EVERGREEN;
70  HasMulI24 = hasCaymanISA();
71
72  return *this;
73}
74
75GCNSubtarget &
76GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77                                              StringRef GPU, StringRef FS) {
78  // Determine default and user-specified characteristics
79  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80  // enabled, but some instructions do not respect them and they run at the
81  // double precision rate, so don't enable by default.
82  //
83  // We want to be able to turn these off, but making this a subtarget feature
84  // for SI has the unhelpful behavior that it unsets everything else if you
85  // disable it.
86  //
87  // Similarly we want enable-prt-strict-null to be on by default and not to
88  // unset everything else if it is disabled
89
90  // Assuming ECC is enabled is the conservative default.
91  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92
93  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95
96  // FIXME: I don't think think Evergreen has any useful support for
97  // denormals, but should be checked. Should we issue a warning somewhere
98  // if someone tries to enable these?
99  if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100    FullFS += "+fp64-fp16-denormals,";
101  } else {
102    FullFS += "-fp32-denormals,";
103  }
104
105  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106
107  // Disable mutually exclusive bits.
108  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109    if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110      FullFS += "-wavefrontsize16,";
111    if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112      FullFS += "-wavefrontsize32,";
113    if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114      FullFS += "-wavefrontsize64,";
115  }
116
117  FullFS += FS;
118
119  ParseSubtargetFeatures(GPU, FullFS);
120
121  // We don't support FP64 for EG/NI atm.
122  assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123
124  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126  // variants of MUBUF instructions.
127  if (!hasAddr64() && !FS.contains("flat-for-global")) {
128    FlatForGlobal = true;
129  }
130
131  // Set defaults if needed.
132  if (MaxPrivateElementSize == 0)
133    MaxPrivateElementSize = 4;
134
135  if (LDSBankCount == 0)
136    LDSBankCount = 32;
137
138  if (TT.getArch() == Triple::amdgcn) {
139    if (LocalMemorySize == 0)
140      LocalMemorySize = 32768;
141
142    // Do something sensible for unspecified target.
143    if (!HasMovrel && !HasVGPRIndexMode)
144      HasMovrel = true;
145  }
146
147  // Don't crash on invalid devices.
148  if (WavefrontSize == 0)
149    WavefrontSize = 64;
150
151  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152
153  if (DoesNotSupportXNACK && EnableXNACK) {
154    ToggleFeature(AMDGPU::FeatureXNACK);
155    EnableXNACK = false;
156  }
157
158  // ECC is on by default, but turn it off if the hardware doesn't support it
159  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
160  // ECC.
161  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
162    ToggleFeature(AMDGPU::FeatureSRAMECC);
163    EnableSRAMECC = false;
164  }
165
166  return *this;
167}
168
169AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
170  TargetTriple(TT),
171  Has16BitInsts(false),
172  HasMadMixInsts(false),
173  FP32Denormals(false),
174  FPExceptions(false),
175  HasSDWA(false),
176  HasVOP3PInsts(false),
177  HasMulI24(true),
178  HasMulU24(true),
179  HasInv2PiInlineImm(false),
180  HasFminFmaxLegacy(true),
181  EnablePromoteAlloca(false),
182  HasTrigReducedRange(false),
183  MaxWavesPerEU(10),
184  LocalMemorySize(0),
185  WavefrontSize(0)
186  { }
187
188GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
189                           const GCNTargetMachine &TM) :
190    AMDGPUGenSubtargetInfo(TT, GPU, FS),
191    AMDGPUSubtarget(TT),
192    TargetTriple(TT),
193    Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
194    InstrItins(getInstrItineraryForCPU(GPU)),
195    LDSBankCount(0),
196    MaxPrivateElementSize(0),
197
198    FastFMAF32(false),
199    HalfRate64Ops(false),
200
201    FP64FP16Denormals(false),
202    FlatForGlobal(false),
203    AutoWaitcntBeforeBarrier(false),
204    CodeObjectV3(false),
205    UnalignedScratchAccess(false),
206    UnalignedBufferAccess(false),
207
208    HasApertureRegs(false),
209    EnableXNACK(false),
210    DoesNotSupportXNACK(false),
211    EnableCuMode(false),
212    TrapHandler(false),
213
214    EnableLoadStoreOpt(false),
215    EnableUnsafeDSOffsetFolding(false),
216    EnableSIScheduler(false),
217    EnableDS128(false),
218    EnablePRTStrictNull(false),
219    DumpCode(false),
220
221    FP64(false),
222    GCN3Encoding(false),
223    CIInsts(false),
224    GFX8Insts(false),
225    GFX9Insts(false),
226    GFX10Insts(false),
227    GFX7GFX8GFX9Insts(false),
228    SGPRInitBug(false),
229    HasSMemRealTime(false),
230    HasIntClamp(false),
231    HasFmaMixInsts(false),
232    HasMovrel(false),
233    HasVGPRIndexMode(false),
234    HasScalarStores(false),
235    HasScalarAtomics(false),
236    HasSDWAOmod(false),
237    HasSDWAScalar(false),
238    HasSDWASdst(false),
239    HasSDWAMac(false),
240    HasSDWAOutModsVOPC(false),
241    HasDPP(false),
242    HasDPP8(false),
243    HasR128A16(false),
244    HasNSAEncoding(false),
245    HasDLInsts(false),
246    HasDot1Insts(false),
247    HasDot2Insts(false),
248    HasDot3Insts(false),
249    HasDot4Insts(false),
250    HasDot5Insts(false),
251    HasDot6Insts(false),
252    HasMAIInsts(false),
253    HasPkFmacF16Inst(false),
254    HasAtomicFaddInsts(false),
255    EnableSRAMECC(false),
256    DoesNotSupportSRAMECC(false),
257    HasNoSdstCMPX(false),
258    HasVscnt(false),
259    HasRegisterBanking(false),
260    HasVOP3Literal(false),
261    HasNoDataDepHazard(false),
262    FlatAddressSpace(false),
263    FlatInstOffsets(false),
264    FlatGlobalInsts(false),
265    FlatScratchInsts(false),
266    ScalarFlatScratchInsts(false),
267    AddNoCarryInsts(false),
268    HasUnpackedD16VMem(false),
269    LDSMisalignedBug(false),
270    HasMFMAInlineLiteralBug(false),
271
272    ScalarizeGlobal(false),
273
274    HasVcmpxPermlaneHazard(false),
275    HasVMEMtoScalarWriteHazard(false),
276    HasSMEMtoVectorWriteHazard(false),
277    HasInstFwdPrefetchBug(false),
278    HasVcmpxExecWARHazard(false),
279    HasLdsBranchVmemWARHazard(false),
280    HasNSAtoVMEMBug(false),
281    HasOffset3fBug(false),
282    HasFlatSegmentOffsetBug(false),
283
284    FeatureDisable(false),
285    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
286    TLInfo(TM, *this),
287    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
288  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
289  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
290  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
291  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
292  InstSelector.reset(new AMDGPUInstructionSelector(
293  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
294}
295
296unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
297  if (getGeneration() < GFX10)
298    return 1;
299
300  switch (Opcode) {
301  case AMDGPU::V_LSHLREV_B64:
302  case AMDGPU::V_LSHLREV_B64_gfx10:
303  case AMDGPU::V_LSHL_B64:
304  case AMDGPU::V_LSHRREV_B64:
305  case AMDGPU::V_LSHRREV_B64_gfx10:
306  case AMDGPU::V_LSHR_B64:
307  case AMDGPU::V_ASHRREV_I64:
308  case AMDGPU::V_ASHRREV_I64_gfx10:
309  case AMDGPU::V_ASHR_I64:
310    return 1;
311  }
312
313  return 2;
314}
315
316unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
317  const Function &F) const {
318  if (NWaves == 1)
319    return getLocalMemorySize();
320  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
321  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
322  if (!WorkGroupsPerCu)
323    return 0;
324  unsigned MaxWaves = getMaxWavesPerEU();
325  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
326}
327
328unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
329  const Function &F) const {
330  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
331  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
332  if (!WorkGroupsPerCu)
333    return 0;
334  unsigned MaxWaves = getMaxWavesPerEU();
335  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
336  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
337  NumWaves = std::min(NumWaves, MaxWaves);
338  NumWaves = std::max(NumWaves, 1u);
339  return NumWaves;
340}
341
342unsigned
343AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
344  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
345  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
346}
347
348std::pair<unsigned, unsigned>
349AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
350  switch (CC) {
351  case CallingConv::AMDGPU_VS:
352  case CallingConv::AMDGPU_LS:
353  case CallingConv::AMDGPU_HS:
354  case CallingConv::AMDGPU_ES:
355  case CallingConv::AMDGPU_GS:
356  case CallingConv::AMDGPU_PS:
357    return std::make_pair(1, getWavefrontSize());
358  default:
359    return std::make_pair(1u, getMaxFlatWorkGroupSize());
360  }
361}
362
363std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364  const Function &F) const {
365  // Default minimum/maximum flat work group sizes.
366  std::pair<unsigned, unsigned> Default =
367    getDefaultFlatWorkGroupSize(F.getCallingConv());
368
369  // Requested minimum/maximum flat work group sizes.
370  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371    F, "amdgpu-flat-work-group-size", Default);
372
373  // Make sure requested minimum is less than requested maximum.
374  if (Requested.first > Requested.second)
375    return Default;
376
377  // Make sure requested values do not violate subtarget's specifications.
378  if (Requested.first < getMinFlatWorkGroupSize())
379    return Default;
380  if (Requested.second > getMaxFlatWorkGroupSize())
381    return Default;
382
383  return Requested;
384}
385
386std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387  const Function &F) const {
388  // Default minimum/maximum number of waves per execution unit.
389  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
390
391  // Default/requested minimum/maximum flat work group sizes.
392  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
393
394  // If minimum/maximum flat work group sizes were explicitly requested using
395  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396  // number of waves per execution unit to values implied by requested
397  // minimum/maximum flat work group sizes.
398  unsigned MinImpliedByFlatWorkGroupSize =
399    getMaxWavesPerEU(FlatWorkGroupSizes.second);
400  bool RequestedFlatWorkGroupSize = false;
401
402  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403    Default.first = MinImpliedByFlatWorkGroupSize;
404    RequestedFlatWorkGroupSize = true;
405  }
406
407  // Requested minimum/maximum number of waves per execution unit.
408  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409    F, "amdgpu-waves-per-eu", Default, true);
410
411  // Make sure requested minimum is less than requested maximum.
412  if (Requested.second && Requested.first > Requested.second)
413    return Default;
414
415  // Make sure requested values do not violate subtarget's specifications.
416  if (Requested.first < getMinWavesPerEU() ||
417      Requested.first > getMaxWavesPerEU())
418    return Default;
419  if (Requested.second > getMaxWavesPerEU())
420    return Default;
421
422  // Make sure requested values are compatible with values implied by requested
423  // minimum/maximum flat work group sizes.
424  if (RequestedFlatWorkGroupSize &&
425      Requested.first < MinImpliedByFlatWorkGroupSize)
426    return Default;
427
428  return Requested;
429}
430
431bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
432  Function *Kernel = I->getParent()->getParent();
433  unsigned MinSize = 0;
434  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435  bool IdQuery = false;
436
437  // If reqd_work_group_size is present it narrows value down.
438  if (auto *CI = dyn_cast<CallInst>(I)) {
439    const Function *F = CI->getCalledFunction();
440    if (F) {
441      unsigned Dim = UINT_MAX;
442      switch (F->getIntrinsicID()) {
443      case Intrinsic::amdgcn_workitem_id_x:
444      case Intrinsic::r600_read_tidig_x:
445        IdQuery = true;
446        LLVM_FALLTHROUGH;
447      case Intrinsic::r600_read_local_size_x:
448        Dim = 0;
449        break;
450      case Intrinsic::amdgcn_workitem_id_y:
451      case Intrinsic::r600_read_tidig_y:
452        IdQuery = true;
453        LLVM_FALLTHROUGH;
454      case Intrinsic::r600_read_local_size_y:
455        Dim = 1;
456        break;
457      case Intrinsic::amdgcn_workitem_id_z:
458      case Intrinsic::r600_read_tidig_z:
459        IdQuery = true;
460        LLVM_FALLTHROUGH;
461      case Intrinsic::r600_read_local_size_z:
462        Dim = 2;
463        break;
464      default:
465        break;
466      }
467      if (Dim <= 3) {
468        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469          if (Node->getNumOperands() == 3)
470            MinSize = MaxSize = mdconst::extract<ConstantInt>(
471                                  Node->getOperand(Dim))->getZExtValue();
472      }
473    }
474  }
475
476  if (!MaxSize)
477    return false;
478
479  // Range metadata is [Lo, Hi). For ID query we need to pass max size
480  // as Hi. For size query we need to pass Hi + 1.
481  if (IdQuery)
482    MinSize = 0;
483  else
484    ++MaxSize;
485
486  MDBuilder MDB(I->getContext());
487  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488                                                  APInt(32, MaxSize));
489  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490  return true;
491}
492
493uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
494                                                 Align &MaxAlign) const {
495  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
496         F.getCallingConv() == CallingConv::SPIR_KERNEL);
497
498  const DataLayout &DL = F.getParent()->getDataLayout();
499  uint64_t ExplicitArgBytes = 0;
500  MaxAlign = Align::None();
501
502  for (const Argument &Arg : F.args()) {
503    Type *ArgTy = Arg.getType();
504
505    const Align Alignment(DL.getABITypeAlignment(ArgTy));
506    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507    ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
508    MaxAlign = std::max(MaxAlign, Alignment);
509  }
510
511  return ExplicitArgBytes;
512}
513
514unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
515                                                Align &MaxAlign) const {
516  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
517
518  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
519
520  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522  if (ImplicitBytes != 0) {
523    const Align Alignment = getAlignmentForImplicitArgPtr();
524    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525  }
526
527  // Being able to dereference past the end is useful for emitting scalar loads.
528  return alignTo(TotalSize, 4);
529}
530
531R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
532                             const TargetMachine &TM) :
533  R600GenSubtargetInfo(TT, GPU, FS),
534  AMDGPUSubtarget(TT),
535  InstrInfo(*this),
536  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537  FMA(false),
538  CaymanISA(false),
539  CFALUBug(false),
540  HasVertexCache(false),
541  R600ALUInst(false),
542  FP64(false),
543  TexVTXClauseSize(0),
544  Gen(R600),
545  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546  InstrItins(getInstrItineraryForCPU(GPU)) { }
547
548void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
549                                      unsigned NumRegionInstrs) const {
550  // Track register pressure so the scheduler can try to decrease
551  // pressure once register usage is above the threshold defined by
552  // SIRegisterInfo::getRegPressureSetLimit()
553  Policy.ShouldTrackPressure = true;
554
555  // Enabling both top down and bottom up scheduling seems to give us less
556  // register spills than just using one of these approaches on its own.
557  Policy.OnlyTopDown = false;
558  Policy.OnlyBottomUp = false;
559
560  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561  if (!enableSIScheduler())
562    Policy.ShouldTrackLaneMasks = true;
563}
564
565bool GCNSubtarget::hasMadF16() const {
566  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567}
568
569bool GCNSubtarget::useVGPRIndexMode() const {
570  return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
571}
572
573unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
574  if (getGeneration() >= AMDGPUSubtarget::GFX10)
575    return getMaxWavesPerEU();
576
577  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
578    if (SGPRs <= 80)
579      return 10;
580    if (SGPRs <= 88)
581      return 9;
582    if (SGPRs <= 100)
583      return 8;
584    return 7;
585  }
586  if (SGPRs <= 48)
587    return 10;
588  if (SGPRs <= 56)
589    return 9;
590  if (SGPRs <= 64)
591    return 8;
592  if (SGPRs <= 72)
593    return 7;
594  if (SGPRs <= 80)
595    return 6;
596  return 5;
597}
598
599unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
600  unsigned MaxWaves = getMaxWavesPerEU();
601  unsigned Granule = getVGPRAllocGranule();
602  if (VGPRs < Granule)
603    return MaxWaves;
604  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
605  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
606}
607
608unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
609  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
610  if (getGeneration() >= AMDGPUSubtarget::GFX10)
611    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
612
613  if (MFI.hasFlatScratchInit()) {
614    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
615      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
616    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
617      return 4; // FLAT_SCRATCH, VCC (in that order).
618  }
619
620  if (isXNACKEnabled())
621    return 4; // XNACK, VCC (in that order).
622  return 2; // VCC.
623}
624
625unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
626                                        unsigned LDSSize,
627                                        unsigned NumSGPRs,
628                                        unsigned NumVGPRs) const {
629  unsigned Occupancy =
630    std::min(getMaxWavesPerEU(),
631             getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
632  if (NumSGPRs)
633    Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
634  if (NumVGPRs)
635    Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
636  return Occupancy;
637}
638
639unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
640  const Function &F = MF.getFunction();
641  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
642
643  // Compute maximum number of SGPRs function can use using default/requested
644  // minimum number of waves per execution unit.
645  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
646  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
647  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
648
649  // Check if maximum number of SGPRs was explicitly requested using
650  // "amdgpu-num-sgpr" attribute.
651  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
652    unsigned Requested = AMDGPU::getIntegerAttribute(
653      F, "amdgpu-num-sgpr", MaxNumSGPRs);
654
655    // Make sure requested value does not violate subtarget's specifications.
656    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
657      Requested = 0;
658
659    // If more SGPRs are required to support the input user/system SGPRs,
660    // increase to accommodate them.
661    //
662    // FIXME: This really ends up using the requested number of SGPRs + number
663    // of reserved special registers in total. Theoretically you could re-use
664    // the last input registers for these special registers, but this would
665    // require a lot of complexity to deal with the weird aliasing.
666    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
667    if (Requested && Requested < InputNumSGPRs)
668      Requested = InputNumSGPRs;
669
670    // Make sure requested value is compatible with values implied by
671    // default/requested minimum/maximum number of waves per execution unit.
672    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
673      Requested = 0;
674    if (WavesPerEU.second &&
675        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
676      Requested = 0;
677
678    if (Requested)
679      MaxNumSGPRs = Requested;
680  }
681
682  if (hasSGPRInitBug())
683    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
684
685  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
686                  MaxAddressableNumSGPRs);
687}
688
689unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
690  const Function &F = MF.getFunction();
691  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
692
693  // Compute maximum number of VGPRs function can use using default/requested
694  // minimum number of waves per execution unit.
695  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
696  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
697
698  // Check if maximum number of VGPRs was explicitly requested using
699  // "amdgpu-num-vgpr" attribute.
700  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
701    unsigned Requested = AMDGPU::getIntegerAttribute(
702      F, "amdgpu-num-vgpr", MaxNumVGPRs);
703
704    // Make sure requested value is compatible with values implied by
705    // default/requested minimum/maximum number of waves per execution unit.
706    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
707      Requested = 0;
708    if (WavesPerEU.second &&
709        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
710      Requested = 0;
711
712    if (Requested)
713      MaxNumVGPRs = Requested;
714  }
715
716  return MaxNumVGPRs;
717}
718
719void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
720                                         SDep &Dep) const {
721  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
722      !Src->isInstr() || !Dst->isInstr())
723    return;
724
725  MachineInstr *SrcI = Src->getInstr();
726  MachineInstr *DstI = Dst->getInstr();
727
728  if (SrcI->isBundle()) {
729    const SIRegisterInfo *TRI = getRegisterInfo();
730    auto Reg = Dep.getReg();
731    MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
732    MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
733    unsigned Lat = 0;
734    for (++I; I != E && I->isBundledWithPred(); ++I) {
735      if (I->modifiesRegister(Reg, TRI))
736        Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
737      else if (Lat)
738        --Lat;
739    }
740    Dep.setLatency(Lat);
741  } else if (DstI->isBundle()) {
742    const SIRegisterInfo *TRI = getRegisterInfo();
743    auto Reg = Dep.getReg();
744    MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
745    MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
746    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
747    for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
748      if (I->readsRegister(Reg, TRI))
749        break;
750      --Lat;
751    }
752    Dep.setLatency(Lat);
753  }
754}
755
756namespace {
757struct MemOpClusterMutation : ScheduleDAGMutation {
758  const SIInstrInfo *TII;
759
760  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
761
762  void apply(ScheduleDAGInstrs *DAG) override {
763    SUnit *SUa = nullptr;
764    // Search for two consequent memory operations and link them
765    // to prevent scheduler from moving them apart.
766    // In DAG pre-process SUnits are in the original order of
767    // the instructions before scheduling.
768    for (SUnit &SU : DAG->SUnits) {
769      MachineInstr &MI2 = *SU.getInstr();
770      if (!MI2.mayLoad() && !MI2.mayStore()) {
771        SUa = nullptr;
772        continue;
773      }
774      if (!SUa) {
775        SUa = &SU;
776        continue;
777      }
778
779      MachineInstr &MI1 = *SUa->getInstr();
780      if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
781          (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
782          (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
783          (TII->isDS(MI1)   && TII->isDS(MI2))) {
784        SU.addPredBarrier(SUa);
785
786        for (const SDep &SI : SU.Preds) {
787          if (SI.getSUnit() != SUa)
788            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
789        }
790
791        if (&SU != &DAG->ExitSU) {
792          for (const SDep &SI : SUa->Succs) {
793            if (SI.getSUnit() != &SU)
794              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
795          }
796        }
797      }
798
799      SUa = &SU;
800    }
801  }
802};
803
804struct FillMFMAShadowMutation : ScheduleDAGMutation {
805  const SIInstrInfo *TII;
806
807  ScheduleDAGMI *DAG;
808
809  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
810
811  bool isSALU(const SUnit *SU) const {
812    const MachineInstr *MI = SU->getInstr();
813    return MI && TII->isSALU(*MI) && !MI->isTerminator();
814  }
815
816  bool isVALU(const SUnit *SU) const {
817    const MachineInstr *MI = SU->getInstr();
818    return MI && TII->isVALU(*MI);
819  }
820
821  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
822    if (Pred->NodeNum < Succ->NodeNum)
823      return true;
824
825    SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
826
827    for (unsigned I = 0; I < Succs.size(); ++I) {
828      for (const SDep &SI : Succs[I]->Succs) {
829        const SUnit *SU = SI.getSUnit();
830        if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
831          Succs.push_back(SU);
832      }
833    }
834
835    SmallPtrSet<const SUnit*, 32> Visited;
836    while (!Preds.empty()) {
837      const SUnit *SU = Preds.pop_back_val();
838      if (llvm::find(Succs, SU) != Succs.end())
839        return false;
840      Visited.insert(SU);
841      for (const SDep &SI : SU->Preds)
842        if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
843          Preds.push_back(SI.getSUnit());
844    }
845
846    return true;
847  }
848
849  // Link as much SALU intructions in chain as possible. Return the size
850  // of the chain. Links up to MaxChain instructions.
851  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
852                         SmallPtrSetImpl<SUnit *> &Visited) const {
853    SmallVector<SUnit *, 8> Worklist({To});
854    unsigned Linked = 0;
855
856    while (!Worklist.empty() && MaxChain-- > 0) {
857      SUnit *SU = Worklist.pop_back_val();
858      if (!Visited.insert(SU).second)
859        continue;
860
861      LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
862                 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
863
864      if (SU->addPred(SDep(From, SDep::Artificial), false))
865        ++Linked;
866
867      for (SDep &SI : From->Succs) {
868        SUnit *SUv = SI.getSUnit();
869        if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
870          SUv->addPred(SDep(SU, SDep::Artificial), false);
871      }
872
873      for (SDep &SI : SU->Succs) {
874        SUnit *Succ = SI.getSUnit();
875        if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
876          Worklist.push_back(Succ);
877      }
878    }
879
880    return Linked;
881  }
882
883  void apply(ScheduleDAGInstrs *DAGInstrs) override {
884    const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
885    if (!ST.hasMAIInsts() || DisablePowerSched)
886      return;
887    DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
888    const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
889    if (!TSchedModel || DAG->SUnits.empty())
890      return;
891
892    // Scan for MFMA long latency instructions and try to add a dependency
893    // of available SALU instructions to give them a chance to fill MFMA
894    // shadow. That is desirable to fill MFMA shadow with SALU instructions
895    // rather than VALU to prevent power consumption bursts and throttle.
896    auto LastSALU = DAG->SUnits.begin();
897    auto E = DAG->SUnits.end();
898    SmallPtrSet<SUnit*, 32> Visited;
899    for (SUnit &SU : DAG->SUnits) {
900      MachineInstr &MAI = *SU.getInstr();
901      if (!TII->isMAI(MAI) ||
902           MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
903           MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
904        continue;
905
906      unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
907
908      LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
909                 dbgs() << "Need " << Lat
910                        << " instructions to cover latency.\n");
911
912      // Find up to Lat independent scalar instructions as early as
913      // possible such that they can be scheduled after this MFMA.
914      for ( ; Lat && LastSALU != E; ++LastSALU) {
915        if (Visited.count(&*LastSALU))
916          continue;
917
918        if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
919          continue;
920
921        Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
922      }
923    }
924  }
925};
926} // namespace
927
928void GCNSubtarget::getPostRAMutations(
929    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
930  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
931  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
932}
933
934const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
935  if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
936    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
937  else
938    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
939}
940
941const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
942  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
943    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
944  else
945    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
946}
947