1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
17#include "llvm/IR/CallingConv.h"
18#include "llvm/Support/Alignment.h"
19#include "llvm/TargetParser/Triple.h"
20
21namespace llvm {
22
23enum AMDGPUDwarfFlavour : unsigned;
24class Function;
25class Instruction;
26class MachineFunction;
27class TargetMachine;
28
29class AMDGPUSubtarget {
30public:
31  enum Generation {
32    INVALID = 0,
33    R600 = 1,
34    R700 = 2,
35    EVERGREEN = 3,
36    NORTHERN_ISLANDS = 4,
37    SOUTHERN_ISLANDS = 5,
38    SEA_ISLANDS = 6,
39    VOLCANIC_ISLANDS = 7,
40    GFX9 = 8,
41    GFX10 = 9,
42    GFX11 = 10,
43    GFX12 = 11,
44  };
45
46private:
47  Triple TargetTriple;
48
49protected:
50  bool GCN3Encoding = false;
51  bool Has16BitInsts = false;
52  bool HasTrue16BitInsts = false;
53  bool EnableRealTrue16Insts = false;
54  bool HasMadMixInsts = false;
55  bool HasMadMacF32Insts = false;
56  bool HasDsSrc2Insts = false;
57  bool HasSDWA = false;
58  bool HasVOP3PInsts = false;
59  bool HasMulI24 = true;
60  bool HasMulU24 = true;
61  bool HasSMulHi = false;
62  bool HasInv2PiInlineImm = false;
63  bool HasFminFmaxLegacy = true;
64  bool EnablePromoteAlloca = false;
65  bool HasTrigReducedRange = false;
66  bool FastFMAF32 = false;
67  unsigned EUsPerCU = 4;
68  unsigned MaxWavesPerEU = 10;
69  unsigned LocalMemorySize = 0;
70  unsigned AddressableLocalMemorySize = 0;
71  char WavefrontSizeLog2 = 0;
72
73public:
74  AMDGPUSubtarget(const Triple &TT);
75
76  static const AMDGPUSubtarget &get(const MachineFunction &MF);
77  static const AMDGPUSubtarget &get(const TargetMachine &TM,
78                                    const Function &F);
79
80  /// \returns Default range flat work group size for a calling convention.
81  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
82
83  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
84  /// for function \p F, or minimum/maximum flat work group sizes explicitly
85  /// requested using "amdgpu-flat-work-group-size" attribute attached to
86  /// function \p F.
87  ///
88  /// \returns Subtarget's default values if explicitly requested values cannot
89  /// be converted to integer, or violate subtarget's specifications.
90  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
91
92  /// \returns Subtarget's default pair of minimum/maximum number of waves per
93  /// execution unit for function \p F, or minimum/maximum number of waves per
94  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
95  /// attached to function \p F.
96  ///
97  /// \returns Subtarget's default values if explicitly requested values cannot
98  /// be converted to integer, violate subtarget's specifications, or are not
99  /// compatible with minimum/maximum number of waves limited by flat work group
100  /// size, register usage, and/or lds usage.
101  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
102    // Default/requested minimum/maximum flat work group sizes.
103    std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
104    return getWavesPerEU(F, FlatWorkGroupSizes);
105  }
106
107  /// Overload which uses the specified values for the flat work group sizes,
108  /// rather than querying the function itself. \p FlatWorkGroupSizes Should
109  /// correspond to the function's value for getFlatWorkGroupSizes.
110  std::pair<unsigned, unsigned>
111  getWavesPerEU(const Function &F,
112                std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
113  std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
114      std::pair<unsigned, unsigned> WavesPerEU,
115      std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
116
117  /// Return the amount of LDS that can be used that will not restrict the
118  /// occupancy lower than WaveCount.
119  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
120                                           const Function &) const;
121
122  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
123  /// the given LDS memory size is the only constraint.
124  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
125
126  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
127
128  bool isAmdHsaOS() const {
129    return TargetTriple.getOS() == Triple::AMDHSA;
130  }
131
132  bool isAmdPalOS() const {
133    return TargetTriple.getOS() == Triple::AMDPAL;
134  }
135
136  bool isMesa3DOS() const {
137    return TargetTriple.getOS() == Triple::Mesa3D;
138  }
139
140  bool isMesaKernel(const Function &F) const;
141
142  bool isAmdHsaOrMesa(const Function &F) const {
143    return isAmdHsaOS() || isMesaKernel(F);
144  }
145
146  bool isGCN() const {
147    return TargetTriple.getArch() == Triple::amdgcn;
148  }
149
150  bool isGCN3Encoding() const {
151    return GCN3Encoding;
152  }
153
154  bool has16BitInsts() const {
155    return Has16BitInsts;
156  }
157
158  /// Return true if the subtarget supports True16 instructions.
159  bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
160
161  /// Return true if real (non-fake) variants of True16 instructions using
162  /// 16-bit registers should be code-generated. Fake True16 instructions are
163  /// identical to non-fake ones except that they take 32-bit registers as
164  /// operands and always use their low halves.
165  // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
166  // supported and the support for fake True16 instructions is removed.
167  bool useRealTrue16Insts() const;
168
169  bool hasMadMixInsts() const {
170    return HasMadMixInsts;
171  }
172
173  bool hasMadMacF32Insts() const {
174    return HasMadMacF32Insts || !isGCN();
175  }
176
177  bool hasDsSrc2Insts() const {
178    return HasDsSrc2Insts;
179  }
180
181  bool hasSDWA() const {
182    return HasSDWA;
183  }
184
185  bool hasVOP3PInsts() const {
186    return HasVOP3PInsts;
187  }
188
189  bool hasMulI24() const {
190    return HasMulI24;
191  }
192
193  bool hasMulU24() const {
194    return HasMulU24;
195  }
196
197  bool hasSMulHi() const {
198    return HasSMulHi;
199  }
200
201  bool hasInv2PiInlineImm() const {
202    return HasInv2PiInlineImm;
203  }
204
205  bool hasFminFmaxLegacy() const {
206    return HasFminFmaxLegacy;
207  }
208
209  bool hasTrigReducedRange() const {
210    return HasTrigReducedRange;
211  }
212
213  bool hasFastFMAF32() const {
214    return FastFMAF32;
215  }
216
217  bool isPromoteAllocaEnabled() const {
218    return EnablePromoteAlloca;
219  }
220
221  unsigned getWavefrontSize() const {
222    return 1 << WavefrontSizeLog2;
223  }
224
225  unsigned getWavefrontSizeLog2() const {
226    return WavefrontSizeLog2;
227  }
228
229  unsigned getLocalMemorySize() const {
230    return LocalMemorySize;
231  }
232
233  unsigned getAddressableLocalMemorySize() const {
234    return AddressableLocalMemorySize;
235  }
236
237  /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
238  /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
239  /// CU mode into account.
240  unsigned getEUsPerCU() const { return EUsPerCU; }
241
242  Align getAlignmentForImplicitArgPtr() const {
243    return isAmdHsaOS() ? Align(8) : Align(4);
244  }
245
246  /// Returns the offset in bytes from the start of the input buffer
247  ///        of the first explicit kernel argument.
248  unsigned getExplicitKernelArgOffset() const {
249    switch (TargetTriple.getOS()) {
250    case Triple::AMDHSA:
251    case Triple::AMDPAL:
252    case Triple::Mesa3D:
253      return 0;
254    case Triple::UnknownOS:
255    default:
256      // For legacy reasons unknown/other is treated as a different version of
257      // mesa.
258      return 36;
259    }
260
261    llvm_unreachable("invalid triple OS");
262  }
263
264  /// \returns Maximum number of work groups per compute unit supported by the
265  /// subtarget and limited by given \p FlatWorkGroupSize.
266  virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
267
268  /// \returns Minimum flat work group size supported by the subtarget.
269  virtual unsigned getMinFlatWorkGroupSize() const = 0;
270
271  /// \returns Maximum flat work group size supported by the subtarget.
272  virtual unsigned getMaxFlatWorkGroupSize() const = 0;
273
274  /// \returns Number of waves per execution unit required to support the given
275  /// \p FlatWorkGroupSize.
276  virtual unsigned
277  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
278
279  /// \returns Minimum number of waves per execution unit supported by the
280  /// subtarget.
281  virtual unsigned getMinWavesPerEU() const = 0;
282
283  /// \returns Maximum number of waves per execution unit supported by the
284  /// subtarget without any kind of limitation.
285  unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
286
287  /// Return the maximum workitem ID value in the function, for the given (0, 1,
288  /// 2) dimension.
289  unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290
291  /// Return true if only a single workitem can be active in a wave.
292  bool isSingleLaneExecution(const Function &Kernel) const;
293
294  /// Creates value range metadata on an workitemid.* intrinsic call or load.
295  bool makeLIDRangeMetadata(Instruction *I) const;
296
297  /// \returns Number of bytes of arguments that are passed to a shader or
298  /// kernel in addition to the explicit ones declared for the function.
299  unsigned getImplicitArgNumBytes(const Function &F) const;
300  uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
301  unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
302
303  /// \returns Corresponding DWARF register number mapping flavour for the
304  /// \p WavefrontSize.
305  AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
306
307  virtual ~AMDGPUSubtarget() = default;
308};
309
310} // end namespace llvm
311
312#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
313