AMDGPUTargetTransformInfo.h revision 360784
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "AMDGPUSubtarget.h"
22#include "AMDGPUTargetMachine.h"
23#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24#include "Utils/AMDGPUBaseInfo.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/CodeGen/BasicTTIImpl.h"
28#include "llvm/IR/Function.h"
29#include "llvm/MC/SubtargetFeature.h"
30#include "llvm/Support/MathExtras.h"
31#include <cassert>
32
33namespace llvm {
34
35class AMDGPUTargetLowering;
36class Loop;
37class ScalarEvolution;
38class Type;
39class Value;
40
41class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
42  using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
43  using TTI = TargetTransformInfo;
44
45  friend BaseT;
46
47  Triple TargetTriple;
48
49  const GCNSubtarget *ST;
50  const TargetLoweringBase *TLI;
51
52  const TargetSubtargetInfo *getST() const { return ST; }
53  const TargetLoweringBase *getTLI() const { return TLI; }
54
55public:
56  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
57      : BaseT(TM, F.getParent()->getDataLayout()),
58        TargetTriple(TM->getTargetTriple()),
59        ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
60        TLI(ST->getTargetLowering()) {}
61
62  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
63                               TTI::UnrollingPreferences &UP);
64};
65
66class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
67  using BaseT = BasicTTIImplBase<GCNTTIImpl>;
68  using TTI = TargetTransformInfo;
69
70  friend BaseT;
71
72  const GCNSubtarget *ST;
73  const AMDGPUTargetLowering *TLI;
74  AMDGPUTTIImpl CommonTTI;
75  bool IsGraphicsShader;
76  bool HasFP32Denormals;
77
78  const FeatureBitset InlineFeatureIgnoreList = {
79    // Codegen control options which don't matter.
80    AMDGPU::FeatureEnableLoadStoreOpt,
81    AMDGPU::FeatureEnableSIScheduler,
82    AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
83    AMDGPU::FeatureFlatForGlobal,
84    AMDGPU::FeaturePromoteAlloca,
85    AMDGPU::FeatureUnalignedBufferAccess,
86    AMDGPU::FeatureUnalignedScratchAccess,
87
88    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
89
90    // Property of the kernel/environment which can't actually differ.
91    AMDGPU::FeatureSGPRInitBug,
92    AMDGPU::FeatureXNACK,
93    AMDGPU::FeatureTrapHandler,
94    AMDGPU::FeatureCodeObjectV3,
95
96    // The default assumption needs to be ecc is enabled, but no directly
97    // exposed operations depend on it, so it can be safely inlined.
98    AMDGPU::FeatureSRAMECC,
99
100    // Perf-tuning features
101    AMDGPU::FeatureFastFMAF32,
102    AMDGPU::HalfRate64Ops
103  };
104
105  const GCNSubtarget *getST() const { return ST; }
106  const AMDGPUTargetLowering *getTLI() const { return TLI; }
107
108  static inline int getFullRateInstrCost() {
109    return TargetTransformInfo::TCC_Basic;
110  }
111
112  static inline int getHalfRateInstrCost() {
113    return 2 * TargetTransformInfo::TCC_Basic;
114  }
115
116  // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
117  // should be 2 or 4.
118  static inline int getQuarterRateInstrCost() {
119    return 3 * TargetTransformInfo::TCC_Basic;
120  }
121
122   // On some parts, normal fp64 operations are half rate, and others
123   // quarter. This also applies to some integer operations.
124  inline int get64BitInstrCost() const {
125    return ST->hasHalfRate64Ops() ?
126      getHalfRateInstrCost() : getQuarterRateInstrCost();
127  }
128
129public:
130  explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
131    : BaseT(TM, F.getParent()->getDataLayout()),
132      ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
133      TLI(ST->getTargetLowering()),
134      CommonTTI(TM, F),
135      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
136      HasFP32Denormals(ST->hasFP32Denormals(F)) { }
137
138  bool hasBranchDivergence() { return true; }
139
140  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
141                               TTI::UnrollingPreferences &UP);
142
143  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
144    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
145    return TTI::PSK_FastHardware;
146  }
147
148  unsigned getHardwareNumberOfRegisters(bool Vector) const;
149  unsigned getNumberOfRegisters(bool Vector) const;
150  unsigned getRegisterBitWidth(bool Vector) const;
151  unsigned getMinVectorRegisterBitWidth() const;
152  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
153                               unsigned ChainSizeInBytes,
154                               VectorType *VecTy) const;
155  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
156                                unsigned ChainSizeInBytes,
157                                VectorType *VecTy) const;
158  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
159
160  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
161                                  unsigned Alignment,
162                                  unsigned AddrSpace) const;
163  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
164                                   unsigned Alignment,
165                                   unsigned AddrSpace) const;
166  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
167                                    unsigned Alignment,
168                                    unsigned AddrSpace) const;
169
170  unsigned getMaxInterleaveFactor(unsigned VF);
171
172  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
173
174  int getArithmeticInstrCost(
175      unsigned Opcode, Type *Ty,
176      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
177      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
178      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
179      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
180      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
181      const Instruction *CxtI = nullptr);
182
183  unsigned getCFInstrCost(unsigned Opcode);
184
185  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
186  bool isSourceOfDivergence(const Value *V) const;
187  bool isAlwaysUniform(const Value *V) const;
188
189  unsigned getFlatAddressSpace() const {
190    // Don't bother running InferAddressSpaces pass on graphics shaders which
191    // don't use flat addressing.
192    if (IsGraphicsShader)
193      return -1;
194    return AMDGPUAS::FLAT_ADDRESS;
195  }
196
197  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
198                                  Intrinsic::ID IID) const;
199  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
200                                        Value *OldV, Value *NewV) const;
201
202  unsigned getVectorSplitCost() { return 0; }
203
204  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
205                          Type *SubTp);
206
207  bool areInlineCompatible(const Function *Caller,
208                           const Function *Callee) const;
209
210  unsigned getInliningThresholdMultiplier() { return 11; }
211
212  int getInlinerVectorBonusPercent() { return 0; }
213
214  int getArithmeticReductionCost(unsigned Opcode,
215                                 Type *Ty,
216                                 bool IsPairwise);
217  template <typename T>
218  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
219                            ArrayRef<T *> Args, FastMathFlags FMF,
220                            unsigned VF);
221  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
222                            ArrayRef<Type *> Tys, FastMathFlags FMF,
223                            unsigned ScalarizationCostPassed = UINT_MAX);
224  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
225                            ArrayRef<Value *> Args, FastMathFlags FMF,
226                            unsigned VF = 1);
227  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
228                             bool IsPairwiseForm,
229                             bool IsUnsigned);
230  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
231};
232
233class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
234  using BaseT = BasicTTIImplBase<R600TTIImpl>;
235  using TTI = TargetTransformInfo;
236
237  friend BaseT;
238
239  const R600Subtarget *ST;
240  const AMDGPUTargetLowering *TLI;
241  AMDGPUTTIImpl CommonTTI;
242
243public:
244  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
245    : BaseT(TM, F.getParent()->getDataLayout()),
246      ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
247      TLI(ST->getTargetLowering()),
248      CommonTTI(TM, F)	{}
249
250  const R600Subtarget *getST() const { return ST; }
251  const AMDGPUTargetLowering *getTLI() const { return TLI; }
252
253  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
254                               TTI::UnrollingPreferences &UP);
255  unsigned getHardwareNumberOfRegisters(bool Vec) const;
256  unsigned getNumberOfRegisters(bool Vec) const;
257  unsigned getRegisterBitWidth(bool Vector) const;
258  unsigned getMinVectorRegisterBitWidth() const;
259  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
260  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
261                                  unsigned AddrSpace) const;
262  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
263		                   unsigned Alignment,
264                                   unsigned AddrSpace) const;
265  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
266                                    unsigned Alignment,
267                                    unsigned AddrSpace) const;
268  unsigned getMaxInterleaveFactor(unsigned VF);
269  unsigned getCFInstrCost(unsigned Opcode);
270  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
271};
272
273} // end namespace llvm
274
275#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
276