1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/CallSite.h"
20#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/DerivedTypes.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/PatternMatch.h"
26#include "llvm/IR/Type.h"
27#include "llvm/MC/SubtargetFeature.h"
28#include "llvm/Support/Casting.h"
29#include "llvm/Support/MachineValueType.h"
30#include "llvm/Target/TargetMachine.h"
31#include <algorithm>
32#include <cassert>
33#include <cstdint>
34#include <utility>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "armtti"
39
40static cl::opt<bool> EnableMaskedLoadStores(
41  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
42  cl::desc("Enable the generation of masked loads and stores"));
43
44static cl::opt<bool> DisableLowOverheadLoops(
45  "disable-arm-loloops", cl::Hidden, cl::init(false),
46  cl::desc("Disable the generation of low-overhead loops"));
47
48extern cl::opt<bool> DisableTailPredication;
49
50extern cl::opt<bool> EnableMaskedGatherScatters;
51
52bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
53                                     const Function *Callee) const {
54  const TargetMachine &TM = getTLI()->getTargetMachine();
55  const FeatureBitset &CallerBits =
56      TM.getSubtargetImpl(*Caller)->getFeatureBits();
57  const FeatureBitset &CalleeBits =
58      TM.getSubtargetImpl(*Callee)->getFeatureBits();
59
60  // To inline a callee, all features not in the whitelist must match exactly.
61  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
62                    (CalleeBits & ~InlineFeatureWhitelist);
63  // For features in the whitelist, the callee's features must be a subset of
64  // the callers'.
65  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
66                     (CalleeBits & InlineFeatureWhitelist);
67  return MatchExact && MatchSubset;
68}
69
70int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
71  assert(Ty->isIntegerTy());
72
73 unsigned Bits = Ty->getPrimitiveSizeInBits();
74 if (Bits == 0 || Imm.getActiveBits() >= 64)
75   return 4;
76
77  int64_t SImmVal = Imm.getSExtValue();
78  uint64_t ZImmVal = Imm.getZExtValue();
79  if (!ST->isThumb()) {
80    if ((SImmVal >= 0 && SImmVal < 65536) ||
81        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
82        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
83      return 1;
84    return ST->hasV6T2Ops() ? 2 : 3;
85  }
86  if (ST->isThumb2()) {
87    if ((SImmVal >= 0 && SImmVal < 65536) ||
88        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
89        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
90      return 1;
91    return ST->hasV6T2Ops() ? 2 : 3;
92  }
93  // Thumb1, any i8 imm cost 1.
94  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
95    return 1;
96  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
97    return 2;
98  // Load from constantpool.
99  return 3;
100}
101
102// Constants smaller than 256 fit in the immediate field of
103// Thumb1 instructions so we return a zero cost and 1 otherwise.
104int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
105                                      const APInt &Imm, Type *Ty) {
106  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
107    return 0;
108
109  return 1;
110}
111
112int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
113                              Type *Ty) {
114  // Division by a constant can be turned into multiplication, but only if we
115  // know it's constant. So it's not so much that the immediate is cheap (it's
116  // not), but that the alternative is worse.
117  // FIXME: this is probably unneeded with GlobalISel.
118  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
119       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
120      Idx == 1)
121    return 0;
122
123  if (Opcode == Instruction::And) {
124    // UXTB/UXTH
125    if (Imm == 255 || Imm == 65535)
126      return 0;
127    // Conversion to BIC is free, and means we can use ~Imm instead.
128    return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
129  }
130
131  if (Opcode == Instruction::Add)
132    // Conversion to SUB is free, and means we can use -Imm instead.
133    return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
134
135  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
136      Ty->getIntegerBitWidth() == 32) {
137    int64_t NegImm = -Imm.getSExtValue();
138    if (ST->isThumb2() && NegImm < 1<<12)
139      // icmp X, #-C -> cmn X, #C
140      return 0;
141    if (ST->isThumb() && NegImm < 1<<8)
142      // icmp X, #-C -> adds X, #C
143      return 0;
144  }
145
146  // xor a, -1 can always be folded to MVN
147  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
148    return 0;
149
150  return getIntImmCost(Imm, Ty);
151}
152
153int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
154                                 const Instruction *I) {
155  int ISD = TLI->InstructionOpcodeToISD(Opcode);
156  assert(ISD && "Invalid opcode");
157
158  // Single to/from double precision conversions.
159  static const CostTblEntry NEONFltDblTbl[] = {
160    // Vector fptrunc/fpext conversions.
161    { ISD::FP_ROUND,   MVT::v2f64, 2 },
162    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
163    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
164  };
165
166  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
167                                          ISD == ISD::FP_EXTEND)) {
168    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
169    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
170      return LT.first * Entry->Cost;
171  }
172
173  EVT SrcTy = TLI->getValueType(DL, Src);
174  EVT DstTy = TLI->getValueType(DL, Dst);
175
176  if (!SrcTy.isSimple() || !DstTy.isSimple())
177    return BaseT::getCastInstrCost(Opcode, Dst, Src);
178
179  // The extend of a load is free
180  if (I && isa<LoadInst>(I->getOperand(0))) {
181    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
182        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
183        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
184        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
185        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
186        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
187        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
188        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
189        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
190        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
191        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
192        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
193        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
194    };
195    if (const auto *Entry = ConvertCostTableLookup(
196            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
197      return Entry->Cost;
198
199    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
200        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
201        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
202        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
203        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
204        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
205        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
206    };
207    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
208      if (const auto *Entry =
209              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
210                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
211        return Entry->Cost;
212    }
213  }
214
215  // Some arithmetic, load and store operations have specific instructions
216  // to cast up/down their types automatically at no extra cost.
217  // TODO: Get these tables to know at least what the related operations are.
218  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
219    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
220    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
221    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
222    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
223    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
224    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
225
226    // The number of vmovl instructions for the extension.
227    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
228    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
229    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
230    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
231    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
232    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
233    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
234    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
235    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
236    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
237
238    // Operations that we legalize using splitting.
239    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
240    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
241
242    // Vector float <-> i32 conversions.
243    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
244    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
245
246    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
247    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
248    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
249    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
250    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
251    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
252    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
253    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
254    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
255    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
256    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
257    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
258    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
259    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
260    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
261    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
262    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
263    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
264    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
265    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
266
267    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
268    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
269    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
270    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
271    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
272    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
273
274    // Vector double <-> i32 conversions.
275    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
276    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
277
278    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
279    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
280    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
281    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
282    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
283    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
284
285    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
286    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
287    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
288    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
289    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
290    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
291  };
292
293  if (SrcTy.isVector() && ST->hasNEON()) {
294    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
295                                                   DstTy.getSimpleVT(),
296                                                   SrcTy.getSimpleVT()))
297      return Entry->Cost;
298  }
299
300  // Scalar float to integer conversions.
301  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
302    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
303    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
304    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
305    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
306    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
307    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
308    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
309    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
310    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
311    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
312    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
313    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
314    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
315    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
316    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
317    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
318    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
319    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
320    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
321    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
322  };
323  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
324    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
325                                                   DstTy.getSimpleVT(),
326                                                   SrcTy.getSimpleVT()))
327      return Entry->Cost;
328  }
329
330  // Scalar integer to float conversions.
331  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
332    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
333    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
334    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
335    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
336    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
337    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
338    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
339    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
340    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
341    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
342    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
343    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
344    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
345    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
346    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
347    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
348    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
349    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
350    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
351    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
352  };
353
354  if (SrcTy.isInteger() && ST->hasNEON()) {
355    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
356                                                   ISD, DstTy.getSimpleVT(),
357                                                   SrcTy.getSimpleVT()))
358      return Entry->Cost;
359  }
360
361  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
362  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
363  // are linearised so take more.
364  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
365    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
366    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
367    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
368    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
369    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
370    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
371    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
372    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
373    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
374    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
375    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
376    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
377  };
378
379  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
380    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
381                                                   ISD, DstTy.getSimpleVT(),
382                                                   SrcTy.getSimpleVT()))
383      return Entry->Cost * ST->getMVEVectorCostFactor();
384  }
385
386  // Scalar integer conversion costs.
387  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
388    // i16 -> i64 requires two dependent operations.
389    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
390
391    // Truncates on i64 are assumed to be free.
392    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
393    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
394    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
395    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
396  };
397
398  if (SrcTy.isInteger()) {
399    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
400                                                   DstTy.getSimpleVT(),
401                                                   SrcTy.getSimpleVT()))
402      return Entry->Cost;
403  }
404
405  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
406                     ? ST->getMVEVectorCostFactor()
407                     : 1;
408  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
409}
410
411int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
412                                   unsigned Index) {
413  // Penalize inserting into an D-subregister. We end up with a three times
414  // lower estimated throughput on swift.
415  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
416      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
417    return 3;
418
419  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
420                        Opcode == Instruction::ExtractElement)) {
421    // Cross-class copies are expensive on many microarchitectures,
422    // so assume they are expensive by default.
423    if (ValTy->getVectorElementType()->isIntegerTy())
424      return 3;
425
426    // Even if it's not a cross class copy, this likely leads to mixing
427    // of NEON and VFP code and should be therefore penalized.
428    if (ValTy->isVectorTy() &&
429        ValTy->getScalarSizeInBits() <= 32)
430      return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
431  }
432
433  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
434                                 Opcode == Instruction::ExtractElement)) {
435    // We say MVE moves costs at least the MVEVectorCostFactor, even though
436    // they are scalar instructions. This helps prevent mixing scalar and
437    // vector, to prevent vectorising where we end up just scalarising the
438    // result anyway.
439    return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
440                    ST->getMVEVectorCostFactor()) *
441           ValTy->getVectorNumElements() / 2;
442  }
443
444  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
445}
446
447int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
448                                   const Instruction *I) {
449  int ISD = TLI->InstructionOpcodeToISD(Opcode);
450  // On NEON a vector select gets lowered to vbsl.
451  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
452    // Lowering of some vector selects is currently far from perfect.
453    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
454      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
455      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
456      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
457    };
458
459    EVT SelCondTy = TLI->getValueType(DL, CondTy);
460    EVT SelValTy = TLI->getValueType(DL, ValTy);
461    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
462      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
463                                                     SelCondTy.getSimpleVT(),
464                                                     SelValTy.getSimpleVT()))
465        return Entry->Cost;
466    }
467
468    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
469    return LT.first;
470  }
471
472  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
473                     ? ST->getMVEVectorCostFactor()
474                     : 1;
475  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
476}
477
478int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
479                                          const SCEV *Ptr) {
480  // Address computations in vectorized code with non-consecutive addresses will
481  // likely result in more instructions compared to scalar code where the
482  // computation can more often be merged into the index mode. The resulting
483  // extra micro-ops can significantly decrease throughput.
484  unsigned NumVectorInstToHideOverhead = 10;
485  int MaxMergeDistance = 64;
486
487  if (ST->hasNEON()) {
488    if (Ty->isVectorTy() && SE &&
489        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
490      return NumVectorInstToHideOverhead;
491
492    // In many cases the address computation is not merged into the instruction
493    // addressing mode.
494    return 1;
495  }
496  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
497}
498
499bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
500  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
501    return false;
502
503  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
504    // Don't support v2i1 yet.
505    if (VecTy->getNumElements() == 2)
506      return false;
507
508    // We don't support extending fp types.
509     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
510    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
511      return false;
512  }
513
514  unsigned EltWidth = DataTy->getScalarSizeInBits();
515  return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
516         (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
517         (EltWidth == 8);
518}
519
520bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
521  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
522    return false;
523
524  // This method is called in 2 places:
525  //  - from the vectorizer with a scalar type, in which case we need to get
526  //  this as good as we can with the limited info we have (and rely on the cost
527  //  model for the rest).
528  //  - from the masked intrinsic lowering pass with the actual vector type.
529  // For MVE, we have a custom lowering pass that will already have custom
530  // legalised any gathers that we can to MVE intrinsics, and want to expand all
531  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
532  // are here, we know we want to expand.
533  if (isa<VectorType>(Ty))
534    return false;
535
536  unsigned EltWidth = Ty->getScalarSizeInBits();
537  return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
538          (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
539}
540
541int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
542  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
543  assert(MI && "MemcpyInst expected");
544  ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
545
546  // To model the cost of a library call, we assume 1 for the call, and
547  // 3 for the argument setup.
548  const unsigned LibCallCost = 4;
549
550  // If 'size' is not a constant, a library call will be generated.
551  if (!C)
552    return LibCallCost;
553
554  const unsigned Size = C->getValue().getZExtValue();
555  const unsigned DstAlign = MI->getDestAlignment();
556  const unsigned SrcAlign = MI->getSourceAlignment();
557  const Function *F = I->getParent()->getParent();
558  const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
559  std::vector<EVT> MemOps;
560
561  // MemOps will be poplulated with a list of data types that needs to be
562  // loaded and stored. That's why we multiply the number of elements by 2 to
563  // get the cost for this memcpy.
564  if (getTLI()->findOptimalMemOpLowering(
565          MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
566          false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
567          MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
568          F->getAttributes()))
569    return MemOps.size() * 2;
570
571  // If we can't find an optimal memop lowering, return the default cost
572  return LibCallCost;
573}
574
575int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
576                               Type *SubTp) {
577  if (ST->hasNEON()) {
578    if (Kind == TTI::SK_Broadcast) {
579      static const CostTblEntry NEONDupTbl[] = {
580          // VDUP handles these cases.
581          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
582          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
583          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
584          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
585          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
586          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
587
588          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
589          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
590          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
591          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
592
593      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
594
595      if (const auto *Entry =
596              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
597        return LT.first * Entry->Cost;
598    }
599    if (Kind == TTI::SK_Reverse) {
600      static const CostTblEntry NEONShuffleTbl[] = {
601          // Reverse shuffle cost one instruction if we are shuffling within a
602          // double word (vrev) or two if we shuffle a quad word (vrev, vext).
603          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
604          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
605          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
606          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
607          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
608          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
609
610          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
611          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
612          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
613          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
614
615      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
616
617      if (const auto *Entry =
618              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
619        return LT.first * Entry->Cost;
620    }
621    if (Kind == TTI::SK_Select) {
622      static const CostTblEntry NEONSelShuffleTbl[] = {
623          // Select shuffle cost table for ARM. Cost is the number of
624          // instructions
625          // required to create the shuffled vector.
626
627          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
628          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
629          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
630          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
631
632          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
633          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
634          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
635
636          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
637
638          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
639
640      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
641      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
642                                              ISD::VECTOR_SHUFFLE, LT.second))
643        return LT.first * Entry->Cost;
644    }
645  }
646  if (ST->hasMVEIntegerOps()) {
647    if (Kind == TTI::SK_Broadcast) {
648      static const CostTblEntry MVEDupTbl[] = {
649          // VDUP handles these cases.
650          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
651          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
652          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
653          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
654          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
655
656      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
657
658      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
659                                              LT.second))
660        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
661    }
662  }
663  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
664                     ? ST->getMVEVectorCostFactor()
665                     : 1;
666  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
667}
668
669int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
670                                       TTI::OperandValueKind Op1Info,
671                                       TTI::OperandValueKind Op2Info,
672                                       TTI::OperandValueProperties Opd1PropInfo,
673                                       TTI::OperandValueProperties Opd2PropInfo,
674                                       ArrayRef<const Value *> Args,
675                                       const Instruction *CxtI) {
676  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
677  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
678
679  if (ST->hasNEON()) {
680    const unsigned FunctionCallDivCost = 20;
681    const unsigned ReciprocalDivCost = 10;
682    static const CostTblEntry CostTbl[] = {
683      // Division.
684      // These costs are somewhat random. Choose a cost of 20 to indicate that
685      // vectorizing devision (added function call) is going to be very expensive.
686      // Double registers types.
687      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
688      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
689      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
690      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
691      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
692      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
693      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
694      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
695      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
696      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
697      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
698      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
699      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
700      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
701      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
702      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
703      // Quad register types.
704      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
705      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
706      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
707      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
708      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
709      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
710      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
711      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
712      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
713      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
714      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
715      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
716      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
717      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
718      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
719      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
720      // Multiplication.
721    };
722
723    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
724      return LT.first * Entry->Cost;
725
726    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
727                                             Opd1PropInfo, Opd2PropInfo);
728
729    // This is somewhat of a hack. The problem that we are facing is that SROA
730    // creates a sequence of shift, and, or instructions to construct values.
731    // These sequences are recognized by the ISel and have zero-cost. Not so for
732    // the vectorized code. Because we have support for v2i64 but not i64 those
733    // sequences look particularly beneficial to vectorize.
734    // To work around this we increase the cost of v2i64 operations to make them
735    // seem less beneficial.
736    if (LT.second == MVT::v2i64 &&
737        Op2Info == TargetTransformInfo::OK_UniformConstantValue)
738      Cost += 4;
739
740    return Cost;
741  }
742
743  // If this operation is a shift on arm/thumb2, it might well be folded into
744  // the following instruction, hence having a cost of 0.
745  auto LooksLikeAFreeShift = [&]() {
746    if (ST->isThumb1Only() || Ty->isVectorTy())
747      return false;
748
749    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
750      return false;
751    if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
752      return false;
753
754    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
755    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
756    case Instruction::Add:
757    case Instruction::Sub:
758    case Instruction::And:
759    case Instruction::Xor:
760    case Instruction::Or:
761    case Instruction::ICmp:
762      return true;
763    default:
764      return false;
765    }
766  };
767  if (LooksLikeAFreeShift())
768    return 0;
769
770  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
771                     ? ST->getMVEVectorCostFactor()
772                     : 1;
773
774  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
775  // without treating floats as more expensive that scalars or increasing the
776  // costs for custom operations. The results is also multiplied by the
777  // MVEVectorCostFactor where appropriate.
778  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
779    return LT.first * BaseCost;
780
781  // Else this is expand, assume that we need to scalarize this op.
782  if (Ty->isVectorTy()) {
783    unsigned Num = Ty->getVectorNumElements();
784    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
785    // Return the cost of multiple scalar invocation plus the cost of
786    // inserting and extracting the values.
787    return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
788  }
789
790  return BaseCost;
791}
792
793int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
794                                MaybeAlign Alignment, unsigned AddressSpace,
795                                const Instruction *I) {
796  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
797
798  if (ST->hasNEON() && Src->isVectorTy() &&
799      (Alignment && *Alignment != Align(16)) &&
800      Src->getVectorElementType()->isDoubleTy()) {
801    // Unaligned loads/stores are extremely inefficient.
802    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
803    return LT.first * 4;
804  }
805  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
806                     ? ST->getMVEVectorCostFactor()
807                     : 1;
808  return BaseCost * LT.first;
809}
810
811int ARMTTIImpl::getInterleavedMemoryOpCost(
812    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
813    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
814    bool UseMaskForGaps) {
815  assert(Factor >= 2 && "Invalid interleave factor");
816  assert(isa<VectorType>(VecTy) && "Expect a vector type");
817
818  // vldN/vstN doesn't support vector types of i64/f64 element.
819  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
820
821  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
822      !UseMaskForCond && !UseMaskForGaps) {
823    unsigned NumElts = VecTy->getVectorNumElements();
824    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
825
826    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
827    // Accesses having vector types that are a multiple of 128 bits can be
828    // matched to more than one vldN/vstN instruction.
829    int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
830    if (NumElts % Factor == 0 &&
831        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
832      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
833
834    // Some smaller than legal interleaved patterns are cheap as we can make
835    // use of the vmovn or vrev patterns to interleave a standard load. This is
836    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
837    // promoted differently). The cost of 2 here is then a load and vrev or
838    // vmovn.
839    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
840        VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
841      return 2 * BaseCost;
842  }
843
844  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
845                                           Alignment, AddressSpace,
846                                           UseMaskForCond, UseMaskForGaps);
847}
848
849bool ARMTTIImpl::isLoweredToCall(const Function *F) {
850  if (!F->isIntrinsic())
851    BaseT::isLoweredToCall(F);
852
853  // Assume all Arm-specific intrinsics map to an instruction.
854  if (F->getName().startswith("llvm.arm"))
855    return false;
856
857  switch (F->getIntrinsicID()) {
858  default: break;
859  case Intrinsic::powi:
860  case Intrinsic::sin:
861  case Intrinsic::cos:
862  case Intrinsic::pow:
863  case Intrinsic::log:
864  case Intrinsic::log10:
865  case Intrinsic::log2:
866  case Intrinsic::exp:
867  case Intrinsic::exp2:
868    return true;
869  case Intrinsic::sqrt:
870  case Intrinsic::fabs:
871  case Intrinsic::copysign:
872  case Intrinsic::floor:
873  case Intrinsic::ceil:
874  case Intrinsic::trunc:
875  case Intrinsic::rint:
876  case Intrinsic::nearbyint:
877  case Intrinsic::round:
878  case Intrinsic::canonicalize:
879  case Intrinsic::lround:
880  case Intrinsic::llround:
881  case Intrinsic::lrint:
882  case Intrinsic::llrint:
883    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
884      return true;
885    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
886      return true;
887    // Some operations can be handled by vector instructions and assume
888    // unsupported vectors will be expanded into supported scalar ones.
889    // TODO Handle scalar operations properly.
890    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
891  case Intrinsic::masked_store:
892  case Intrinsic::masked_load:
893  case Intrinsic::masked_gather:
894  case Intrinsic::masked_scatter:
895    return !ST->hasMVEIntegerOps();
896  case Intrinsic::sadd_with_overflow:
897  case Intrinsic::uadd_with_overflow:
898  case Intrinsic::ssub_with_overflow:
899  case Intrinsic::usub_with_overflow:
900  case Intrinsic::sadd_sat:
901  case Intrinsic::uadd_sat:
902  case Intrinsic::ssub_sat:
903  case Intrinsic::usub_sat:
904    return false;
905  }
906
907  return BaseT::isLoweredToCall(F);
908}
909
910bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
911                                          AssumptionCache &AC,
912                                          TargetLibraryInfo *LibInfo,
913                                          HardwareLoopInfo &HWLoopInfo) {
914  // Low-overhead branches are only supported in the 'low-overhead branch'
915  // extension of v8.1-m.
916  if (!ST->hasLOB() || DisableLowOverheadLoops)
917    return false;
918
919  if (!SE.hasLoopInvariantBackedgeTakenCount(L))
920    return false;
921
922  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
923  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
924    return false;
925
926  const SCEV *TripCountSCEV =
927    SE.getAddExpr(BackedgeTakenCount,
928                  SE.getOne(BackedgeTakenCount->getType()));
929
930  // We need to store the trip count in LR, a 32-bit register.
931  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
932    return false;
933
934  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
935  // point in generating a hardware loop if that's going to happen.
936  auto MaybeCall = [this](Instruction &I) {
937    const ARMTargetLowering *TLI = getTLI();
938    unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
939    EVT VT = TLI->getValueType(DL, I.getType(), true);
940    if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
941      return true;
942
943    // Check if an intrinsic will be lowered to a call and assume that any
944    // other CallInst will generate a bl.
945    if (auto *Call = dyn_cast<CallInst>(&I)) {
946      if (isa<IntrinsicInst>(Call)) {
947        if (const Function *F = Call->getCalledFunction())
948          return isLoweredToCall(F);
949      }
950      return true;
951    }
952
953    // FPv5 provides conversions between integer, double-precision,
954    // single-precision, and half-precision formats.
955    switch (I.getOpcode()) {
956    default:
957      break;
958    case Instruction::FPToSI:
959    case Instruction::FPToUI:
960    case Instruction::SIToFP:
961    case Instruction::UIToFP:
962    case Instruction::FPTrunc:
963    case Instruction::FPExt:
964      return !ST->hasFPARMv8Base();
965    }
966
967    // FIXME: Unfortunately the approach of checking the Operation Action does
968    // not catch all cases of Legalization that use library calls. Our
969    // Legalization step categorizes some transformations into library calls as
970    // Custom, Expand or even Legal when doing type legalization. So for now
971    // we have to special case for instance the SDIV of 64bit integers and the
972    // use of floating point emulation.
973    if (VT.isInteger() && VT.getSizeInBits() >= 64) {
974      switch (ISD) {
975      default:
976        break;
977      case ISD::SDIV:
978      case ISD::UDIV:
979      case ISD::SREM:
980      case ISD::UREM:
981      case ISD::SDIVREM:
982      case ISD::UDIVREM:
983        return true;
984      }
985    }
986
987    // Assume all other non-float operations are supported.
988    if (!VT.isFloatingPoint())
989      return false;
990
991    // We'll need a library call to handle most floats when using soft.
992    if (TLI->useSoftFloat()) {
993      switch (I.getOpcode()) {
994      default:
995        return true;
996      case Instruction::Alloca:
997      case Instruction::Load:
998      case Instruction::Store:
999      case Instruction::Select:
1000      case Instruction::PHI:
1001        return false;
1002      }
1003    }
1004
1005    // We'll need a libcall to perform double precision operations on a single
1006    // precision only FPU.
1007    if (I.getType()->isDoubleTy() && !ST->hasFP64())
1008      return true;
1009
1010    // Likewise for half precision arithmetic.
1011    if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1012      return true;
1013
1014    return false;
1015  };
1016
1017  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1018    if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1019      switch (Call->getIntrinsicID()) {
1020      default:
1021        break;
1022      case Intrinsic::set_loop_iterations:
1023      case Intrinsic::test_set_loop_iterations:
1024      case Intrinsic::loop_decrement:
1025      case Intrinsic::loop_decrement_reg:
1026        return true;
1027      }
1028    }
1029    return false;
1030  };
1031
1032  // Scan the instructions to see if there's any that we know will turn into a
1033  // call or if this loop is already a low-overhead loop.
1034  auto ScanLoop = [&](Loop *L) {
1035    for (auto *BB : L->getBlocks()) {
1036      for (auto &I : *BB) {
1037        if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
1038          return false;
1039      }
1040    }
1041    return true;
1042  };
1043
1044  // Visit inner loops.
1045  for (auto Inner : *L)
1046    if (!ScanLoop(Inner))
1047      return false;
1048
1049  if (!ScanLoop(L))
1050    return false;
1051
1052  // TODO: Check whether the trip count calculation is expensive. If L is the
1053  // inner loop but we know it has a low trip count, calculating that trip
1054  // count (in the parent loop) may be detrimental.
1055
1056  LLVMContext &C = L->getHeader()->getContext();
1057  HWLoopInfo.CounterInReg = true;
1058  HWLoopInfo.IsNestingLegal = false;
1059  HWLoopInfo.PerformEntryTest = true;
1060  HWLoopInfo.CountType = Type::getInt32Ty(C);
1061  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1062  return true;
1063}
1064
1065static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1066  // We don't allow icmp's, and because we only look at single block loops,
1067  // we simply count the icmps, i.e. there should only be 1 for the backedge.
1068  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1069    return false;
1070
1071  if (isa<FCmpInst>(&I))
1072    return false;
1073
1074  // We could allow extending/narrowing FP loads/stores, but codegen is
1075  // too inefficient so reject this for now.
1076  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1077    return false;
1078
1079  // Extends have to be extending-loads
1080  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1081    if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1082      return false;
1083
1084  // Truncs have to be narrowing-stores
1085  if (isa<TruncInst>(&I) )
1086    if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1087      return false;
1088
1089  return true;
1090}
1091
1092// To set up a tail-predicated loop, we need to know the total number of
1093// elements processed by that loop. Thus, we need to determine the element
1094// size and:
1095// 1) it should be uniform for all operations in the vector loop, so we
1096//    e.g. don't want any widening/narrowing operations.
1097// 2) it should be smaller than i64s because we don't have vector operations
1098//    that work on i64s.
1099// 3) we don't want elements to be reversed or shuffled, to make sure the
1100//    tail-predication masks/predicates the right lanes.
1101//
1102static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1103                                 const DataLayout &DL,
1104                                 const LoopAccessInfo *LAI) {
1105  PredicatedScalarEvolution PSE = LAI->getPSE();
1106  int ICmpCount = 0;
1107  int Stride = 0;
1108
1109  LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
1110  SmallVector<Instruction *, 16> LoadStores;
1111  for (BasicBlock *BB : L->blocks()) {
1112    for (Instruction &I : BB->instructionsWithoutDebug()) {
1113      if (isa<PHINode>(&I))
1114        continue;
1115      if (!canTailPredicateInstruction(I, ICmpCount)) {
1116        LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
1117        return false;
1118      }
1119
1120      Type *T  = I.getType();
1121      if (T->isPointerTy())
1122        T = T->getPointerElementType();
1123
1124      if (T->getScalarSizeInBits() > 32) {
1125        LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
1126        return false;
1127      }
1128
1129      if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
1130        Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
1131        int64_t NextStride = getPtrStride(PSE, Ptr, L);
1132        // TODO: for now only allow consecutive strides of 1. We could support
1133        // other strides as long as it is uniform, but let's keep it simple for
1134        // now.
1135        if (Stride == 0 && NextStride == 1) {
1136          Stride = NextStride;
1137          continue;
1138        }
1139        if (Stride != NextStride) {
1140          LLVM_DEBUG(dbgs() << "Different strides found, can't "
1141                               "tail-predicate\n.");
1142          return false;
1143        }
1144      }
1145    }
1146  }
1147
1148  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
1149  return true;
1150}
1151
1152bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
1153                                             ScalarEvolution &SE,
1154                                             AssumptionCache &AC,
1155                                             TargetLibraryInfo *TLI,
1156                                             DominatorTree *DT,
1157                                             const LoopAccessInfo *LAI) {
1158  if (DisableTailPredication)
1159    return false;
1160
1161  // Creating a predicated vector loop is the first step for generating a
1162  // tail-predicated hardware loop, for which we need the MVE masked
1163  // load/stores instructions:
1164  if (!ST->hasMVEIntegerOps())
1165    return false;
1166
1167  // For now, restrict this to single block loops.
1168  if (L->getNumBlocks() > 1) {
1169    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
1170                         "loop.\n");
1171    return false;
1172  }
1173
1174  assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
1175
1176  HardwareLoopInfo HWLoopInfo(L);
1177  if (!HWLoopInfo.canAnalyze(*LI)) {
1178    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1179                         "analyzable.\n");
1180    return false;
1181  }
1182
1183  // This checks if we have the low-overhead branch architecture
1184  // extension, and if we will create a hardware-loop:
1185  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
1186    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1187                         "profitable.\n");
1188    return false;
1189  }
1190
1191  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
1192    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
1193                         "a candidate.\n");
1194    return false;
1195  }
1196
1197  return canTailPredicateLoop(L, LI, SE, DL, LAI);
1198}
1199
1200
1201void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1202                                         TTI::UnrollingPreferences &UP) {
1203  // Only currently enable these preferences for M-Class cores.
1204  if (!ST->isMClass())
1205    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
1206
1207  // Disable loop unrolling for Oz and Os.
1208  UP.OptSizeThreshold = 0;
1209  UP.PartialOptSizeThreshold = 0;
1210  if (L->getHeader()->getParent()->hasOptSize())
1211    return;
1212
1213  // Only enable on Thumb-2 targets.
1214  if (!ST->isThumb2())
1215    return;
1216
1217  SmallVector<BasicBlock*, 4> ExitingBlocks;
1218  L->getExitingBlocks(ExitingBlocks);
1219  LLVM_DEBUG(dbgs() << "Loop has:\n"
1220                    << "Blocks: " << L->getNumBlocks() << "\n"
1221                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
1222
1223  // Only allow another exit other than the latch. This acts as an early exit
1224  // as it mirrors the profitability calculation of the runtime unroller.
1225  if (ExitingBlocks.size() > 2)
1226    return;
1227
1228  // Limit the CFG of the loop body for targets with a branch predictor.
1229  // Allowing 4 blocks permits if-then-else diamonds in the body.
1230  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1231    return;
1232
1233  // Scan the loop: don't unroll loops with calls as this could prevent
1234  // inlining.
1235  unsigned Cost = 0;
1236  for (auto *BB : L->getBlocks()) {
1237    for (auto &I : *BB) {
1238      // Don't unroll vectorised loop. MVE does not benefit from it as much as
1239      // scalar code.
1240      if (I.getType()->isVectorTy())
1241        return;
1242
1243      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1244        ImmutableCallSite CS(&I);
1245        if (const Function *F = CS.getCalledFunction()) {
1246          if (!isLoweredToCall(F))
1247            continue;
1248        }
1249        return;
1250      }
1251
1252      SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1253                                            I.value_op_end());
1254      Cost += getUserCost(&I, Operands);
1255    }
1256  }
1257
1258  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1259
1260  UP.Partial = true;
1261  UP.Runtime = true;
1262  UP.UpperBound = true;
1263  UP.UnrollRemainder = true;
1264  UP.DefaultUnrollRuntimeCount = 4;
1265  UP.UnrollAndJam = true;
1266  UP.UnrollAndJamInnerLoopThreshold = 60;
1267
1268  // Force unrolling small loops can be very useful because of the branch
1269  // taken cost of the backedge.
1270  if (Cost < 12)
1271    UP.Force = true;
1272}
1273
1274bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1275                                       TTI::ReductionFlags Flags) const {
1276  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
1277  unsigned ScalarBits = Ty->getScalarSizeInBits();
1278  if (!ST->hasMVEIntegerOps())
1279    return false;
1280
1281  switch (Opcode) {
1282  case Instruction::FAdd:
1283  case Instruction::FMul:
1284  case Instruction::And:
1285  case Instruction::Or:
1286  case Instruction::Xor:
1287  case Instruction::Mul:
1288  case Instruction::FCmp:
1289    return false;
1290  case Instruction::ICmp:
1291  case Instruction::Add:
1292    return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
1293  default:
1294    llvm_unreachable("Unhandled reduction opcode");
1295  }
1296  return false;
1297}
1298