AMDGPUISelLowering.cpp revision 314564
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This is the parent TargetLowering class for hardware code gen
12/// targets.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPUISelLowering.h"
17#include "AMDGPU.h"
18#include "AMDGPUFrameLowering.h"
19#include "AMDGPUIntrinsicInfo.h"
20#include "AMDGPURegisterInfo.h"
21#include "AMDGPUSubtarget.h"
22#include "R600MachineFunctionInfo.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/CodeGen/CallingConvLower.h"
25#include "llvm/CodeGen/MachineFunction.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
29#include "llvm/IR/DataLayout.h"
30#include "llvm/IR/DiagnosticInfo.h"
31#include "SIInstrInfo.h"
32using namespace llvm;
33
34static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
35                            CCValAssign::LocInfo LocInfo,
36                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
37  MachineFunction &MF = State.getMachineFunction();
38  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
39
40  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
41                                         ArgFlags.getOrigAlign());
42  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
43  return true;
44}
45
46#include "AMDGPUGenCallingConv.inc"
47
48// Find a larger type to do a load / store of a vector with.
49EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
50  unsigned StoreSize = VT.getStoreSizeInBits();
51  if (StoreSize <= 32)
52    return EVT::getIntegerVT(Ctx, StoreSize);
53
54  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
55  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
56}
57
58AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59                                           const AMDGPUSubtarget &STI)
60    : TargetLowering(TM), Subtarget(&STI) {
61  // Lower floating point store/load to integer store/load to reduce the number
62  // of patterns in tablegen.
63  setOperationAction(ISD::LOAD, MVT::f32, Promote);
64  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65
66  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68
69  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
70  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
71
72  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
73  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
74
75  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
76  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
77
78  setOperationAction(ISD::LOAD, MVT::i64, Promote);
79  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
80
81  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
82  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
83
84  setOperationAction(ISD::LOAD, MVT::f64, Promote);
85  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
86
87  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
88  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
89
90  // There are no 64-bit extloads. These should be done as a 32-bit extload and
91  // an extension to 64-bit.
92  for (MVT VT : MVT::integer_valuetypes()) {
93    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
94    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
95    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
96  }
97
98  for (MVT VT : MVT::integer_valuetypes()) {
99    if (VT == MVT::i64)
100      continue;
101
102    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
103    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
104    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
105    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
106
107    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
108    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
109    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
110    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
111
112    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
113    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
114    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
115    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
116  }
117
118  for (MVT VT : MVT::integer_vector_valuetypes()) {
119    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
120    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
121    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
122    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
123    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
124    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
125    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
126    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
127    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
128    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
129    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
130    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
131  }
132
133  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
134  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
135  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
136  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
137
138  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
139  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
140  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
141  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
142
143  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
144  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
145  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
146  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
147
148  setOperationAction(ISD::STORE, MVT::f32, Promote);
149  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
150
151  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
152  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
153
154  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
155  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
156
157  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
158  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
159
160  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
161  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
162
163  setOperationAction(ISD::STORE, MVT::i64, Promote);
164  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
165
166  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
167  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
168
169  setOperationAction(ISD::STORE, MVT::f64, Promote);
170  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
171
172  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
173  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
174
175  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
176  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
177  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
178  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
179
180  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
181  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
182  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
183  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
184
185  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
186  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
187  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
188  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
189
190  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
191  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
192
193  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
194  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
195
196  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
197  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
198
199  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
200  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
201
202
203  setOperationAction(ISD::Constant, MVT::i32, Legal);
204  setOperationAction(ISD::Constant, MVT::i64, Legal);
205  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
206  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
207
208  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
209  setOperationAction(ISD::BRIND, MVT::Other, Expand);
210
211  // This is totally unsupported, just custom lower to produce an error.
212  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
213
214  // We need to custom lower some of the intrinsics
215  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
216  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
217
218  // Library functions.  These default to Expand, but we have instructions
219  // for them.
220  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
221  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
222  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
223  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
224  setOperationAction(ISD::FABS,   MVT::f32, Legal);
225  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
226  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
227  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
228  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
229  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
230
231  setOperationAction(ISD::FROUND, MVT::f32, Custom);
232  setOperationAction(ISD::FROUND, MVT::f64, Custom);
233
234  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
235  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
236
237  setOperationAction(ISD::FREM, MVT::f32, Custom);
238  setOperationAction(ISD::FREM, MVT::f64, Custom);
239
240  // v_mad_f32 does not support denormals according to some sources.
241  if (!Subtarget->hasFP32Denormals())
242    setOperationAction(ISD::FMAD, MVT::f32, Legal);
243
244  // Expand to fneg + fadd.
245  setOperationAction(ISD::FSUB, MVT::f64, Expand);
246
247  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
248  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
249  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
250  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
251  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
252  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
253  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
254  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
255  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
256  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
257
258  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
259    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
260    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
261    setOperationAction(ISD::FRINT, MVT::f64, Custom);
262    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
263  }
264
265  if (!Subtarget->hasBFI()) {
266    // fcopysign can be done in a single instruction with BFI.
267    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
268    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
269  }
270
271  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
272  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
273
274  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
275  for (MVT VT : ScalarIntVTs) {
276    // These should use [SU]DIVREM, so set them to expand
277    setOperationAction(ISD::SDIV, VT, Expand);
278    setOperationAction(ISD::UDIV, VT, Expand);
279    setOperationAction(ISD::SREM, VT, Expand);
280    setOperationAction(ISD::UREM, VT, Expand);
281
282    // GPU does not have divrem function for signed or unsigned.
283    setOperationAction(ISD::SDIVREM, VT, Custom);
284    setOperationAction(ISD::UDIVREM, VT, Custom);
285
286    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
287    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
288    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
289
290    setOperationAction(ISD::BSWAP, VT, Expand);
291    setOperationAction(ISD::CTTZ, VT, Expand);
292    setOperationAction(ISD::CTLZ, VT, Expand);
293  }
294
295  if (!Subtarget->hasBCNT(32))
296    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
297
298  if (!Subtarget->hasBCNT(64))
299    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
300
301  // The hardware supports 32-bit ROTR, but not ROTL.
302  setOperationAction(ISD::ROTL, MVT::i32, Expand);
303  setOperationAction(ISD::ROTL, MVT::i64, Expand);
304  setOperationAction(ISD::ROTR, MVT::i64, Expand);
305
306  setOperationAction(ISD::MUL, MVT::i64, Expand);
307  setOperationAction(ISD::MULHU, MVT::i64, Expand);
308  setOperationAction(ISD::MULHS, MVT::i64, Expand);
309  setOperationAction(ISD::UDIV, MVT::i32, Expand);
310  setOperationAction(ISD::UREM, MVT::i32, Expand);
311  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
312  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
313  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
314  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
315  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
316
317  setOperationAction(ISD::SMIN, MVT::i32, Legal);
318  setOperationAction(ISD::UMIN, MVT::i32, Legal);
319  setOperationAction(ISD::SMAX, MVT::i32, Legal);
320  setOperationAction(ISD::UMAX, MVT::i32, Legal);
321
322  if (Subtarget->hasFFBH())
323    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
324
325  if (Subtarget->hasFFBL())
326    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
327
328  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
329  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
330
331  // We only really have 32-bit BFE instructions (and 16-bit on VI).
332  //
333  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
334  // effort to match them now. We want this to be false for i64 cases when the
335  // extraction isn't restricted to the upper or lower half. Ideally we would
336  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
337  // span the midpoint are probably relatively rare, so don't worry about them
338  // for now.
339  if (Subtarget->hasBFE())
340    setHasExtractBitsInsn(true);
341
342  static const MVT::SimpleValueType VectorIntTypes[] = {
343    MVT::v2i32, MVT::v4i32
344  };
345
346  for (MVT VT : VectorIntTypes) {
347    // Expand the following operations for the current type by default.
348    setOperationAction(ISD::ADD,  VT, Expand);
349    setOperationAction(ISD::AND,  VT, Expand);
350    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
351    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
352    setOperationAction(ISD::MUL,  VT, Expand);
353    setOperationAction(ISD::MULHU, VT, Expand);
354    setOperationAction(ISD::MULHS, VT, Expand);
355    setOperationAction(ISD::OR,   VT, Expand);
356    setOperationAction(ISD::SHL,  VT, Expand);
357    setOperationAction(ISD::SRA,  VT, Expand);
358    setOperationAction(ISD::SRL,  VT, Expand);
359    setOperationAction(ISD::ROTL, VT, Expand);
360    setOperationAction(ISD::ROTR, VT, Expand);
361    setOperationAction(ISD::SUB,  VT, Expand);
362    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
363    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
364    setOperationAction(ISD::SDIV, VT, Expand);
365    setOperationAction(ISD::UDIV, VT, Expand);
366    setOperationAction(ISD::SREM, VT, Expand);
367    setOperationAction(ISD::UREM, VT, Expand);
368    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
369    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
370    setOperationAction(ISD::SDIVREM, VT, Custom);
371    setOperationAction(ISD::UDIVREM, VT, Expand);
372    setOperationAction(ISD::ADDC, VT, Expand);
373    setOperationAction(ISD::SUBC, VT, Expand);
374    setOperationAction(ISD::ADDE, VT, Expand);
375    setOperationAction(ISD::SUBE, VT, Expand);
376    setOperationAction(ISD::SELECT, VT, Expand);
377    setOperationAction(ISD::VSELECT, VT, Expand);
378    setOperationAction(ISD::SELECT_CC, VT, Expand);
379    setOperationAction(ISD::XOR,  VT, Expand);
380    setOperationAction(ISD::BSWAP, VT, Expand);
381    setOperationAction(ISD::CTPOP, VT, Expand);
382    setOperationAction(ISD::CTTZ, VT, Expand);
383    setOperationAction(ISD::CTLZ, VT, Expand);
384    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
385  }
386
387  static const MVT::SimpleValueType FloatVectorTypes[] = {
388    MVT::v2f32, MVT::v4f32
389  };
390
391  for (MVT VT : FloatVectorTypes) {
392    setOperationAction(ISD::FABS, VT, Expand);
393    setOperationAction(ISD::FMINNUM, VT, Expand);
394    setOperationAction(ISD::FMAXNUM, VT, Expand);
395    setOperationAction(ISD::FADD, VT, Expand);
396    setOperationAction(ISD::FCEIL, VT, Expand);
397    setOperationAction(ISD::FCOS, VT, Expand);
398    setOperationAction(ISD::FDIV, VT, Expand);
399    setOperationAction(ISD::FEXP2, VT, Expand);
400    setOperationAction(ISD::FLOG2, VT, Expand);
401    setOperationAction(ISD::FREM, VT, Expand);
402    setOperationAction(ISD::FPOW, VT, Expand);
403    setOperationAction(ISD::FFLOOR, VT, Expand);
404    setOperationAction(ISD::FTRUNC, VT, Expand);
405    setOperationAction(ISD::FMUL, VT, Expand);
406    setOperationAction(ISD::FMA, VT, Expand);
407    setOperationAction(ISD::FRINT, VT, Expand);
408    setOperationAction(ISD::FNEARBYINT, VT, Expand);
409    setOperationAction(ISD::FSQRT, VT, Expand);
410    setOperationAction(ISD::FSIN, VT, Expand);
411    setOperationAction(ISD::FSUB, VT, Expand);
412    setOperationAction(ISD::FNEG, VT, Expand);
413    setOperationAction(ISD::VSELECT, VT, Expand);
414    setOperationAction(ISD::SELECT_CC, VT, Expand);
415    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
416    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
417  }
418
419  // This causes using an unrolled select operation rather than expansion with
420  // bit operations. This is in general better, but the alternative using BFI
421  // instructions may be better if the select sources are SGPRs.
422  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
423  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
424
425  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
426  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
427
428  // There are no libcalls of any kind.
429  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
430    setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
431
432  setBooleanContents(ZeroOrNegativeOneBooleanContent);
433  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
434
435  setSchedulingPreference(Sched::RegPressure);
436  setJumpIsExpensive(true);
437
438  // FIXME: This is only partially true. If we have to do vector compares, any
439  // SGPR pair can be a condition register. If we have a uniform condition, we
440  // are better off doing SALU operations, where there is only one SCC. For now,
441  // we don't have a way of knowing during instruction selection if a condition
442  // will be uniform and we always use vector compares. Assume we are using
443  // vector compares until that is fixed.
444  setHasMultipleConditionRegisters(true);
445
446  // SI at least has hardware support for floating point exceptions, but no way
447  // of using or handling them is implemented. They are also optional in OpenCL
448  // (Section 7.3)
449  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
450
451  PredictableSelectIsExpensive = false;
452
453  // We want to find all load dependencies for long chains of stores to enable
454  // merging into very wide vectors. The problem is with vectors with > 4
455  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
456  // vectors are a legal type, even though we have to split the loads
457  // usually. When we can more precisely specify load legality per address
458  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
459  // smarter so that they can figure out what to do in 2 iterations without all
460  // N > 4 stores on the same chain.
461  GatherAllAliasesMaxDepth = 16;
462
463  // FIXME: Need to really handle these.
464  MaxStoresPerMemcpy  = 4096;
465  MaxStoresPerMemmove = 4096;
466  MaxStoresPerMemset  = 4096;
467
468  setTargetDAGCombine(ISD::BITCAST);
469  setTargetDAGCombine(ISD::SHL);
470  setTargetDAGCombine(ISD::SRA);
471  setTargetDAGCombine(ISD::SRL);
472  setTargetDAGCombine(ISD::MUL);
473  setTargetDAGCombine(ISD::MULHU);
474  setTargetDAGCombine(ISD::MULHS);
475  setTargetDAGCombine(ISD::SELECT);
476  setTargetDAGCombine(ISD::SELECT_CC);
477  setTargetDAGCombine(ISD::STORE);
478  setTargetDAGCombine(ISD::FADD);
479  setTargetDAGCombine(ISD::FSUB);
480  setTargetDAGCombine(ISD::FNEG);
481}
482
483//===----------------------------------------------------------------------===//
484// Target Information
485//===----------------------------------------------------------------------===//
486
487static bool fnegFoldsIntoOp(unsigned Opc) {
488  switch (Opc) {
489  case ISD::FADD:
490  case ISD::FSUB:
491  case ISD::FMUL:
492  case ISD::FMA:
493  case ISD::FMAD:
494  case ISD::FSIN:
495  case AMDGPUISD::RCP:
496  case AMDGPUISD::RCP_LEGACY:
497  case AMDGPUISD::SIN_HW:
498  case AMDGPUISD::FMUL_LEGACY:
499    return true;
500  default:
501    return false;
502  }
503}
504
505MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
506  return MVT::i32;
507}
508
509bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
510  return true;
511}
512
513// The backend supports 32 and 64 bit floating point immediates.
514// FIXME: Why are we reporting vectors of FP immediates as legal?
515bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
516  EVT ScalarVT = VT.getScalarType();
517  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
518         (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
519}
520
521// We don't want to shrink f64 / f32 constants.
522bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
523  EVT ScalarVT = VT.getScalarType();
524  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
525}
526
527bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
528                                                 ISD::LoadExtType,
529                                                 EVT NewVT) const {
530
531  unsigned NewSize = NewVT.getStoreSizeInBits();
532
533  // If we are reducing to a 32-bit load, this is always better.
534  if (NewSize == 32)
535    return true;
536
537  EVT OldVT = N->getValueType(0);
538  unsigned OldSize = OldVT.getStoreSizeInBits();
539
540  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
541  // extloads, so doing one requires using a buffer_load. In cases where we
542  // still couldn't use a scalar load, using the wider load shouldn't really
543  // hurt anything.
544
545  // If the old size already had to be an extload, there's no harm in continuing
546  // to reduce the width.
547  return (OldSize < 32);
548}
549
550bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
551                                                   EVT CastTy) const {
552
553  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
554
555  if (LoadTy.getScalarType() == MVT::i32)
556    return false;
557
558  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
559  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
560
561  return (LScalarSize < CastScalarSize) ||
562         (CastScalarSize >= 32);
563}
564
565// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
566// profitable with the expansion for 64-bit since it's generally good to
567// speculate things.
568// FIXME: These should really have the size as a parameter.
569bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
570  return true;
571}
572
573bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
574  return true;
575}
576
577//===---------------------------------------------------------------------===//
578// Target Properties
579//===---------------------------------------------------------------------===//
580
581bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
582  assert(VT.isFloatingPoint());
583  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
584                                              VT == MVT::f16);
585}
586
587bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
588  return isFAbsFree(VT);
589}
590
591bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
592                                                         unsigned NumElem,
593                                                         unsigned AS) const {
594  return true;
595}
596
597bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
598  // There are few operations which truly have vector input operands. Any vector
599  // operation is going to involve operations on each component, and a
600  // build_vector will be a copy per element, so it always makes sense to use a
601  // build_vector input in place of the extracted element to avoid a copy into a
602  // super register.
603  //
604  // We should probably only do this if all users are extracts only, but this
605  // should be the common case.
606  return true;
607}
608
609bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
610  // Truncate is just accessing a subregister.
611
612  unsigned SrcSize = Source.getSizeInBits();
613  unsigned DestSize = Dest.getSizeInBits();
614
615  return DestSize < SrcSize && DestSize % 32 == 0 ;
616}
617
618bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
619  // Truncate is just accessing a subregister.
620
621  unsigned SrcSize = Source->getScalarSizeInBits();
622  unsigned DestSize = Dest->getScalarSizeInBits();
623
624  if (DestSize== 16 && Subtarget->has16BitInsts())
625    return SrcSize >= 32;
626
627  return DestSize < SrcSize && DestSize % 32 == 0;
628}
629
630bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
631  unsigned SrcSize = Src->getScalarSizeInBits();
632  unsigned DestSize = Dest->getScalarSizeInBits();
633
634  if (SrcSize == 16 && Subtarget->has16BitInsts())
635    return DestSize >= 32;
636
637  return SrcSize == 32 && DestSize == 64;
638}
639
640bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
641  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
642  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
643  // this will enable reducing 64-bit operations the 32-bit, which is always
644  // good.
645
646  if (Src == MVT::i16)
647    return Dest == MVT::i32 ||Dest == MVT::i64 ;
648
649  return Src == MVT::i32 && Dest == MVT::i64;
650}
651
652bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
653  return isZExtFree(Val.getValueType(), VT2);
654}
655
656bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
657  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
658  // limited number of native 64-bit operations. Shrinking an operation to fit
659  // in a single 32-bit register should always be helpful. As currently used,
660  // this is much less general than the name suggests, and is only used in
661  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
662  // not profitable, and may actually be harmful.
663  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
664}
665
666//===---------------------------------------------------------------------===//
667// TargetLowering Callbacks
668//===---------------------------------------------------------------------===//
669
670/// The SelectionDAGBuilder will automatically promote function arguments
671/// with illegal types.  However, this does not work for the AMDGPU targets
672/// since the function arguments are stored in memory as these illegal types.
673/// In order to handle this properly we need to get the original types sizes
674/// from the LLVM IR Function and fixup the ISD:InputArg values before
675/// passing them to AnalyzeFormalArguments()
676
677/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
678/// input values across multiple registers.  Each item in the Ins array
679/// represents a single value that will be stored in regsters.  Ins[x].VT is
680/// the value type of the value that will be stored in the register, so
681/// whatever SDNode we lower the argument to needs to be this type.
682///
683/// In order to correctly lower the arguments we need to know the size of each
684/// argument.  Since Ins[x].VT gives us the size of the register that will
685/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
686/// for the orignal function argument so that we can deduce the correct memory
687/// type to use for Ins[x].  In most cases the correct memory type will be
688/// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
689/// we have a kernel argument of type v8i8, this argument will be split into
690/// 8 parts and each part will be represented by its own item in the Ins array.
691/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
692/// the argument before it was split.  From this, we deduce that the memory type
693/// for each individual part is i8.  We pass the memory type as LocVT to the
694/// calling convention analysis function and the register type (Ins[x].VT) as
695/// the ValVT.
696void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
697                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
698  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
699    const ISD::InputArg &In = Ins[i];
700    EVT MemVT;
701
702    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
703
704    if (!Subtarget->isAmdHsaOS() &&
705        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
706      // The ABI says the caller will extend these values to 32-bits.
707      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
708    } else if (NumRegs == 1) {
709      // This argument is not split, so the IR type is the memory type.
710      assert(!In.Flags.isSplit());
711      if (In.ArgVT.isExtended()) {
712        // We have an extended type, like i24, so we should just use the register type
713        MemVT = In.VT;
714      } else {
715        MemVT = In.ArgVT;
716      }
717    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
718               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
719      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
720      // We have a vector value which has been split into a vector with
721      // the same scalar type, but fewer elements.  This should handle
722      // all the floating-point vector types.
723      MemVT = In.VT;
724    } else if (In.ArgVT.isVector() &&
725               In.ArgVT.getVectorNumElements() == NumRegs) {
726      // This arg has been split so that each element is stored in a separate
727      // register.
728      MemVT = In.ArgVT.getScalarType();
729    } else if (In.ArgVT.isExtended()) {
730      // We have an extended type, like i65.
731      MemVT = In.VT;
732    } else {
733      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
734      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
735      if (In.VT.isInteger()) {
736        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
737      } else if (In.VT.isVector()) {
738        assert(!In.VT.getScalarType().isFloatingPoint());
739        unsigned NumElements = In.VT.getVectorNumElements();
740        assert(MemoryBits % NumElements == 0);
741        // This vector type has been split into another vector type with
742        // a different elements size.
743        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
744                                         MemoryBits / NumElements);
745        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
746      } else {
747        llvm_unreachable("cannot deduce memory type.");
748      }
749    }
750
751    // Convert one element vectors to scalar.
752    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
753      MemVT = MemVT.getScalarType();
754
755    if (MemVT.isExtended()) {
756      // This should really only happen if we have vec3 arguments
757      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
758      MemVT = MemVT.getPow2VectorType(State.getContext());
759    }
760
761    assert(MemVT.isSimple());
762    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
763                    State);
764  }
765}
766
767void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
768                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
769  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
770}
771
772void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
773                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
774
775  State.AnalyzeReturn(Outs, RetCC_SI);
776}
777
778SDValue
779AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
780                                  bool isVarArg,
781                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
782                                  const SmallVectorImpl<SDValue> &OutVals,
783                                  const SDLoc &DL, SelectionDAG &DAG) const {
784  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
785}
786
787//===---------------------------------------------------------------------===//
788// Target specific lowering
789//===---------------------------------------------------------------------===//
790
791SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
792                                        SmallVectorImpl<SDValue> &InVals) const {
793  SDValue Callee = CLI.Callee;
794  SelectionDAG &DAG = CLI.DAG;
795
796  const Function &Fn = *DAG.getMachineFunction().getFunction();
797
798  StringRef FuncName("<unknown>");
799
800  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
801    FuncName = G->getSymbol();
802  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
803    FuncName = G->getGlobal()->getName();
804
805  DiagnosticInfoUnsupported NoCalls(
806      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
807  DAG.getContext()->diagnose(NoCalls);
808
809  if (!CLI.IsTailCall) {
810    for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
811      InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
812  }
813
814  return DAG.getEntryNode();
815}
816
817SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
818                                                      SelectionDAG &DAG) const {
819  const Function &Fn = *DAG.getMachineFunction().getFunction();
820
821  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
822                                            SDLoc(Op).getDebugLoc());
823  DAG.getContext()->diagnose(NoDynamicAlloca);
824  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
825  return DAG.getMergeValues(Ops, SDLoc());
826}
827
828SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
829                                             SelectionDAG &DAG) const {
830  switch (Op.getOpcode()) {
831  default:
832    Op->dump(&DAG);
833    llvm_unreachable("Custom lowering code for this"
834                     "instruction is not implemented yet!");
835    break;
836  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
837  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
838  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
839  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
840  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
841  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
842  case ISD::FREM: return LowerFREM(Op, DAG);
843  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
844  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
845  case ISD::FRINT: return LowerFRINT(Op, DAG);
846  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
847  case ISD::FROUND: return LowerFROUND(Op, DAG);
848  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
849  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
850  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
851  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
852  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
853  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
854  case ISD::CTLZ:
855  case ISD::CTLZ_ZERO_UNDEF:
856    return LowerCTLZ(Op, DAG);
857  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
858  }
859  return Op;
860}
861
862void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
863                                              SmallVectorImpl<SDValue> &Results,
864                                              SelectionDAG &DAG) const {
865  switch (N->getOpcode()) {
866  case ISD::SIGN_EXTEND_INREG:
867    // Different parts of legalization seem to interpret which type of
868    // sign_extend_inreg is the one to check for custom lowering. The extended
869    // from type is what really matters, but some places check for custom
870    // lowering of the result type. This results in trying to use
871    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
872    // nothing here and let the illegal result integer be handled normally.
873    return;
874  default:
875    return;
876  }
877}
878
879static bool hasDefinedInitializer(const GlobalValue *GV) {
880  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
881  if (!GVar || !GVar->hasInitializer())
882    return false;
883
884  return !isa<UndefValue>(GVar->getInitializer());
885}
886
887SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
888                                                 SDValue Op,
889                                                 SelectionDAG &DAG) const {
890
891  const DataLayout &DL = DAG.getDataLayout();
892  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
893  const GlobalValue *GV = G->getGlobal();
894
895  switch (G->getAddressSpace()) {
896  case AMDGPUAS::LOCAL_ADDRESS: {
897    // XXX: What does the value of G->getOffset() mean?
898    assert(G->getOffset() == 0 &&
899         "Do not know what to do with an non-zero offset");
900
901    // TODO: We could emit code to handle the initialization somewhere.
902    if (hasDefinedInitializer(GV))
903      break;
904
905    unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
906    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
907  }
908  }
909
910  const Function &Fn = *DAG.getMachineFunction().getFunction();
911  DiagnosticInfoUnsupported BadInit(
912      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
913  DAG.getContext()->diagnose(BadInit);
914  return SDValue();
915}
916
917SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
918                                                  SelectionDAG &DAG) const {
919  SmallVector<SDValue, 8> Args;
920
921  for (const SDUse &U : Op->ops())
922    DAG.ExtractVectorElements(U.get(), Args);
923
924  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
925}
926
927SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
928                                                     SelectionDAG &DAG) const {
929
930  SmallVector<SDValue, 8> Args;
931  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
932  EVT VT = Op.getValueType();
933  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
934                            VT.getVectorNumElements());
935
936  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
937}
938
939SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
940    SelectionDAG &DAG) const {
941  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
942  SDLoc DL(Op);
943  EVT VT = Op.getValueType();
944
945  switch (IntrinsicID) {
946    default: return Op;
947    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
948      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
949                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
950
951    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
952      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
953                         Op.getOperand(1),
954                         Op.getOperand(2),
955                         Op.getOperand(3));
956
957    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
958      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
959                         Op.getOperand(1),
960                         Op.getOperand(2),
961                         Op.getOperand(3));
962  }
963}
964
965/// \brief Generate Min/Max node
966SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
967                                                   SDValue LHS, SDValue RHS,
968                                                   SDValue True, SDValue False,
969                                                   SDValue CC,
970                                                   DAGCombinerInfo &DCI) const {
971  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
972    return SDValue();
973
974  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
975    return SDValue();
976
977  SelectionDAG &DAG = DCI.DAG;
978  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
979  switch (CCOpcode) {
980  case ISD::SETOEQ:
981  case ISD::SETONE:
982  case ISD::SETUNE:
983  case ISD::SETNE:
984  case ISD::SETUEQ:
985  case ISD::SETEQ:
986  case ISD::SETFALSE:
987  case ISD::SETFALSE2:
988  case ISD::SETTRUE:
989  case ISD::SETTRUE2:
990  case ISD::SETUO:
991  case ISD::SETO:
992    break;
993  case ISD::SETULE:
994  case ISD::SETULT: {
995    if (LHS == True)
996      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
997    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
998  }
999  case ISD::SETOLE:
1000  case ISD::SETOLT:
1001  case ISD::SETLE:
1002  case ISD::SETLT: {
1003    // Ordered. Assume ordered for undefined.
1004
1005    // Only do this after legalization to avoid interfering with other combines
1006    // which might occur.
1007    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1008        !DCI.isCalledByLegalizer())
1009      return SDValue();
1010
1011    // We need to permute the operands to get the correct NaN behavior. The
1012    // selected operand is the second one based on the failing compare with NaN,
1013    // so permute it based on the compare type the hardware uses.
1014    if (LHS == True)
1015      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1016    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1017  }
1018  case ISD::SETUGE:
1019  case ISD::SETUGT: {
1020    if (LHS == True)
1021      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1022    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1023  }
1024  case ISD::SETGT:
1025  case ISD::SETGE:
1026  case ISD::SETOGE:
1027  case ISD::SETOGT: {
1028    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1029        !DCI.isCalledByLegalizer())
1030      return SDValue();
1031
1032    if (LHS == True)
1033      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1034    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1035  }
1036  case ISD::SETCC_INVALID:
1037    llvm_unreachable("Invalid setcc condcode!");
1038  }
1039  return SDValue();
1040}
1041
1042std::pair<SDValue, SDValue>
1043AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1044  SDLoc SL(Op);
1045
1046  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1047
1048  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1049  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1050
1051  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1052  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1053
1054  return std::make_pair(Lo, Hi);
1055}
1056
1057SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1058  SDLoc SL(Op);
1059
1060  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1061  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1062  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1063}
1064
1065SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1066  SDLoc SL(Op);
1067
1068  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1069  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1070  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1071}
1072
1073SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1074                                              SelectionDAG &DAG) const {
1075  LoadSDNode *Load = cast<LoadSDNode>(Op);
1076  EVT VT = Op.getValueType();
1077
1078
1079  // If this is a 2 element vector, we really want to scalarize and not create
1080  // weird 1 element vectors.
1081  if (VT.getVectorNumElements() == 2)
1082    return scalarizeVectorLoad(Load, DAG);
1083
1084  SDValue BasePtr = Load->getBasePtr();
1085  EVT PtrVT = BasePtr.getValueType();
1086  EVT MemVT = Load->getMemoryVT();
1087  SDLoc SL(Op);
1088
1089  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1090
1091  EVT LoVT, HiVT;
1092  EVT LoMemVT, HiMemVT;
1093  SDValue Lo, Hi;
1094
1095  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1096  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1097  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1098
1099  unsigned Size = LoMemVT.getStoreSize();
1100  unsigned BaseAlign = Load->getAlignment();
1101  unsigned HiAlign = MinAlign(BaseAlign, Size);
1102
1103  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1104                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1105                                  BaseAlign, Load->getMemOperand()->getFlags());
1106  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1107                              DAG.getConstant(Size, SL, PtrVT));
1108  SDValue HiLoad =
1109      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1110                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1111                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1112
1113  SDValue Ops[] = {
1114    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1115    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1116                LoLoad.getValue(1), HiLoad.getValue(1))
1117  };
1118
1119  return DAG.getMergeValues(Ops, SL);
1120}
1121
1122SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1123                                               SelectionDAG &DAG) const {
1124  StoreSDNode *Store = cast<StoreSDNode>(Op);
1125  SDValue Val = Store->getValue();
1126  EVT VT = Val.getValueType();
1127
1128  // If this is a 2 element vector, we really want to scalarize and not create
1129  // weird 1 element vectors.
1130  if (VT.getVectorNumElements() == 2)
1131    return scalarizeVectorStore(Store, DAG);
1132
1133  EVT MemVT = Store->getMemoryVT();
1134  SDValue Chain = Store->getChain();
1135  SDValue BasePtr = Store->getBasePtr();
1136  SDLoc SL(Op);
1137
1138  EVT LoVT, HiVT;
1139  EVT LoMemVT, HiMemVT;
1140  SDValue Lo, Hi;
1141
1142  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1143  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1144  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1145
1146  EVT PtrVT = BasePtr.getValueType();
1147  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1148                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
1149                                              PtrVT));
1150
1151  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1152  unsigned BaseAlign = Store->getAlignment();
1153  unsigned Size = LoMemVT.getStoreSize();
1154  unsigned HiAlign = MinAlign(BaseAlign, Size);
1155
1156  SDValue LoStore =
1157      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1158                        Store->getMemOperand()->getFlags());
1159  SDValue HiStore =
1160      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1161                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1162
1163  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1164}
1165
1166// This is a shortcut for integer division because we have fast i32<->f32
1167// conversions, and fast f32 reciprocal instructions. The fractional part of a
1168// float is enough to accurately represent up to a 24-bit signed integer.
1169SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1170                                            bool Sign) const {
1171  SDLoc DL(Op);
1172  EVT VT = Op.getValueType();
1173  SDValue LHS = Op.getOperand(0);
1174  SDValue RHS = Op.getOperand(1);
1175  MVT IntVT = MVT::i32;
1176  MVT FltVT = MVT::f32;
1177
1178  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1179  if (LHSSignBits < 9)
1180    return SDValue();
1181
1182  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1183  if (RHSSignBits < 9)
1184    return SDValue();
1185
1186  unsigned BitSize = VT.getSizeInBits();
1187  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1188  unsigned DivBits = BitSize - SignBits;
1189  if (Sign)
1190    ++DivBits;
1191
1192  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1193  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1194
1195  SDValue jq = DAG.getConstant(1, DL, IntVT);
1196
1197  if (Sign) {
1198    // char|short jq = ia ^ ib;
1199    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1200
1201    // jq = jq >> (bitsize - 2)
1202    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1203                     DAG.getConstant(BitSize - 2, DL, VT));
1204
1205    // jq = jq | 0x1
1206    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1207  }
1208
1209  // int ia = (int)LHS;
1210  SDValue ia = LHS;
1211
1212  // int ib, (int)RHS;
1213  SDValue ib = RHS;
1214
1215  // float fa = (float)ia;
1216  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1217
1218  // float fb = (float)ib;
1219  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1220
1221  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1222                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1223
1224  // fq = trunc(fq);
1225  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1226
1227  // float fqneg = -fq;
1228  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1229
1230  // float fr = mad(fqneg, fb, fa);
1231  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
1232
1233  // int iq = (int)fq;
1234  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1235
1236  // fr = fabs(fr);
1237  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1238
1239  // fb = fabs(fb);
1240  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1241
1242  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1243
1244  // int cv = fr >= fb;
1245  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1246
1247  // jq = (cv ? jq : 0);
1248  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1249
1250  // dst = iq + jq;
1251  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1252
1253  // Rem needs compensation, it's easier to recompute it
1254  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1255  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1256
1257  // Truncate to number of bits this divide really is.
1258  if (Sign) {
1259    SDValue InRegSize
1260      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1261    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1262    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1263  } else {
1264    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1265    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1266    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1267  }
1268
1269  return DAG.getMergeValues({ Div, Rem }, DL);
1270}
1271
1272void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1273                                      SelectionDAG &DAG,
1274                                      SmallVectorImpl<SDValue> &Results) const {
1275  assert(Op.getValueType() == MVT::i64);
1276
1277  SDLoc DL(Op);
1278  EVT VT = Op.getValueType();
1279  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1280
1281  SDValue one = DAG.getConstant(1, DL, HalfVT);
1282  SDValue zero = DAG.getConstant(0, DL, HalfVT);
1283
1284  //HiLo split
1285  SDValue LHS = Op.getOperand(0);
1286  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1287  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1288
1289  SDValue RHS = Op.getOperand(1);
1290  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1291  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1292
1293  if (VT == MVT::i64 &&
1294    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1295    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1296
1297    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1298                              LHS_Lo, RHS_Lo);
1299
1300    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1301    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1302
1303    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1304    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1305    return;
1306  }
1307
1308  // Get Speculative values
1309  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1310  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1311
1312  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1313  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1314  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1315
1316  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1317  SDValue DIV_Lo = zero;
1318
1319  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1320
1321  for (unsigned i = 0; i < halfBitWidth; ++i) {
1322    const unsigned bitPos = halfBitWidth - i - 1;
1323    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1324    // Get value of high bit
1325    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1326    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1327    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1328
1329    // Shift
1330    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1331    // Add LHS high bit
1332    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1333
1334    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1335    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1336
1337    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1338
1339    // Update REM
1340    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1341    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1342  }
1343
1344  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1345  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1346  Results.push_back(DIV);
1347  Results.push_back(REM);
1348}
1349
1350SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1351                                           SelectionDAG &DAG) const {
1352  SDLoc DL(Op);
1353  EVT VT = Op.getValueType();
1354
1355  if (VT == MVT::i64) {
1356    SmallVector<SDValue, 2> Results;
1357    LowerUDIVREM64(Op, DAG, Results);
1358    return DAG.getMergeValues(Results, DL);
1359  }
1360
1361  if (VT == MVT::i32) {
1362    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1363      return Res;
1364  }
1365
1366  SDValue Num = Op.getOperand(0);
1367  SDValue Den = Op.getOperand(1);
1368
1369  // RCP =  URECIP(Den) = 2^32 / Den + e
1370  // e is rounding error.
1371  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1372
1373  // RCP_LO = mul(RCP, Den) */
1374  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1375
1376  // RCP_HI = mulhu (RCP, Den) */
1377  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1378
1379  // NEG_RCP_LO = -RCP_LO
1380  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1381                                                     RCP_LO);
1382
1383  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1384  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1385                                           NEG_RCP_LO, RCP_LO,
1386                                           ISD::SETEQ);
1387  // Calculate the rounding error from the URECIP instruction
1388  // E = mulhu(ABS_RCP_LO, RCP)
1389  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1390
1391  // RCP_A_E = RCP + E
1392  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1393
1394  // RCP_S_E = RCP - E
1395  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1396
1397  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1398  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1399                                     RCP_A_E, RCP_S_E,
1400                                     ISD::SETEQ);
1401  // Quotient = mulhu(Tmp0, Num)
1402  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1403
1404  // Num_S_Remainder = Quotient * Den
1405  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1406
1407  // Remainder = Num - Num_S_Remainder
1408  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1409
1410  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1411  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1412                                                 DAG.getConstant(-1, DL, VT),
1413                                                 DAG.getConstant(0, DL, VT),
1414                                                 ISD::SETUGE);
1415  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1416  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1417                                                  Num_S_Remainder,
1418                                                  DAG.getConstant(-1, DL, VT),
1419                                                  DAG.getConstant(0, DL, VT),
1420                                                  ISD::SETUGE);
1421  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1422  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1423                                               Remainder_GE_Zero);
1424
1425  // Calculate Division result:
1426
1427  // Quotient_A_One = Quotient + 1
1428  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1429                                       DAG.getConstant(1, DL, VT));
1430
1431  // Quotient_S_One = Quotient - 1
1432  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1433                                       DAG.getConstant(1, DL, VT));
1434
1435  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1436  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1437                                     Quotient, Quotient_A_One, ISD::SETEQ);
1438
1439  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1440  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1441                            Quotient_S_One, Div, ISD::SETEQ);
1442
1443  // Calculate Rem result:
1444
1445  // Remainder_S_Den = Remainder - Den
1446  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1447
1448  // Remainder_A_Den = Remainder + Den
1449  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1450
1451  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1452  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1453                                    Remainder, Remainder_S_Den, ISD::SETEQ);
1454
1455  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1456  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1457                            Remainder_A_Den, Rem, ISD::SETEQ);
1458  SDValue Ops[2] = {
1459    Div,
1460    Rem
1461  };
1462  return DAG.getMergeValues(Ops, DL);
1463}
1464
1465SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1466                                           SelectionDAG &DAG) const {
1467  SDLoc DL(Op);
1468  EVT VT = Op.getValueType();
1469
1470  SDValue LHS = Op.getOperand(0);
1471  SDValue RHS = Op.getOperand(1);
1472
1473  SDValue Zero = DAG.getConstant(0, DL, VT);
1474  SDValue NegOne = DAG.getConstant(-1, DL, VT);
1475
1476  if (VT == MVT::i32) {
1477    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1478      return Res;
1479  }
1480
1481  if (VT == MVT::i64 &&
1482      DAG.ComputeNumSignBits(LHS) > 32 &&
1483      DAG.ComputeNumSignBits(RHS) > 32) {
1484    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1485
1486    //HiLo split
1487    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1488    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1489    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1490                                 LHS_Lo, RHS_Lo);
1491    SDValue Res[2] = {
1492      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1493      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1494    };
1495    return DAG.getMergeValues(Res, DL);
1496  }
1497
1498  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1499  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1500  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1501  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1502
1503  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1504  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1505
1506  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1507  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1508
1509  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1510  SDValue Rem = Div.getValue(1);
1511
1512  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1513  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1514
1515  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1516  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1517
1518  SDValue Res[2] = {
1519    Div,
1520    Rem
1521  };
1522  return DAG.getMergeValues(Res, DL);
1523}
1524
1525// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1526SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1527  SDLoc SL(Op);
1528  EVT VT = Op.getValueType();
1529  SDValue X = Op.getOperand(0);
1530  SDValue Y = Op.getOperand(1);
1531
1532  // TODO: Should this propagate fast-math-flags?
1533
1534  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1535  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1536  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1537
1538  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1539}
1540
1541SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1542  SDLoc SL(Op);
1543  SDValue Src = Op.getOperand(0);
1544
1545  // result = trunc(src)
1546  // if (src > 0.0 && src != result)
1547  //   result += 1.0
1548
1549  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1550
1551  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1552  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1553
1554  EVT SetCCVT =
1555      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1556
1557  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1558  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1559  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1560
1561  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1562  // TODO: Should this propagate fast-math-flags?
1563  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1564}
1565
1566static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1567                                  SelectionDAG &DAG) {
1568  const unsigned FractBits = 52;
1569  const unsigned ExpBits = 11;
1570
1571  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1572                                Hi,
1573                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
1574                                DAG.getConstant(ExpBits, SL, MVT::i32));
1575  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1576                            DAG.getConstant(1023, SL, MVT::i32));
1577
1578  return Exp;
1579}
1580
1581SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1582  SDLoc SL(Op);
1583  SDValue Src = Op.getOperand(0);
1584
1585  assert(Op.getValueType() == MVT::f64);
1586
1587  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1588  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1589
1590  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1591
1592  // Extract the upper half, since this is where we will find the sign and
1593  // exponent.
1594  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1595
1596  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1597
1598  const unsigned FractBits = 52;
1599
1600  // Extract the sign bit.
1601  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1602  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1603
1604  // Extend back to to 64-bits.
1605  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1606  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1607
1608  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1609  const SDValue FractMask
1610    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1611
1612  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1613  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1614  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1615
1616  EVT SetCCVT =
1617      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1618
1619  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1620
1621  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1622  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1623
1624  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1625  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1626
1627  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1628}
1629
1630SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1631  SDLoc SL(Op);
1632  SDValue Src = Op.getOperand(0);
1633
1634  assert(Op.getValueType() == MVT::f64);
1635
1636  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1637  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1638  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1639
1640  // TODO: Should this propagate fast-math-flags?
1641
1642  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1643  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1644
1645  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1646
1647  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1648  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1649
1650  EVT SetCCVT =
1651      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1652  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1653
1654  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1655}
1656
1657SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1658  // FNEARBYINT and FRINT are the same, except in their handling of FP
1659  // exceptions. Those aren't really meaningful for us, and OpenCL only has
1660  // rint, so just treat them as equivalent.
1661  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1662}
1663
1664// XXX - May require not supporting f32 denormals?
1665SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
1666  SDLoc SL(Op);
1667  SDValue X = Op.getOperand(0);
1668
1669  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
1670
1671  // TODO: Should this propagate fast-math-flags?
1672
1673  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
1674
1675  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
1676
1677  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
1678  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
1679  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
1680
1681  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
1682
1683  EVT SetCCVT =
1684      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
1685
1686  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1687
1688  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
1689
1690  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
1691}
1692
1693SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1694  SDLoc SL(Op);
1695  SDValue X = Op.getOperand(0);
1696
1697  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1698
1699  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1700  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1701  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1702  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1703  EVT SetCCVT =
1704      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1705
1706  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1707
1708  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1709
1710  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1711
1712  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1713                                       MVT::i64);
1714
1715  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1716  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1717                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
1718                                          MVT::i64),
1719                          Exp);
1720
1721  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1722  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1723                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
1724                              ISD::SETNE);
1725
1726  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1727                             D, DAG.getConstant(0, SL, MVT::i64));
1728  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1729
1730  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1731  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1732
1733  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1734  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1735  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1736
1737  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1738                            ExpEqNegOne,
1739                            DAG.getConstantFP(1.0, SL, MVT::f64),
1740                            DAG.getConstantFP(0.0, SL, MVT::f64));
1741
1742  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1743
1744  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1745  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1746
1747  return K;
1748}
1749
1750SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1751  EVT VT = Op.getValueType();
1752
1753  if (VT == MVT::f32)
1754    return LowerFROUND32(Op, DAG);
1755
1756  if (VT == MVT::f64)
1757    return LowerFROUND64(Op, DAG);
1758
1759  llvm_unreachable("unhandled type");
1760}
1761
1762SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1763  SDLoc SL(Op);
1764  SDValue Src = Op.getOperand(0);
1765
1766  // result = trunc(src);
1767  // if (src < 0.0 && src != result)
1768  //   result += -1.0.
1769
1770  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1771
1772  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1773  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1774
1775  EVT SetCCVT =
1776      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1777
1778  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1779  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1780  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1781
1782  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1783  // TODO: Should this propagate fast-math-flags?
1784  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1785}
1786
1787SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1788  SDLoc SL(Op);
1789  SDValue Src = Op.getOperand(0);
1790  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1791
1792  if (ZeroUndef && Src.getValueType() == MVT::i32)
1793    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1794
1795  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1796
1797  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1799
1800  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1801  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1802
1803  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1804                                   *DAG.getContext(), MVT::i32);
1805
1806  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1807
1808  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1809  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1810
1811  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1812  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1813
1814  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
1815  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
1816
1817  if (!ZeroUndef) {
1818    // Test if the full 64-bit input is zero.
1819
1820    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
1821    // which we probably don't want.
1822    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
1823    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
1824
1825    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
1826    // with the same cycles, otherwise it is slower.
1827    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
1828    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
1829
1830    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
1831
1832    // The instruction returns -1 for 0 input, but the defined intrinsic
1833    // behavior is to return the number of bits.
1834    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
1835                          SrcIsZero, Bits32, NewCtlz);
1836  }
1837
1838  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
1839}
1840
1841SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
1842                                               bool Signed) const {
1843  // Unsigned
1844  // cul2f(ulong u)
1845  //{
1846  //  uint lz = clz(u);
1847  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
1848  //  u = (u << lz) & 0x7fffffffffffffffUL;
1849  //  ulong t = u & 0xffffffffffUL;
1850  //  uint v = (e << 23) | (uint)(u >> 40);
1851  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
1852  //  return as_float(v + r);
1853  //}
1854  // Signed
1855  // cl2f(long l)
1856  //{
1857  //  long s = l >> 63;
1858  //  float r = cul2f((l + s) ^ s);
1859  //  return s ? -r : r;
1860  //}
1861
1862  SDLoc SL(Op);
1863  SDValue Src = Op.getOperand(0);
1864  SDValue L = Src;
1865
1866  SDValue S;
1867  if (Signed) {
1868    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
1869    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
1870
1871    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
1872    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
1873  }
1874
1875  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1876                                   *DAG.getContext(), MVT::f32);
1877
1878
1879  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
1880  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
1881  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
1882  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
1883
1884  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
1885  SDValue E = DAG.getSelect(SL, MVT::i32,
1886    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
1887    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
1888    ZeroI32);
1889
1890  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
1891    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
1892    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
1893
1894  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
1895                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
1896
1897  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
1898                             U, DAG.getConstant(40, SL, MVT::i64));
1899
1900  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
1901    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
1902    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
1903
1904  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
1905  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
1906  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
1907
1908  SDValue One = DAG.getConstant(1, SL, MVT::i32);
1909
1910  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
1911
1912  SDValue R = DAG.getSelect(SL, MVT::i32,
1913    RCmp,
1914    One,
1915    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
1916  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
1917  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
1918
1919  if (!Signed)
1920    return R;
1921
1922  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
1923  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
1924}
1925
1926SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
1927                                               bool Signed) const {
1928  SDLoc SL(Op);
1929  SDValue Src = Op.getOperand(0);
1930
1931  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1932
1933  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1934                           DAG.getConstant(0, SL, MVT::i32));
1935  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1936                           DAG.getConstant(1, SL, MVT::i32));
1937
1938  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
1939                              SL, MVT::f64, Hi);
1940
1941  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
1942
1943  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
1944                              DAG.getConstant(32, SL, MVT::i32));
1945  // TODO: Should this propagate fast-math-flags?
1946  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
1947}
1948
1949SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
1950                                               SelectionDAG &DAG) const {
1951  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
1952         "operation should be legal");
1953
1954  // TODO: Factor out code common with LowerSINT_TO_FP.
1955
1956  EVT DestVT = Op.getValueType();
1957  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1958    SDLoc DL(Op);
1959    SDValue Src = Op.getOperand(0);
1960
1961    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1962    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1963    SDValue FPRound =
1964        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1965
1966    return FPRound;
1967  }
1968
1969  if (DestVT == MVT::f32)
1970    return LowerINT_TO_FP32(Op, DAG, false);
1971
1972  assert(DestVT == MVT::f64);
1973  return LowerINT_TO_FP64(Op, DAG, false);
1974}
1975
1976SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
1977                                              SelectionDAG &DAG) const {
1978  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
1979         "operation should be legal");
1980
1981  // TODO: Factor out code common with LowerUINT_TO_FP.
1982
1983  EVT DestVT = Op.getValueType();
1984  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1985    SDLoc DL(Op);
1986    SDValue Src = Op.getOperand(0);
1987
1988    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1989    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1990    SDValue FPRound =
1991        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1992
1993    return FPRound;
1994  }
1995
1996  if (DestVT == MVT::f32)
1997    return LowerINT_TO_FP32(Op, DAG, true);
1998
1999  assert(DestVT == MVT::f64);
2000  return LowerINT_TO_FP64(Op, DAG, true);
2001}
2002
2003SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2004                                               bool Signed) const {
2005  SDLoc SL(Op);
2006
2007  SDValue Src = Op.getOperand(0);
2008
2009  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2010
2011  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2012                                 MVT::f64);
2013  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2014                                 MVT::f64);
2015  // TODO: Should this propagate fast-math-flags?
2016  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2017
2018  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2019
2020
2021  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2022
2023  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2024                           MVT::i32, FloorMul);
2025  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2026
2027  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2028
2029  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2030}
2031
2032SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2033
2034  if (getTargetMachine().Options.UnsafeFPMath) {
2035    // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2036    return SDValue();
2037  }
2038
2039  SDLoc DL(Op);
2040  SDValue N0 = Op.getOperand(0);
2041  assert (N0.getSimpleValueType() == MVT::f64);
2042
2043  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2044  const unsigned ExpMask = 0x7ff;
2045  const unsigned ExpBiasf64 = 1023;
2046  const unsigned ExpBiasf16 = 15;
2047  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2048  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2049  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2050  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2051                           DAG.getConstant(32, DL, MVT::i64));
2052  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2053  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2054  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2055                          DAG.getConstant(20, DL, MVT::i64));
2056  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2057                  DAG.getConstant(ExpMask, DL, MVT::i32));
2058  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2059  // add the f16 bias (15) to get the biased exponent for the f16 format.
2060  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2061                  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2062
2063  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2064                          DAG.getConstant(8, DL, MVT::i32));
2065  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2066                  DAG.getConstant(0xffe, DL, MVT::i32));
2067
2068  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2069                                  DAG.getConstant(0x1ff, DL, MVT::i32));
2070  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2071
2072  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2073  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2074
2075  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2076  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2077      DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2078                      Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2079
2080  // N = M | (E << 12);
2081  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2082      DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2083                  DAG.getConstant(12, DL, MVT::i32)));
2084
2085  // B = clamp(1-E, 0, 13);
2086  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2087                                  One, E);
2088  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2089  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2090                  DAG.getConstant(13, DL, MVT::i32));
2091
2092  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2093                                   DAG.getConstant(0x1000, DL, MVT::i32));
2094
2095  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2096  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2097  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2098  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2099
2100  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2101  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2102                              DAG.getConstant(0x7, DL, MVT::i32));
2103  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2104                  DAG.getConstant(2, DL, MVT::i32));
2105  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2106                               One, Zero, ISD::SETEQ);
2107  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2108                               One, Zero, ISD::SETGT);
2109  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2110  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2111
2112  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2113                      DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2114  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2115                      I, V, ISD::SETEQ);
2116
2117  // Extract the sign bit.
2118  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2119                            DAG.getConstant(16, DL, MVT::i32));
2120  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2121                     DAG.getConstant(0x8000, DL, MVT::i32));
2122
2123  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2124  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2125}
2126
2127SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2128                                              SelectionDAG &DAG) const {
2129  SDValue Src = Op.getOperand(0);
2130
2131  // TODO: Factor out code common with LowerFP_TO_UINT.
2132
2133  EVT SrcVT = Src.getValueType();
2134  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2135    SDLoc DL(Op);
2136
2137    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2138    SDValue FpToInt32 =
2139        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2140
2141    return FpToInt32;
2142  }
2143
2144  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2145    return LowerFP64_TO_INT(Op, DAG, true);
2146
2147  return SDValue();
2148}
2149
2150SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2151                                              SelectionDAG &DAG) const {
2152  SDValue Src = Op.getOperand(0);
2153
2154  // TODO: Factor out code common with LowerFP_TO_SINT.
2155
2156  EVT SrcVT = Src.getValueType();
2157  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2158    SDLoc DL(Op);
2159
2160    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2161    SDValue FpToInt32 =
2162        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2163
2164    return FpToInt32;
2165  }
2166
2167  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2168    return LowerFP64_TO_INT(Op, DAG, false);
2169
2170  return SDValue();
2171}
2172
2173SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2174                                                     SelectionDAG &DAG) const {
2175  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2176  MVT VT = Op.getSimpleValueType();
2177  MVT ScalarVT = VT.getScalarType();
2178
2179  assert(VT.isVector());
2180
2181  SDValue Src = Op.getOperand(0);
2182  SDLoc DL(Op);
2183
2184  // TODO: Don't scalarize on Evergreen?
2185  unsigned NElts = VT.getVectorNumElements();
2186  SmallVector<SDValue, 8> Args;
2187  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2188
2189  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2190  for (unsigned I = 0; I < NElts; ++I)
2191    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2192
2193  return DAG.getBuildVector(VT, DL, Args);
2194}
2195
2196//===----------------------------------------------------------------------===//
2197// Custom DAG optimizations
2198//===----------------------------------------------------------------------===//
2199
2200static bool isU24(SDValue Op, SelectionDAG &DAG) {
2201  APInt KnownZero, KnownOne;
2202  EVT VT = Op.getValueType();
2203  DAG.computeKnownBits(Op, KnownZero, KnownOne);
2204
2205  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
2206}
2207
2208static bool isI24(SDValue Op, SelectionDAG &DAG) {
2209  EVT VT = Op.getValueType();
2210
2211  // In order for this to be a signed 24-bit value, bit 23, must
2212  // be a sign bit.
2213  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2214                                     // as unsigned 24-bit values.
2215         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2216}
2217
2218static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2219                        TargetLowering::DAGCombinerInfo &DCI) {
2220
2221  SelectionDAG &DAG = DCI.DAG;
2222  SDValue Op = Node24->getOperand(OpIdx);
2223  EVT VT = Op.getValueType();
2224
2225  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2226  APInt KnownZero, KnownOne;
2227  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2228  if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
2229    return true;
2230
2231  return false;
2232}
2233
2234template <typename IntTy>
2235static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2236                               uint32_t Width, const SDLoc &DL) {
2237  if (Width + Offset < 32) {
2238    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2239    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2240    return DAG.getConstant(Result, DL, MVT::i32);
2241  }
2242
2243  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2244}
2245
2246static bool hasVolatileUser(SDNode *Val) {
2247  for (SDNode *U : Val->uses()) {
2248    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2249      if (M->isVolatile())
2250        return true;
2251    }
2252  }
2253
2254  return false;
2255}
2256
2257bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2258  // i32 vectors are the canonical memory type.
2259  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2260    return false;
2261
2262  if (!VT.isByteSized())
2263    return false;
2264
2265  unsigned Size = VT.getStoreSize();
2266
2267  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2268    return false;
2269
2270  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2271    return false;
2272
2273  return true;
2274}
2275
2276// Replace load of an illegal type with a store of a bitcast to a friendlier
2277// type.
2278SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2279                                                 DAGCombinerInfo &DCI) const {
2280  if (!DCI.isBeforeLegalize())
2281    return SDValue();
2282
2283  LoadSDNode *LN = cast<LoadSDNode>(N);
2284  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2285    return SDValue();
2286
2287  SDLoc SL(N);
2288  SelectionDAG &DAG = DCI.DAG;
2289  EVT VT = LN->getMemoryVT();
2290
2291  unsigned Size = VT.getStoreSize();
2292  unsigned Align = LN->getAlignment();
2293  if (Align < Size && isTypeLegal(VT)) {
2294    bool IsFast;
2295    unsigned AS = LN->getAddressSpace();
2296
2297    // Expand unaligned loads earlier than legalization. Due to visitation order
2298    // problems during legalization, the emitted instructions to pack and unpack
2299    // the bytes again are not eliminated in the case of an unaligned copy.
2300    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2301      if (VT.isVector())
2302        return scalarizeVectorLoad(LN, DAG);
2303
2304      SDValue Ops[2];
2305      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2306      return DAG.getMergeValues(Ops, SDLoc(N));
2307    }
2308
2309    if (!IsFast)
2310      return SDValue();
2311  }
2312
2313  if (!shouldCombineMemoryType(VT))
2314    return SDValue();
2315
2316  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2317
2318  SDValue NewLoad
2319    = DAG.getLoad(NewVT, SL, LN->getChain(),
2320                  LN->getBasePtr(), LN->getMemOperand());
2321
2322  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2323  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2324  return SDValue(N, 0);
2325}
2326
2327// Replace store of an illegal type with a store of a bitcast to a friendlier
2328// type.
2329SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2330                                                  DAGCombinerInfo &DCI) const {
2331  if (!DCI.isBeforeLegalize())
2332    return SDValue();
2333
2334  StoreSDNode *SN = cast<StoreSDNode>(N);
2335  if (SN->isVolatile() || !ISD::isNormalStore(SN))
2336    return SDValue();
2337
2338  EVT VT = SN->getMemoryVT();
2339  unsigned Size = VT.getStoreSize();
2340
2341  SDLoc SL(N);
2342  SelectionDAG &DAG = DCI.DAG;
2343  unsigned Align = SN->getAlignment();
2344  if (Align < Size && isTypeLegal(VT)) {
2345    bool IsFast;
2346    unsigned AS = SN->getAddressSpace();
2347
2348    // Expand unaligned stores earlier than legalization. Due to visitation
2349    // order problems during legalization, the emitted instructions to pack and
2350    // unpack the bytes again are not eliminated in the case of an unaligned
2351    // copy.
2352    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2353      if (VT.isVector())
2354        return scalarizeVectorStore(SN, DAG);
2355
2356      return expandUnalignedStore(SN, DAG);
2357    }
2358
2359    if (!IsFast)
2360      return SDValue();
2361  }
2362
2363  if (!shouldCombineMemoryType(VT))
2364    return SDValue();
2365
2366  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2367  SDValue Val = SN->getValue();
2368
2369  //DCI.AddToWorklist(Val.getNode());
2370
2371  bool OtherUses = !Val.hasOneUse();
2372  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2373  if (OtherUses) {
2374    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2375    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2376  }
2377
2378  return DAG.getStore(SN->getChain(), SL, CastVal,
2379                      SN->getBasePtr(), SN->getMemOperand());
2380}
2381
2382/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2383/// binary operation \p Opc to it with the corresponding constant operands.
2384SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2385  DAGCombinerInfo &DCI, const SDLoc &SL,
2386  unsigned Opc, SDValue LHS,
2387  uint32_t ValLo, uint32_t ValHi) const {
2388  SelectionDAG &DAG = DCI.DAG;
2389  SDValue Lo, Hi;
2390  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2391
2392  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2393  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2394
2395  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2396  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2397
2398  // Re-visit the ands. It's possible we eliminated one of them and it could
2399  // simplify the vector.
2400  DCI.AddToWorklist(Lo.getNode());
2401  DCI.AddToWorklist(Hi.getNode());
2402
2403  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2404  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2405}
2406
2407SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2408                                                DAGCombinerInfo &DCI) const {
2409  if (N->getValueType(0) != MVT::i64)
2410    return SDValue();
2411
2412  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2413
2414  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2415  // common case, splitting this into a move and a 32-bit shift is faster and
2416  // the same code size.
2417  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2418  if (!RHS)
2419    return SDValue();
2420
2421  unsigned RHSVal = RHS->getZExtValue();
2422  if (RHSVal < 32)
2423    return SDValue();
2424
2425  SDValue LHS = N->getOperand(0);
2426
2427  SDLoc SL(N);
2428  SelectionDAG &DAG = DCI.DAG;
2429
2430  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2431
2432  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2433  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2434
2435  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2436
2437  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2438  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2439}
2440
2441SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2442                                                DAGCombinerInfo &DCI) const {
2443  if (N->getValueType(0) != MVT::i64)
2444    return SDValue();
2445
2446  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2447  if (!RHS)
2448    return SDValue();
2449
2450  SelectionDAG &DAG = DCI.DAG;
2451  SDLoc SL(N);
2452  unsigned RHSVal = RHS->getZExtValue();
2453
2454  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2455  if (RHSVal == 32) {
2456    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2457    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2458                                   DAG.getConstant(31, SL, MVT::i32));
2459
2460    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2461    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2462  }
2463
2464  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2465  if (RHSVal == 63) {
2466    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2467    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2468                                   DAG.getConstant(31, SL, MVT::i32));
2469    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2470    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2471  }
2472
2473  return SDValue();
2474}
2475
2476SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2477                                                DAGCombinerInfo &DCI) const {
2478  if (N->getValueType(0) != MVT::i64)
2479    return SDValue();
2480
2481  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2482  if (!RHS)
2483    return SDValue();
2484
2485  unsigned ShiftAmt = RHS->getZExtValue();
2486  if (ShiftAmt < 32)
2487    return SDValue();
2488
2489  // srl i64:x, C for C >= 32
2490  // =>
2491  //   build_pair (srl hi_32(x), C - 32), 0
2492
2493  SelectionDAG &DAG = DCI.DAG;
2494  SDLoc SL(N);
2495
2496  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2497  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2498
2499  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2500  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2501                           VecOp, One);
2502
2503  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2504  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2505
2506  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2507
2508  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2509}
2510
2511// We need to specifically handle i64 mul here to avoid unnecessary conversion
2512// instructions. If we only match on the legalized i64 mul expansion,
2513// SimplifyDemandedBits will be unable to remove them because there will be
2514// multiple uses due to the separate mul + mulh[su].
2515static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2516                        SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2517  if (Size <= 32) {
2518    unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2519    return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2520  }
2521
2522  // Because we want to eliminate extension instructions before the
2523  // operation, we need to create a single user here (i.e. not the separate
2524  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2525
2526  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2527
2528  SDValue Mul = DAG.getNode(MulOpc, SL,
2529                            DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2530
2531  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2532                     Mul.getValue(0), Mul.getValue(1));
2533}
2534
2535SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2536                                                DAGCombinerInfo &DCI) const {
2537  EVT VT = N->getValueType(0);
2538
2539  unsigned Size = VT.getSizeInBits();
2540  if (VT.isVector() || Size > 64)
2541    return SDValue();
2542
2543  // There are i16 integer mul/mad.
2544  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2545    return SDValue();
2546
2547  SelectionDAG &DAG = DCI.DAG;
2548  SDLoc DL(N);
2549
2550  SDValue N0 = N->getOperand(0);
2551  SDValue N1 = N->getOperand(1);
2552  SDValue Mul;
2553
2554  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2555    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2556    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2557    Mul = getMul24(DAG, DL, N0, N1, Size, false);
2558  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2559    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2560    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2561    Mul = getMul24(DAG, DL, N0, N1, Size, true);
2562  } else {
2563    return SDValue();
2564  }
2565
2566  // We need to use sext even for MUL_U24, because MUL_U24 is used
2567  // for signed multiply of 8 and 16-bit types.
2568  return DAG.getSExtOrTrunc(Mul, DL, VT);
2569}
2570
2571SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2572                                                  DAGCombinerInfo &DCI) const {
2573  EVT VT = N->getValueType(0);
2574
2575  if (!Subtarget->hasMulI24() || VT.isVector())
2576    return SDValue();
2577
2578  SelectionDAG &DAG = DCI.DAG;
2579  SDLoc DL(N);
2580
2581  SDValue N0 = N->getOperand(0);
2582  SDValue N1 = N->getOperand(1);
2583
2584  if (!isI24(N0, DAG) || !isI24(N1, DAG))
2585    return SDValue();
2586
2587  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2588  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2589
2590  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2591  DCI.AddToWorklist(Mulhi.getNode());
2592  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2593}
2594
2595SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2596                                                  DAGCombinerInfo &DCI) const {
2597  EVT VT = N->getValueType(0);
2598
2599  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2600    return SDValue();
2601
2602  SelectionDAG &DAG = DCI.DAG;
2603  SDLoc DL(N);
2604
2605  SDValue N0 = N->getOperand(0);
2606  SDValue N1 = N->getOperand(1);
2607
2608  if (!isU24(N0, DAG) || !isU24(N1, DAG))
2609    return SDValue();
2610
2611  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2612  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2613
2614  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2615  DCI.AddToWorklist(Mulhi.getNode());
2616  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2617}
2618
2619SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2620  SDNode *N, DAGCombinerInfo &DCI) const {
2621  SelectionDAG &DAG = DCI.DAG;
2622
2623  // Simplify demanded bits before splitting into multiple users.
2624  if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2625    return SDValue();
2626
2627  SDValue N0 = N->getOperand(0);
2628  SDValue N1 = N->getOperand(1);
2629
2630  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2631
2632  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2633  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2634
2635  SDLoc SL(N);
2636
2637  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2638  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2639  return DAG.getMergeValues({ MulLo, MulHi }, SL);
2640}
2641
2642static bool isNegativeOne(SDValue Val) {
2643  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2644    return C->isAllOnesValue();
2645  return false;
2646}
2647
2648static bool isCtlzOpc(unsigned Opc) {
2649  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2650}
2651
2652SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2653                                          SDValue Op,
2654                                          const SDLoc &DL) const {
2655  EVT VT = Op.getValueType();
2656  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2657  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2658                              LegalVT != MVT::i16))
2659    return SDValue();
2660
2661  if (VT != MVT::i32)
2662    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2663
2664  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2665  if (VT != MVT::i32)
2666    FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
2667
2668  return FFBH;
2669}
2670
2671// The native instructions return -1 on 0 input. Optimize out a select that
2672// produces -1 on 0.
2673//
2674// TODO: If zero is not undef, we could also do this if the output is compared
2675// against the bitwidth.
2676//
2677// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2678SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2679                                                 SDValue LHS, SDValue RHS,
2680                                                 DAGCombinerInfo &DCI) const {
2681  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2682  if (!CmpRhs || !CmpRhs->isNullValue())
2683    return SDValue();
2684
2685  SelectionDAG &DAG = DCI.DAG;
2686  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2687  SDValue CmpLHS = Cond.getOperand(0);
2688
2689  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2690  if (CCOpcode == ISD::SETEQ &&
2691      isCtlzOpc(RHS.getOpcode()) &&
2692      RHS.getOperand(0) == CmpLHS &&
2693      isNegativeOne(LHS)) {
2694    return getFFBH_U32(DAG, CmpLHS, SL);
2695  }
2696
2697  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2698  if (CCOpcode == ISD::SETNE &&
2699      isCtlzOpc(LHS.getOpcode()) &&
2700      LHS.getOperand(0) == CmpLHS &&
2701      isNegativeOne(RHS)) {
2702    return getFFBH_U32(DAG, CmpLHS, SL);
2703  }
2704
2705  return SDValue();
2706}
2707
2708static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
2709                                         unsigned Op,
2710                                         const SDLoc &SL,
2711                                         SDValue Cond,
2712                                         SDValue N1,
2713                                         SDValue N2) {
2714  SelectionDAG &DAG = DCI.DAG;
2715  EVT VT = N1.getValueType();
2716
2717  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
2718                                  N1.getOperand(0), N2.getOperand(0));
2719  DCI.AddToWorklist(NewSelect.getNode());
2720  return DAG.getNode(Op, SL, VT, NewSelect);
2721}
2722
2723// Pull a free FP operation out of a select so it may fold into uses.
2724//
2725// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
2726// select c, (fneg x), k -> fneg (select c, x, (fneg k))
2727//
2728// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
2729// select c, (fabs x), +k -> fabs (select c, x, k)
2730static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
2731                                    SDValue N) {
2732  SelectionDAG &DAG = DCI.DAG;
2733  SDValue Cond = N.getOperand(0);
2734  SDValue LHS = N.getOperand(1);
2735  SDValue RHS = N.getOperand(2);
2736
2737  EVT VT = N.getValueType();
2738  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
2739      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
2740    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
2741                                     SDLoc(N), Cond, LHS, RHS);
2742  }
2743
2744  bool Inv = false;
2745  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
2746    std::swap(LHS, RHS);
2747    Inv = true;
2748  }
2749
2750  // TODO: Support vector constants.
2751  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2752  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
2753    SDLoc SL(N);
2754    // If one side is an fneg/fabs and the other is a constant, we can push the
2755    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
2756    SDValue NewLHS = LHS.getOperand(0);
2757    SDValue NewRHS = RHS;
2758
2759    // Careful: if the neg can be folded up, don't try to pull it back down.
2760    bool ShouldFoldNeg = true;
2761
2762    if (NewLHS.hasOneUse()) {
2763      unsigned Opc = NewLHS.getOpcode();
2764      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
2765        ShouldFoldNeg = false;
2766      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
2767        ShouldFoldNeg = false;
2768    }
2769
2770    if (ShouldFoldNeg) {
2771      if (LHS.getOpcode() == ISD::FNEG)
2772        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2773      else if (CRHS->isNegative())
2774        return SDValue();
2775
2776      if (Inv)
2777        std::swap(NewLHS, NewRHS);
2778
2779      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
2780                                      Cond, NewLHS, NewRHS);
2781      DCI.AddToWorklist(NewSelect.getNode());
2782      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
2783    }
2784  }
2785
2786  return SDValue();
2787}
2788
2789
2790SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
2791                                                   DAGCombinerInfo &DCI) const {
2792  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
2793    return Folded;
2794
2795  SDValue Cond = N->getOperand(0);
2796  if (Cond.getOpcode() != ISD::SETCC)
2797    return SDValue();
2798
2799  EVT VT = N->getValueType(0);
2800  SDValue LHS = Cond.getOperand(0);
2801  SDValue RHS = Cond.getOperand(1);
2802  SDValue CC = Cond.getOperand(2);
2803
2804  SDValue True = N->getOperand(1);
2805  SDValue False = N->getOperand(2);
2806
2807  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
2808    SelectionDAG &DAG = DCI.DAG;
2809    if ((DAG.isConstantValueOfAnyType(True) ||
2810         DAG.isConstantValueOfAnyType(True)) &&
2811        (!DAG.isConstantValueOfAnyType(False) &&
2812         !DAG.isConstantValueOfAnyType(False))) {
2813      // Swap cmp + select pair to move constant to false input.
2814      // This will allow using VOPC cndmasks more often.
2815      // select (setcc x, y), k, x -> select (setcc y, x) x, x
2816
2817      SDLoc SL(N);
2818      ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
2819                                            LHS.getValueType().isInteger());
2820
2821      SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
2822      return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
2823    }
2824  }
2825
2826  if (VT == MVT::f32 && Cond.hasOneUse()) {
2827    SDValue MinMax
2828      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
2829    // Revisit this node so we can catch min3/max3/med3 patterns.
2830    //DCI.AddToWorklist(MinMax.getNode());
2831    return MinMax;
2832  }
2833
2834  // There's no reason to not do this if the condition has other uses.
2835  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
2836}
2837
2838SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
2839                                                 DAGCombinerInfo &DCI) const {
2840  SelectionDAG &DAG = DCI.DAG;
2841  SDValue N0 = N->getOperand(0);
2842  EVT VT = N->getValueType(0);
2843
2844  unsigned Opc = N0.getOpcode();
2845
2846  // If the input has multiple uses and we can either fold the negate down, or
2847  // the other uses cannot, give up. This both prevents unprofitable
2848  // transformations and infinite loops: we won't repeatedly try to fold around
2849  // a negate that has no 'good' form.
2850  //
2851  // TODO: Check users can fold
2852  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
2853    return SDValue();
2854
2855  SDLoc SL(N);
2856  switch (Opc) {
2857  case ISD::FADD: {
2858    if (!mayIgnoreSignedZero(N0))
2859      return SDValue();
2860
2861    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
2862    SDValue LHS = N0.getOperand(0);
2863    SDValue RHS = N0.getOperand(1);
2864
2865    if (LHS.getOpcode() != ISD::FNEG)
2866      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
2867    else
2868      LHS = LHS.getOperand(0);
2869
2870    if (RHS.getOpcode() != ISD::FNEG)
2871      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2872    else
2873      RHS = RHS.getOperand(0);
2874
2875    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
2876    if (!N0.hasOneUse())
2877      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2878    return Res;
2879  }
2880  case ISD::FMUL:
2881  case AMDGPUISD::FMUL_LEGACY: {
2882    // (fneg (fmul x, y)) -> (fmul x, (fneg y))
2883    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
2884    SDValue LHS = N0.getOperand(0);
2885    SDValue RHS = N0.getOperand(1);
2886
2887    if (LHS.getOpcode() == ISD::FNEG)
2888      LHS = LHS.getOperand(0);
2889    else if (RHS.getOpcode() == ISD::FNEG)
2890      RHS = RHS.getOperand(0);
2891    else
2892      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2893
2894    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
2895    if (!N0.hasOneUse())
2896      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2897    return Res;
2898  }
2899  case ISD::FMA:
2900  case ISD::FMAD: {
2901    if (!mayIgnoreSignedZero(N0))
2902      return SDValue();
2903
2904    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
2905    SDValue LHS = N0.getOperand(0);
2906    SDValue MHS = N0.getOperand(1);
2907    SDValue RHS = N0.getOperand(2);
2908
2909    if (LHS.getOpcode() == ISD::FNEG)
2910      LHS = LHS.getOperand(0);
2911    else if (MHS.getOpcode() == ISD::FNEG)
2912      MHS = MHS.getOperand(0);
2913    else
2914      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
2915
2916    if (RHS.getOpcode() != ISD::FNEG)
2917      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2918    else
2919      RHS = RHS.getOperand(0);
2920
2921    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
2922    if (!N0.hasOneUse())
2923      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2924    return Res;
2925  }
2926  case ISD::FP_EXTEND:
2927  case AMDGPUISD::RCP:
2928  case AMDGPUISD::RCP_LEGACY:
2929  case ISD::FSIN:
2930  case AMDGPUISD::SIN_HW: {
2931    SDValue CvtSrc = N0.getOperand(0);
2932    if (CvtSrc.getOpcode() == ISD::FNEG) {
2933      // (fneg (fp_extend (fneg x))) -> (fp_extend x)
2934      // (fneg (rcp (fneg x))) -> (rcp x)
2935      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
2936    }
2937
2938    if (!N0.hasOneUse())
2939      return SDValue();
2940
2941    // (fneg (fp_extend x)) -> (fp_extend (fneg x))
2942    // (fneg (rcp x)) -> (rcp (fneg x))
2943    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
2944    return DAG.getNode(Opc, SL, VT, Neg);
2945  }
2946  case ISD::FP_ROUND: {
2947    SDValue CvtSrc = N0.getOperand(0);
2948
2949    if (CvtSrc.getOpcode() == ISD::FNEG) {
2950      // (fneg (fp_round (fneg x))) -> (fp_round x)
2951      return DAG.getNode(ISD::FP_ROUND, SL, VT,
2952                         CvtSrc.getOperand(0), N0.getOperand(1));
2953    }
2954
2955    if (!N0.hasOneUse())
2956      return SDValue();
2957
2958    // (fneg (fp_round x)) -> (fp_round (fneg x))
2959    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
2960    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
2961  }
2962  default:
2963    return SDValue();
2964  }
2965}
2966
2967SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
2968                                                DAGCombinerInfo &DCI) const {
2969  SelectionDAG &DAG = DCI.DAG;
2970  SDLoc DL(N);
2971
2972  switch(N->getOpcode()) {
2973  default:
2974    break;
2975  case ISD::BITCAST: {
2976    EVT DestVT = N->getValueType(0);
2977
2978    // Push casts through vector builds. This helps avoid emitting a large
2979    // number of copies when materializing floating point vector constants.
2980    //
2981    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
2982    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
2983    if (DestVT.isVector()) {
2984      SDValue Src = N->getOperand(0);
2985      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2986        EVT SrcVT = Src.getValueType();
2987        unsigned NElts = DestVT.getVectorNumElements();
2988
2989        if (SrcVT.getVectorNumElements() == NElts) {
2990          EVT DestEltVT = DestVT.getVectorElementType();
2991
2992          SmallVector<SDValue, 8> CastedElts;
2993          SDLoc SL(N);
2994          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
2995            SDValue Elt = Src.getOperand(I);
2996            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
2997          }
2998
2999          return DAG.getBuildVector(DestVT, SL, CastedElts);
3000        }
3001      }
3002    }
3003
3004    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3005      break;
3006
3007    // Fold bitcasts of constants.
3008    //
3009    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3010    // TODO: Generalize and move to DAGCombiner
3011    SDValue Src = N->getOperand(0);
3012    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3013      assert(Src.getValueType() == MVT::i64);
3014      SDLoc SL(N);
3015      uint64_t CVal = C->getZExtValue();
3016      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3017                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3018                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3019    }
3020
3021    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3022      const APInt &Val = C->getValueAPF().bitcastToAPInt();
3023      SDLoc SL(N);
3024      uint64_t CVal = Val.getZExtValue();
3025      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3026                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3027                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3028
3029      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3030    }
3031
3032    break;
3033  }
3034  case ISD::SHL: {
3035    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3036      break;
3037
3038    return performShlCombine(N, DCI);
3039  }
3040  case ISD::SRL: {
3041    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3042      break;
3043
3044    return performSrlCombine(N, DCI);
3045  }
3046  case ISD::SRA: {
3047    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3048      break;
3049
3050    return performSraCombine(N, DCI);
3051  }
3052  case ISD::MUL:
3053    return performMulCombine(N, DCI);
3054  case ISD::MULHS:
3055    return performMulhsCombine(N, DCI);
3056  case ISD::MULHU:
3057    return performMulhuCombine(N, DCI);
3058  case AMDGPUISD::MUL_I24:
3059  case AMDGPUISD::MUL_U24:
3060  case AMDGPUISD::MULHI_I24:
3061  case AMDGPUISD::MULHI_U24: {
3062    // If the first call to simplify is successfull, then N may end up being
3063    // deleted, so we shouldn't call simplifyI24 again.
3064    simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3065    return SDValue();
3066  }
3067  case AMDGPUISD::MUL_LOHI_I24:
3068  case AMDGPUISD::MUL_LOHI_U24:
3069    return performMulLoHi24Combine(N, DCI);
3070  case ISD::SELECT:
3071    return performSelectCombine(N, DCI);
3072  case ISD::FNEG:
3073    return performFNegCombine(N, DCI);
3074  case AMDGPUISD::BFE_I32:
3075  case AMDGPUISD::BFE_U32: {
3076    assert(!N->getValueType(0).isVector() &&
3077           "Vector handling of BFE not implemented");
3078    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3079    if (!Width)
3080      break;
3081
3082    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3083    if (WidthVal == 0)
3084      return DAG.getConstant(0, DL, MVT::i32);
3085
3086    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3087    if (!Offset)
3088      break;
3089
3090    SDValue BitsFrom = N->getOperand(0);
3091    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3092
3093    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3094
3095    if (OffsetVal == 0) {
3096      // This is already sign / zero extended, so try to fold away extra BFEs.
3097      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3098
3099      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3100      if (OpSignBits >= SignBits)
3101        return BitsFrom;
3102
3103      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3104      if (Signed) {
3105        // This is a sign_extend_inreg. Replace it to take advantage of existing
3106        // DAG Combines. If not eliminated, we will match back to BFE during
3107        // selection.
3108
3109        // TODO: The sext_inreg of extended types ends, although we can could
3110        // handle them in a single BFE.
3111        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3112                           DAG.getValueType(SmallVT));
3113      }
3114
3115      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3116    }
3117
3118    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3119      if (Signed) {
3120        return constantFoldBFE<int32_t>(DAG,
3121                                        CVal->getSExtValue(),
3122                                        OffsetVal,
3123                                        WidthVal,
3124                                        DL);
3125      }
3126
3127      return constantFoldBFE<uint32_t>(DAG,
3128                                       CVal->getZExtValue(),
3129                                       OffsetVal,
3130                                       WidthVal,
3131                                       DL);
3132    }
3133
3134    if ((OffsetVal + WidthVal) >= 32) {
3135      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3136      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3137                         BitsFrom, ShiftVal);
3138    }
3139
3140    if (BitsFrom.hasOneUse()) {
3141      APInt Demanded = APInt::getBitsSet(32,
3142                                         OffsetVal,
3143                                         OffsetVal + WidthVal);
3144
3145      APInt KnownZero, KnownOne;
3146      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3147                                            !DCI.isBeforeLegalizeOps());
3148      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3149      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
3150          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
3151                                   KnownZero, KnownOne, TLO)) {
3152        DCI.CommitTargetLoweringOpt(TLO);
3153      }
3154    }
3155
3156    break;
3157  }
3158  case ISD::LOAD:
3159    return performLoadCombine(N, DCI);
3160  case ISD::STORE:
3161    return performStoreCombine(N, DCI);
3162  }
3163  return SDValue();
3164}
3165
3166//===----------------------------------------------------------------------===//
3167// Helper functions
3168//===----------------------------------------------------------------------===//
3169
3170SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3171                                                  const TargetRegisterClass *RC,
3172                                                   unsigned Reg, EVT VT) const {
3173  MachineFunction &MF = DAG.getMachineFunction();
3174  MachineRegisterInfo &MRI = MF.getRegInfo();
3175  unsigned VirtualRegister;
3176  if (!MRI.isLiveIn(Reg)) {
3177    VirtualRegister = MRI.createVirtualRegister(RC);
3178    MRI.addLiveIn(Reg, VirtualRegister);
3179  } else {
3180    VirtualRegister = MRI.getLiveInVirtReg(Reg);
3181  }
3182  return DAG.getRegister(VirtualRegister, VT);
3183}
3184
3185uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3186    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3187  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3188  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3189  switch (Param) {
3190  case GRID_DIM:
3191    return ArgOffset;
3192  case GRID_OFFSET:
3193    return ArgOffset + 4;
3194  }
3195  llvm_unreachable("unexpected implicit parameter type");
3196}
3197
3198#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3199
3200const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3201  switch ((AMDGPUISD::NodeType)Opcode) {
3202  case AMDGPUISD::FIRST_NUMBER: break;
3203  // AMDIL DAG nodes
3204  NODE_NAME_CASE(CALL);
3205  NODE_NAME_CASE(UMUL);
3206  NODE_NAME_CASE(BRANCH_COND);
3207
3208  // AMDGPU DAG nodes
3209  NODE_NAME_CASE(ENDPGM)
3210  NODE_NAME_CASE(RETURN)
3211  NODE_NAME_CASE(DWORDADDR)
3212  NODE_NAME_CASE(FRACT)
3213  NODE_NAME_CASE(SETCC)
3214  NODE_NAME_CASE(SETREG)
3215  NODE_NAME_CASE(FMA_W_CHAIN)
3216  NODE_NAME_CASE(FMUL_W_CHAIN)
3217  NODE_NAME_CASE(CLAMP)
3218  NODE_NAME_CASE(COS_HW)
3219  NODE_NAME_CASE(SIN_HW)
3220  NODE_NAME_CASE(FMAX_LEGACY)
3221  NODE_NAME_CASE(FMIN_LEGACY)
3222  NODE_NAME_CASE(FMAX3)
3223  NODE_NAME_CASE(SMAX3)
3224  NODE_NAME_CASE(UMAX3)
3225  NODE_NAME_CASE(FMIN3)
3226  NODE_NAME_CASE(SMIN3)
3227  NODE_NAME_CASE(UMIN3)
3228  NODE_NAME_CASE(FMED3)
3229  NODE_NAME_CASE(SMED3)
3230  NODE_NAME_CASE(UMED3)
3231  NODE_NAME_CASE(URECIP)
3232  NODE_NAME_CASE(DIV_SCALE)
3233  NODE_NAME_CASE(DIV_FMAS)
3234  NODE_NAME_CASE(DIV_FIXUP)
3235  NODE_NAME_CASE(TRIG_PREOP)
3236  NODE_NAME_CASE(RCP)
3237  NODE_NAME_CASE(RSQ)
3238  NODE_NAME_CASE(RCP_LEGACY)
3239  NODE_NAME_CASE(RSQ_LEGACY)
3240  NODE_NAME_CASE(FMUL_LEGACY)
3241  NODE_NAME_CASE(RSQ_CLAMP)
3242  NODE_NAME_CASE(LDEXP)
3243  NODE_NAME_CASE(FP_CLASS)
3244  NODE_NAME_CASE(DOT4)
3245  NODE_NAME_CASE(CARRY)
3246  NODE_NAME_CASE(BORROW)
3247  NODE_NAME_CASE(BFE_U32)
3248  NODE_NAME_CASE(BFE_I32)
3249  NODE_NAME_CASE(BFI)
3250  NODE_NAME_CASE(BFM)
3251  NODE_NAME_CASE(FFBH_U32)
3252  NODE_NAME_CASE(FFBH_I32)
3253  NODE_NAME_CASE(MUL_U24)
3254  NODE_NAME_CASE(MUL_I24)
3255  NODE_NAME_CASE(MULHI_U24)
3256  NODE_NAME_CASE(MULHI_I24)
3257  NODE_NAME_CASE(MUL_LOHI_U24)
3258  NODE_NAME_CASE(MUL_LOHI_I24)
3259  NODE_NAME_CASE(MAD_U24)
3260  NODE_NAME_CASE(MAD_I24)
3261  NODE_NAME_CASE(TEXTURE_FETCH)
3262  NODE_NAME_CASE(EXPORT)
3263  NODE_NAME_CASE(EXPORT_DONE)
3264  NODE_NAME_CASE(R600_EXPORT)
3265  NODE_NAME_CASE(CONST_ADDRESS)
3266  NODE_NAME_CASE(REGISTER_LOAD)
3267  NODE_NAME_CASE(REGISTER_STORE)
3268  NODE_NAME_CASE(LOAD_INPUT)
3269  NODE_NAME_CASE(SAMPLE)
3270  NODE_NAME_CASE(SAMPLEB)
3271  NODE_NAME_CASE(SAMPLED)
3272  NODE_NAME_CASE(SAMPLEL)
3273  NODE_NAME_CASE(CVT_F32_UBYTE0)
3274  NODE_NAME_CASE(CVT_F32_UBYTE1)
3275  NODE_NAME_CASE(CVT_F32_UBYTE2)
3276  NODE_NAME_CASE(CVT_F32_UBYTE3)
3277  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3278  NODE_NAME_CASE(CONST_DATA_PTR)
3279  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3280  NODE_NAME_CASE(KILL)
3281  NODE_NAME_CASE(DUMMY_CHAIN)
3282  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3283  NODE_NAME_CASE(SENDMSG)
3284  NODE_NAME_CASE(SENDMSGHALT)
3285  NODE_NAME_CASE(INTERP_MOV)
3286  NODE_NAME_CASE(INTERP_P1)
3287  NODE_NAME_CASE(INTERP_P2)
3288  NODE_NAME_CASE(STORE_MSKOR)
3289  NODE_NAME_CASE(LOAD_CONSTANT)
3290  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3291  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3292  NODE_NAME_CASE(ATOMIC_INC)
3293  NODE_NAME_CASE(ATOMIC_DEC)
3294  NODE_NAME_CASE(BUFFER_LOAD)
3295  NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3296  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3297  }
3298  return nullptr;
3299}
3300
3301SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3302                                              SelectionDAG &DAG, int Enabled,
3303                                              int &RefinementSteps,
3304                                              bool &UseOneConstNR,
3305                                              bool Reciprocal) const {
3306  EVT VT = Operand.getValueType();
3307
3308  if (VT == MVT::f32) {
3309    RefinementSteps = 0;
3310    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3311  }
3312
3313  // TODO: There is also f64 rsq instruction, but the documentation is less
3314  // clear on its precision.
3315
3316  return SDValue();
3317}
3318
3319SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3320                                               SelectionDAG &DAG, int Enabled,
3321                                               int &RefinementSteps) const {
3322  EVT VT = Operand.getValueType();
3323
3324  if (VT == MVT::f32) {
3325    // Reciprocal, < 1 ulp error.
3326    //
3327    // This reciprocal approximation converges to < 0.5 ulp error with one
3328    // newton rhapson performed with two fused multiple adds (FMAs).
3329
3330    RefinementSteps = 0;
3331    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3332  }
3333
3334  // TODO: There is also f64 rcp instruction, but the documentation is less
3335  // clear on its precision.
3336
3337  return SDValue();
3338}
3339
3340void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3341  const SDValue Op,
3342  APInt &KnownZero,
3343  APInt &KnownOne,
3344  const SelectionDAG &DAG,
3345  unsigned Depth) const {
3346
3347  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
3348
3349  APInt KnownZero2;
3350  APInt KnownOne2;
3351  unsigned Opc = Op.getOpcode();
3352
3353  switch (Opc) {
3354  default:
3355    break;
3356  case AMDGPUISD::CARRY:
3357  case AMDGPUISD::BORROW: {
3358    KnownZero = APInt::getHighBitsSet(32, 31);
3359    break;
3360  }
3361
3362  case AMDGPUISD::BFE_I32:
3363  case AMDGPUISD::BFE_U32: {
3364    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3365    if (!CWidth)
3366      return;
3367
3368    unsigned BitWidth = 32;
3369    uint32_t Width = CWidth->getZExtValue() & 0x1f;
3370
3371    if (Opc == AMDGPUISD::BFE_U32)
3372      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
3373
3374    break;
3375  }
3376  }
3377}
3378
3379unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3380  SDValue Op,
3381  const SelectionDAG &DAG,
3382  unsigned Depth) const {
3383  switch (Op.getOpcode()) {
3384  case AMDGPUISD::BFE_I32: {
3385    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3386    if (!Width)
3387      return 1;
3388
3389    unsigned SignBits = 32 - Width->getZExtValue() + 1;
3390    if (!isNullConstant(Op.getOperand(1)))
3391      return SignBits;
3392
3393    // TODO: Could probably figure something out with non-0 offsets.
3394    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3395    return std::max(SignBits, Op0SignBits);
3396  }
3397
3398  case AMDGPUISD::BFE_U32: {
3399    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3400    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3401  }
3402
3403  case AMDGPUISD::CARRY:
3404  case AMDGPUISD::BORROW:
3405    return 31;
3406
3407  default:
3408    return 1;
3409  }
3410}
3411