AMDGPUISelLowering.cpp revision 329410
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This is the parent TargetLowering class for hardware code gen
12/// targets.
13//
14//===----------------------------------------------------------------------===//
15
16#define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f
17#define AMDGPU_LN2_F       0.693147180559945309417232121458176568f
18#define AMDGPU_LN10_F      2.30258509299404568401799145468436421f
19
20#include "AMDGPUISelLowering.h"
21#include "AMDGPU.h"
22#include "AMDGPUCallLowering.h"
23#include "AMDGPUFrameLowering.h"
24#include "AMDGPUIntrinsicInfo.h"
25#include "AMDGPURegisterInfo.h"
26#include "AMDGPUSubtarget.h"
27#include "AMDGPUTargetMachine.h"
28#include "R600MachineFunctionInfo.h"
29#include "SIInstrInfo.h"
30#include "SIMachineFunctionInfo.h"
31#include "llvm/CodeGen/CallingConvLower.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/CodeGen/SelectionDAG.h"
35#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/DiagnosticInfo.h"
38#include "llvm/Support/KnownBits.h"
39using namespace llvm;
40
41static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
42                            CCValAssign::LocInfo LocInfo,
43                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
44  MachineFunction &MF = State.getMachineFunction();
45  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
46
47  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
48                                         ArgFlags.getOrigAlign());
49  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
50  return true;
51}
52
53static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
54                           CCValAssign::LocInfo LocInfo,
55                           ISD::ArgFlagsTy ArgFlags, CCState &State,
56                           const TargetRegisterClass *RC,
57                           unsigned NumRegs) {
58  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
59  unsigned RegResult = State.AllocateReg(RegList);
60  if (RegResult == AMDGPU::NoRegister)
61    return false;
62
63  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
64  return true;
65}
66
67static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
68                              CCValAssign::LocInfo LocInfo,
69                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
70  switch (LocVT.SimpleTy) {
71  case MVT::i64:
72  case MVT::f64:
73  case MVT::v2i32:
74  case MVT::v2f32: {
75    // Up to SGPR0-SGPR39
76    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
77                          &AMDGPU::SGPR_64RegClass, 20);
78  }
79  default:
80    return false;
81  }
82}
83
84// Allocate up to VGPR31.
85//
86// TODO: Since there are no VGPR alignent requirements would it be better to
87// split into individual scalar registers?
88static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
89                              CCValAssign::LocInfo LocInfo,
90                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
91  switch (LocVT.SimpleTy) {
92  case MVT::i64:
93  case MVT::f64:
94  case MVT::v2i32:
95  case MVT::v2f32: {
96    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
97                          &AMDGPU::VReg_64RegClass, 31);
98  }
99  case MVT::v4i32:
100  case MVT::v4f32:
101  case MVT::v2i64:
102  case MVT::v2f64: {
103    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
104                          &AMDGPU::VReg_128RegClass, 29);
105  }
106  case MVT::v8i32:
107  case MVT::v8f32: {
108    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
109                          &AMDGPU::VReg_256RegClass, 25);
110
111  }
112  case MVT::v16i32:
113  case MVT::v16f32: {
114    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
115                          &AMDGPU::VReg_512RegClass, 17);
116
117  }
118  default:
119    return false;
120  }
121}
122
123#include "AMDGPUGenCallingConv.inc"
124
125// Find a larger type to do a load / store of a vector with.
126EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
127  unsigned StoreSize = VT.getStoreSizeInBits();
128  if (StoreSize <= 32)
129    return EVT::getIntegerVT(Ctx, StoreSize);
130
131  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
132  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
133}
134
135unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
136  KnownBits Known;
137  EVT VT = Op.getValueType();
138  DAG.computeKnownBits(Op, Known);
139
140  return VT.getSizeInBits() - Known.countMinLeadingZeros();
141}
142
143unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
144  EVT VT = Op.getValueType();
145
146  // In order for this to be a signed 24-bit value, bit 23, must
147  // be a sign bit.
148  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
149}
150
151AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
152                                           const AMDGPUSubtarget &STI)
153    : TargetLowering(TM), Subtarget(&STI) {
154  AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
155  // Lower floating point store/load to integer store/load to reduce the number
156  // of patterns in tablegen.
157  setOperationAction(ISD::LOAD, MVT::f32, Promote);
158  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
159
160  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
161  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
162
163  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
164  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
165
166  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
167  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
168
169  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
170  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
171
172  setOperationAction(ISD::LOAD, MVT::i64, Promote);
173  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
174
175  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
176  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
177
178  setOperationAction(ISD::LOAD, MVT::f64, Promote);
179  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
180
181  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
182  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
183
184  // There are no 64-bit extloads. These should be done as a 32-bit extload and
185  // an extension to 64-bit.
186  for (MVT VT : MVT::integer_valuetypes()) {
187    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
188    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
189    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
190  }
191
192  for (MVT VT : MVT::integer_valuetypes()) {
193    if (VT == MVT::i64)
194      continue;
195
196    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
197    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
198    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
199    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
200
201    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
202    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
203    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
204    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
205
206    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
207    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
208    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
209    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
210  }
211
212  for (MVT VT : MVT::integer_vector_valuetypes()) {
213    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
214    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
215    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
216    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
217    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
218    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
219    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
220    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
221    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
222    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
223    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
224    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
225  }
226
227  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
228  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
229  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
230  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
231
232  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
233  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
234  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
235  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
236
237  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
238  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
239  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
240  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
241
242  setOperationAction(ISD::STORE, MVT::f32, Promote);
243  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
245  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
246  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
248  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
249  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
250
251  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
252  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
253
254  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
255  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
256
257  setOperationAction(ISD::STORE, MVT::i64, Promote);
258  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
259
260  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
261  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
262
263  setOperationAction(ISD::STORE, MVT::f64, Promote);
264  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
265
266  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
267  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
268
269  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
270  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
271  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
272  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
273
274  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
275  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
276  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
277  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
278
279  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
280  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
281  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
282  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
283
284  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
285  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
286
287  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
288  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
289
290  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
291  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
292
293  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
294  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
295
296
297  setOperationAction(ISD::Constant, MVT::i32, Legal);
298  setOperationAction(ISD::Constant, MVT::i64, Legal);
299  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
300  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
301
302  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
303  setOperationAction(ISD::BRIND, MVT::Other, Expand);
304
305  // This is totally unsupported, just custom lower to produce an error.
306  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
307
308  // Library functions.  These default to Expand, but we have instructions
309  // for them.
310  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
311  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
312  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
313  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
314  setOperationAction(ISD::FABS,   MVT::f32, Legal);
315  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
316  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
317  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
318  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
319  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
320
321  setOperationAction(ISD::FROUND, MVT::f32, Custom);
322  setOperationAction(ISD::FROUND, MVT::f64, Custom);
323
324  setOperationAction(ISD::FLOG, MVT::f32, Custom);
325  setOperationAction(ISD::FLOG10, MVT::f32, Custom);
326
327  if (Subtarget->has16BitInsts()) {
328    setOperationAction(ISD::FLOG, MVT::f16, Custom);
329    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
330  }
331
332  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
333  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
334
335  setOperationAction(ISD::FREM, MVT::f32, Custom);
336  setOperationAction(ISD::FREM, MVT::f64, Custom);
337
338  // v_mad_f32 does not support denormals according to some sources.
339  if (!Subtarget->hasFP32Denormals())
340    setOperationAction(ISD::FMAD, MVT::f32, Legal);
341
342  // Expand to fneg + fadd.
343  setOperationAction(ISD::FSUB, MVT::f64, Expand);
344
345  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
346  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
347  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
348  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
349  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
350  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
351  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
352  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
353  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
354  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
355
356  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
357    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
358    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
359    setOperationAction(ISD::FRINT, MVT::f64, Custom);
360    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
361  }
362
363  if (!Subtarget->hasBFI()) {
364    // fcopysign can be done in a single instruction with BFI.
365    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
366    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
367  }
368
369  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
370  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
371  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
372
373  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
374  for (MVT VT : ScalarIntVTs) {
375    // These should use [SU]DIVREM, so set them to expand
376    setOperationAction(ISD::SDIV, VT, Expand);
377    setOperationAction(ISD::UDIV, VT, Expand);
378    setOperationAction(ISD::SREM, VT, Expand);
379    setOperationAction(ISD::UREM, VT, Expand);
380
381    // GPU does not have divrem function for signed or unsigned.
382    setOperationAction(ISD::SDIVREM, VT, Custom);
383    setOperationAction(ISD::UDIVREM, VT, Custom);
384
385    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
386    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
387    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
388
389    setOperationAction(ISD::BSWAP, VT, Expand);
390    setOperationAction(ISD::CTTZ, VT, Expand);
391    setOperationAction(ISD::CTLZ, VT, Expand);
392  }
393
394  if (!Subtarget->hasBCNT(32))
395    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
396
397  if (!Subtarget->hasBCNT(64))
398    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
399
400  // The hardware supports 32-bit ROTR, but not ROTL.
401  setOperationAction(ISD::ROTL, MVT::i32, Expand);
402  setOperationAction(ISD::ROTL, MVT::i64, Expand);
403  setOperationAction(ISD::ROTR, MVT::i64, Expand);
404
405  setOperationAction(ISD::MUL, MVT::i64, Expand);
406  setOperationAction(ISD::MULHU, MVT::i64, Expand);
407  setOperationAction(ISD::MULHS, MVT::i64, Expand);
408  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
409  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
410  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
411  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
412  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
413
414  setOperationAction(ISD::SMIN, MVT::i32, Legal);
415  setOperationAction(ISD::UMIN, MVT::i32, Legal);
416  setOperationAction(ISD::SMAX, MVT::i32, Legal);
417  setOperationAction(ISD::UMAX, MVT::i32, Legal);
418
419  if (Subtarget->hasFFBH())
420    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
421
422  if (Subtarget->hasFFBL())
423    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
424
425  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
426  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
427  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
428  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
429
430  // We only really have 32-bit BFE instructions (and 16-bit on VI).
431  //
432  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
433  // effort to match them now. We want this to be false for i64 cases when the
434  // extraction isn't restricted to the upper or lower half. Ideally we would
435  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
436  // span the midpoint are probably relatively rare, so don't worry about them
437  // for now.
438  if (Subtarget->hasBFE())
439    setHasExtractBitsInsn(true);
440
441  static const MVT::SimpleValueType VectorIntTypes[] = {
442    MVT::v2i32, MVT::v4i32
443  };
444
445  for (MVT VT : VectorIntTypes) {
446    // Expand the following operations for the current type by default.
447    setOperationAction(ISD::ADD,  VT, Expand);
448    setOperationAction(ISD::AND,  VT, Expand);
449    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
450    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
451    setOperationAction(ISD::MUL,  VT, Expand);
452    setOperationAction(ISD::MULHU, VT, Expand);
453    setOperationAction(ISD::MULHS, VT, Expand);
454    setOperationAction(ISD::OR,   VT, Expand);
455    setOperationAction(ISD::SHL,  VT, Expand);
456    setOperationAction(ISD::SRA,  VT, Expand);
457    setOperationAction(ISD::SRL,  VT, Expand);
458    setOperationAction(ISD::ROTL, VT, Expand);
459    setOperationAction(ISD::ROTR, VT, Expand);
460    setOperationAction(ISD::SUB,  VT, Expand);
461    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
462    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
463    setOperationAction(ISD::SDIV, VT, Expand);
464    setOperationAction(ISD::UDIV, VT, Expand);
465    setOperationAction(ISD::SREM, VT, Expand);
466    setOperationAction(ISD::UREM, VT, Expand);
467    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
468    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
469    setOperationAction(ISD::SDIVREM, VT, Custom);
470    setOperationAction(ISD::UDIVREM, VT, Expand);
471    setOperationAction(ISD::ADDC, VT, Expand);
472    setOperationAction(ISD::SUBC, VT, Expand);
473    setOperationAction(ISD::ADDE, VT, Expand);
474    setOperationAction(ISD::SUBE, VT, Expand);
475    setOperationAction(ISD::SELECT, VT, Expand);
476    setOperationAction(ISD::VSELECT, VT, Expand);
477    setOperationAction(ISD::SELECT_CC, VT, Expand);
478    setOperationAction(ISD::XOR,  VT, Expand);
479    setOperationAction(ISD::BSWAP, VT, Expand);
480    setOperationAction(ISD::CTPOP, VT, Expand);
481    setOperationAction(ISD::CTTZ, VT, Expand);
482    setOperationAction(ISD::CTLZ, VT, Expand);
483    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
484    setOperationAction(ISD::SETCC, VT, Expand);
485  }
486
487  static const MVT::SimpleValueType FloatVectorTypes[] = {
488    MVT::v2f32, MVT::v4f32
489  };
490
491  for (MVT VT : FloatVectorTypes) {
492    setOperationAction(ISD::FABS, VT, Expand);
493    setOperationAction(ISD::FMINNUM, VT, Expand);
494    setOperationAction(ISD::FMAXNUM, VT, Expand);
495    setOperationAction(ISD::FADD, VT, Expand);
496    setOperationAction(ISD::FCEIL, VT, Expand);
497    setOperationAction(ISD::FCOS, VT, Expand);
498    setOperationAction(ISD::FDIV, VT, Expand);
499    setOperationAction(ISD::FEXP2, VT, Expand);
500    setOperationAction(ISD::FLOG2, VT, Expand);
501    setOperationAction(ISD::FREM, VT, Expand);
502    setOperationAction(ISD::FLOG, VT, Expand);
503    setOperationAction(ISD::FLOG10, VT, Expand);
504    setOperationAction(ISD::FPOW, VT, Expand);
505    setOperationAction(ISD::FFLOOR, VT, Expand);
506    setOperationAction(ISD::FTRUNC, VT, Expand);
507    setOperationAction(ISD::FMUL, VT, Expand);
508    setOperationAction(ISD::FMA, VT, Expand);
509    setOperationAction(ISD::FRINT, VT, Expand);
510    setOperationAction(ISD::FNEARBYINT, VT, Expand);
511    setOperationAction(ISD::FSQRT, VT, Expand);
512    setOperationAction(ISD::FSIN, VT, Expand);
513    setOperationAction(ISD::FSUB, VT, Expand);
514    setOperationAction(ISD::FNEG, VT, Expand);
515    setOperationAction(ISD::VSELECT, VT, Expand);
516    setOperationAction(ISD::SELECT_CC, VT, Expand);
517    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
518    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
519    setOperationAction(ISD::SETCC, VT, Expand);
520  }
521
522  // This causes using an unrolled select operation rather than expansion with
523  // bit operations. This is in general better, but the alternative using BFI
524  // instructions may be better if the select sources are SGPRs.
525  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
526  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
527
528  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
529  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
530
531  // There are no libcalls of any kind.
532  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
533    setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
534
535  setBooleanContents(ZeroOrNegativeOneBooleanContent);
536  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
537
538  setSchedulingPreference(Sched::RegPressure);
539  setJumpIsExpensive(true);
540
541  // FIXME: This is only partially true. If we have to do vector compares, any
542  // SGPR pair can be a condition register. If we have a uniform condition, we
543  // are better off doing SALU operations, where there is only one SCC. For now,
544  // we don't have a way of knowing during instruction selection if a condition
545  // will be uniform and we always use vector compares. Assume we are using
546  // vector compares until that is fixed.
547  setHasMultipleConditionRegisters(true);
548
549  // SI at least has hardware support for floating point exceptions, but no way
550  // of using or handling them is implemented. They are also optional in OpenCL
551  // (Section 7.3)
552  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
553
554  PredictableSelectIsExpensive = false;
555
556  // We want to find all load dependencies for long chains of stores to enable
557  // merging into very wide vectors. The problem is with vectors with > 4
558  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
559  // vectors are a legal type, even though we have to split the loads
560  // usually. When we can more precisely specify load legality per address
561  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
562  // smarter so that they can figure out what to do in 2 iterations without all
563  // N > 4 stores on the same chain.
564  GatherAllAliasesMaxDepth = 16;
565
566  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
567  // about these during lowering.
568  MaxStoresPerMemcpy  = 0xffffffff;
569  MaxStoresPerMemmove = 0xffffffff;
570  MaxStoresPerMemset  = 0xffffffff;
571
572  setTargetDAGCombine(ISD::BITCAST);
573  setTargetDAGCombine(ISD::SHL);
574  setTargetDAGCombine(ISD::SRA);
575  setTargetDAGCombine(ISD::SRL);
576  setTargetDAGCombine(ISD::MUL);
577  setTargetDAGCombine(ISD::MULHU);
578  setTargetDAGCombine(ISD::MULHS);
579  setTargetDAGCombine(ISD::SELECT);
580  setTargetDAGCombine(ISD::SELECT_CC);
581  setTargetDAGCombine(ISD::STORE);
582  setTargetDAGCombine(ISD::FADD);
583  setTargetDAGCombine(ISD::FSUB);
584  setTargetDAGCombine(ISD::FNEG);
585  setTargetDAGCombine(ISD::FABS);
586  setTargetDAGCombine(ISD::AssertZext);
587  setTargetDAGCombine(ISD::AssertSext);
588}
589
590//===----------------------------------------------------------------------===//
591// Target Information
592//===----------------------------------------------------------------------===//
593
594LLVM_READNONE
595static bool fnegFoldsIntoOp(unsigned Opc) {
596  switch (Opc) {
597  case ISD::FADD:
598  case ISD::FSUB:
599  case ISD::FMUL:
600  case ISD::FMA:
601  case ISD::FMAD:
602  case ISD::FMINNUM:
603  case ISD::FMAXNUM:
604  case ISD::FSIN:
605  case ISD::FTRUNC:
606  case ISD::FRINT:
607  case ISD::FNEARBYINT:
608  case AMDGPUISD::RCP:
609  case AMDGPUISD::RCP_LEGACY:
610  case AMDGPUISD::SIN_HW:
611  case AMDGPUISD::FMUL_LEGACY:
612  case AMDGPUISD::FMIN_LEGACY:
613  case AMDGPUISD::FMAX_LEGACY:
614    return true;
615  default:
616    return false;
617  }
618}
619
620/// \p returns true if the operation will definitely need to use a 64-bit
621/// encoding, and thus will use a VOP3 encoding regardless of the source
622/// modifiers.
623LLVM_READONLY
624static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
625  return N->getNumOperands() > 2 || VT == MVT::f64;
626}
627
628// Most FP instructions support source modifiers, but this could be refined
629// slightly.
630LLVM_READONLY
631static bool hasSourceMods(const SDNode *N) {
632  if (isa<MemSDNode>(N))
633    return false;
634
635  switch (N->getOpcode()) {
636  case ISD::CopyToReg:
637  case ISD::SELECT:
638  case ISD::FDIV:
639  case ISD::FREM:
640  case ISD::INLINEASM:
641  case AMDGPUISD::INTERP_P1:
642  case AMDGPUISD::INTERP_P2:
643  case AMDGPUISD::DIV_SCALE:
644
645  // TODO: Should really be looking at the users of the bitcast. These are
646  // problematic because bitcasts are used to legalize all stores to integer
647  // types.
648  case ISD::BITCAST:
649    return false;
650  default:
651    return true;
652  }
653}
654
655bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
656                                                 unsigned CostThreshold) {
657  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
658  // it is truly free to use a source modifier in all cases. If there are
659  // multiple users but for each one will necessitate using VOP3, there will be
660  // a code size increase. Try to avoid increasing code size unless we know it
661  // will save on the instruction count.
662  unsigned NumMayIncreaseSize = 0;
663  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
664
665  // XXX - Should this limit number of uses to check?
666  for (const SDNode *U : N->uses()) {
667    if (!hasSourceMods(U))
668      return false;
669
670    if (!opMustUseVOP3Encoding(U, VT)) {
671      if (++NumMayIncreaseSize > CostThreshold)
672        return false;
673    }
674  }
675
676  return true;
677}
678
679MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
680  return MVT::i32;
681}
682
683bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
684  return true;
685}
686
687// The backend supports 32 and 64 bit floating point immediates.
688// FIXME: Why are we reporting vectors of FP immediates as legal?
689bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
690  EVT ScalarVT = VT.getScalarType();
691  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
692         (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
693}
694
695// We don't want to shrink f64 / f32 constants.
696bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
697  EVT ScalarVT = VT.getScalarType();
698  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
699}
700
701bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
702                                                 ISD::LoadExtType,
703                                                 EVT NewVT) const {
704
705  unsigned NewSize = NewVT.getStoreSizeInBits();
706
707  // If we are reducing to a 32-bit load, this is always better.
708  if (NewSize == 32)
709    return true;
710
711  EVT OldVT = N->getValueType(0);
712  unsigned OldSize = OldVT.getStoreSizeInBits();
713
714  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
715  // extloads, so doing one requires using a buffer_load. In cases where we
716  // still couldn't use a scalar load, using the wider load shouldn't really
717  // hurt anything.
718
719  // If the old size already had to be an extload, there's no harm in continuing
720  // to reduce the width.
721  return (OldSize < 32);
722}
723
724bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
725                                                   EVT CastTy) const {
726
727  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
728
729  if (LoadTy.getScalarType() == MVT::i32)
730    return false;
731
732  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
733  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
734
735  return (LScalarSize < CastScalarSize) ||
736         (CastScalarSize >= 32);
737}
738
739// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
740// profitable with the expansion for 64-bit since it's generally good to
741// speculate things.
742// FIXME: These should really have the size as a parameter.
743bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
744  return true;
745}
746
747bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
748  return true;
749}
750
751//===---------------------------------------------------------------------===//
752// Target Properties
753//===---------------------------------------------------------------------===//
754
755bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
756  assert(VT.isFloatingPoint());
757
758  // Packed operations do not have a fabs modifier.
759  return VT == MVT::f32 || VT == MVT::f64 ||
760         (Subtarget->has16BitInsts() && VT == MVT::f16);
761}
762
763bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
764  assert(VT.isFloatingPoint());
765  return VT == MVT::f32 || VT == MVT::f64 ||
766         (Subtarget->has16BitInsts() && VT == MVT::f16) ||
767         (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
768}
769
770bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
771                                                         unsigned NumElem,
772                                                         unsigned AS) const {
773  return true;
774}
775
776bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
777  // There are few operations which truly have vector input operands. Any vector
778  // operation is going to involve operations on each component, and a
779  // build_vector will be a copy per element, so it always makes sense to use a
780  // build_vector input in place of the extracted element to avoid a copy into a
781  // super register.
782  //
783  // We should probably only do this if all users are extracts only, but this
784  // should be the common case.
785  return true;
786}
787
788bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
789  // Truncate is just accessing a subregister.
790
791  unsigned SrcSize = Source.getSizeInBits();
792  unsigned DestSize = Dest.getSizeInBits();
793
794  return DestSize < SrcSize && DestSize % 32 == 0 ;
795}
796
797bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
798  // Truncate is just accessing a subregister.
799
800  unsigned SrcSize = Source->getScalarSizeInBits();
801  unsigned DestSize = Dest->getScalarSizeInBits();
802
803  if (DestSize== 16 && Subtarget->has16BitInsts())
804    return SrcSize >= 32;
805
806  return DestSize < SrcSize && DestSize % 32 == 0;
807}
808
809bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
810  unsigned SrcSize = Src->getScalarSizeInBits();
811  unsigned DestSize = Dest->getScalarSizeInBits();
812
813  if (SrcSize == 16 && Subtarget->has16BitInsts())
814    return DestSize >= 32;
815
816  return SrcSize == 32 && DestSize == 64;
817}
818
819bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
820  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
821  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
822  // this will enable reducing 64-bit operations the 32-bit, which is always
823  // good.
824
825  if (Src == MVT::i16)
826    return Dest == MVT::i32 ||Dest == MVT::i64 ;
827
828  return Src == MVT::i32 && Dest == MVT::i64;
829}
830
831bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
832  return isZExtFree(Val.getValueType(), VT2);
833}
834
835// v_mad_mix* support a conversion from f16 to f32.
836//
837// There is only one special case when denormals are enabled we don't currently,
838// where this is OK to use.
839bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
840                                           EVT DestVT, EVT SrcVT) const {
841  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
842         DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
843         SrcVT.getScalarType() == MVT::f16;
844}
845
846bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
847  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
848  // limited number of native 64-bit operations. Shrinking an operation to fit
849  // in a single 32-bit register should always be helpful. As currently used,
850  // this is much less general than the name suggests, and is only used in
851  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
852  // not profitable, and may actually be harmful.
853  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
854}
855
856//===---------------------------------------------------------------------===//
857// TargetLowering Callbacks
858//===---------------------------------------------------------------------===//
859
860CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
861                                                  bool IsVarArg) {
862  switch (CC) {
863  case CallingConv::AMDGPU_KERNEL:
864  case CallingConv::SPIR_KERNEL:
865    return CC_AMDGPU_Kernel;
866  case CallingConv::AMDGPU_VS:
867  case CallingConv::AMDGPU_GS:
868  case CallingConv::AMDGPU_PS:
869  case CallingConv::AMDGPU_CS:
870  case CallingConv::AMDGPU_HS:
871  case CallingConv::AMDGPU_ES:
872  case CallingConv::AMDGPU_LS:
873    return CC_AMDGPU;
874  case CallingConv::C:
875  case CallingConv::Fast:
876  case CallingConv::Cold:
877    return CC_AMDGPU_Func;
878  default:
879    report_fatal_error("Unsupported calling convention.");
880  }
881}
882
883CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
884                                                    bool IsVarArg) {
885  switch (CC) {
886  case CallingConv::AMDGPU_KERNEL:
887  case CallingConv::SPIR_KERNEL:
888    return CC_AMDGPU_Kernel;
889  case CallingConv::AMDGPU_VS:
890  case CallingConv::AMDGPU_GS:
891  case CallingConv::AMDGPU_PS:
892  case CallingConv::AMDGPU_CS:
893  case CallingConv::AMDGPU_HS:
894  case CallingConv::AMDGPU_ES:
895  case CallingConv::AMDGPU_LS:
896    return RetCC_SI_Shader;
897  case CallingConv::C:
898  case CallingConv::Fast:
899  case CallingConv::Cold:
900    return RetCC_AMDGPU_Func;
901  default:
902    report_fatal_error("Unsupported calling convention.");
903  }
904}
905
906/// The SelectionDAGBuilder will automatically promote function arguments
907/// with illegal types.  However, this does not work for the AMDGPU targets
908/// since the function arguments are stored in memory as these illegal types.
909/// In order to handle this properly we need to get the original types sizes
910/// from the LLVM IR Function and fixup the ISD:InputArg values before
911/// passing them to AnalyzeFormalArguments()
912
913/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
914/// input values across multiple registers.  Each item in the Ins array
915/// represents a single value that will be stored in registers.  Ins[x].VT is
916/// the value type of the value that will be stored in the register, so
917/// whatever SDNode we lower the argument to needs to be this type.
918///
919/// In order to correctly lower the arguments we need to know the size of each
920/// argument.  Since Ins[x].VT gives us the size of the register that will
921/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
922/// for the orignal function argument so that we can deduce the correct memory
923/// type to use for Ins[x].  In most cases the correct memory type will be
924/// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
925/// we have a kernel argument of type v8i8, this argument will be split into
926/// 8 parts and each part will be represented by its own item in the Ins array.
927/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
928/// the argument before it was split.  From this, we deduce that the memory type
929/// for each individual part is i8.  We pass the memory type as LocVT to the
930/// calling convention analysis function and the register type (Ins[x].VT) as
931/// the ValVT.
932void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
933                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
934  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
935    const ISD::InputArg &In = Ins[i];
936    EVT MemVT;
937
938    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
939
940    if (!Subtarget->isAmdHsaOS() &&
941        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
942      // The ABI says the caller will extend these values to 32-bits.
943      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
944    } else if (NumRegs == 1) {
945      // This argument is not split, so the IR type is the memory type.
946      assert(!In.Flags.isSplit());
947      if (In.ArgVT.isExtended()) {
948        // We have an extended type, like i24, so we should just use the register type
949        MemVT = In.VT;
950      } else {
951        MemVT = In.ArgVT;
952      }
953    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
954               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
955      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
956      // We have a vector value which has been split into a vector with
957      // the same scalar type, but fewer elements.  This should handle
958      // all the floating-point vector types.
959      MemVT = In.VT;
960    } else if (In.ArgVT.isVector() &&
961               In.ArgVT.getVectorNumElements() == NumRegs) {
962      // This arg has been split so that each element is stored in a separate
963      // register.
964      MemVT = In.ArgVT.getScalarType();
965    } else if (In.ArgVT.isExtended()) {
966      // We have an extended type, like i65.
967      MemVT = In.VT;
968    } else {
969      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
970      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
971      if (In.VT.isInteger()) {
972        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
973      } else if (In.VT.isVector()) {
974        assert(!In.VT.getScalarType().isFloatingPoint());
975        unsigned NumElements = In.VT.getVectorNumElements();
976        assert(MemoryBits % NumElements == 0);
977        // This vector type has been split into another vector type with
978        // a different elements size.
979        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
980                                         MemoryBits / NumElements);
981        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
982      } else {
983        llvm_unreachable("cannot deduce memory type.");
984      }
985    }
986
987    // Convert one element vectors to scalar.
988    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
989      MemVT = MemVT.getScalarType();
990
991    if (MemVT.isExtended()) {
992      // This should really only happen if we have vec3 arguments
993      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
994      MemVT = MemVT.getPow2VectorType(State.getContext());
995    }
996
997    assert(MemVT.isSimple());
998    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
999                    State);
1000  }
1001}
1002
1003SDValue AMDGPUTargetLowering::LowerReturn(
1004  SDValue Chain, CallingConv::ID CallConv,
1005  bool isVarArg,
1006  const SmallVectorImpl<ISD::OutputArg> &Outs,
1007  const SmallVectorImpl<SDValue> &OutVals,
1008  const SDLoc &DL, SelectionDAG &DAG) const {
1009  // FIXME: Fails for r600 tests
1010  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1011  // "wave terminate should not have return values");
1012  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1013}
1014
1015//===---------------------------------------------------------------------===//
1016// Target specific lowering
1017//===---------------------------------------------------------------------===//
1018
1019/// Selects the correct CCAssignFn for a given CallingConvention value.
1020CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1021                                                    bool IsVarArg) {
1022  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1023}
1024
1025CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1026                                                      bool IsVarArg) {
1027  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1028}
1029
1030SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1031                                                  SelectionDAG &DAG,
1032                                                  MachineFrameInfo &MFI,
1033                                                  int ClobberedFI) const {
1034  SmallVector<SDValue, 8> ArgChains;
1035  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1036  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1037
1038  // Include the original chain at the beginning of the list. When this is
1039  // used by target LowerCall hooks, this helps legalize find the
1040  // CALLSEQ_BEGIN node.
1041  ArgChains.push_back(Chain);
1042
1043  // Add a chain value for each stack argument corresponding
1044  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1045                            UE = DAG.getEntryNode().getNode()->use_end();
1046       U != UE; ++U) {
1047    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1048      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1049        if (FI->getIndex() < 0) {
1050          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1051          int64_t InLastByte = InFirstByte;
1052          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1053
1054          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1055              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1056            ArgChains.push_back(SDValue(L, 1));
1057        }
1058      }
1059    }
1060  }
1061
1062  // Build a tokenfactor for all the chains.
1063  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1064}
1065
1066SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1067                                                 SmallVectorImpl<SDValue> &InVals,
1068                                                 StringRef Reason) const {
1069  SDValue Callee = CLI.Callee;
1070  SelectionDAG &DAG = CLI.DAG;
1071
1072  const Function &Fn = DAG.getMachineFunction().getFunction();
1073
1074  StringRef FuncName("<unknown>");
1075
1076  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1077    FuncName = G->getSymbol();
1078  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1079    FuncName = G->getGlobal()->getName();
1080
1081  DiagnosticInfoUnsupported NoCalls(
1082    Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1083  DAG.getContext()->diagnose(NoCalls);
1084
1085  if (!CLI.IsTailCall) {
1086    for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1087      InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1088  }
1089
1090  return DAG.getEntryNode();
1091}
1092
1093SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1094                                        SmallVectorImpl<SDValue> &InVals) const {
1095  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1096}
1097
1098SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1099                                                      SelectionDAG &DAG) const {
1100  const Function &Fn = DAG.getMachineFunction().getFunction();
1101
1102  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1103                                            SDLoc(Op).getDebugLoc());
1104  DAG.getContext()->diagnose(NoDynamicAlloca);
1105  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1106  return DAG.getMergeValues(Ops, SDLoc());
1107}
1108
1109SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1110                                             SelectionDAG &DAG) const {
1111  switch (Op.getOpcode()) {
1112  default:
1113    Op->print(errs(), &DAG);
1114    llvm_unreachable("Custom lowering code for this"
1115                     "instruction is not implemented yet!");
1116    break;
1117  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1118  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1119  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1120  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1121  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1122  case ISD::FREM: return LowerFREM(Op, DAG);
1123  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1124  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1125  case ISD::FRINT: return LowerFRINT(Op, DAG);
1126  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1127  case ISD::FROUND: return LowerFROUND(Op, DAG);
1128  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1129  case ISD::FLOG:
1130    return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
1131  case ISD::FLOG10:
1132    return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
1133  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1134  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1135  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1136  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1137  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1138  case ISD::CTTZ:
1139  case ISD::CTTZ_ZERO_UNDEF:
1140  case ISD::CTLZ:
1141  case ISD::CTLZ_ZERO_UNDEF:
1142    return LowerCTLZ_CTTZ(Op, DAG);
1143  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1144  }
1145  return Op;
1146}
1147
1148void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1149                                              SmallVectorImpl<SDValue> &Results,
1150                                              SelectionDAG &DAG) const {
1151  switch (N->getOpcode()) {
1152  case ISD::SIGN_EXTEND_INREG:
1153    // Different parts of legalization seem to interpret which type of
1154    // sign_extend_inreg is the one to check for custom lowering. The extended
1155    // from type is what really matters, but some places check for custom
1156    // lowering of the result type. This results in trying to use
1157    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1158    // nothing here and let the illegal result integer be handled normally.
1159    return;
1160  default:
1161    return;
1162  }
1163}
1164
1165static bool hasDefinedInitializer(const GlobalValue *GV) {
1166  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1167  if (!GVar || !GVar->hasInitializer())
1168    return false;
1169
1170  return !isa<UndefValue>(GVar->getInitializer());
1171}
1172
1173SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1174                                                 SDValue Op,
1175                                                 SelectionDAG &DAG) const {
1176
1177  const DataLayout &DL = DAG.getDataLayout();
1178  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1179  const GlobalValue *GV = G->getGlobal();
1180
1181  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
1182    // XXX: What does the value of G->getOffset() mean?
1183    assert(G->getOffset() == 0 &&
1184         "Do not know what to do with an non-zero offset");
1185
1186    // TODO: We could emit code to handle the initialization somewhere.
1187    if (!hasDefinedInitializer(GV)) {
1188      unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1189      return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1190    }
1191  }
1192
1193  const Function &Fn = DAG.getMachineFunction().getFunction();
1194  DiagnosticInfoUnsupported BadInit(
1195      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1196  DAG.getContext()->diagnose(BadInit);
1197  return SDValue();
1198}
1199
1200SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1201                                                  SelectionDAG &DAG) const {
1202  SmallVector<SDValue, 8> Args;
1203
1204  for (const SDUse &U : Op->ops())
1205    DAG.ExtractVectorElements(U.get(), Args);
1206
1207  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1208}
1209
1210SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1211                                                     SelectionDAG &DAG) const {
1212
1213  SmallVector<SDValue, 8> Args;
1214  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1215  EVT VT = Op.getValueType();
1216  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1217                            VT.getVectorNumElements());
1218
1219  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1220}
1221
1222/// \brief Generate Min/Max node
1223SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1224                                                   SDValue LHS, SDValue RHS,
1225                                                   SDValue True, SDValue False,
1226                                                   SDValue CC,
1227                                                   DAGCombinerInfo &DCI) const {
1228  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1229    return SDValue();
1230
1231  SelectionDAG &DAG = DCI.DAG;
1232  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1233  switch (CCOpcode) {
1234  case ISD::SETOEQ:
1235  case ISD::SETONE:
1236  case ISD::SETUNE:
1237  case ISD::SETNE:
1238  case ISD::SETUEQ:
1239  case ISD::SETEQ:
1240  case ISD::SETFALSE:
1241  case ISD::SETFALSE2:
1242  case ISD::SETTRUE:
1243  case ISD::SETTRUE2:
1244  case ISD::SETUO:
1245  case ISD::SETO:
1246    break;
1247  case ISD::SETULE:
1248  case ISD::SETULT: {
1249    if (LHS == True)
1250      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1251    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1252  }
1253  case ISD::SETOLE:
1254  case ISD::SETOLT:
1255  case ISD::SETLE:
1256  case ISD::SETLT: {
1257    // Ordered. Assume ordered for undefined.
1258
1259    // Only do this after legalization to avoid interfering with other combines
1260    // which might occur.
1261    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1262        !DCI.isCalledByLegalizer())
1263      return SDValue();
1264
1265    // We need to permute the operands to get the correct NaN behavior. The
1266    // selected operand is the second one based on the failing compare with NaN,
1267    // so permute it based on the compare type the hardware uses.
1268    if (LHS == True)
1269      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1270    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1271  }
1272  case ISD::SETUGE:
1273  case ISD::SETUGT: {
1274    if (LHS == True)
1275      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1276    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1277  }
1278  case ISD::SETGT:
1279  case ISD::SETGE:
1280  case ISD::SETOGE:
1281  case ISD::SETOGT: {
1282    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1283        !DCI.isCalledByLegalizer())
1284      return SDValue();
1285
1286    if (LHS == True)
1287      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1288    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1289  }
1290  case ISD::SETCC_INVALID:
1291    llvm_unreachable("Invalid setcc condcode!");
1292  }
1293  return SDValue();
1294}
1295
1296std::pair<SDValue, SDValue>
1297AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1298  SDLoc SL(Op);
1299
1300  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1301
1302  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1303  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1304
1305  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1306  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1307
1308  return std::make_pair(Lo, Hi);
1309}
1310
1311SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1312  SDLoc SL(Op);
1313
1314  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1315  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1316  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1317}
1318
1319SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1320  SDLoc SL(Op);
1321
1322  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1323  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1324  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1325}
1326
1327SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1328                                              SelectionDAG &DAG) const {
1329  LoadSDNode *Load = cast<LoadSDNode>(Op);
1330  EVT VT = Op.getValueType();
1331
1332
1333  // If this is a 2 element vector, we really want to scalarize and not create
1334  // weird 1 element vectors.
1335  if (VT.getVectorNumElements() == 2)
1336    return scalarizeVectorLoad(Load, DAG);
1337
1338  SDValue BasePtr = Load->getBasePtr();
1339  EVT MemVT = Load->getMemoryVT();
1340  SDLoc SL(Op);
1341
1342  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1343
1344  EVT LoVT, HiVT;
1345  EVT LoMemVT, HiMemVT;
1346  SDValue Lo, Hi;
1347
1348  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1349  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1350  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1351
1352  unsigned Size = LoMemVT.getStoreSize();
1353  unsigned BaseAlign = Load->getAlignment();
1354  unsigned HiAlign = MinAlign(BaseAlign, Size);
1355
1356  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1357                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1358                                  BaseAlign, Load->getMemOperand()->getFlags());
1359  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1360  SDValue HiLoad =
1361      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1362                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1363                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1364
1365  SDValue Ops[] = {
1366    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1367    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1368                LoLoad.getValue(1), HiLoad.getValue(1))
1369  };
1370
1371  return DAG.getMergeValues(Ops, SL);
1372}
1373
1374SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1375                                               SelectionDAG &DAG) const {
1376  StoreSDNode *Store = cast<StoreSDNode>(Op);
1377  SDValue Val = Store->getValue();
1378  EVT VT = Val.getValueType();
1379
1380  // If this is a 2 element vector, we really want to scalarize and not create
1381  // weird 1 element vectors.
1382  if (VT.getVectorNumElements() == 2)
1383    return scalarizeVectorStore(Store, DAG);
1384
1385  EVT MemVT = Store->getMemoryVT();
1386  SDValue Chain = Store->getChain();
1387  SDValue BasePtr = Store->getBasePtr();
1388  SDLoc SL(Op);
1389
1390  EVT LoVT, HiVT;
1391  EVT LoMemVT, HiMemVT;
1392  SDValue Lo, Hi;
1393
1394  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1395  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1396  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1397
1398  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1399
1400  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1401  unsigned BaseAlign = Store->getAlignment();
1402  unsigned Size = LoMemVT.getStoreSize();
1403  unsigned HiAlign = MinAlign(BaseAlign, Size);
1404
1405  SDValue LoStore =
1406      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1407                        Store->getMemOperand()->getFlags());
1408  SDValue HiStore =
1409      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1410                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1411
1412  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1413}
1414
1415// This is a shortcut for integer division because we have fast i32<->f32
1416// conversions, and fast f32 reciprocal instructions. The fractional part of a
1417// float is enough to accurately represent up to a 24-bit signed integer.
1418SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1419                                            bool Sign) const {
1420  SDLoc DL(Op);
1421  EVT VT = Op.getValueType();
1422  SDValue LHS = Op.getOperand(0);
1423  SDValue RHS = Op.getOperand(1);
1424  MVT IntVT = MVT::i32;
1425  MVT FltVT = MVT::f32;
1426
1427  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1428  if (LHSSignBits < 9)
1429    return SDValue();
1430
1431  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1432  if (RHSSignBits < 9)
1433    return SDValue();
1434
1435  unsigned BitSize = VT.getSizeInBits();
1436  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1437  unsigned DivBits = BitSize - SignBits;
1438  if (Sign)
1439    ++DivBits;
1440
1441  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1442  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1443
1444  SDValue jq = DAG.getConstant(1, DL, IntVT);
1445
1446  if (Sign) {
1447    // char|short jq = ia ^ ib;
1448    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1449
1450    // jq = jq >> (bitsize - 2)
1451    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1452                     DAG.getConstant(BitSize - 2, DL, VT));
1453
1454    // jq = jq | 0x1
1455    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1456  }
1457
1458  // int ia = (int)LHS;
1459  SDValue ia = LHS;
1460
1461  // int ib, (int)RHS;
1462  SDValue ib = RHS;
1463
1464  // float fa = (float)ia;
1465  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1466
1467  // float fb = (float)ib;
1468  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1469
1470  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1471                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1472
1473  // fq = trunc(fq);
1474  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1475
1476  // float fqneg = -fq;
1477  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1478
1479  // float fr = mad(fqneg, fb, fa);
1480  unsigned OpCode = Subtarget->hasFP32Denormals() ?
1481                    (unsigned)AMDGPUISD::FMAD_FTZ :
1482                    (unsigned)ISD::FMAD;
1483  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1484
1485  // int iq = (int)fq;
1486  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1487
1488  // fr = fabs(fr);
1489  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1490
1491  // fb = fabs(fb);
1492  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1493
1494  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1495
1496  // int cv = fr >= fb;
1497  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1498
1499  // jq = (cv ? jq : 0);
1500  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1501
1502  // dst = iq + jq;
1503  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1504
1505  // Rem needs compensation, it's easier to recompute it
1506  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1507  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1508
1509  // Truncate to number of bits this divide really is.
1510  if (Sign) {
1511    SDValue InRegSize
1512      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1513    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1514    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1515  } else {
1516    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1517    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1518    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1519  }
1520
1521  return DAG.getMergeValues({ Div, Rem }, DL);
1522}
1523
1524void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1525                                      SelectionDAG &DAG,
1526                                      SmallVectorImpl<SDValue> &Results) const {
1527  SDLoc DL(Op);
1528  EVT VT = Op.getValueType();
1529
1530  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1531
1532  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1533
1534  SDValue One = DAG.getConstant(1, DL, HalfVT);
1535  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1536
1537  //HiLo split
1538  SDValue LHS = Op.getOperand(0);
1539  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1540  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1541
1542  SDValue RHS = Op.getOperand(1);
1543  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1544  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1545
1546  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1547      DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1548
1549    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1550                              LHS_Lo, RHS_Lo);
1551
1552    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1553    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1554
1555    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1556    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1557    return;
1558  }
1559
1560  if (isTypeLegal(MVT::i64)) {
1561    // Compute denominator reciprocal.
1562    unsigned FMAD = Subtarget->hasFP32Denormals() ?
1563                    (unsigned)AMDGPUISD::FMAD_FTZ :
1564                    (unsigned)ISD::FMAD;
1565
1566    SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1567    SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1568    SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1569      DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1570      Cvt_Lo);
1571    SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1572    SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1573      DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1574    SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1575      DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1576    SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1577    SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1578      DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1579      Mul1);
1580    SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1581    SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1582    SDValue Rcp64 = DAG.getBitcast(VT,
1583                        DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1584
1585    SDValue Zero64 = DAG.getConstant(0, DL, VT);
1586    SDValue One64  = DAG.getConstant(1, DL, VT);
1587    SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1588    SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1589
1590    SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1591    SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1592    SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1593    SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1594                                    Zero);
1595    SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1596                                    One);
1597
1598    SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1599                                  Mulhi1_Lo, Zero1);
1600    SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1601                                  Mulhi1_Hi, Add1_Lo.getValue(1));
1602    SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1603    SDValue Add1 = DAG.getBitcast(VT,
1604                        DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1605
1606    SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1607    SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1608    SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1609                                    Zero);
1610    SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1611                                    One);
1612
1613    SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1614                                  Mulhi2_Lo, Zero1);
1615    SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1616                                   Mulhi2_Hi, Add1_Lo.getValue(1));
1617    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1618                                  Zero, Add2_Lo.getValue(1));
1619    SDValue Add2 = DAG.getBitcast(VT,
1620                        DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1621    SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1622
1623    SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1624
1625    SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1626    SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1627    SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1628                                  Mul3_Lo, Zero1);
1629    SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1630                                  Mul3_Hi, Sub1_Lo.getValue(1));
1631    SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1632    SDValue Sub1 = DAG.getBitcast(VT,
1633                        DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1634
1635    SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1636    SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1637                                 ISD::SETUGE);
1638    SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1639                                 ISD::SETUGE);
1640    SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1641
1642    // TODO: Here and below portions of the code can be enclosed into if/endif.
1643    // Currently control flow is unconditional and we have 4 selects after
1644    // potential endif to substitute PHIs.
1645
1646    // if C3 != 0 ...
1647    SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1648                                  RHS_Lo, Zero1);
1649    SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1650                                  RHS_Hi, Sub1_Lo.getValue(1));
1651    SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1652                                  Zero, Sub2_Lo.getValue(1));
1653    SDValue Sub2 = DAG.getBitcast(VT,
1654                        DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1655
1656    SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1657
1658    SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1659                                 ISD::SETUGE);
1660    SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1661                                 ISD::SETUGE);
1662    SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1663
1664    // if (C6 != 0)
1665    SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1666
1667    SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1668                                  RHS_Lo, Zero1);
1669    SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1670                                  RHS_Hi, Sub2_Lo.getValue(1));
1671    SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1672                                  Zero, Sub3_Lo.getValue(1));
1673    SDValue Sub3 = DAG.getBitcast(VT,
1674                        DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1675
1676    // endif C6
1677    // endif C3
1678
1679    SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1680    SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1681
1682    SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1683    SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1684
1685    Results.push_back(Div);
1686    Results.push_back(Rem);
1687
1688    return;
1689  }
1690
1691  // r600 expandion.
1692  // Get Speculative values
1693  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1694  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1695
1696  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1697  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1698  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1699
1700  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1701  SDValue DIV_Lo = Zero;
1702
1703  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1704
1705  for (unsigned i = 0; i < halfBitWidth; ++i) {
1706    const unsigned bitPos = halfBitWidth - i - 1;
1707    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1708    // Get value of high bit
1709    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1710    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1711    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1712
1713    // Shift
1714    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1715    // Add LHS high bit
1716    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1717
1718    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1719    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1720
1721    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1722
1723    // Update REM
1724    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1725    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1726  }
1727
1728  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1729  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1730  Results.push_back(DIV);
1731  Results.push_back(REM);
1732}
1733
1734SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1735                                           SelectionDAG &DAG) const {
1736  SDLoc DL(Op);
1737  EVT VT = Op.getValueType();
1738
1739  if (VT == MVT::i64) {
1740    SmallVector<SDValue, 2> Results;
1741    LowerUDIVREM64(Op, DAG, Results);
1742    return DAG.getMergeValues(Results, DL);
1743  }
1744
1745  if (VT == MVT::i32) {
1746    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1747      return Res;
1748  }
1749
1750  SDValue Num = Op.getOperand(0);
1751  SDValue Den = Op.getOperand(1);
1752
1753  // RCP =  URECIP(Den) = 2^32 / Den + e
1754  // e is rounding error.
1755  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1756
1757  // RCP_LO = mul(RCP, Den) */
1758  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1759
1760  // RCP_HI = mulhu (RCP, Den) */
1761  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1762
1763  // NEG_RCP_LO = -RCP_LO
1764  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1765                                                     RCP_LO);
1766
1767  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1768  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1769                                           NEG_RCP_LO, RCP_LO,
1770                                           ISD::SETEQ);
1771  // Calculate the rounding error from the URECIP instruction
1772  // E = mulhu(ABS_RCP_LO, RCP)
1773  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1774
1775  // RCP_A_E = RCP + E
1776  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1777
1778  // RCP_S_E = RCP - E
1779  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1780
1781  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1782  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1783                                     RCP_A_E, RCP_S_E,
1784                                     ISD::SETEQ);
1785  // Quotient = mulhu(Tmp0, Num)
1786  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1787
1788  // Num_S_Remainder = Quotient * Den
1789  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1790
1791  // Remainder = Num - Num_S_Remainder
1792  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1793
1794  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1795  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1796                                                 DAG.getConstant(-1, DL, VT),
1797                                                 DAG.getConstant(0, DL, VT),
1798                                                 ISD::SETUGE);
1799  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1800  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1801                                                  Num_S_Remainder,
1802                                                  DAG.getConstant(-1, DL, VT),
1803                                                  DAG.getConstant(0, DL, VT),
1804                                                  ISD::SETUGE);
1805  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1806  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1807                                               Remainder_GE_Zero);
1808
1809  // Calculate Division result:
1810
1811  // Quotient_A_One = Quotient + 1
1812  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1813                                       DAG.getConstant(1, DL, VT));
1814
1815  // Quotient_S_One = Quotient - 1
1816  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1817                                       DAG.getConstant(1, DL, VT));
1818
1819  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1820  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1821                                     Quotient, Quotient_A_One, ISD::SETEQ);
1822
1823  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1824  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1825                            Quotient_S_One, Div, ISD::SETEQ);
1826
1827  // Calculate Rem result:
1828
1829  // Remainder_S_Den = Remainder - Den
1830  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1831
1832  // Remainder_A_Den = Remainder + Den
1833  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1834
1835  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1836  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1837                                    Remainder, Remainder_S_Den, ISD::SETEQ);
1838
1839  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1840  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1841                            Remainder_A_Den, Rem, ISD::SETEQ);
1842  SDValue Ops[2] = {
1843    Div,
1844    Rem
1845  };
1846  return DAG.getMergeValues(Ops, DL);
1847}
1848
1849SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1850                                           SelectionDAG &DAG) const {
1851  SDLoc DL(Op);
1852  EVT VT = Op.getValueType();
1853
1854  SDValue LHS = Op.getOperand(0);
1855  SDValue RHS = Op.getOperand(1);
1856
1857  SDValue Zero = DAG.getConstant(0, DL, VT);
1858  SDValue NegOne = DAG.getConstant(-1, DL, VT);
1859
1860  if (VT == MVT::i32) {
1861    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1862      return Res;
1863  }
1864
1865  if (VT == MVT::i64 &&
1866      DAG.ComputeNumSignBits(LHS) > 32 &&
1867      DAG.ComputeNumSignBits(RHS) > 32) {
1868    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1869
1870    //HiLo split
1871    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1872    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1873    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1874                                 LHS_Lo, RHS_Lo);
1875    SDValue Res[2] = {
1876      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1877      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1878    };
1879    return DAG.getMergeValues(Res, DL);
1880  }
1881
1882  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1883  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1884  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1885  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1886
1887  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1888  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1889
1890  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1891  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1892
1893  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1894  SDValue Rem = Div.getValue(1);
1895
1896  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1897  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1898
1899  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1900  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1901
1902  SDValue Res[2] = {
1903    Div,
1904    Rem
1905  };
1906  return DAG.getMergeValues(Res, DL);
1907}
1908
1909// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1910SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1911  SDLoc SL(Op);
1912  EVT VT = Op.getValueType();
1913  SDValue X = Op.getOperand(0);
1914  SDValue Y = Op.getOperand(1);
1915
1916  // TODO: Should this propagate fast-math-flags?
1917
1918  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1919  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1920  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1921
1922  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1923}
1924
1925SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1926  SDLoc SL(Op);
1927  SDValue Src = Op.getOperand(0);
1928
1929  // result = trunc(src)
1930  // if (src > 0.0 && src != result)
1931  //   result += 1.0
1932
1933  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1934
1935  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1936  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1937
1938  EVT SetCCVT =
1939      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1940
1941  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1942  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1943  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1944
1945  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1946  // TODO: Should this propagate fast-math-flags?
1947  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1948}
1949
1950static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1951                                  SelectionDAG &DAG) {
1952  const unsigned FractBits = 52;
1953  const unsigned ExpBits = 11;
1954
1955  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1956                                Hi,
1957                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
1958                                DAG.getConstant(ExpBits, SL, MVT::i32));
1959  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1960                            DAG.getConstant(1023, SL, MVT::i32));
1961
1962  return Exp;
1963}
1964
1965SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1966  SDLoc SL(Op);
1967  SDValue Src = Op.getOperand(0);
1968
1969  assert(Op.getValueType() == MVT::f64);
1970
1971  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1972  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1973
1974  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1975
1976  // Extract the upper half, since this is where we will find the sign and
1977  // exponent.
1978  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1979
1980  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1981
1982  const unsigned FractBits = 52;
1983
1984  // Extract the sign bit.
1985  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1986  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1987
1988  // Extend back to to 64-bits.
1989  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1990  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1991
1992  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1993  const SDValue FractMask
1994    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1995
1996  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1997  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1998  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1999
2000  EVT SetCCVT =
2001      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2002
2003  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2004
2005  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2006  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2007
2008  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2009  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2010
2011  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2012}
2013
2014SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2015  SDLoc SL(Op);
2016  SDValue Src = Op.getOperand(0);
2017
2018  assert(Op.getValueType() == MVT::f64);
2019
2020  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2021  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2022  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2023
2024  // TODO: Should this propagate fast-math-flags?
2025
2026  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2027  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2028
2029  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2030
2031  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2032  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2033
2034  EVT SetCCVT =
2035      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2036  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2037
2038  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2039}
2040
2041SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2042  // FNEARBYINT and FRINT are the same, except in their handling of FP
2043  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2044  // rint, so just treat them as equivalent.
2045  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2046}
2047
2048// XXX - May require not supporting f32 denormals?
2049
2050// Don't handle v2f16. The extra instructions to scalarize and repack around the
2051// compare and vselect end up producing worse code than scalarizing the whole
2052// operation.
2053SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
2054  SDLoc SL(Op);
2055  SDValue X = Op.getOperand(0);
2056  EVT VT = Op.getValueType();
2057
2058  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2059
2060  // TODO: Should this propagate fast-math-flags?
2061
2062  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2063
2064  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2065
2066  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2067  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2068  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2069
2070  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2071
2072  EVT SetCCVT =
2073      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2074
2075  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2076
2077  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2078
2079  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2080}
2081
2082SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
2083  SDLoc SL(Op);
2084  SDValue X = Op.getOperand(0);
2085
2086  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
2087
2088  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2089  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2090  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
2091  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
2092  EVT SetCCVT =
2093      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2094
2095  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2096
2097  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
2098
2099  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2100
2101  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
2102                                       MVT::i64);
2103
2104  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
2105  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
2106                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
2107                                          MVT::i64),
2108                          Exp);
2109
2110  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
2111  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
2112                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
2113                              ISD::SETNE);
2114
2115  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
2116                             D, DAG.getConstant(0, SL, MVT::i64));
2117  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
2118
2119  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
2120  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
2121
2122  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2123  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2124  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
2125
2126  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
2127                            ExpEqNegOne,
2128                            DAG.getConstantFP(1.0, SL, MVT::f64),
2129                            DAG.getConstantFP(0.0, SL, MVT::f64));
2130
2131  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
2132
2133  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
2134  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
2135
2136  return K;
2137}
2138
2139SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2140  EVT VT = Op.getValueType();
2141
2142  if (VT == MVT::f32 || VT == MVT::f16)
2143    return LowerFROUND32_16(Op, DAG);
2144
2145  if (VT == MVT::f64)
2146    return LowerFROUND64(Op, DAG);
2147
2148  llvm_unreachable("unhandled type");
2149}
2150
2151SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2152  SDLoc SL(Op);
2153  SDValue Src = Op.getOperand(0);
2154
2155  // result = trunc(src);
2156  // if (src < 0.0 && src != result)
2157  //   result += -1.0.
2158
2159  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2160
2161  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2162  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2163
2164  EVT SetCCVT =
2165      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2166
2167  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2168  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2169  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2170
2171  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2172  // TODO: Should this propagate fast-math-flags?
2173  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2174}
2175
2176SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2177                                        double Log2BaseInverted) const {
2178  EVT VT = Op.getValueType();
2179
2180  SDLoc SL(Op);
2181  SDValue Operand = Op.getOperand(0);
2182  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2183  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2184
2185  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2186}
2187
2188static bool isCtlzOpc(unsigned Opc) {
2189  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2190}
2191
2192static bool isCttzOpc(unsigned Opc) {
2193  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2194}
2195
2196SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2197  SDLoc SL(Op);
2198  SDValue Src = Op.getOperand(0);
2199  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2200                   Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2201
2202  unsigned ISDOpc, NewOpc;
2203  if (isCtlzOpc(Op.getOpcode())) {
2204    ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2205    NewOpc = AMDGPUISD::FFBH_U32;
2206  } else if (isCttzOpc(Op.getOpcode())) {
2207    ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2208    NewOpc = AMDGPUISD::FFBL_B32;
2209  } else
2210    llvm_unreachable("Unexpected OPCode!!!");
2211
2212
2213  if (ZeroUndef && Src.getValueType() == MVT::i32)
2214    return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2215
2216  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2217
2218  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2219  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2220
2221  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2222  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2223
2224  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2225                                   *DAG.getContext(), MVT::i32);
2226
2227  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2228  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2229
2230  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2231  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2232
2233  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2234  SDValue Add, NewOpr;
2235  if (isCtlzOpc(Op.getOpcode())) {
2236    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2237    // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2238    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2239  } else {
2240    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2241    // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2242    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2243  }
2244
2245  if (!ZeroUndef) {
2246    // Test if the full 64-bit input is zero.
2247
2248    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2249    // which we probably don't want.
2250    SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2251    SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2252    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2253
2254    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2255    // with the same cycles, otherwise it is slower.
2256    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2257    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2258
2259    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2260
2261    // The instruction returns -1 for 0 input, but the defined intrinsic
2262    // behavior is to return the number of bits.
2263    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2264                         SrcIsZero, Bits32, NewOpr);
2265  }
2266
2267  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2268}
2269
2270SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2271                                               bool Signed) const {
2272  // Unsigned
2273  // cul2f(ulong u)
2274  //{
2275  //  uint lz = clz(u);
2276  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2277  //  u = (u << lz) & 0x7fffffffffffffffUL;
2278  //  ulong t = u & 0xffffffffffUL;
2279  //  uint v = (e << 23) | (uint)(u >> 40);
2280  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2281  //  return as_float(v + r);
2282  //}
2283  // Signed
2284  // cl2f(long l)
2285  //{
2286  //  long s = l >> 63;
2287  //  float r = cul2f((l + s) ^ s);
2288  //  return s ? -r : r;
2289  //}
2290
2291  SDLoc SL(Op);
2292  SDValue Src = Op.getOperand(0);
2293  SDValue L = Src;
2294
2295  SDValue S;
2296  if (Signed) {
2297    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2298    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2299
2300    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2301    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2302  }
2303
2304  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2305                                   *DAG.getContext(), MVT::f32);
2306
2307
2308  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2309  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2310  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2311  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2312
2313  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2314  SDValue E = DAG.getSelect(SL, MVT::i32,
2315    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2316    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2317    ZeroI32);
2318
2319  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2320    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2321    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2322
2323  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2324                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2325
2326  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2327                             U, DAG.getConstant(40, SL, MVT::i64));
2328
2329  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2330    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2331    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2332
2333  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2334  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2335  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2336
2337  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2338
2339  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2340
2341  SDValue R = DAG.getSelect(SL, MVT::i32,
2342    RCmp,
2343    One,
2344    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2345  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2346  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2347
2348  if (!Signed)
2349    return R;
2350
2351  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2352  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2353}
2354
2355SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2356                                               bool Signed) const {
2357  SDLoc SL(Op);
2358  SDValue Src = Op.getOperand(0);
2359
2360  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2361
2362  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2363                           DAG.getConstant(0, SL, MVT::i32));
2364  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2365                           DAG.getConstant(1, SL, MVT::i32));
2366
2367  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2368                              SL, MVT::f64, Hi);
2369
2370  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2371
2372  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2373                              DAG.getConstant(32, SL, MVT::i32));
2374  // TODO: Should this propagate fast-math-flags?
2375  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2376}
2377
2378SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2379                                               SelectionDAG &DAG) const {
2380  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2381         "operation should be legal");
2382
2383  // TODO: Factor out code common with LowerSINT_TO_FP.
2384
2385  EVT DestVT = Op.getValueType();
2386  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2387    SDLoc DL(Op);
2388    SDValue Src = Op.getOperand(0);
2389
2390    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2391    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2392    SDValue FPRound =
2393        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2394
2395    return FPRound;
2396  }
2397
2398  if (DestVT == MVT::f32)
2399    return LowerINT_TO_FP32(Op, DAG, false);
2400
2401  assert(DestVT == MVT::f64);
2402  return LowerINT_TO_FP64(Op, DAG, false);
2403}
2404
2405SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2406                                              SelectionDAG &DAG) const {
2407  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2408         "operation should be legal");
2409
2410  // TODO: Factor out code common with LowerUINT_TO_FP.
2411
2412  EVT DestVT = Op.getValueType();
2413  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2414    SDLoc DL(Op);
2415    SDValue Src = Op.getOperand(0);
2416
2417    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2418    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2419    SDValue FPRound =
2420        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2421
2422    return FPRound;
2423  }
2424
2425  if (DestVT == MVT::f32)
2426    return LowerINT_TO_FP32(Op, DAG, true);
2427
2428  assert(DestVT == MVT::f64);
2429  return LowerINT_TO_FP64(Op, DAG, true);
2430}
2431
2432SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2433                                               bool Signed) const {
2434  SDLoc SL(Op);
2435
2436  SDValue Src = Op.getOperand(0);
2437
2438  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2439
2440  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2441                                 MVT::f64);
2442  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2443                                 MVT::f64);
2444  // TODO: Should this propagate fast-math-flags?
2445  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2446
2447  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2448
2449
2450  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2451
2452  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2453                           MVT::i32, FloorMul);
2454  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2455
2456  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2457
2458  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2459}
2460
2461SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2462  SDLoc DL(Op);
2463  SDValue N0 = Op.getOperand(0);
2464
2465  // Convert to target node to get known bits
2466  if (N0.getValueType() == MVT::f32)
2467    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2468
2469  if (getTargetMachine().Options.UnsafeFPMath) {
2470    // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2471    return SDValue();
2472  }
2473
2474  assert(N0.getSimpleValueType() == MVT::f64);
2475
2476  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2477  const unsigned ExpMask = 0x7ff;
2478  const unsigned ExpBiasf64 = 1023;
2479  const unsigned ExpBiasf16 = 15;
2480  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2481  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2482  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2483  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2484                           DAG.getConstant(32, DL, MVT::i64));
2485  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2486  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2487  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2488                          DAG.getConstant(20, DL, MVT::i64));
2489  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2490                  DAG.getConstant(ExpMask, DL, MVT::i32));
2491  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2492  // add the f16 bias (15) to get the biased exponent for the f16 format.
2493  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2494                  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2495
2496  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2497                          DAG.getConstant(8, DL, MVT::i32));
2498  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2499                  DAG.getConstant(0xffe, DL, MVT::i32));
2500
2501  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2502                                  DAG.getConstant(0x1ff, DL, MVT::i32));
2503  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2504
2505  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2506  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2507
2508  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2509  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2510      DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2511                      Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2512
2513  // N = M | (E << 12);
2514  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2515      DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2516                  DAG.getConstant(12, DL, MVT::i32)));
2517
2518  // B = clamp(1-E, 0, 13);
2519  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2520                                  One, E);
2521  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2522  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2523                  DAG.getConstant(13, DL, MVT::i32));
2524
2525  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2526                                   DAG.getConstant(0x1000, DL, MVT::i32));
2527
2528  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2529  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2530  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2531  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2532
2533  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2534  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2535                              DAG.getConstant(0x7, DL, MVT::i32));
2536  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2537                  DAG.getConstant(2, DL, MVT::i32));
2538  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2539                               One, Zero, ISD::SETEQ);
2540  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2541                               One, Zero, ISD::SETGT);
2542  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2543  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2544
2545  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2546                      DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2547  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2548                      I, V, ISD::SETEQ);
2549
2550  // Extract the sign bit.
2551  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2552                            DAG.getConstant(16, DL, MVT::i32));
2553  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2554                     DAG.getConstant(0x8000, DL, MVT::i32));
2555
2556  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2557  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2558}
2559
2560SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2561                                              SelectionDAG &DAG) const {
2562  SDValue Src = Op.getOperand(0);
2563
2564  // TODO: Factor out code common with LowerFP_TO_UINT.
2565
2566  EVT SrcVT = Src.getValueType();
2567  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2568    SDLoc DL(Op);
2569
2570    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2571    SDValue FpToInt32 =
2572        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2573
2574    return FpToInt32;
2575  }
2576
2577  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2578    return LowerFP64_TO_INT(Op, DAG, true);
2579
2580  return SDValue();
2581}
2582
2583SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2584                                              SelectionDAG &DAG) const {
2585  SDValue Src = Op.getOperand(0);
2586
2587  // TODO: Factor out code common with LowerFP_TO_SINT.
2588
2589  EVT SrcVT = Src.getValueType();
2590  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2591    SDLoc DL(Op);
2592
2593    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2594    SDValue FpToInt32 =
2595        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2596
2597    return FpToInt32;
2598  }
2599
2600  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2601    return LowerFP64_TO_INT(Op, DAG, false);
2602
2603  return SDValue();
2604}
2605
2606SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2607                                                     SelectionDAG &DAG) const {
2608  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2609  MVT VT = Op.getSimpleValueType();
2610  MVT ScalarVT = VT.getScalarType();
2611
2612  assert(VT.isVector());
2613
2614  SDValue Src = Op.getOperand(0);
2615  SDLoc DL(Op);
2616
2617  // TODO: Don't scalarize on Evergreen?
2618  unsigned NElts = VT.getVectorNumElements();
2619  SmallVector<SDValue, 8> Args;
2620  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2621
2622  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2623  for (unsigned I = 0; I < NElts; ++I)
2624    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2625
2626  return DAG.getBuildVector(VT, DL, Args);
2627}
2628
2629//===----------------------------------------------------------------------===//
2630// Custom DAG optimizations
2631//===----------------------------------------------------------------------===//
2632
2633static bool isU24(SDValue Op, SelectionDAG &DAG) {
2634  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2635}
2636
2637static bool isI24(SDValue Op, SelectionDAG &DAG) {
2638  EVT VT = Op.getValueType();
2639  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2640                                     // as unsigned 24-bit values.
2641    AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2642}
2643
2644static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2645                        TargetLowering::DAGCombinerInfo &DCI) {
2646
2647  SelectionDAG &DAG = DCI.DAG;
2648  SDValue Op = Node24->getOperand(OpIdx);
2649  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2650  EVT VT = Op.getValueType();
2651
2652  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2653  APInt KnownZero, KnownOne;
2654  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2655  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
2656    return true;
2657
2658  return false;
2659}
2660
2661template <typename IntTy>
2662static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2663                               uint32_t Width, const SDLoc &DL) {
2664  if (Width + Offset < 32) {
2665    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2666    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2667    return DAG.getConstant(Result, DL, MVT::i32);
2668  }
2669
2670  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2671}
2672
2673static bool hasVolatileUser(SDNode *Val) {
2674  for (SDNode *U : Val->uses()) {
2675    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2676      if (M->isVolatile())
2677        return true;
2678    }
2679  }
2680
2681  return false;
2682}
2683
2684bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2685  // i32 vectors are the canonical memory type.
2686  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2687    return false;
2688
2689  if (!VT.isByteSized())
2690    return false;
2691
2692  unsigned Size = VT.getStoreSize();
2693
2694  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2695    return false;
2696
2697  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2698    return false;
2699
2700  return true;
2701}
2702
2703// Replace load of an illegal type with a store of a bitcast to a friendlier
2704// type.
2705SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2706                                                 DAGCombinerInfo &DCI) const {
2707  if (!DCI.isBeforeLegalize())
2708    return SDValue();
2709
2710  LoadSDNode *LN = cast<LoadSDNode>(N);
2711  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2712    return SDValue();
2713
2714  SDLoc SL(N);
2715  SelectionDAG &DAG = DCI.DAG;
2716  EVT VT = LN->getMemoryVT();
2717
2718  unsigned Size = VT.getStoreSize();
2719  unsigned Align = LN->getAlignment();
2720  if (Align < Size && isTypeLegal(VT)) {
2721    bool IsFast;
2722    unsigned AS = LN->getAddressSpace();
2723
2724    // Expand unaligned loads earlier than legalization. Due to visitation order
2725    // problems during legalization, the emitted instructions to pack and unpack
2726    // the bytes again are not eliminated in the case of an unaligned copy.
2727    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2728      if (VT.isVector())
2729        return scalarizeVectorLoad(LN, DAG);
2730
2731      SDValue Ops[2];
2732      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2733      return DAG.getMergeValues(Ops, SDLoc(N));
2734    }
2735
2736    if (!IsFast)
2737      return SDValue();
2738  }
2739
2740  if (!shouldCombineMemoryType(VT))
2741    return SDValue();
2742
2743  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2744
2745  SDValue NewLoad
2746    = DAG.getLoad(NewVT, SL, LN->getChain(),
2747                  LN->getBasePtr(), LN->getMemOperand());
2748
2749  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2750  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2751  return SDValue(N, 0);
2752}
2753
2754// Replace store of an illegal type with a store of a bitcast to a friendlier
2755// type.
2756SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2757                                                  DAGCombinerInfo &DCI) const {
2758  if (!DCI.isBeforeLegalize())
2759    return SDValue();
2760
2761  StoreSDNode *SN = cast<StoreSDNode>(N);
2762  if (SN->isVolatile() || !ISD::isNormalStore(SN))
2763    return SDValue();
2764
2765  EVT VT = SN->getMemoryVT();
2766  unsigned Size = VT.getStoreSize();
2767
2768  SDLoc SL(N);
2769  SelectionDAG &DAG = DCI.DAG;
2770  unsigned Align = SN->getAlignment();
2771  if (Align < Size && isTypeLegal(VT)) {
2772    bool IsFast;
2773    unsigned AS = SN->getAddressSpace();
2774
2775    // Expand unaligned stores earlier than legalization. Due to visitation
2776    // order problems during legalization, the emitted instructions to pack and
2777    // unpack the bytes again are not eliminated in the case of an unaligned
2778    // copy.
2779    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2780      if (VT.isVector())
2781        return scalarizeVectorStore(SN, DAG);
2782
2783      return expandUnalignedStore(SN, DAG);
2784    }
2785
2786    if (!IsFast)
2787      return SDValue();
2788  }
2789
2790  if (!shouldCombineMemoryType(VT))
2791    return SDValue();
2792
2793  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2794  SDValue Val = SN->getValue();
2795
2796  //DCI.AddToWorklist(Val.getNode());
2797
2798  bool OtherUses = !Val.hasOneUse();
2799  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2800  if (OtherUses) {
2801    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2802    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2803  }
2804
2805  return DAG.getStore(SN->getChain(), SL, CastVal,
2806                      SN->getBasePtr(), SN->getMemOperand());
2807}
2808
2809SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
2810                                                  DAGCombinerInfo &DCI) const {
2811  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2812  if (!CSrc)
2813    return SDValue();
2814
2815  const APFloat &F = CSrc->getValueAPF();
2816  APFloat Zero = APFloat::getZero(F.getSemantics());
2817  APFloat::cmpResult Cmp0 = F.compare(Zero);
2818  if (Cmp0 == APFloat::cmpLessThan ||
2819      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
2820    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2821  }
2822
2823  APFloat One(F.getSemantics(), "1.0");
2824  APFloat::cmpResult Cmp1 = F.compare(One);
2825  if (Cmp1 == APFloat::cmpGreaterThan)
2826    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2827
2828  return SDValue(CSrc, 0);
2829}
2830
2831// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2832// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2833// issues.
2834SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2835                                                        DAGCombinerInfo &DCI) const {
2836  SelectionDAG &DAG = DCI.DAG;
2837  SDValue N0 = N->getOperand(0);
2838
2839  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2840  //     (vt2 (truncate (assertzext vt0:x, vt1)))
2841  if (N0.getOpcode() == ISD::TRUNCATE) {
2842    SDValue N1 = N->getOperand(1);
2843    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2844    SDLoc SL(N);
2845
2846    SDValue Src = N0.getOperand(0);
2847    EVT SrcVT = Src.getValueType();
2848    if (SrcVT.bitsGE(ExtVT)) {
2849      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2850      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2851    }
2852  }
2853
2854  return SDValue();
2855}
2856/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2857/// binary operation \p Opc to it with the corresponding constant operands.
2858SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2859  DAGCombinerInfo &DCI, const SDLoc &SL,
2860  unsigned Opc, SDValue LHS,
2861  uint32_t ValLo, uint32_t ValHi) const {
2862  SelectionDAG &DAG = DCI.DAG;
2863  SDValue Lo, Hi;
2864  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2865
2866  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2867  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2868
2869  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2870  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2871
2872  // Re-visit the ands. It's possible we eliminated one of them and it could
2873  // simplify the vector.
2874  DCI.AddToWorklist(Lo.getNode());
2875  DCI.AddToWorklist(Hi.getNode());
2876
2877  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2878  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2879}
2880
2881SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2882                                                DAGCombinerInfo &DCI) const {
2883  EVT VT = N->getValueType(0);
2884
2885  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2886  if (!RHS)
2887    return SDValue();
2888
2889  SDValue LHS = N->getOperand(0);
2890  unsigned RHSVal = RHS->getZExtValue();
2891  if (!RHSVal)
2892    return LHS;
2893
2894  SDLoc SL(N);
2895  SelectionDAG &DAG = DCI.DAG;
2896
2897  switch (LHS->getOpcode()) {
2898  default:
2899    break;
2900  case ISD::ZERO_EXTEND:
2901  case ISD::SIGN_EXTEND:
2902  case ISD::ANY_EXTEND: {
2903    SDValue X = LHS->getOperand(0);
2904
2905    if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
2906        isTypeLegal(MVT::v2i16)) {
2907      // Prefer build_vector as the canonical form if packed types are legal.
2908      // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
2909      SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
2910       { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
2911      return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
2912    }
2913
2914    // shl (ext x) => zext (shl x), if shift does not overflow int
2915    if (VT != MVT::i64)
2916      break;
2917    KnownBits Known;
2918    DAG.computeKnownBits(X, Known);
2919    unsigned LZ = Known.countMinLeadingZeros();
2920    if (LZ < RHSVal)
2921      break;
2922    EVT XVT = X.getValueType();
2923    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2924    return DAG.getZExtOrTrunc(Shl, SL, VT);
2925  }
2926  }
2927
2928  if (VT != MVT::i64)
2929    return SDValue();
2930
2931  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2932
2933  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2934  // common case, splitting this into a move and a 32-bit shift is faster and
2935  // the same code size.
2936  if (RHSVal < 32)
2937    return SDValue();
2938
2939  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2940
2941  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2942  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2943
2944  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2945
2946  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2947  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2948}
2949
2950SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2951                                                DAGCombinerInfo &DCI) const {
2952  if (N->getValueType(0) != MVT::i64)
2953    return SDValue();
2954
2955  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2956  if (!RHS)
2957    return SDValue();
2958
2959  SelectionDAG &DAG = DCI.DAG;
2960  SDLoc SL(N);
2961  unsigned RHSVal = RHS->getZExtValue();
2962
2963  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2964  if (RHSVal == 32) {
2965    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2966    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2967                                   DAG.getConstant(31, SL, MVT::i32));
2968
2969    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2970    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2971  }
2972
2973  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2974  if (RHSVal == 63) {
2975    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2976    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2977                                   DAG.getConstant(31, SL, MVT::i32));
2978    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2979    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2980  }
2981
2982  return SDValue();
2983}
2984
2985SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2986                                                DAGCombinerInfo &DCI) const {
2987  if (N->getValueType(0) != MVT::i64)
2988    return SDValue();
2989
2990  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2991  if (!RHS)
2992    return SDValue();
2993
2994  unsigned ShiftAmt = RHS->getZExtValue();
2995  if (ShiftAmt < 32)
2996    return SDValue();
2997
2998  // srl i64:x, C for C >= 32
2999  // =>
3000  //   build_pair (srl hi_32(x), C - 32), 0
3001
3002  SelectionDAG &DAG = DCI.DAG;
3003  SDLoc SL(N);
3004
3005  SDValue One = DAG.getConstant(1, SL, MVT::i32);
3006  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3007
3008  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
3009  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
3010                           VecOp, One);
3011
3012  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3013  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3014
3015  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3016
3017  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3018}
3019
3020// We need to specifically handle i64 mul here to avoid unnecessary conversion
3021// instructions. If we only match on the legalized i64 mul expansion,
3022// SimplifyDemandedBits will be unable to remove them because there will be
3023// multiple uses due to the separate mul + mulh[su].
3024static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3025                        SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3026  if (Size <= 32) {
3027    unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3028    return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3029  }
3030
3031  // Because we want to eliminate extension instructions before the
3032  // operation, we need to create a single user here (i.e. not the separate
3033  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3034
3035  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3036
3037  SDValue Mul = DAG.getNode(MulOpc, SL,
3038                            DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3039
3040  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3041                     Mul.getValue(0), Mul.getValue(1));
3042}
3043
3044SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3045                                                DAGCombinerInfo &DCI) const {
3046  EVT VT = N->getValueType(0);
3047
3048  unsigned Size = VT.getSizeInBits();
3049  if (VT.isVector() || Size > 64)
3050    return SDValue();
3051
3052  // There are i16 integer mul/mad.
3053  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3054    return SDValue();
3055
3056  SelectionDAG &DAG = DCI.DAG;
3057  SDLoc DL(N);
3058
3059  SDValue N0 = N->getOperand(0);
3060  SDValue N1 = N->getOperand(1);
3061  SDValue Mul;
3062
3063  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3064    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3065    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3066    Mul = getMul24(DAG, DL, N0, N1, Size, false);
3067  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3068    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3069    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3070    Mul = getMul24(DAG, DL, N0, N1, Size, true);
3071  } else {
3072    return SDValue();
3073  }
3074
3075  // We need to use sext even for MUL_U24, because MUL_U24 is used
3076  // for signed multiply of 8 and 16-bit types.
3077  return DAG.getSExtOrTrunc(Mul, DL, VT);
3078}
3079
3080SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3081                                                  DAGCombinerInfo &DCI) const {
3082  EVT VT = N->getValueType(0);
3083
3084  if (!Subtarget->hasMulI24() || VT.isVector())
3085    return SDValue();
3086
3087  SelectionDAG &DAG = DCI.DAG;
3088  SDLoc DL(N);
3089
3090  SDValue N0 = N->getOperand(0);
3091  SDValue N1 = N->getOperand(1);
3092
3093  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3094    return SDValue();
3095
3096  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3097  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3098
3099  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3100  DCI.AddToWorklist(Mulhi.getNode());
3101  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3102}
3103
3104SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3105                                                  DAGCombinerInfo &DCI) const {
3106  EVT VT = N->getValueType(0);
3107
3108  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3109    return SDValue();
3110
3111  SelectionDAG &DAG = DCI.DAG;
3112  SDLoc DL(N);
3113
3114  SDValue N0 = N->getOperand(0);
3115  SDValue N1 = N->getOperand(1);
3116
3117  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3118    return SDValue();
3119
3120  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3121  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3122
3123  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3124  DCI.AddToWorklist(Mulhi.getNode());
3125  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3126}
3127
3128SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3129  SDNode *N, DAGCombinerInfo &DCI) const {
3130  SelectionDAG &DAG = DCI.DAG;
3131
3132  // Simplify demanded bits before splitting into multiple users.
3133  if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
3134    return SDValue();
3135
3136  SDValue N0 = N->getOperand(0);
3137  SDValue N1 = N->getOperand(1);
3138
3139  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3140
3141  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3142  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3143
3144  SDLoc SL(N);
3145
3146  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3147  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3148  return DAG.getMergeValues({ MulLo, MulHi }, SL);
3149}
3150
3151static bool isNegativeOne(SDValue Val) {
3152  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3153    return C->isAllOnesValue();
3154  return false;
3155}
3156
3157SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3158                                          SDValue Op,
3159                                          const SDLoc &DL,
3160                                          unsigned Opc) const {
3161  EVT VT = Op.getValueType();
3162  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3163  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3164                              LegalVT != MVT::i16))
3165    return SDValue();
3166
3167  if (VT != MVT::i32)
3168    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3169
3170  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3171  if (VT != MVT::i32)
3172    FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3173
3174  return FFBX;
3175}
3176
3177// The native instructions return -1 on 0 input. Optimize out a select that
3178// produces -1 on 0.
3179//
3180// TODO: If zero is not undef, we could also do this if the output is compared
3181// against the bitwidth.
3182//
3183// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3184SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3185                                                 SDValue LHS, SDValue RHS,
3186                                                 DAGCombinerInfo &DCI) const {
3187  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3188  if (!CmpRhs || !CmpRhs->isNullValue())
3189    return SDValue();
3190
3191  SelectionDAG &DAG = DCI.DAG;
3192  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3193  SDValue CmpLHS = Cond.getOperand(0);
3194
3195  unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3196                                           AMDGPUISD::FFBH_U32;
3197
3198  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3199  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3200  if (CCOpcode == ISD::SETEQ &&
3201      (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3202      RHS.getOperand(0) == CmpLHS &&
3203      isNegativeOne(LHS)) {
3204    return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3205  }
3206
3207  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3208  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3209  if (CCOpcode == ISD::SETNE &&
3210      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3211      LHS.getOperand(0) == CmpLHS &&
3212      isNegativeOne(RHS)) {
3213    return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3214  }
3215
3216  return SDValue();
3217}
3218
3219static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3220                                         unsigned Op,
3221                                         const SDLoc &SL,
3222                                         SDValue Cond,
3223                                         SDValue N1,
3224                                         SDValue N2) {
3225  SelectionDAG &DAG = DCI.DAG;
3226  EVT VT = N1.getValueType();
3227
3228  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3229                                  N1.getOperand(0), N2.getOperand(0));
3230  DCI.AddToWorklist(NewSelect.getNode());
3231  return DAG.getNode(Op, SL, VT, NewSelect);
3232}
3233
3234// Pull a free FP operation out of a select so it may fold into uses.
3235//
3236// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3237// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3238//
3239// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3240// select c, (fabs x), +k -> fabs (select c, x, k)
3241static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3242                                    SDValue N) {
3243  SelectionDAG &DAG = DCI.DAG;
3244  SDValue Cond = N.getOperand(0);
3245  SDValue LHS = N.getOperand(1);
3246  SDValue RHS = N.getOperand(2);
3247
3248  EVT VT = N.getValueType();
3249  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3250      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3251    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3252                                     SDLoc(N), Cond, LHS, RHS);
3253  }
3254
3255  bool Inv = false;
3256  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3257    std::swap(LHS, RHS);
3258    Inv = true;
3259  }
3260
3261  // TODO: Support vector constants.
3262  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3263  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3264    SDLoc SL(N);
3265    // If one side is an fneg/fabs and the other is a constant, we can push the
3266    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3267    SDValue NewLHS = LHS.getOperand(0);
3268    SDValue NewRHS = RHS;
3269
3270    // Careful: if the neg can be folded up, don't try to pull it back down.
3271    bool ShouldFoldNeg = true;
3272
3273    if (NewLHS.hasOneUse()) {
3274      unsigned Opc = NewLHS.getOpcode();
3275      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3276        ShouldFoldNeg = false;
3277      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3278        ShouldFoldNeg = false;
3279    }
3280
3281    if (ShouldFoldNeg) {
3282      if (LHS.getOpcode() == ISD::FNEG)
3283        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3284      else if (CRHS->isNegative())
3285        return SDValue();
3286
3287      if (Inv)
3288        std::swap(NewLHS, NewRHS);
3289
3290      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3291                                      Cond, NewLHS, NewRHS);
3292      DCI.AddToWorklist(NewSelect.getNode());
3293      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3294    }
3295  }
3296
3297  return SDValue();
3298}
3299
3300
3301SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3302                                                   DAGCombinerInfo &DCI) const {
3303  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3304    return Folded;
3305
3306  SDValue Cond = N->getOperand(0);
3307  if (Cond.getOpcode() != ISD::SETCC)
3308    return SDValue();
3309
3310  EVT VT = N->getValueType(0);
3311  SDValue LHS = Cond.getOperand(0);
3312  SDValue RHS = Cond.getOperand(1);
3313  SDValue CC = Cond.getOperand(2);
3314
3315  SDValue True = N->getOperand(1);
3316  SDValue False = N->getOperand(2);
3317
3318  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3319    SelectionDAG &DAG = DCI.DAG;
3320    if ((DAG.isConstantValueOfAnyType(True) ||
3321         DAG.isConstantValueOfAnyType(True)) &&
3322        (!DAG.isConstantValueOfAnyType(False) &&
3323         !DAG.isConstantValueOfAnyType(False))) {
3324      // Swap cmp + select pair to move constant to false input.
3325      // This will allow using VOPC cndmasks more often.
3326      // select (setcc x, y), k, x -> select (setcc y, x) x, x
3327
3328      SDLoc SL(N);
3329      ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3330                                            LHS.getValueType().isInteger());
3331
3332      SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3333      return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3334    }
3335
3336    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3337      SDValue MinMax
3338        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3339      // Revisit this node so we can catch min3/max3/med3 patterns.
3340      //DCI.AddToWorklist(MinMax.getNode());
3341      return MinMax;
3342    }
3343  }
3344
3345  // There's no reason to not do this if the condition has other uses.
3346  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3347}
3348
3349static bool isConstantFPZero(SDValue N) {
3350  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3351    return C->isZero() && !C->isNegative();
3352  return false;
3353}
3354
3355static unsigned inverseMinMax(unsigned Opc) {
3356  switch (Opc) {
3357  case ISD::FMAXNUM:
3358    return ISD::FMINNUM;
3359  case ISD::FMINNUM:
3360    return ISD::FMAXNUM;
3361  case AMDGPUISD::FMAX_LEGACY:
3362    return AMDGPUISD::FMIN_LEGACY;
3363  case AMDGPUISD::FMIN_LEGACY:
3364    return  AMDGPUISD::FMAX_LEGACY;
3365  default:
3366    llvm_unreachable("invalid min/max opcode");
3367  }
3368}
3369
3370SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3371                                                 DAGCombinerInfo &DCI) const {
3372  SelectionDAG &DAG = DCI.DAG;
3373  SDValue N0 = N->getOperand(0);
3374  EVT VT = N->getValueType(0);
3375
3376  unsigned Opc = N0.getOpcode();
3377
3378  // If the input has multiple uses and we can either fold the negate down, or
3379  // the other uses cannot, give up. This both prevents unprofitable
3380  // transformations and infinite loops: we won't repeatedly try to fold around
3381  // a negate that has no 'good' form.
3382  if (N0.hasOneUse()) {
3383    // This may be able to fold into the source, but at a code size cost. Don't
3384    // fold if the fold into the user is free.
3385    if (allUsesHaveSourceMods(N, 0))
3386      return SDValue();
3387  } else {
3388    if (fnegFoldsIntoOp(Opc) &&
3389        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3390      return SDValue();
3391  }
3392
3393  SDLoc SL(N);
3394  switch (Opc) {
3395  case ISD::FADD: {
3396    if (!mayIgnoreSignedZero(N0))
3397      return SDValue();
3398
3399    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3400    SDValue LHS = N0.getOperand(0);
3401    SDValue RHS = N0.getOperand(1);
3402
3403    if (LHS.getOpcode() != ISD::FNEG)
3404      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3405    else
3406      LHS = LHS.getOperand(0);
3407
3408    if (RHS.getOpcode() != ISD::FNEG)
3409      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3410    else
3411      RHS = RHS.getOperand(0);
3412
3413    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3414    if (!N0.hasOneUse())
3415      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3416    return Res;
3417  }
3418  case ISD::FMUL:
3419  case AMDGPUISD::FMUL_LEGACY: {
3420    // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3421    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3422    SDValue LHS = N0.getOperand(0);
3423    SDValue RHS = N0.getOperand(1);
3424
3425    if (LHS.getOpcode() == ISD::FNEG)
3426      LHS = LHS.getOperand(0);
3427    else if (RHS.getOpcode() == ISD::FNEG)
3428      RHS = RHS.getOperand(0);
3429    else
3430      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3431
3432    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3433    if (!N0.hasOneUse())
3434      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3435    return Res;
3436  }
3437  case ISD::FMA:
3438  case ISD::FMAD: {
3439    if (!mayIgnoreSignedZero(N0))
3440      return SDValue();
3441
3442    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3443    SDValue LHS = N0.getOperand(0);
3444    SDValue MHS = N0.getOperand(1);
3445    SDValue RHS = N0.getOperand(2);
3446
3447    if (LHS.getOpcode() == ISD::FNEG)
3448      LHS = LHS.getOperand(0);
3449    else if (MHS.getOpcode() == ISD::FNEG)
3450      MHS = MHS.getOperand(0);
3451    else
3452      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3453
3454    if (RHS.getOpcode() != ISD::FNEG)
3455      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3456    else
3457      RHS = RHS.getOperand(0);
3458
3459    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3460    if (!N0.hasOneUse())
3461      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3462    return Res;
3463  }
3464  case ISD::FMAXNUM:
3465  case ISD::FMINNUM:
3466  case AMDGPUISD::FMAX_LEGACY:
3467  case AMDGPUISD::FMIN_LEGACY: {
3468    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3469    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3470    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3471    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3472
3473    SDValue LHS = N0.getOperand(0);
3474    SDValue RHS = N0.getOperand(1);
3475
3476    // 0 doesn't have a negated inline immediate.
3477    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3478    // operations.
3479    if (isConstantFPZero(RHS))
3480      return SDValue();
3481
3482    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3483    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3484    unsigned Opposite = inverseMinMax(Opc);
3485
3486    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3487    if (!N0.hasOneUse())
3488      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3489    return Res;
3490  }
3491  case ISD::FP_EXTEND:
3492  case ISD::FTRUNC:
3493  case ISD::FRINT:
3494  case ISD::FNEARBYINT: // XXX - Should fround be handled?
3495  case ISD::FSIN:
3496  case AMDGPUISD::RCP:
3497  case AMDGPUISD::RCP_LEGACY:
3498  case AMDGPUISD::SIN_HW: {
3499    SDValue CvtSrc = N0.getOperand(0);
3500    if (CvtSrc.getOpcode() == ISD::FNEG) {
3501      // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3502      // (fneg (rcp (fneg x))) -> (rcp x)
3503      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3504    }
3505
3506    if (!N0.hasOneUse())
3507      return SDValue();
3508
3509    // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3510    // (fneg (rcp x)) -> (rcp (fneg x))
3511    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3512    return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3513  }
3514  case ISD::FP_ROUND: {
3515    SDValue CvtSrc = N0.getOperand(0);
3516
3517    if (CvtSrc.getOpcode() == ISD::FNEG) {
3518      // (fneg (fp_round (fneg x))) -> (fp_round x)
3519      return DAG.getNode(ISD::FP_ROUND, SL, VT,
3520                         CvtSrc.getOperand(0), N0.getOperand(1));
3521    }
3522
3523    if (!N0.hasOneUse())
3524      return SDValue();
3525
3526    // (fneg (fp_round x)) -> (fp_round (fneg x))
3527    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3528    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3529  }
3530  case ISD::FP16_TO_FP: {
3531    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3532    // f16, but legalization of f16 fneg ends up pulling it out of the source.
3533    // Put the fneg back as a legal source operation that can be matched later.
3534    SDLoc SL(N);
3535
3536    SDValue Src = N0.getOperand(0);
3537    EVT SrcVT = Src.getValueType();
3538
3539    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3540    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3541                                  DAG.getConstant(0x8000, SL, SrcVT));
3542    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3543  }
3544  default:
3545    return SDValue();
3546  }
3547}
3548
3549SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3550                                                 DAGCombinerInfo &DCI) const {
3551  SelectionDAG &DAG = DCI.DAG;
3552  SDValue N0 = N->getOperand(0);
3553
3554  if (!N0.hasOneUse())
3555    return SDValue();
3556
3557  switch (N0.getOpcode()) {
3558  case ISD::FP16_TO_FP: {
3559    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3560    SDLoc SL(N);
3561    SDValue Src = N0.getOperand(0);
3562    EVT SrcVT = Src.getValueType();
3563
3564    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3565    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3566                                  DAG.getConstant(0x7fff, SL, SrcVT));
3567    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3568  }
3569  default:
3570    return SDValue();
3571  }
3572}
3573
3574SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3575                                                DAGCombinerInfo &DCI) const {
3576  SelectionDAG &DAG = DCI.DAG;
3577  SDLoc DL(N);
3578
3579  switch(N->getOpcode()) {
3580  default:
3581    break;
3582  case ISD::BITCAST: {
3583    EVT DestVT = N->getValueType(0);
3584
3585    // Push casts through vector builds. This helps avoid emitting a large
3586    // number of copies when materializing floating point vector constants.
3587    //
3588    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3589    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3590    if (DestVT.isVector()) {
3591      SDValue Src = N->getOperand(0);
3592      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3593        EVT SrcVT = Src.getValueType();
3594        unsigned NElts = DestVT.getVectorNumElements();
3595
3596        if (SrcVT.getVectorNumElements() == NElts) {
3597          EVT DestEltVT = DestVT.getVectorElementType();
3598
3599          SmallVector<SDValue, 8> CastedElts;
3600          SDLoc SL(N);
3601          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3602            SDValue Elt = Src.getOperand(I);
3603            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3604          }
3605
3606          return DAG.getBuildVector(DestVT, SL, CastedElts);
3607        }
3608      }
3609    }
3610
3611    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3612      break;
3613
3614    // Fold bitcasts of constants.
3615    //
3616    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3617    // TODO: Generalize and move to DAGCombiner
3618    SDValue Src = N->getOperand(0);
3619    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3620      assert(Src.getValueType() == MVT::i64);
3621      SDLoc SL(N);
3622      uint64_t CVal = C->getZExtValue();
3623      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3624                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3625                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3626    }
3627
3628    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3629      const APInt &Val = C->getValueAPF().bitcastToAPInt();
3630      SDLoc SL(N);
3631      uint64_t CVal = Val.getZExtValue();
3632      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3633                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3634                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3635
3636      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3637    }
3638
3639    break;
3640  }
3641  case ISD::SHL: {
3642    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3643      break;
3644
3645    return performShlCombine(N, DCI);
3646  }
3647  case ISD::SRL: {
3648    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3649      break;
3650
3651    return performSrlCombine(N, DCI);
3652  }
3653  case ISD::SRA: {
3654    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3655      break;
3656
3657    return performSraCombine(N, DCI);
3658  }
3659  case ISD::MUL:
3660    return performMulCombine(N, DCI);
3661  case ISD::MULHS:
3662    return performMulhsCombine(N, DCI);
3663  case ISD::MULHU:
3664    return performMulhuCombine(N, DCI);
3665  case AMDGPUISD::MUL_I24:
3666  case AMDGPUISD::MUL_U24:
3667  case AMDGPUISD::MULHI_I24:
3668  case AMDGPUISD::MULHI_U24: {
3669    // If the first call to simplify is successfull, then N may end up being
3670    // deleted, so we shouldn't call simplifyI24 again.
3671    simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3672    return SDValue();
3673  }
3674  case AMDGPUISD::MUL_LOHI_I24:
3675  case AMDGPUISD::MUL_LOHI_U24:
3676    return performMulLoHi24Combine(N, DCI);
3677  case ISD::SELECT:
3678    return performSelectCombine(N, DCI);
3679  case ISD::FNEG:
3680    return performFNegCombine(N, DCI);
3681  case ISD::FABS:
3682    return performFAbsCombine(N, DCI);
3683  case AMDGPUISD::BFE_I32:
3684  case AMDGPUISD::BFE_U32: {
3685    assert(!N->getValueType(0).isVector() &&
3686           "Vector handling of BFE not implemented");
3687    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3688    if (!Width)
3689      break;
3690
3691    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3692    if (WidthVal == 0)
3693      return DAG.getConstant(0, DL, MVT::i32);
3694
3695    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3696    if (!Offset)
3697      break;
3698
3699    SDValue BitsFrom = N->getOperand(0);
3700    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3701
3702    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3703
3704    if (OffsetVal == 0) {
3705      // This is already sign / zero extended, so try to fold away extra BFEs.
3706      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3707
3708      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3709      if (OpSignBits >= SignBits)
3710        return BitsFrom;
3711
3712      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3713      if (Signed) {
3714        // This is a sign_extend_inreg. Replace it to take advantage of existing
3715        // DAG Combines. If not eliminated, we will match back to BFE during
3716        // selection.
3717
3718        // TODO: The sext_inreg of extended types ends, although we can could
3719        // handle them in a single BFE.
3720        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3721                           DAG.getValueType(SmallVT));
3722      }
3723
3724      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3725    }
3726
3727    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3728      if (Signed) {
3729        return constantFoldBFE<int32_t>(DAG,
3730                                        CVal->getSExtValue(),
3731                                        OffsetVal,
3732                                        WidthVal,
3733                                        DL);
3734      }
3735
3736      return constantFoldBFE<uint32_t>(DAG,
3737                                       CVal->getZExtValue(),
3738                                       OffsetVal,
3739                                       WidthVal,
3740                                       DL);
3741    }
3742
3743    if ((OffsetVal + WidthVal) >= 32 &&
3744        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
3745      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3746      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3747                         BitsFrom, ShiftVal);
3748    }
3749
3750    if (BitsFrom.hasOneUse()) {
3751      APInt Demanded = APInt::getBitsSet(32,
3752                                         OffsetVal,
3753                                         OffsetVal + WidthVal);
3754
3755      KnownBits Known;
3756      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3757                                            !DCI.isBeforeLegalizeOps());
3758      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3759      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
3760          TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
3761        DCI.CommitTargetLoweringOpt(TLO);
3762      }
3763    }
3764
3765    break;
3766  }
3767  case ISD::LOAD:
3768    return performLoadCombine(N, DCI);
3769  case ISD::STORE:
3770    return performStoreCombine(N, DCI);
3771  case AMDGPUISD::CLAMP:
3772    return performClampCombine(N, DCI);
3773  case AMDGPUISD::RCP: {
3774    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3775      // XXX - Should this flush denormals?
3776      const APFloat &Val = CFP->getValueAPF();
3777      APFloat One(Val.getSemantics(), "1.0");
3778      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3779    }
3780
3781    break;
3782  }
3783  case ISD::AssertZext:
3784  case ISD::AssertSext:
3785    return performAssertSZExtCombine(N, DCI);
3786  }
3787  return SDValue();
3788}
3789
3790//===----------------------------------------------------------------------===//
3791// Helper functions
3792//===----------------------------------------------------------------------===//
3793
3794SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3795                                                   const TargetRegisterClass *RC,
3796                                                   unsigned Reg, EVT VT,
3797                                                   const SDLoc &SL,
3798                                                   bool RawReg) const {
3799  MachineFunction &MF = DAG.getMachineFunction();
3800  MachineRegisterInfo &MRI = MF.getRegInfo();
3801  unsigned VReg;
3802
3803  if (!MRI.isLiveIn(Reg)) {
3804    VReg = MRI.createVirtualRegister(RC);
3805    MRI.addLiveIn(Reg, VReg);
3806  } else {
3807    VReg = MRI.getLiveInVirtReg(Reg);
3808  }
3809
3810  if (RawReg)
3811    return DAG.getRegister(VReg, VT);
3812
3813  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
3814}
3815
3816SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
3817                                                  EVT VT,
3818                                                  const SDLoc &SL,
3819                                                  int64_t Offset) const {
3820  MachineFunction &MF = DAG.getMachineFunction();
3821  MachineFrameInfo &MFI = MF.getFrameInfo();
3822
3823  int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
3824  auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
3825  SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
3826
3827  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
3828                     MachineMemOperand::MODereferenceable |
3829                     MachineMemOperand::MOInvariant);
3830}
3831
3832SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
3833                                                   const SDLoc &SL,
3834                                                   SDValue Chain,
3835                                                   SDValue StackPtr,
3836                                                   SDValue ArgVal,
3837                                                   int64_t Offset) const {
3838  MachineFunction &MF = DAG.getMachineFunction();
3839  MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
3840
3841  SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
3842  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
3843                               MachineMemOperand::MODereferenceable);
3844  return Store;
3845}
3846
3847SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
3848                                             const TargetRegisterClass *RC,
3849                                             EVT VT, const SDLoc &SL,
3850                                             const ArgDescriptor &Arg) const {
3851  assert(Arg && "Attempting to load missing argument");
3852
3853  if (Arg.isRegister())
3854    return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
3855  return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
3856}
3857
3858uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3859    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3860  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3861  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3862  switch (Param) {
3863  case GRID_DIM:
3864    return ArgOffset;
3865  case GRID_OFFSET:
3866    return ArgOffset + 4;
3867  }
3868  llvm_unreachable("unexpected implicit parameter type");
3869}
3870
3871#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3872
3873const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3874  switch ((AMDGPUISD::NodeType)Opcode) {
3875  case AMDGPUISD::FIRST_NUMBER: break;
3876  // AMDIL DAG nodes
3877  NODE_NAME_CASE(UMUL);
3878  NODE_NAME_CASE(BRANCH_COND);
3879
3880  // AMDGPU DAG nodes
3881  NODE_NAME_CASE(IF)
3882  NODE_NAME_CASE(ELSE)
3883  NODE_NAME_CASE(LOOP)
3884  NODE_NAME_CASE(CALL)
3885  NODE_NAME_CASE(TC_RETURN)
3886  NODE_NAME_CASE(TRAP)
3887  NODE_NAME_CASE(RET_FLAG)
3888  NODE_NAME_CASE(RETURN_TO_EPILOG)
3889  NODE_NAME_CASE(ENDPGM)
3890  NODE_NAME_CASE(DWORDADDR)
3891  NODE_NAME_CASE(FRACT)
3892  NODE_NAME_CASE(SETCC)
3893  NODE_NAME_CASE(SETREG)
3894  NODE_NAME_CASE(FMA_W_CHAIN)
3895  NODE_NAME_CASE(FMUL_W_CHAIN)
3896  NODE_NAME_CASE(CLAMP)
3897  NODE_NAME_CASE(COS_HW)
3898  NODE_NAME_CASE(SIN_HW)
3899  NODE_NAME_CASE(FMAX_LEGACY)
3900  NODE_NAME_CASE(FMIN_LEGACY)
3901  NODE_NAME_CASE(FMAX3)
3902  NODE_NAME_CASE(SMAX3)
3903  NODE_NAME_CASE(UMAX3)
3904  NODE_NAME_CASE(FMIN3)
3905  NODE_NAME_CASE(SMIN3)
3906  NODE_NAME_CASE(UMIN3)
3907  NODE_NAME_CASE(FMED3)
3908  NODE_NAME_CASE(SMED3)
3909  NODE_NAME_CASE(UMED3)
3910  NODE_NAME_CASE(URECIP)
3911  NODE_NAME_CASE(DIV_SCALE)
3912  NODE_NAME_CASE(DIV_FMAS)
3913  NODE_NAME_CASE(DIV_FIXUP)
3914  NODE_NAME_CASE(FMAD_FTZ)
3915  NODE_NAME_CASE(TRIG_PREOP)
3916  NODE_NAME_CASE(RCP)
3917  NODE_NAME_CASE(RSQ)
3918  NODE_NAME_CASE(RCP_LEGACY)
3919  NODE_NAME_CASE(RSQ_LEGACY)
3920  NODE_NAME_CASE(FMUL_LEGACY)
3921  NODE_NAME_CASE(RSQ_CLAMP)
3922  NODE_NAME_CASE(LDEXP)
3923  NODE_NAME_CASE(FP_CLASS)
3924  NODE_NAME_CASE(DOT4)
3925  NODE_NAME_CASE(CARRY)
3926  NODE_NAME_CASE(BORROW)
3927  NODE_NAME_CASE(BFE_U32)
3928  NODE_NAME_CASE(BFE_I32)
3929  NODE_NAME_CASE(BFI)
3930  NODE_NAME_CASE(BFM)
3931  NODE_NAME_CASE(FFBH_U32)
3932  NODE_NAME_CASE(FFBH_I32)
3933  NODE_NAME_CASE(FFBL_B32)
3934  NODE_NAME_CASE(MUL_U24)
3935  NODE_NAME_CASE(MUL_I24)
3936  NODE_NAME_CASE(MULHI_U24)
3937  NODE_NAME_CASE(MULHI_I24)
3938  NODE_NAME_CASE(MUL_LOHI_U24)
3939  NODE_NAME_CASE(MUL_LOHI_I24)
3940  NODE_NAME_CASE(MAD_U24)
3941  NODE_NAME_CASE(MAD_I24)
3942  NODE_NAME_CASE(MAD_I64_I32)
3943  NODE_NAME_CASE(MAD_U64_U32)
3944  NODE_NAME_CASE(TEXTURE_FETCH)
3945  NODE_NAME_CASE(EXPORT)
3946  NODE_NAME_CASE(EXPORT_DONE)
3947  NODE_NAME_CASE(R600_EXPORT)
3948  NODE_NAME_CASE(CONST_ADDRESS)
3949  NODE_NAME_CASE(REGISTER_LOAD)
3950  NODE_NAME_CASE(REGISTER_STORE)
3951  NODE_NAME_CASE(SAMPLE)
3952  NODE_NAME_CASE(SAMPLEB)
3953  NODE_NAME_CASE(SAMPLED)
3954  NODE_NAME_CASE(SAMPLEL)
3955  NODE_NAME_CASE(CVT_F32_UBYTE0)
3956  NODE_NAME_CASE(CVT_F32_UBYTE1)
3957  NODE_NAME_CASE(CVT_F32_UBYTE2)
3958  NODE_NAME_CASE(CVT_F32_UBYTE3)
3959  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3960  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
3961  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
3962  NODE_NAME_CASE(CVT_PK_I16_I32)
3963  NODE_NAME_CASE(CVT_PK_U16_U32)
3964  NODE_NAME_CASE(FP_TO_FP16)
3965  NODE_NAME_CASE(FP16_ZEXT)
3966  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3967  NODE_NAME_CASE(CONST_DATA_PTR)
3968  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3969  NODE_NAME_CASE(KILL)
3970  NODE_NAME_CASE(DUMMY_CHAIN)
3971  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3972  NODE_NAME_CASE(INIT_EXEC)
3973  NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
3974  NODE_NAME_CASE(SENDMSG)
3975  NODE_NAME_CASE(SENDMSGHALT)
3976  NODE_NAME_CASE(INTERP_MOV)
3977  NODE_NAME_CASE(INTERP_P1)
3978  NODE_NAME_CASE(INTERP_P2)
3979  NODE_NAME_CASE(STORE_MSKOR)
3980  NODE_NAME_CASE(LOAD_CONSTANT)
3981  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3982  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
3983  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
3984  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3985  NODE_NAME_CASE(ATOMIC_INC)
3986  NODE_NAME_CASE(ATOMIC_DEC)
3987  NODE_NAME_CASE(BUFFER_LOAD)
3988  NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3989  NODE_NAME_CASE(BUFFER_STORE)
3990  NODE_NAME_CASE(BUFFER_STORE_FORMAT)
3991  NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
3992  NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
3993  NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
3994  NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
3995  NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
3996  NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
3997  NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
3998  NODE_NAME_CASE(BUFFER_ATOMIC_AND)
3999  NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4000  NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4001  NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4002  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4003  }
4004  return nullptr;
4005}
4006
4007SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4008                                              SelectionDAG &DAG, int Enabled,
4009                                              int &RefinementSteps,
4010                                              bool &UseOneConstNR,
4011                                              bool Reciprocal) const {
4012  EVT VT = Operand.getValueType();
4013
4014  if (VT == MVT::f32) {
4015    RefinementSteps = 0;
4016    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4017  }
4018
4019  // TODO: There is also f64 rsq instruction, but the documentation is less
4020  // clear on its precision.
4021
4022  return SDValue();
4023}
4024
4025SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4026                                               SelectionDAG &DAG, int Enabled,
4027                                               int &RefinementSteps) const {
4028  EVT VT = Operand.getValueType();
4029
4030  if (VT == MVT::f32) {
4031    // Reciprocal, < 1 ulp error.
4032    //
4033    // This reciprocal approximation converges to < 0.5 ulp error with one
4034    // newton rhapson performed with two fused multiple adds (FMAs).
4035
4036    RefinementSteps = 0;
4037    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4038  }
4039
4040  // TODO: There is also f64 rcp instruction, but the documentation is less
4041  // clear on its precision.
4042
4043  return SDValue();
4044}
4045
4046void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4047    const SDValue Op, KnownBits &Known,
4048    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4049
4050  Known.resetAll(); // Don't know anything.
4051
4052  unsigned Opc = Op.getOpcode();
4053
4054  switch (Opc) {
4055  default:
4056    break;
4057  case AMDGPUISD::CARRY:
4058  case AMDGPUISD::BORROW: {
4059    Known.Zero = APInt::getHighBitsSet(32, 31);
4060    break;
4061  }
4062
4063  case AMDGPUISD::BFE_I32:
4064  case AMDGPUISD::BFE_U32: {
4065    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4066    if (!CWidth)
4067      return;
4068
4069    uint32_t Width = CWidth->getZExtValue() & 0x1f;
4070
4071    if (Opc == AMDGPUISD::BFE_U32)
4072      Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4073
4074    break;
4075  }
4076  case AMDGPUISD::FP_TO_FP16:
4077  case AMDGPUISD::FP16_ZEXT: {
4078    unsigned BitWidth = Known.getBitWidth();
4079
4080    // High bits are zero.
4081    Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4082    break;
4083  }
4084  case AMDGPUISD::MUL_U24:
4085  case AMDGPUISD::MUL_I24: {
4086    KnownBits LHSKnown, RHSKnown;
4087    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
4088    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
4089
4090    unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4091                      RHSKnown.countMinTrailingZeros();
4092    Known.Zero.setLowBits(std::min(TrailZ, 32u));
4093
4094    unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
4095    unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
4096    unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4097    if (MaxValBits >= 32)
4098      break;
4099    bool Negative = false;
4100    if (Opc == AMDGPUISD::MUL_I24) {
4101      bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
4102      bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
4103      bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
4104      bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
4105      if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
4106        break;
4107      Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
4108    }
4109    if (Negative)
4110      Known.One.setHighBits(32 - MaxValBits);
4111    else
4112      Known.Zero.setHighBits(32 - MaxValBits);
4113    break;
4114  }
4115  case ISD::INTRINSIC_WO_CHAIN: {
4116    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4117    switch (IID) {
4118    case Intrinsic::amdgcn_mbcnt_lo:
4119    case Intrinsic::amdgcn_mbcnt_hi: {
4120      // These return at most the wavefront size - 1.
4121      unsigned Size = Op.getValueType().getSizeInBits();
4122      Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
4123      break;
4124    }
4125    default:
4126      break;
4127    }
4128  }
4129  }
4130}
4131
4132unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4133    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4134    unsigned Depth) const {
4135  switch (Op.getOpcode()) {
4136  case AMDGPUISD::BFE_I32: {
4137    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4138    if (!Width)
4139      return 1;
4140
4141    unsigned SignBits = 32 - Width->getZExtValue() + 1;
4142    if (!isNullConstant(Op.getOperand(1)))
4143      return SignBits;
4144
4145    // TODO: Could probably figure something out with non-0 offsets.
4146    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4147    return std::max(SignBits, Op0SignBits);
4148  }
4149
4150  case AMDGPUISD::BFE_U32: {
4151    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4152    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4153  }
4154
4155  case AMDGPUISD::CARRY:
4156  case AMDGPUISD::BORROW:
4157    return 31;
4158  case AMDGPUISD::FP_TO_FP16:
4159  case AMDGPUISD::FP16_ZEXT:
4160    return 16;
4161  default:
4162    return 1;
4163  }
4164}
4165