AMDGPUISelLowering.cpp revision 309124
162587Sitojun//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
295023Ssuz//
362587Sitojun//                     The LLVM Compiler Infrastructure
4139823Simp//
554263Sshin// This file is distributed under the University of Illinois Open Source
654263Sshin// License. See LICENSE.TXT for details.
754263Sshin//
854263Sshin//===----------------------------------------------------------------------===//
954263Sshin//
1054263Sshin/// \file
1154263Sshin/// \brief This is the parent TargetLowering class for hardware code gen
1254263Sshin/// targets.
1354263Sshin//
1454263Sshin//===----------------------------------------------------------------------===//
1554263Sshin
1654263Sshin#include "AMDGPUISelLowering.h"
1754263Sshin#include "AMDGPU.h"
1854263Sshin#include "AMDGPUFrameLowering.h"
1954263Sshin#include "AMDGPUIntrinsicInfo.h"
2054263Sshin#include "AMDGPURegisterInfo.h"
2154263Sshin#include "AMDGPUSubtarget.h"
2254263Sshin#include "R600MachineFunctionInfo.h"
2354263Sshin#include "SIMachineFunctionInfo.h"
2454263Sshin#include "llvm/CodeGen/CallingConvLower.h"
2554263Sshin#include "llvm/CodeGen/MachineFunction.h"
2654263Sshin#include "llvm/CodeGen/MachineRegisterInfo.h"
2754263Sshin#include "llvm/CodeGen/SelectionDAG.h"
2854263Sshin#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
2954263Sshin#include "llvm/IR/DataLayout.h"
3054263Sshin#include "llvm/IR/DiagnosticInfo.h"
3154263Sshin#include "SIInstrInfo.h"
3254263Sshinusing namespace llvm;
3354263Sshin
3454263Sshinstatic bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
35101739Srwatson                            CCValAssign::LocInfo LocInfo,
3654263Sshin                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
3754263Sshin  MachineFunction &MF = State.getMachineFunction();
3854263Sshin  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
3954263Sshin
4054263Sshin  uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
4154263Sshin                                         ArgFlags.getOrigAlign());
42129880Sphk  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
4354263Sshin  return true;
4454263Sshin}
4554263Sshin
4654263Sshin#include "AMDGPUGenCallingConv.inc"
4791270Sbrooks
4854263Sshin// Find a larger type to do a load / store of a vector with.
49178888SjulianEVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
5062587Sitojun  unsigned StoreSize = VT.getStoreSizeInBits();
5179106Sbrooks  if (StoreSize <= 32)
52181803Sbz    return EVT::getIntegerVT(Ctx, StoreSize);
5354263Sshin
5454263Sshin  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
5554263Sshin  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
56130933Sbrooks}
5754263Sshin
5854263SshinEVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
5954263Sshin  unsigned StoreSize = VT.getStoreSizeInBits();
6054263Sshin  if (StoreSize <= 32)
6154263Sshin    return EVT::getIntegerVT(Ctx, StoreSize);
6254263Sshin
6354263Sshin  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
6478064Sume}
6578064Sume
6654263SshinAMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
6754263Sshin                                           const AMDGPUSubtarget &STI)
6879106Sbrooks    : TargetLowering(TM), Subtarget(&STI) {
6954263Sshin  // Lower floating point store/load to integer store/load to reduce the number
7054263Sshin  // of patterns in tablegen.
7154263Sshin  setOperationAction(ISD::LOAD, MVT::f32, Promote);
7254263Sshin  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
7354263Sshin
7454263Sshin  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
7554263Sshin  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
7654263Sshin
7754263Sshin  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
78148385Sume  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
7954263Sshin
8062587Sitojun  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
8154263Sshin  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
8254263Sshin
8362587Sitojun  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
84153621Sthompsa  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
85153621Sthompsa
8654263Sshin  setOperationAction(ISD::LOAD, MVT::i64, Promote);
8754263Sshin  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
88163606Srwatson
89163606Srwatson  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
9079106Sbrooks  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
9162587Sitojun
92127305Srwatson  setOperationAction(ISD::LOAD, MVT::f64, Promote);
93127898Sru  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
94127305Srwatson
95127305Srwatson  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
9679106Sbrooks  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
97185088Szec
98185895Szec  // There are no 64-bit extloads. These should be done as a 32-bit extload and
99185895Szec  // an extension to 64-bit.
100185895Szec  for (MVT VT : MVT::integer_valuetypes()) {
101185895Szec    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
102185895Szec    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
103185895Szec    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
104185088Szec  }
10589065Smsmith
106185088Szec  for (MVT VT : MVT::integer_valuetypes()) {
107185088Szec    if (VT == MVT::i64)
108185088Szec      continue;
109185088Szec
110185088Szec    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
111185088Szec    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
112185088Szec    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
113185088Szec    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
114185088Szec
11579106Sbrooks    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
11683998Sbrooks    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
11783998Sbrooks    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
11883998Sbrooks    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
11983998Sbrooks
12083998Sbrooks    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
121153621Sthompsa    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
122160195Ssam    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
123128209Sbrooks    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
12479106Sbrooks  }
125130933Sbrooks
12679106Sbrooks  for (MVT VT : MVT::integer_vector_valuetypes()) {
12792725Salfred    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
12879106Sbrooks    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
12991270Sbrooks    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
13091270Sbrooks    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
13191270Sbrooks    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
13262587Sitojun    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
13362587Sitojun    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
13491270Sbrooks    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
13562587Sitojun    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
13662587Sitojun    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
13762587Sitojun    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
13895023Ssuz    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
13962587Sitojun  }
14062587Sitojun
14162587Sitojun  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
14262587Sitojun  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
143183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
144183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
14562587Sitojun
146183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
147183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
148183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
149183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
150183550Szec
151183550Szec  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
15291270Sbrooks  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
15391270Sbrooks  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
15491270Sbrooks  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
15591270Sbrooks
15691270Sbrooks  setOperationAction(ISD::STORE, MVT::f32, Promote);
157183550Szec  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
158183550Szec
15991270Sbrooks  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
160176879Sthompsa  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
161176879Sthompsa
162176879Sthompsa  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
163176879Sthompsa  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
164176879Sthompsa
165176879Sthompsa  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
166176879Sthompsa  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
167176879Sthompsa
168128209Sbrooks  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
169160195Ssam  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
17079106Sbrooks
17192081Smux  setOperationAction(ISD::STORE, MVT::i64, Promote);
172160195Ssam  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
17354263Sshin
174183550Szec  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
17578064Sume  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
17654263Sshin
177131672Sbms  setOperationAction(ISD::STORE, MVT::f64, Promote);
178178888Sjulian  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
179147256Sbrooks
180147256Sbrooks  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
181147256Sbrooks  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
182147256Sbrooks
183147256Sbrooks  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
18479106Sbrooks  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
185155037Sglebius
186155037Sglebius  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
187147256Sbrooks  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
188147256Sbrooks
18979106Sbrooks  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
19079106Sbrooks  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
19162587Sitojun  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
192147256Sbrooks
193147256Sbrooks  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
194147256Sbrooks  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
19578064Sume  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
19679106Sbrooks  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
197147256Sbrooks
19878064Sume  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
199147256Sbrooks  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
200153621Sthompsa  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
201147256Sbrooks  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
202147256Sbrooks
203147256Sbrooks  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
204147611Sdwmalone  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
20583998Sbrooks  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
206147256Sbrooks  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
207155037Sglebius
208155037Sglebius  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
209181803Sbz  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
210155037Sglebius
211155037Sglebius  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
212155037Sglebius  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
21379106Sbrooks
21479106Sbrooks  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
215127305Srwatson  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
216151266Sthompsa
217151266Sthompsa  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
21879106Sbrooks  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
219184678Sbz
22079106Sbrooks
221184678Sbz  setOperationAction(ISD::Constant, MVT::i32, Legal);
222151266Sthompsa  setOperationAction(ISD::Constant, MVT::i64, Legal);
22379106Sbrooks  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
224151266Sthompsa  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
225151266Sthompsa
226151266Sthompsa  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
227151266Sthompsa  setOperationAction(ISD::BRIND, MVT::Other, Expand);
228127305Srwatson
229105293Sume  // This is totally unsupported, just custom lower to produce an error.
230105293Sume  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
231105293Sume
232105293Sume  // We need to custom lower some of the intrinsics
233105293Sume  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
234105293Sume  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
235105293Sume
23679106Sbrooks  // Library functions.  These default to Expand, but we have instructions
23779106Sbrooks  // for them.
23879106Sbrooks  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
23979106Sbrooks  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
240105293Sume  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
24179106Sbrooks  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
24283998Sbrooks  setOperationAction(ISD::FABS,   MVT::f32, Legal);
24383998Sbrooks  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
24479106Sbrooks  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
24579106Sbrooks  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
246147256Sbrooks  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
24779106Sbrooks  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
248155037Sglebius
249155037Sglebius  setOperationAction(ISD::FROUND, MVT::f32, Custom);
25079106Sbrooks  setOperationAction(ISD::FROUND, MVT::f64, Custom);
25179106Sbrooks
25279106Sbrooks  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
25379106Sbrooks  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
25479106Sbrooks
25579106Sbrooks  setOperationAction(ISD::FREM, MVT::f32, Custom);
25679106Sbrooks  setOperationAction(ISD::FREM, MVT::f64, Custom);
25779106Sbrooks
25879106Sbrooks  // v_mad_f32 does not support denormals according to some sources.
25979106Sbrooks  if (!Subtarget->hasFP32Denormals())
26079106Sbrooks    setOperationAction(ISD::FMAD, MVT::f32, Legal);
26179106Sbrooks
262127305Srwatson  // Expand to fneg + fadd.
263185088Szec  setOperationAction(ISD::FSUB, MVT::f64, Expand);
264181803Sbz
265185088Szec  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
266185088Szec  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
267185088Szec  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
268185088Szec  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
269185088Szec  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
270185088Szec  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
271185088Szec  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
272185088Szec  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
273185088Szec  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
27479106Sbrooks  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
275181803Sbz
27662587Sitojun  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
277185088Szec    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
27879106Sbrooks    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
27979106Sbrooks    setOperationAction(ISD::FRINT, MVT::f64, Custom);
28079106Sbrooks    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
28179106Sbrooks  }
282127305Srwatson
28379106Sbrooks  if (!Subtarget->hasBFI()) {
284181803Sbz    // fcopysign can be done in a single instruction with BFI.
28562587Sitojun    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
28679106Sbrooks    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
287132199Sphk  }
288132199Sphk
28954263Sshin  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
29079106Sbrooks
29154263Sshin  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
29254263Sshin  for (MVT VT : ScalarIntVTs) {
29379106Sbrooks    // These should use [SU]DIVREM, so set them to expand
29479106Sbrooks    setOperationAction(ISD::SDIV, VT, Expand);
29579106Sbrooks    setOperationAction(ISD::UDIV, VT, Expand);
29679106Sbrooks    setOperationAction(ISD::SREM, VT, Expand);
29779106Sbrooks    setOperationAction(ISD::UREM, VT, Expand);
29854263Sshin
29979106Sbrooks    // GPU does not have divrem function for signed or unsigned.
30083997Sbrooks    setOperationAction(ISD::SDIVREM, VT, Custom);
30179106Sbrooks    setOperationAction(ISD::UDIVREM, VT, Custom);
302105293Sume
30362587Sitojun    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
30462587Sitojun    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
30562587Sitojun    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
30662587Sitojun
30762587Sitojun    setOperationAction(ISD::BSWAP, VT, Expand);
30862587Sitojun    setOperationAction(ISD::CTTZ, VT, Expand);
30962587Sitojun    setOperationAction(ISD::CTLZ, VT, Expand);
31062587Sitojun  }
31162587Sitojun
31262587Sitojun  if (!Subtarget->hasBCNT(32))
31362587Sitojun    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
31462587Sitojun
31562587Sitojun  if (!Subtarget->hasBCNT(64))
316147256Sbrooks    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
31762587Sitojun
31862587Sitojun  // The hardware supports 32-bit ROTR, but not ROTL.
31962587Sitojun  setOperationAction(ISD::ROTL, MVT::i32, Expand);
32062587Sitojun  setOperationAction(ISD::ROTL, MVT::i64, Expand);
32162587Sitojun  setOperationAction(ISD::ROTR, MVT::i64, Expand);
32262587Sitojun
32362587Sitojun  setOperationAction(ISD::MUL, MVT::i64, Expand);
32462587Sitojun  setOperationAction(ISD::MULHU, MVT::i64, Expand);
32562587Sitojun  setOperationAction(ISD::MULHS, MVT::i64, Expand);
32662587Sitojun  setOperationAction(ISD::UDIV, MVT::i32, Expand);
32762587Sitojun  setOperationAction(ISD::UREM, MVT::i32, Expand);
32862587Sitojun  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
32962587Sitojun  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
33062587Sitojun  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
33162587Sitojun  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
332153621Sthompsa  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
333153621Sthompsa
334153621Sthompsa  setOperationAction(ISD::SMIN, MVT::i32, Legal);
33562587Sitojun  setOperationAction(ISD::UMIN, MVT::i32, Legal);
33662587Sitojun  setOperationAction(ISD::SMAX, MVT::i32, Legal);
33762587Sitojun  setOperationAction(ISD::UMAX, MVT::i32, Legal);
33862587Sitojun
339105339Sume  if (Subtarget->hasFFBH())
340105339Sume    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
341105339Sume
342105339Sume  if (Subtarget->hasFFBL())
34391327Sbrooks    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
34462587Sitojun
34562587Sitojun  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
34662587Sitojun  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
34762587Sitojun
34862587Sitojun  // We only really have 32-bit BFE instructions (and 16-bit on VI).
34962587Sitojun  //
35062587Sitojun  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
35162587Sitojun  // effort to match them now. We want this to be false for i64 cases when the
35262587Sitojun  // extraction isn't restricted to the upper or lower half. Ideally we would
35362587Sitojun  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
35462587Sitojun  // span the midpoint are probably relatively rare, so don't worry about them
355105293Sume  // for now.
356105293Sume  if (Subtarget->hasBFE())
35762587Sitojun    setHasExtractBitsInsn(true);
35862587Sitojun
35962587Sitojun  static const MVT::SimpleValueType VectorIntTypes[] = {
36062587Sitojun    MVT::v2i32, MVT::v4i32
36162587Sitojun  };
36262587Sitojun
36362587Sitojun  for (MVT VT : VectorIntTypes) {
36462587Sitojun    // Expand the following operations for the current type by default.
36562587Sitojun    setOperationAction(ISD::ADD,  VT, Expand);
36662587Sitojun    setOperationAction(ISD::AND,  VT, Expand);
367153621Sthompsa    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
368153621Sthompsa    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
369153621Sthompsa    setOperationAction(ISD::MUL,  VT, Expand);
370153621Sthompsa    setOperationAction(ISD::OR,   VT, Expand);
371153621Sthompsa    setOperationAction(ISD::SHL,  VT, Expand);
372153621Sthompsa    setOperationAction(ISD::SRA,  VT, Expand);
373153621Sthompsa    setOperationAction(ISD::SRL,  VT, Expand);
374153621Sthompsa    setOperationAction(ISD::ROTL, VT, Expand);
375153621Sthompsa    setOperationAction(ISD::ROTR, VT, Expand);
376153621Sthompsa    setOperationAction(ISD::SUB,  VT, Expand);
377153621Sthompsa    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
378153621Sthompsa    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
379153621Sthompsa    setOperationAction(ISD::SDIV, VT, Expand);
380153621Sthompsa    setOperationAction(ISD::UDIV, VT, Expand);
381153621Sthompsa    setOperationAction(ISD::SREM, VT, Expand);
382153621Sthompsa    setOperationAction(ISD::UREM, VT, Expand);
383153621Sthompsa    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
384153621Sthompsa    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
385153621Sthompsa    setOperationAction(ISD::SDIVREM, VT, Custom);
386153621Sthompsa    setOperationAction(ISD::UDIVREM, VT, Expand);
387153621Sthompsa    setOperationAction(ISD::ADDC, VT, Expand);
388153621Sthompsa    setOperationAction(ISD::SUBC, VT, Expand);
38954263Sshin    setOperationAction(ISD::ADDE, VT, Expand);
39054263Sshin    setOperationAction(ISD::SUBE, VT, Expand);
39154263Sshin    setOperationAction(ISD::SELECT, VT, Expand);
39254263Sshin    setOperationAction(ISD::VSELECT, VT, Expand);
39354263Sshin    setOperationAction(ISD::SELECT_CC, VT, Expand);
39454263Sshin    setOperationAction(ISD::XOR,  VT, Expand);
39554263Sshin    setOperationAction(ISD::BSWAP, VT, Expand);
396183550Szec    setOperationAction(ISD::CTPOP, VT, Expand);
397147256Sbrooks    setOperationAction(ISD::CTTZ, VT, Expand);
398127898Sru    setOperationAction(ISD::CTLZ, VT, Expand);
39954263Sshin    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
400127898Sru  }
401147611Sdwmalone
40254263Sshin  static const MVT::SimpleValueType FloatVectorTypes[] = {
403101182Srwatson    MVT::v2f32, MVT::v4f32
404172930Srwatson  };
405101739Srwatson
406101739Srwatson  for (MVT VT : FloatVectorTypes) {
407101739Srwatson    setOperationAction(ISD::FABS, VT, Expand);
408101739Srwatson    setOperationAction(ISD::FMINNUM, VT, Expand);
409101182Srwatson    setOperationAction(ISD::FMAXNUM, VT, Expand);
410101182Srwatson    setOperationAction(ISD::FADD, VT, Expand);
41154263Sshin    setOperationAction(ISD::FCEIL, VT, Expand);
41254263Sshin    setOperationAction(ISD::FCOS, VT, Expand);
413127898Sru    setOperationAction(ISD::FDIV, VT, Expand);
414127898Sru    setOperationAction(ISD::FEXP2, VT, Expand);
415127898Sru    setOperationAction(ISD::FLOG2, VT, Expand);
41654263Sshin    setOperationAction(ISD::FREM, VT, Expand);
41754263Sshin    setOperationAction(ISD::FPOW, VT, Expand);
418127898Sru    setOperationAction(ISD::FFLOOR, VT, Expand);
419127898Sru    setOperationAction(ISD::FTRUNC, VT, Expand);
420127898Sru    setOperationAction(ISD::FMUL, VT, Expand);
421127898Sru    setOperationAction(ISD::FMA, VT, Expand);
422127898Sru    setOperationAction(ISD::FRINT, VT, Expand);
423127898Sru    setOperationAction(ISD::FNEARBYINT, VT, Expand);
424127898Sru    setOperationAction(ISD::FSQRT, VT, Expand);
425127898Sru    setOperationAction(ISD::FSIN, VT, Expand);
426127898Sru    setOperationAction(ISD::FSUB, VT, Expand);
427127898Sru    setOperationAction(ISD::FNEG, VT, Expand);
428127898Sru    setOperationAction(ISD::VSELECT, VT, Expand);
429127898Sru    setOperationAction(ISD::SELECT_CC, VT, Expand);
430127898Sru    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
431127898Sru    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
432181803Sbz  }
43354263Sshin
43454263Sshin  // This causes using an unrolled select operation rather than expansion with
435127303Srwatson  // bit operations. This is in general better, but the alternative using BFI
43654263Sshin  // instructions may be better if the select sources are SGPRs.
43754263Sshin  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
43854263Sshin  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
43954263Sshin
440127898Sru  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
441127898Sru  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
442127898Sru
443127898Sru  setBooleanContents(ZeroOrNegativeOneBooleanContent);
444127898Sru  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
445127898Sru
446127898Sru  setSchedulingPreference(Sched::RegPressure);
447127898Sru  setJumpIsExpensive(true);
448127898Sru
44962587Sitojun  // SI at least has hardware support for floating point exceptions, but no way
45054263Sshin  // of using or handling them is implemented. They are also optional in OpenCL
451155037Sglebius  // (Section 7.3)
452155037Sglebius  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
453155037Sglebius
45454263Sshin  setSelectIsExpensive(false);
45554263Sshin  PredictableSelectIsExpensive = false;
456159174Sglebius
45754263Sshin  setFsqrtIsCheap(true);
45854263Sshin
45954263Sshin  // We want to find all load dependencies for long chains of stores to enable
46054263Sshin  // merging into very wide vectors. The problem is with vectors with > 4
46154263Sshin  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
462147611Sdwmalone  // vectors are a legal type, even though we have to split the loads
463147611Sdwmalone  // usually. When we can more precisely specify load legality per address
464147611Sdwmalone  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
465147611Sdwmalone  // smarter so that they can figure out what to do in 2 iterations without all
466147611Sdwmalone  // N > 4 stores on the same chain.
467147611Sdwmalone  GatherAllAliasesMaxDepth = 16;
468153621Sthompsa
469159180Scsjp  // FIXME: Need to really handle these.
47062587Sitojun  MaxStoresPerMemcpy  = 4096;
47154263Sshin  MaxStoresPerMemmove = 4096;
47254263Sshin  MaxStoresPerMemset  = 4096;
473153621Sthompsa
474153621Sthompsa  setTargetDAGCombine(ISD::BITCAST);
475153621Sthompsa  setTargetDAGCombine(ISD::AND);
476153621Sthompsa  setTargetDAGCombine(ISD::SHL);
477178888Sjulian  setTargetDAGCombine(ISD::SRA);
47878064Sume  setTargetDAGCombine(ISD::SRL);
47978064Sume  setTargetDAGCombine(ISD::MUL);
48062587Sitojun  setTargetDAGCombine(ISD::SELECT);
48162587Sitojun  setTargetDAGCombine(ISD::SELECT_CC);
48278064Sume  setTargetDAGCombine(ISD::STORE);
48354263Sshin  setTargetDAGCombine(ISD::FADD);
48454263Sshin  setTargetDAGCombine(ISD::FSUB);
48554263Sshin}
486153621Sthompsa
48754263Sshin//===----------------------------------------------------------------------===//
48854263Sshin// Target Information
48954263Sshin//===----------------------------------------------------------------------===//
49054263Sshin
491153621SthompsaMVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
49254263Sshin  return MVT::i32;
49354263Sshin}
49454263Sshin
49562587Sitojunbool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
49654263Sshin  return true;
49754263Sshin}
49854263Sshin
499159174Sglebius// The backend supports 32 and 64 bit floating point immediates.
50054263Sshin// FIXME: Why are we reporting vectors of FP immediates as legal?
50178064Sumebool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
50278064Sume  EVT ScalarVT = VT.getScalarType();
503155037Sglebius  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
50454263Sshin}
50554263Sshin
50654263Sshin// We don't want to shrink f64 / f32 constants.
507105338Sumebool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
50854263Sshin  EVT ScalarVT = VT.getScalarType();
50954263Sshin  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
510105338Sume}
51154263Sshin
512153621Sthompsabool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
513153621Sthompsa                                                 ISD::LoadExtType,
514176879Sthompsa                                                 EVT NewVT) const {
515176879Sthompsa
51654263Sshin  unsigned NewSize = NewVT.getStoreSizeInBits();
517105338Sume
51854263Sshin  // If we are reducing to a 32-bit load, this is always better.
51954263Sshin  if (NewSize == 32)
52054263Sshin    return true;
52154263Sshin
52254263Sshin  EVT OldVT = N->getValueType(0);
523105338Sume  unsigned OldSize = OldVT.getStoreSizeInBits();
524101182Srwatson
525101182Srwatson  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
526172930Srwatson  // extloads, so doing one requires using a buffer_load. In cases where we
527101182Srwatson  // still couldn't use a scalar load, using the wider load shouldn't really
528101182Srwatson  // hurt anything.
529159180Scsjp
53078064Sume  // If the old size already had to be an extload, there's no harm in continuing
531123922Ssam  // to reduce the width.
53254263Sshin  return (OldSize < 32);
53354263Sshin}
53483998Sbrooks
535105338Sumebool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
53683998Sbrooks                                                   EVT CastTy) const {
53783998Sbrooks
53883998Sbrooks  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
53983998Sbrooks
54054263Sshin  if (LoadTy.getScalarType() == MVT::i32)
54154263Sshin    return false;
54254263Sshin
54354263Sshin  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
54495023Ssuz  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
54554263Sshin
54695023Ssuz  return (LScalarSize < CastScalarSize) ||
54754263Sshin         (CastScalarSize >= 32);
54895023Ssuz}
54995023Ssuz
55054263Sshin// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
55154263Sshin// profitable with the expansion for 64-bit since it's generally good to
55254263Sshin// speculate things.
55354263Sshin// FIXME: These should really have the size as a parameter.
55454263Sshinbool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
55554263Sshin  return true;
55654263Sshin}
55754263Sshin
55854263Sshinbool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
55954263Sshin  return true;
56054263Sshin}
56154263Sshin
562153621Sthompsa//===---------------------------------------------------------------------===//
563153621Sthompsa// Target Properties
564153621Sthompsa//===---------------------------------------------------------------------===//
565153621Sthompsa
566153621Sthompsabool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
567153621Sthompsa  assert(VT.isFloatingPoint());
568153621Sthompsa  return VT == MVT::f32 || VT == MVT::f64;
569153621Sthompsa}
570153621Sthompsa
571153621Sthompsabool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
572153621Sthompsa  assert(VT.isFloatingPoint());
573153621Sthompsa  return VT == MVT::f32 || VT == MVT::f64;
574153621Sthompsa}
575153621Sthompsa
576153621Sthompsabool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
577153621Sthompsa                                                         unsigned NumElem,
578153621Sthompsa                                                         unsigned AS) const {
579153621Sthompsa  return true;
580153621Sthompsa}
581153621Sthompsa
582153621Sthompsabool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
583153621Sthompsa  // There are few operations which truly have vector input operands. Any vector
584176879Sthompsa  // operation is going to involve operations on each component, and a
585176879Sthompsa  // build_vector will be a copy per element, so it always makes sense to use a
586176879Sthompsa  // build_vector input in place of the extracted element to avoid a copy into a
587176879Sthompsa  // super register.
588176879Sthompsa  //
589176879Sthompsa  // We should probably only do this if all users are extracts only, but this
590176879Sthompsa  // should be the common case.
591176879Sthompsa  return true;
592176879Sthompsa}
593176879Sthompsa
594153621Sthompsabool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
595176879Sthompsa  // Truncate is just accessing a subregister.
596176879Sthompsa  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
597176879Sthompsa}
598176879Sthompsa
599176879Sthompsabool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
600176879Sthompsa  // Truncate is just accessing a subregister.
601176879Sthompsa  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
602176879Sthompsa         (Dest->getPrimitiveSizeInBits() % 32 == 0);
603176879Sthompsa}
604176879Sthompsa
605153621Sthompsabool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
606153621Sthompsa  unsigned SrcSize = Src->getScalarSizeInBits();
607153621Sthompsa  unsigned DestSize = Dest->getScalarSizeInBits();
608153621Sthompsa
60954263Sshin  return SrcSize == 32 && DestSize == 64;
61083998Sbrooks}
611105338Sume
61283998Sbrooksbool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
61383998Sbrooks  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
61454263Sshin  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
61554263Sshin  // this will enable reducing 64-bit operations the 32-bit, which is always
61654263Sshin  // good.
617105338Sume  return Src == MVT::i32 && Dest == MVT::i64;
618105338Sume}
619111888Sjlemon
62054263Sshinbool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
62154263Sshin  return isZExtFree(Val.getValueType(), VT2);
62262587Sitojun}
62354263Sshin
62454263Sshinbool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
62554263Sshin  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
62654263Sshin  // limited number of native 64-bit operations. Shrinking an operation to fit
62754263Sshin  // in a single 32-bit register should always be helpful. As currently used,
62854263Sshin  // this is much less general than the name suggests, and is only used in
629147256Sbrooks  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
63054263Sshin  // not profitable, and may actually be harmful.
63154263Sshin  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
63262587Sitojun}
633105339Sume
634105339Sume//===---------------------------------------------------------------------===//
635105339Sume// TargetLowering Callbacks
636105339Sume//===---------------------------------------------------------------------===//
63754263Sshin
63854263Sshinvoid AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
639105293Sume                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
64054263Sshin
64162587Sitojun  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
64254263Sshin}
64354263Sshin
64454263Sshinvoid AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
64554263Sshin                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
64654263Sshin
64754263Sshin  State.AnalyzeReturn(Outs, RetCC_SI);
64854263Sshin}
64962587Sitojun
65054263SshinSDValue
65154263SshinAMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
65262587Sitojun                                  bool isVarArg,
65354263Sshin                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
654105339Sume                                  const SmallVectorImpl<SDValue> &OutVals,
655105339Sume                                  const SDLoc &DL, SelectionDAG &DAG) const {
656105339Sume  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
657105339Sume}
65854263Sshin
65962587Sitojun//===---------------------------------------------------------------------===//
66054263Sshin// Target specific lowering
661105339Sume//===---------------------------------------------------------------------===//
66254263Sshin
663105339SumeSDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
66454263Sshin                                        SmallVectorImpl<SDValue> &InVals) const {
66554263Sshin  SDValue Callee = CLI.Callee;
66654263Sshin  SelectionDAG &DAG = CLI.DAG;
66778064Sume
66862587Sitojun  const Function &Fn = *DAG.getMachineFunction().getFunction();
66978064Sume
67062587Sitojun  StringRef FuncName("<unknown>");
67154263Sshin
67254263Sshin  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
67354263Sshin    FuncName = G->getSymbol();
67454263Sshin  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
67562587Sitojun    FuncName = G->getGlobal()->getName();
67678064Sume
67762587Sitojun  DiagnosticInfoUnsupported NoCalls(
67862587Sitojun      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
67962587Sitojun  DAG.getContext()->diagnose(NoCalls);
68062587Sitojun
68162587Sitojun  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
68262587Sitojun    InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
68362587Sitojun
68462587Sitojun  return DAG.getEntryNode();
68578064Sume}
68678064Sume
68778064SumeSDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
68878064Sume                                                      SelectionDAG &DAG) const {
68978064Sume  const Function &Fn = *DAG.getMachineFunction().getFunction();
690105293Sume
69191327Sbrooks  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
692105293Sume                                            SDLoc(Op).getDebugLoc());
69362587Sitojun  DAG.getContext()->diagnose(NoDynamicAlloca);
69454263Sshin  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
69578064Sume  return DAG.getMergeValues(Ops, SDLoc());
69678064Sume}
69778064Sume
69878064SumeSDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
69978064Sume                                             SelectionDAG &DAG) const {
70078064Sume  switch (Op.getOpcode()) {
70178064Sume  default:
70278064Sume    Op->dump(&DAG);
70378064Sume    llvm_unreachable("Custom lowering code for this"
70478064Sume                     "instruction is not implemented yet!");
70578064Sume    break;
70678064Sume  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
70778064Sume  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
70878064Sume  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
70978064Sume  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
71078064Sume  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
71178064Sume  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
71278064Sume  case ISD::FREM: return LowerFREM(Op, DAG);
71378064Sume  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
71478064Sume  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
71578064Sume  case ISD::FRINT: return LowerFRINT(Op, DAG);
71678064Sume  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
71778064Sume  case ISD::FROUND: return LowerFROUND(Op, DAG);
71878064Sume  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
71978064Sume  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
72078064Sume  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
72178064Sume  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
72278064Sume  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
72378064Sume  case ISD::CTLZ:
72478064Sume  case ISD::CTLZ_ZERO_UNDEF:
72578064Sume    return LowerCTLZ(Op, DAG);
72678064Sume  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
72778064Sume  }
72878064Sume  return Op;
72978064Sume}
73078064Sume
73178064Sumevoid AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
73278064Sume                                              SmallVectorImpl<SDValue> &Results,
73378064Sume                                              SelectionDAG &DAG) const {
73478064Sume  switch (N->getOpcode()) {
73578064Sume  case ISD::SIGN_EXTEND_INREG:
73678064Sume    // Different parts of legalization seem to interpret which type of
73778064Sume    // sign_extend_inreg is the one to check for custom lowering. The extended
73878064Sume    // from type is what really matters, but some places check for custom
73978064Sume    // lowering of the result type. This results in trying to use
74078064Sume    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
74178064Sume    // nothing here and let the illegal result integer be handled normally.
74278064Sume    return;
74378064Sume  default:
74478064Sume    return;
74578064Sume  }
74678064Sume}
74778064Sume
74878064Sume// FIXME: This implements accesses to initialized globals in the constant
74978064Sume// address space by copying them to private and accessing that. It does not
750147256Sbrooks// properly handle illegal types or vectors. The private vector loads are not
75162587Sitojun// scalarized, and the illegal scalars hit an assertion. This technique will not
75262587Sitojun// work well with large initializers, and this should eventually be
75362587Sitojun// removed. Initialized globals should be placed into a data section that the
75462587Sitojun// runtime will load into a buffer before the kernel is executed. Uses of the
755147256Sbrooks// global need to be replaced with a pointer loaded from an implicit kernel
75654263Sshin// argument into this buffer holding the copy of the data, which will remove the
75762587Sitojun// need for any of this.
75862587SitojunSDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
75954263Sshin                                                       const GlobalValue *GV,
76054263Sshin                                                       const SDValue &InitPtr,
76154263Sshin                                                       SDValue Chain,
76254263Sshin                                                       SelectionDAG &DAG) const {
76354263Sshin  const DataLayout &TD = DAG.getDataLayout();
76454263Sshin  SDLoc DL(InitPtr);
76554263Sshin  Type *InitTy = Init->getType();
76654263Sshin
76754263Sshin  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
76878064Sume    EVT VT = EVT::getEVT(InitTy);
76954263Sshin    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
77078064Sume    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
77154263Sshin                        MachinePointerInfo(UndefValue::get(PtrTy)),
77278064Sume                        TD.getPrefTypeAlignment(InitTy));
77354263Sshin  }
77454263Sshin
77554263Sshin  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
77678064Sume    EVT VT = EVT::getEVT(CFP->getType());
77754263Sshin    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
77854263Sshin    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
77978064Sume                        MachinePointerInfo(UndefValue::get(PtrTy)),
78054263Sshin                        TD.getPrefTypeAlignment(CFP->getType()));
78154263Sshin  }
78254263Sshin
78354263Sshin  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
78454263Sshin    const StructLayout *SL = TD.getStructLayout(ST);
78554263Sshin
78678064Sume    EVT PtrVT = InitPtr.getValueType();
78778064Sume    SmallVector<SDValue, 8> Chains;
78878064Sume
789148385Sume    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
790148385Sume      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
791148385Sume      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
792148385Sume
793148385Sume      Constant *Elt = Init->getAggregateElement(I);
794148385Sume      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
795148385Sume    }
79654263Sshin
79762587Sitojun    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
79854263Sshin  }
79954263Sshin
80054263Sshin  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
80154263Sshin    EVT PtrVT = InitPtr.getValueType();
80254263Sshin
80354263Sshin    unsigned NumElements;
80454263Sshin    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
80554263Sshin      NumElements = AT->getNumElements();
80654263Sshin    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
80778064Sume      NumElements = VT->getNumElements();
80854263Sshin    else
80978064Sume      llvm_unreachable("Unexpected type");
81054263Sshin
81178064Sume    unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
81254263Sshin    SmallVector<SDValue, 8> Chains;
81354263Sshin    for (unsigned i = 0; i < NumElements; ++i) {
81454263Sshin      SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
81578064Sume      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
81654263Sshin
81754263Sshin      Constant *Elt = Init->getAggregateElement(i);
81878064Sume      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
81954263Sshin    }
82054263Sshin
82154263Sshin    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
82254263Sshin  }
82354263Sshin
82454263Sshin  if (isa<UndefValue>(Init)) {
82578064Sume    EVT VT = EVT::getEVT(InitTy);
82678064Sume    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
82778064Sume    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
828148385Sume                        MachinePointerInfo(UndefValue::get(PtrTy)),
829148385Sume                        TD.getPrefTypeAlignment(InitTy));
830148385Sume  }
831148385Sume
832148385Sume  Init->dump();
833148385Sume  llvm_unreachable("Unhandled constant initializer");
834148385Sume}
83554263Sshin
83654263Sshinstatic bool hasDefinedInitializer(const GlobalValue *GV) {
83778064Sume  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
83878064Sume  if (!GVar || !GVar->hasInitializer())
83978064Sume    return false;
84078064Sume
84178064Sume  return !isa<UndefValue>(GVar->getInitializer());
84278064Sume}
84378064Sume
84478064SumeSDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
84578064Sume                                                 SDValue Op,
84678064Sume                                                 SelectionDAG &DAG) const {
84778064Sume
84878064Sume  const DataLayout &DL = DAG.getDataLayout();
84978064Sume  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
85078064Sume  const GlobalValue *GV = G->getGlobal();
85178064Sume
85278064Sume  switch (G->getAddressSpace()) {
85378064Sume  case AMDGPUAS::CONSTANT_ADDRESS: {
85478064Sume    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
85578064Sume    SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
85678064Sume    return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
85778064Sume  }
85878064Sume  case AMDGPUAS::LOCAL_ADDRESS: {
85978064Sume    // XXX: What does the value of G->getOffset() mean?
86078064Sume    assert(G->getOffset() == 0 &&
86178064Sume         "Do not know what to do with an non-zero offset");
86254263Sshin
86362587Sitojun    // TODO: We could emit code to handle the initialization somewhere.
86454263Sshin    if (hasDefinedInitializer(GV))
86554263Sshin      break;
86654263Sshin
86754263Sshin    unsigned Offset;
86854263Sshin    if (MFI->LocalMemoryObjects.count(GV) == 0) {
86954263Sshin      unsigned Align = GV->getAlignment();
87054263Sshin      if (Align == 0)
87154263Sshin        Align = DL.getABITypeAlignment(GV->getValueType());
87254263Sshin
87379106Sbrooks      /// TODO: We should sort these to minimize wasted space due to alignment
874127305Srwatson      /// padding. Currently the padding is decided by the first encountered use
875127305Srwatson      /// during lowering.
876127305Srwatson      Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
877127305Srwatson      MFI->LocalMemoryObjects[GV] = Offset;
878127305Srwatson      MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
879127305Srwatson    } else {
880127305Srwatson      Offset = MFI->LocalMemoryObjects[GV];
881105293Sume    }
882105293Sume
883105293Sume    return DAG.getConstant(Offset, SDLoc(Op),
884105293Sume                           getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
885105293Sume  }
886105293Sume  }
887183550Szec
888147256Sbrooks  const Function &Fn = *DAG.getMachineFunction().getFunction();
889105293Sume  DiagnosticInfoUnsupported BadInit(
890105293Sume      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
891105293Sume  DAG.getContext()->diagnose(BadInit);
892105293Sume  return SDValue();
893127305Srwatson}
894181803Sbz
895105293SumeSDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
896105293Sume                                                  SelectionDAG &DAG) const {
897105293Sume  SmallVector<SDValue, 8> Args;
898105293Sume
899105293Sume  for (const SDUse &U : Op->ops())
900105293Sume    DAG.ExtractVectorElements(U.get(), Args);
901105293Sume
902105293Sume  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
903105293Sume}
904105293Sume
905105293SumeSDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
906105293Sume                                                     SelectionDAG &DAG) const {
907105293Sume
908105293Sume  SmallVector<SDValue, 8> Args;
909181803Sbz  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
910105293Sume  EVT VT = Op.getValueType();
911105293Sume  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
912105293Sume                            VT.getVectorNumElements());
913127305Srwatson
914105293Sume  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
915105293Sume}
916105293Sume
917105293SumeSDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
918105293Sume    SelectionDAG &DAG) const {
919127305Srwatson  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
920105293Sume  SDLoc DL(Op);
921105293Sume  EVT VT = Op.getValueType();
922105293Sume
923105293Sume  switch (IntrinsicID) {
924105293Sume    default: return Op;
925105293Sume    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
926105293Sume      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
927105293Sume                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
928105293Sume
929105293Sume    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
930105293Sume      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
931105293Sume                         Op.getOperand(1),
932105293Sume                         Op.getOperand(2),
933105293Sume                         Op.getOperand(3));
934105293Sume
935105293Sume    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
936105293Sume      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
937111119Simp                         Op.getOperand(1),
938105293Sume                         Op.getOperand(2),
939105293Sume                         Op.getOperand(3));
940105293Sume  }
941105293Sume}
942111119Simp
943105293Sume/// \brief Generate Min/Max node
944105293SumeSDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
945105293Sume                                                   SDValue LHS, SDValue RHS,
946105293Sume                                                   SDValue True, SDValue False,
947105293Sume                                                   SDValue CC,
948105293Sume                                                   DAGCombinerInfo &DCI) const {
949105293Sume  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
950105293Sume    return SDValue();
951105293Sume
952105293Sume  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
953105293Sume    return SDValue();
954148385Sume
955148385Sume  SelectionDAG &DAG = DCI.DAG;
956148385Sume  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
957148385Sume  switch (CCOpcode) {
958148385Sume  case ISD::SETOEQ:
959148385Sume  case ISD::SETONE:
960148385Sume  case ISD::SETUNE:
961148385Sume  case ISD::SETNE:
962148385Sume  case ISD::SETUEQ:
963148385Sume  case ISD::SETEQ:
964105293Sume  case ISD::SETFALSE:
965105293Sume  case ISD::SETFALSE2:
966105293Sume  case ISD::SETTRUE:
967105293Sume  case ISD::SETTRUE2:
968105293Sume  case ISD::SETUO:
969105293Sume  case ISD::SETO:
970105293Sume    break;
971105293Sume  case ISD::SETULE:
972105293Sume  case ISD::SETULT: {
973105293Sume    if (LHS == True)
974105293Sume      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
975105293Sume    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
976105293Sume  }
977105293Sume  case ISD::SETOLE:
978105293Sume  case ISD::SETOLT:
979105293Sume  case ISD::SETLE:
980105293Sume  case ISD::SETLT: {
981105293Sume    // Ordered. Assume ordered for undefined.
982105293Sume
983105293Sume    // Only do this after legalization to avoid interfering with other combines
984148887Srwatson    // which might occur.
985105293Sume    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
986148887Srwatson        !DCI.isCalledByLegalizer())
987105293Sume      return SDValue();
988105293Sume
989105293Sume    // We need to permute the operands to get the correct NaN behavior. The
990105293Sume    // selected operand is the second one based on the failing compare with NaN,
99179106Sbrooks    // so permute it based on the compare type the hardware uses.
992105293Sume    if (LHS == True)
993105293Sume      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
99479106Sbrooks    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
995147256Sbrooks  }
99679106Sbrooks  case ISD::SETUGE:
99779106Sbrooks  case ISD::SETUGT: {
99879106Sbrooks    if (LHS == True)
99979106Sbrooks      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
100079106Sbrooks    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
100179106Sbrooks  }
100279106Sbrooks  case ISD::SETGT:
100379106Sbrooks  case ISD::SETGE:
100479106Sbrooks  case ISD::SETOGE:
1005105293Sume  case ISD::SETOGT: {
1006105293Sume    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1007105293Sume        !DCI.isCalledByLegalizer())
1008105293Sume      return SDValue();
1009105293Sume
1010105293Sume    if (LHS == True)
1011105293Sume      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1012160018Syar    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
101379106Sbrooks  }
1014  case ISD::SETCC_INVALID:
1015    llvm_unreachable("Invalid setcc condcode!");
1016  }
1017  return SDValue();
1018}
1019
1020std::pair<SDValue, SDValue>
1021AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1022  SDLoc SL(Op);
1023
1024  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1025
1026  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1027  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1028
1029  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1030  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1031
1032  return std::make_pair(Lo, Hi);
1033}
1034
1035SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1036  SDLoc SL(Op);
1037
1038  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1039  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1040  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1041}
1042
1043SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1044  SDLoc SL(Op);
1045
1046  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1047  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1048  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1049}
1050
1051SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1052                                              SelectionDAG &DAG) const {
1053  LoadSDNode *Load = cast<LoadSDNode>(Op);
1054  EVT VT = Op.getValueType();
1055
1056
1057  // If this is a 2 element vector, we really want to scalarize and not create
1058  // weird 1 element vectors.
1059  if (VT.getVectorNumElements() == 2)
1060    return scalarizeVectorLoad(Load, DAG);
1061
1062  SDValue BasePtr = Load->getBasePtr();
1063  EVT PtrVT = BasePtr.getValueType();
1064  EVT MemVT = Load->getMemoryVT();
1065  SDLoc SL(Op);
1066
1067  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1068
1069  EVT LoVT, HiVT;
1070  EVT LoMemVT, HiMemVT;
1071  SDValue Lo, Hi;
1072
1073  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1074  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1075  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1076
1077  unsigned Size = LoMemVT.getStoreSize();
1078  unsigned BaseAlign = Load->getAlignment();
1079  unsigned HiAlign = MinAlign(BaseAlign, Size);
1080
1081  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1082                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1083                                  BaseAlign, Load->getMemOperand()->getFlags());
1084  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1085                              DAG.getConstant(Size, SL, PtrVT));
1086  SDValue HiLoad =
1087      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1088                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1089                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1090
1091  SDValue Ops[] = {
1092    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1093    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1094                LoLoad.getValue(1), HiLoad.getValue(1))
1095  };
1096
1097  return DAG.getMergeValues(Ops, SL);
1098}
1099
1100// FIXME: This isn't doing anything for SI. This should be used in a target
1101// combine during type legalization.
1102SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
1103                                               SelectionDAG &DAG) const {
1104  StoreSDNode *Store = cast<StoreSDNode>(Op);
1105  EVT MemVT = Store->getMemoryVT();
1106  unsigned MemBits = MemVT.getSizeInBits();
1107
1108  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
1109  // truncating store into an i32 store.
1110  // XXX: We could also handle optimize other vector bitwidths.
1111  if (!MemVT.isVector() || MemBits > 32) {
1112    return SDValue();
1113  }
1114
1115  SDLoc DL(Op);
1116  SDValue Value = Store->getValue();
1117  EVT VT = Value.getValueType();
1118  EVT ElemVT = VT.getVectorElementType();
1119  SDValue Ptr = Store->getBasePtr();
1120  EVT MemEltVT = MemVT.getVectorElementType();
1121  unsigned MemEltBits = MemEltVT.getSizeInBits();
1122  unsigned MemNumElements = MemVT.getVectorNumElements();
1123  unsigned PackedSize = MemVT.getStoreSizeInBits();
1124  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
1125
1126  assert(Value.getValueType().getScalarSizeInBits() >= 32);
1127
1128  SDValue PackedValue;
1129  for (unsigned i = 0; i < MemNumElements; ++i) {
1130    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
1131                              DAG.getConstant(i, DL, MVT::i32));
1132    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
1133    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
1134
1135    SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
1136    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
1137
1138    if (i == 0) {
1139      PackedValue = Elt;
1140    } else {
1141      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
1142    }
1143  }
1144
1145  if (PackedSize < 32) {
1146    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
1147    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
1148                             Store->getMemOperand()->getPointerInfo(), PackedVT,
1149                             Store->getAlignment(),
1150                             Store->getMemOperand()->getFlags());
1151  }
1152
1153  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
1154                      Store->getMemOperand()->getPointerInfo(),
1155                      Store->getAlignment(),
1156                      Store->getMemOperand()->getFlags());
1157}
1158
1159SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1160                                               SelectionDAG &DAG) const {
1161  StoreSDNode *Store = cast<StoreSDNode>(Op);
1162  SDValue Val = Store->getValue();
1163  EVT VT = Val.getValueType();
1164
1165  // If this is a 2 element vector, we really want to scalarize and not create
1166  // weird 1 element vectors.
1167  if (VT.getVectorNumElements() == 2)
1168    return scalarizeVectorStore(Store, DAG);
1169
1170  EVT MemVT = Store->getMemoryVT();
1171  SDValue Chain = Store->getChain();
1172  SDValue BasePtr = Store->getBasePtr();
1173  SDLoc SL(Op);
1174
1175  EVT LoVT, HiVT;
1176  EVT LoMemVT, HiMemVT;
1177  SDValue Lo, Hi;
1178
1179  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1180  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1181  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1182
1183  EVT PtrVT = BasePtr.getValueType();
1184  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1185                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
1186                                              PtrVT));
1187
1188  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1189  unsigned BaseAlign = Store->getAlignment();
1190  unsigned Size = LoMemVT.getStoreSize();
1191  unsigned HiAlign = MinAlign(BaseAlign, Size);
1192
1193  SDValue LoStore =
1194      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1195                        Store->getMemOperand()->getFlags());
1196  SDValue HiStore =
1197      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1198                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1199
1200  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1201}
1202
1203// This is a shortcut for integer division because we have fast i32<->f32
1204// conversions, and fast f32 reciprocal instructions. The fractional part of a
1205// float is enough to accurately represent up to a 24-bit signed integer.
1206SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1207                                            bool Sign) const {
1208  SDLoc DL(Op);
1209  EVT VT = Op.getValueType();
1210  SDValue LHS = Op.getOperand(0);
1211  SDValue RHS = Op.getOperand(1);
1212  MVT IntVT = MVT::i32;
1213  MVT FltVT = MVT::f32;
1214
1215  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1216  if (LHSSignBits < 9)
1217    return SDValue();
1218
1219  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1220  if (RHSSignBits < 9)
1221    return SDValue();
1222
1223  unsigned BitSize = VT.getSizeInBits();
1224  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1225  unsigned DivBits = BitSize - SignBits;
1226  if (Sign)
1227    ++DivBits;
1228
1229  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1230  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1231
1232  SDValue jq = DAG.getConstant(1, DL, IntVT);
1233
1234  if (Sign) {
1235    // char|short jq = ia ^ ib;
1236    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1237
1238    // jq = jq >> (bitsize - 2)
1239    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1240                     DAG.getConstant(BitSize - 2, DL, VT));
1241
1242    // jq = jq | 0x1
1243    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1244  }
1245
1246  // int ia = (int)LHS;
1247  SDValue ia = LHS;
1248
1249  // int ib, (int)RHS;
1250  SDValue ib = RHS;
1251
1252  // float fa = (float)ia;
1253  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1254
1255  // float fb = (float)ib;
1256  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1257
1258  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1259                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1260
1261  // fq = trunc(fq);
1262  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1263
1264  // float fqneg = -fq;
1265  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1266
1267  // float fr = mad(fqneg, fb, fa);
1268  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
1269
1270  // int iq = (int)fq;
1271  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1272
1273  // fr = fabs(fr);
1274  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1275
1276  // fb = fabs(fb);
1277  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1278
1279  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1280
1281  // int cv = fr >= fb;
1282  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1283
1284  // jq = (cv ? jq : 0);
1285  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1286
1287  // dst = iq + jq;
1288  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1289
1290  // Rem needs compensation, it's easier to recompute it
1291  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1292  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1293
1294  // Truncate to number of bits this divide really is.
1295  if (Sign) {
1296    SDValue InRegSize
1297      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1298    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1299    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1300  } else {
1301    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1302    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1303    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1304  }
1305
1306  return DAG.getMergeValues({ Div, Rem }, DL);
1307}
1308
1309void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1310                                      SelectionDAG &DAG,
1311                                      SmallVectorImpl<SDValue> &Results) const {
1312  assert(Op.getValueType() == MVT::i64);
1313
1314  SDLoc DL(Op);
1315  EVT VT = Op.getValueType();
1316  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1317
1318  SDValue one = DAG.getConstant(1, DL, HalfVT);
1319  SDValue zero = DAG.getConstant(0, DL, HalfVT);
1320
1321  //HiLo split
1322  SDValue LHS = Op.getOperand(0);
1323  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1324  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1325
1326  SDValue RHS = Op.getOperand(1);
1327  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1328  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1329
1330  if (VT == MVT::i64 &&
1331    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1332    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1333
1334    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1335                              LHS_Lo, RHS_Lo);
1336
1337    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1338    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1339
1340    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1341    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1342    return;
1343  }
1344
1345  // Get Speculative values
1346  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1347  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1348
1349  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1350  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1351  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1352
1353  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1354  SDValue DIV_Lo = zero;
1355
1356  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1357
1358  for (unsigned i = 0; i < halfBitWidth; ++i) {
1359    const unsigned bitPos = halfBitWidth - i - 1;
1360    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1361    // Get value of high bit
1362    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1363    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1364    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1365
1366    // Shift
1367    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1368    // Add LHS high bit
1369    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1370
1371    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1372    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1373
1374    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1375
1376    // Update REM
1377    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1378    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1379  }
1380
1381  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1382  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1383  Results.push_back(DIV);
1384  Results.push_back(REM);
1385}
1386
1387SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1388                                           SelectionDAG &DAG) const {
1389  SDLoc DL(Op);
1390  EVT VT = Op.getValueType();
1391
1392  if (VT == MVT::i64) {
1393    SmallVector<SDValue, 2> Results;
1394    LowerUDIVREM64(Op, DAG, Results);
1395    return DAG.getMergeValues(Results, DL);
1396  }
1397
1398  if (VT == MVT::i32) {
1399    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1400      return Res;
1401  }
1402
1403  SDValue Num = Op.getOperand(0);
1404  SDValue Den = Op.getOperand(1);
1405
1406  // RCP =  URECIP(Den) = 2^32 / Den + e
1407  // e is rounding error.
1408  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1409
1410  // RCP_LO = mul(RCP, Den) */
1411  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1412
1413  // RCP_HI = mulhu (RCP, Den) */
1414  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1415
1416  // NEG_RCP_LO = -RCP_LO
1417  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1418                                                     RCP_LO);
1419
1420  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1421  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1422                                           NEG_RCP_LO, RCP_LO,
1423                                           ISD::SETEQ);
1424  // Calculate the rounding error from the URECIP instruction
1425  // E = mulhu(ABS_RCP_LO, RCP)
1426  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1427
1428  // RCP_A_E = RCP + E
1429  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1430
1431  // RCP_S_E = RCP - E
1432  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1433
1434  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1435  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1436                                     RCP_A_E, RCP_S_E,
1437                                     ISD::SETEQ);
1438  // Quotient = mulhu(Tmp0, Num)
1439  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1440
1441  // Num_S_Remainder = Quotient * Den
1442  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1443
1444  // Remainder = Num - Num_S_Remainder
1445  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1446
1447  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1448  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1449                                                 DAG.getConstant(-1, DL, VT),
1450                                                 DAG.getConstant(0, DL, VT),
1451                                                 ISD::SETUGE);
1452  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1453  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1454                                                  Num_S_Remainder,
1455                                                  DAG.getConstant(-1, DL, VT),
1456                                                  DAG.getConstant(0, DL, VT),
1457                                                  ISD::SETUGE);
1458  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1459  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1460                                               Remainder_GE_Zero);
1461
1462  // Calculate Division result:
1463
1464  // Quotient_A_One = Quotient + 1
1465  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1466                                       DAG.getConstant(1, DL, VT));
1467
1468  // Quotient_S_One = Quotient - 1
1469  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1470                                       DAG.getConstant(1, DL, VT));
1471
1472  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1473  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1474                                     Quotient, Quotient_A_One, ISD::SETEQ);
1475
1476  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1477  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1478                            Quotient_S_One, Div, ISD::SETEQ);
1479
1480  // Calculate Rem result:
1481
1482  // Remainder_S_Den = Remainder - Den
1483  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1484
1485  // Remainder_A_Den = Remainder + Den
1486  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1487
1488  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1489  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1490                                    Remainder, Remainder_S_Den, ISD::SETEQ);
1491
1492  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1493  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1494                            Remainder_A_Den, Rem, ISD::SETEQ);
1495  SDValue Ops[2] = {
1496    Div,
1497    Rem
1498  };
1499  return DAG.getMergeValues(Ops, DL);
1500}
1501
1502SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1503                                           SelectionDAG &DAG) const {
1504  SDLoc DL(Op);
1505  EVT VT = Op.getValueType();
1506
1507  SDValue LHS = Op.getOperand(0);
1508  SDValue RHS = Op.getOperand(1);
1509
1510  SDValue Zero = DAG.getConstant(0, DL, VT);
1511  SDValue NegOne = DAG.getConstant(-1, DL, VT);
1512
1513  if (VT == MVT::i32) {
1514    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1515      return Res;
1516  }
1517
1518  if (VT == MVT::i64 &&
1519      DAG.ComputeNumSignBits(LHS) > 32 &&
1520      DAG.ComputeNumSignBits(RHS) > 32) {
1521    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1522
1523    //HiLo split
1524    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1525    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1526    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1527                                 LHS_Lo, RHS_Lo);
1528    SDValue Res[2] = {
1529      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1530      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1531    };
1532    return DAG.getMergeValues(Res, DL);
1533  }
1534
1535  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1536  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1537  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1538  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1539
1540  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1541  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1542
1543  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1544  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1545
1546  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1547  SDValue Rem = Div.getValue(1);
1548
1549  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1550  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1551
1552  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1553  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1554
1555  SDValue Res[2] = {
1556    Div,
1557    Rem
1558  };
1559  return DAG.getMergeValues(Res, DL);
1560}
1561
1562// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1563SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1564  SDLoc SL(Op);
1565  EVT VT = Op.getValueType();
1566  SDValue X = Op.getOperand(0);
1567  SDValue Y = Op.getOperand(1);
1568
1569  // TODO: Should this propagate fast-math-flags?
1570
1571  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1572  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1573  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1574
1575  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1576}
1577
1578SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1579  SDLoc SL(Op);
1580  SDValue Src = Op.getOperand(0);
1581
1582  // result = trunc(src)
1583  // if (src > 0.0 && src != result)
1584  //   result += 1.0
1585
1586  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1587
1588  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1589  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1590
1591  EVT SetCCVT =
1592      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1593
1594  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1595  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1596  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1597
1598  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1599  // TODO: Should this propagate fast-math-flags?
1600  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1601}
1602
1603static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1604                                  SelectionDAG &DAG) {
1605  const unsigned FractBits = 52;
1606  const unsigned ExpBits = 11;
1607
1608  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1609                                Hi,
1610                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
1611                                DAG.getConstant(ExpBits, SL, MVT::i32));
1612  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1613                            DAG.getConstant(1023, SL, MVT::i32));
1614
1615  return Exp;
1616}
1617
1618SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1619  SDLoc SL(Op);
1620  SDValue Src = Op.getOperand(0);
1621
1622  assert(Op.getValueType() == MVT::f64);
1623
1624  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1625  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1626
1627  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1628
1629  // Extract the upper half, since this is where we will find the sign and
1630  // exponent.
1631  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1632
1633  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1634
1635  const unsigned FractBits = 52;
1636
1637  // Extract the sign bit.
1638  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1639  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1640
1641  // Extend back to to 64-bits.
1642  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1643  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1644
1645  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1646  const SDValue FractMask
1647    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1648
1649  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1650  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1651  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1652
1653  EVT SetCCVT =
1654      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1655
1656  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1657
1658  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1659  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1660
1661  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1662  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1663
1664  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1665}
1666
1667SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1668  SDLoc SL(Op);
1669  SDValue Src = Op.getOperand(0);
1670
1671  assert(Op.getValueType() == MVT::f64);
1672
1673  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
1674  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1675  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1676
1677  // TODO: Should this propagate fast-math-flags?
1678
1679  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1680  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1681
1682  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1683
1684  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
1685  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1686
1687  EVT SetCCVT =
1688      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1689  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1690
1691  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1692}
1693
1694SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1695  // FNEARBYINT and FRINT are the same, except in their handling of FP
1696  // exceptions. Those aren't really meaningful for us, and OpenCL only has
1697  // rint, so just treat them as equivalent.
1698  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1699}
1700
1701// XXX - May require not supporting f32 denormals?
1702SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
1703  SDLoc SL(Op);
1704  SDValue X = Op.getOperand(0);
1705
1706  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
1707
1708  // TODO: Should this propagate fast-math-flags?
1709
1710  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
1711
1712  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
1713
1714  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
1715  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
1716  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
1717
1718  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
1719
1720  EVT SetCCVT =
1721      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
1722
1723  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1724
1725  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
1726
1727  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
1728}
1729
1730SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1731  SDLoc SL(Op);
1732  SDValue X = Op.getOperand(0);
1733
1734  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1735
1736  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1737  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1738  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1739  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1740  EVT SetCCVT =
1741      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1742
1743  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1744
1745  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1746
1747  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1748
1749  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1750                                       MVT::i64);
1751
1752  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1753  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1754                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
1755                                          MVT::i64),
1756                          Exp);
1757
1758  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1759  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1760                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
1761                              ISD::SETNE);
1762
1763  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1764                             D, DAG.getConstant(0, SL, MVT::i64));
1765  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1766
1767  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1768  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1769
1770  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1771  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1772  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1773
1774  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1775                            ExpEqNegOne,
1776                            DAG.getConstantFP(1.0, SL, MVT::f64),
1777                            DAG.getConstantFP(0.0, SL, MVT::f64));
1778
1779  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1780
1781  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1782  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1783
1784  return K;
1785}
1786
1787SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1788  EVT VT = Op.getValueType();
1789
1790  if (VT == MVT::f32)
1791    return LowerFROUND32(Op, DAG);
1792
1793  if (VT == MVT::f64)
1794    return LowerFROUND64(Op, DAG);
1795
1796  llvm_unreachable("unhandled type");
1797}
1798
1799SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1800  SDLoc SL(Op);
1801  SDValue Src = Op.getOperand(0);
1802
1803  // result = trunc(src);
1804  // if (src < 0.0 && src != result)
1805  //   result += -1.0.
1806
1807  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1808
1809  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1810  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1811
1812  EVT SetCCVT =
1813      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1814
1815  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1816  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1817  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1818
1819  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1820  // TODO: Should this propagate fast-math-flags?
1821  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1822}
1823
1824SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1825  SDLoc SL(Op);
1826  SDValue Src = Op.getOperand(0);
1827  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1828
1829  if (ZeroUndef && Src.getValueType() == MVT::i32)
1830    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1831
1832  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1833
1834  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1835  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1836
1837  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1838  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1839
1840  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1841                                   *DAG.getContext(), MVT::i32);
1842
1843  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1844
1845  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1846  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1847
1848  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1849  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1850
1851  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
1852  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
1853
1854  if (!ZeroUndef) {
1855    // Test if the full 64-bit input is zero.
1856
1857    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
1858    // which we probably don't want.
1859    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
1860    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
1861
1862    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
1863    // with the same cycles, otherwise it is slower.
1864    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
1865    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
1866
1867    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
1868
1869    // The instruction returns -1 for 0 input, but the defined intrinsic
1870    // behavior is to return the number of bits.
1871    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
1872                          SrcIsZero, Bits32, NewCtlz);
1873  }
1874
1875  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
1876}
1877
1878SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
1879                                               bool Signed) const {
1880  // Unsigned
1881  // cul2f(ulong u)
1882  //{
1883  //  uint lz = clz(u);
1884  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
1885  //  u = (u << lz) & 0x7fffffffffffffffUL;
1886  //  ulong t = u & 0xffffffffffUL;
1887  //  uint v = (e << 23) | (uint)(u >> 40);
1888  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
1889  //  return as_float(v + r);
1890  //}
1891  // Signed
1892  // cl2f(long l)
1893  //{
1894  //  long s = l >> 63;
1895  //  float r = cul2f((l + s) ^ s);
1896  //  return s ? -r : r;
1897  //}
1898
1899  SDLoc SL(Op);
1900  SDValue Src = Op.getOperand(0);
1901  SDValue L = Src;
1902
1903  SDValue S;
1904  if (Signed) {
1905    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
1906    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
1907
1908    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
1909    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
1910  }
1911
1912  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1913                                   *DAG.getContext(), MVT::f32);
1914
1915
1916  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
1917  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
1918  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
1919  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
1920
1921  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
1922  SDValue E = DAG.getSelect(SL, MVT::i32,
1923    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
1924    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
1925    ZeroI32);
1926
1927  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
1928    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
1929    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
1930
1931  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
1932                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
1933
1934  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
1935                             U, DAG.getConstant(40, SL, MVT::i64));
1936
1937  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
1938    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
1939    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
1940
1941  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
1942  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
1943  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
1944
1945  SDValue One = DAG.getConstant(1, SL, MVT::i32);
1946
1947  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
1948
1949  SDValue R = DAG.getSelect(SL, MVT::i32,
1950    RCmp,
1951    One,
1952    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
1953  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
1954  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
1955
1956  if (!Signed)
1957    return R;
1958
1959  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
1960  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
1961}
1962
1963SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
1964                                               bool Signed) const {
1965  SDLoc SL(Op);
1966  SDValue Src = Op.getOperand(0);
1967
1968  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1969
1970  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1971                           DAG.getConstant(0, SL, MVT::i32));
1972  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1973                           DAG.getConstant(1, SL, MVT::i32));
1974
1975  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
1976                              SL, MVT::f64, Hi);
1977
1978  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
1979
1980  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
1981                              DAG.getConstant(32, SL, MVT::i32));
1982  // TODO: Should this propagate fast-math-flags?
1983  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
1984}
1985
1986SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
1987                                               SelectionDAG &DAG) const {
1988  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
1989         "operation should be legal");
1990
1991  EVT DestVT = Op.getValueType();
1992  if (DestVT == MVT::f64)
1993    return LowerINT_TO_FP64(Op, DAG, false);
1994
1995  if (DestVT == MVT::f32)
1996    return LowerINT_TO_FP32(Op, DAG, false);
1997
1998  return SDValue();
1999}
2000
2001SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2002                                              SelectionDAG &DAG) const {
2003  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2004         "operation should be legal");
2005
2006  EVT DestVT = Op.getValueType();
2007  if (DestVT == MVT::f32)
2008    return LowerINT_TO_FP32(Op, DAG, true);
2009
2010  if (DestVT == MVT::f64)
2011    return LowerINT_TO_FP64(Op, DAG, true);
2012
2013  return SDValue();
2014}
2015
2016SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2017                                               bool Signed) const {
2018  SDLoc SL(Op);
2019
2020  SDValue Src = Op.getOperand(0);
2021
2022  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2023
2024  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2025                                 MVT::f64);
2026  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2027                                 MVT::f64);
2028  // TODO: Should this propagate fast-math-flags?
2029  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2030
2031  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2032
2033
2034  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2035
2036  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2037                           MVT::i32, FloorMul);
2038  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2039
2040  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2041
2042  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2043}
2044
2045SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2046                                              SelectionDAG &DAG) const {
2047  SDValue Src = Op.getOperand(0);
2048
2049  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2050    return LowerFP64_TO_INT(Op, DAG, true);
2051
2052  return SDValue();
2053}
2054
2055SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2056                                              SelectionDAG &DAG) const {
2057  SDValue Src = Op.getOperand(0);
2058
2059  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2060    return LowerFP64_TO_INT(Op, DAG, false);
2061
2062  return SDValue();
2063}
2064
2065SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2066                                                     SelectionDAG &DAG) const {
2067  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2068  MVT VT = Op.getSimpleValueType();
2069  MVT ScalarVT = VT.getScalarType();
2070
2071  if (!VT.isVector())
2072    return SDValue();
2073
2074  SDValue Src = Op.getOperand(0);
2075  SDLoc DL(Op);
2076
2077  // TODO: Don't scalarize on Evergreen?
2078  unsigned NElts = VT.getVectorNumElements();
2079  SmallVector<SDValue, 8> Args;
2080  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2081
2082  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2083  for (unsigned I = 0; I < NElts; ++I)
2084    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2085
2086  return DAG.getBuildVector(VT, DL, Args);
2087}
2088
2089//===----------------------------------------------------------------------===//
2090// Custom DAG optimizations
2091//===----------------------------------------------------------------------===//
2092
2093static bool isU24(SDValue Op, SelectionDAG &DAG) {
2094  APInt KnownZero, KnownOne;
2095  EVT VT = Op.getValueType();
2096  DAG.computeKnownBits(Op, KnownZero, KnownOne);
2097
2098  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
2099}
2100
2101static bool isI24(SDValue Op, SelectionDAG &DAG) {
2102  EVT VT = Op.getValueType();
2103
2104  // In order for this to be a signed 24-bit value, bit 23, must
2105  // be a sign bit.
2106  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2107                                     // as unsigned 24-bit values.
2108         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2109}
2110
2111static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
2112
2113  SelectionDAG &DAG = DCI.DAG;
2114  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2115  EVT VT = Op.getValueType();
2116
2117  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2118  APInt KnownZero, KnownOne;
2119  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2120  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
2121    DCI.CommitTargetLoweringOpt(TLO);
2122}
2123
2124template <typename IntTy>
2125static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2126                               uint32_t Width, const SDLoc &DL) {
2127  if (Width + Offset < 32) {
2128    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2129    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2130    return DAG.getConstant(Result, DL, MVT::i32);
2131  }
2132
2133  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2134}
2135
2136static bool hasVolatileUser(SDNode *Val) {
2137  for (SDNode *U : Val->uses()) {
2138    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2139      if (M->isVolatile())
2140        return true;
2141    }
2142  }
2143
2144  return false;
2145}
2146
2147bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2148  // i32 vectors are the canonical memory type.
2149  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2150    return false;
2151
2152  if (!VT.isByteSized())
2153    return false;
2154
2155  unsigned Size = VT.getStoreSize();
2156
2157  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2158    return false;
2159
2160  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2161    return false;
2162
2163  return true;
2164}
2165
2166// Replace load of an illegal type with a store of a bitcast to a friendlier
2167// type.
2168SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2169                                                 DAGCombinerInfo &DCI) const {
2170  if (!DCI.isBeforeLegalize())
2171    return SDValue();
2172
2173  LoadSDNode *LN = cast<LoadSDNode>(N);
2174  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2175    return SDValue();
2176
2177  SDLoc SL(N);
2178  SelectionDAG &DAG = DCI.DAG;
2179  EVT VT = LN->getMemoryVT();
2180
2181  unsigned Size = VT.getStoreSize();
2182  unsigned Align = LN->getAlignment();
2183  if (Align < Size && isTypeLegal(VT)) {
2184    bool IsFast;
2185    unsigned AS = LN->getAddressSpace();
2186
2187    // Expand unaligned loads earlier than legalization. Due to visitation order
2188    // problems during legalization, the emitted instructions to pack and unpack
2189    // the bytes again are not eliminated in the case of an unaligned copy.
2190    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2191      SDValue Ops[2];
2192      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2193      return DAG.getMergeValues(Ops, SDLoc(N));
2194    }
2195
2196    if (!IsFast)
2197      return SDValue();
2198  }
2199
2200  if (!shouldCombineMemoryType(VT))
2201    return SDValue();
2202
2203  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2204
2205  SDValue NewLoad
2206    = DAG.getLoad(NewVT, SL, LN->getChain(),
2207                  LN->getBasePtr(), LN->getMemOperand());
2208
2209  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2210  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2211  return SDValue(N, 0);
2212}
2213
2214// Replace store of an illegal type with a store of a bitcast to a friendlier
2215// type.
2216SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2217                                                  DAGCombinerInfo &DCI) const {
2218  if (!DCI.isBeforeLegalize())
2219    return SDValue();
2220
2221  StoreSDNode *SN = cast<StoreSDNode>(N);
2222  if (SN->isVolatile() || !ISD::isNormalStore(SN))
2223    return SDValue();
2224
2225  EVT VT = SN->getMemoryVT();
2226  unsigned Size = VT.getStoreSize();
2227
2228  SDLoc SL(N);
2229  SelectionDAG &DAG = DCI.DAG;
2230  unsigned Align = SN->getAlignment();
2231  if (Align < Size && isTypeLegal(VT)) {
2232    bool IsFast;
2233    unsigned AS = SN->getAddressSpace();
2234
2235    // Expand unaligned stores earlier than legalization. Due to visitation
2236    // order problems during legalization, the emitted instructions to pack and
2237    // unpack the bytes again are not eliminated in the case of an unaligned
2238    // copy.
2239    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
2240      return expandUnalignedStore(SN, DAG);
2241
2242    if (!IsFast)
2243      return SDValue();
2244  }
2245
2246  if (!shouldCombineMemoryType(VT))
2247    return SDValue();
2248
2249  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2250  SDValue Val = SN->getValue();
2251
2252  //DCI.AddToWorklist(Val.getNode());
2253
2254  bool OtherUses = !Val.hasOneUse();
2255  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2256  if (OtherUses) {
2257    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2258    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2259  }
2260
2261  return DAG.getStore(SN->getChain(), SL, CastVal,
2262                      SN->getBasePtr(), SN->getMemOperand());
2263}
2264
2265// TODO: Should repeat for other bit ops.
2266SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
2267                                                DAGCombinerInfo &DCI) const {
2268  if (N->getValueType(0) != MVT::i64)
2269    return SDValue();
2270
2271  // Break up 64-bit and of a constant into two 32-bit ands. This will typically
2272  // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
2273  // combine opportunities since most 64-bit operations are decomposed this way.
2274  // TODO: We won't want this for SALU especially if it is an inline immediate.
2275  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2276  if (!RHS)
2277    return SDValue();
2278
2279  uint64_t Val = RHS->getZExtValue();
2280  if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
2281    // If either half of the constant is 0, this is really a 32-bit and, so
2282    // split it. If we can re-use the full materialized constant, keep it.
2283    return SDValue();
2284  }
2285
2286  SDLoc SL(N);
2287  SelectionDAG &DAG = DCI.DAG;
2288
2289  SDValue Lo, Hi;
2290  std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
2291
2292  SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
2293  SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
2294
2295  SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
2296  SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
2297
2298  // Re-visit the ands. It's possible we eliminated one of them and it could
2299  // simplify the vector.
2300  DCI.AddToWorklist(Lo.getNode());
2301  DCI.AddToWorklist(Hi.getNode());
2302
2303  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2304  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2305}
2306
2307SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2308                                                DAGCombinerInfo &DCI) const {
2309  if (N->getValueType(0) != MVT::i64)
2310    return SDValue();
2311
2312  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2313
2314  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2315  // common case, splitting this into a move and a 32-bit shift is faster and
2316  // the same code size.
2317  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2318  if (!RHS)
2319    return SDValue();
2320
2321  unsigned RHSVal = RHS->getZExtValue();
2322  if (RHSVal < 32)
2323    return SDValue();
2324
2325  SDValue LHS = N->getOperand(0);
2326
2327  SDLoc SL(N);
2328  SelectionDAG &DAG = DCI.DAG;
2329
2330  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2331
2332  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2333  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2334
2335  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2336
2337  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2338  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2339}
2340
2341SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2342                                                DAGCombinerInfo &DCI) const {
2343  if (N->getValueType(0) != MVT::i64)
2344    return SDValue();
2345
2346  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2347  if (!RHS)
2348    return SDValue();
2349
2350  SelectionDAG &DAG = DCI.DAG;
2351  SDLoc SL(N);
2352  unsigned RHSVal = RHS->getZExtValue();
2353
2354  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2355  if (RHSVal == 32) {
2356    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2357    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2358                                   DAG.getConstant(31, SL, MVT::i32));
2359
2360    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2361    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2362  }
2363
2364  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2365  if (RHSVal == 63) {
2366    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2367    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2368                                   DAG.getConstant(31, SL, MVT::i32));
2369    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2370    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2371  }
2372
2373  return SDValue();
2374}
2375
2376SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2377                                                DAGCombinerInfo &DCI) const {
2378  if (N->getValueType(0) != MVT::i64)
2379    return SDValue();
2380
2381  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2382  if (!RHS)
2383    return SDValue();
2384
2385  unsigned ShiftAmt = RHS->getZExtValue();
2386  if (ShiftAmt < 32)
2387    return SDValue();
2388
2389  // srl i64:x, C for C >= 32
2390  // =>
2391  //   build_pair (srl hi_32(x), C - 32), 0
2392
2393  SelectionDAG &DAG = DCI.DAG;
2394  SDLoc SL(N);
2395
2396  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2397  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2398
2399  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2400  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2401                           VecOp, One);
2402
2403  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2404  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2405
2406  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2407
2408  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2409}
2410
2411SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2412                                                DAGCombinerInfo &DCI) const {
2413  EVT VT = N->getValueType(0);
2414
2415  if (VT.isVector() || VT.getSizeInBits() > 32)
2416    return SDValue();
2417
2418  SelectionDAG &DAG = DCI.DAG;
2419  SDLoc DL(N);
2420
2421  SDValue N0 = N->getOperand(0);
2422  SDValue N1 = N->getOperand(1);
2423  SDValue Mul;
2424
2425  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2426    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2427    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2428    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
2429  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2430    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2431    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2432    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
2433  } else {
2434    return SDValue();
2435  }
2436
2437  // We need to use sext even for MUL_U24, because MUL_U24 is used
2438  // for signed multiply of 8 and 16-bit types.
2439  return DAG.getSExtOrTrunc(Mul, DL, VT);
2440}
2441
2442static bool isNegativeOne(SDValue Val) {
2443  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2444    return C->isAllOnesValue();
2445  return false;
2446}
2447
2448static bool isCtlzOpc(unsigned Opc) {
2449  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2450}
2451
2452// Get FFBH node if the incoming op may have been type legalized from a smaller
2453// type VT.
2454// Need to match pre-legalized type because the generic legalization inserts the
2455// add/sub between the select and compare.
2456static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
2457                           const SDLoc &SL, SDValue Op) {
2458  EVT VT = Op.getValueType();
2459  EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
2460  if (LegalVT != MVT::i32)
2461    return SDValue();
2462
2463  if (VT != MVT::i32)
2464    Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
2465
2466  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
2467  if (VT != MVT::i32)
2468    FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
2469
2470  return FFBH;
2471}
2472
2473// The native instructions return -1 on 0 input. Optimize out a select that
2474// produces -1 on 0.
2475//
2476// TODO: If zero is not undef, we could also do this if the output is compared
2477// against the bitwidth.
2478//
2479// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2480SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2481                                                 SDValue LHS, SDValue RHS,
2482                                                 DAGCombinerInfo &DCI) const {
2483  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2484  if (!CmpRhs || !CmpRhs->isNullValue())
2485    return SDValue();
2486
2487  SelectionDAG &DAG = DCI.DAG;
2488  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2489  SDValue CmpLHS = Cond.getOperand(0);
2490
2491  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2492  if (CCOpcode == ISD::SETEQ &&
2493      isCtlzOpc(RHS.getOpcode()) &&
2494      RHS.getOperand(0) == CmpLHS &&
2495      isNegativeOne(LHS)) {
2496    return getFFBH_U32(*this, DAG, SL, CmpLHS);
2497  }
2498
2499  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2500  if (CCOpcode == ISD::SETNE &&
2501      isCtlzOpc(LHS.getOpcode()) &&
2502      LHS.getOperand(0) == CmpLHS &&
2503      isNegativeOne(RHS)) {
2504    return getFFBH_U32(*this, DAG, SL, CmpLHS);
2505  }
2506
2507  return SDValue();
2508}
2509
2510SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
2511                                                   DAGCombinerInfo &DCI) const {
2512  SDValue Cond = N->getOperand(0);
2513  if (Cond.getOpcode() != ISD::SETCC)
2514    return SDValue();
2515
2516  EVT VT = N->getValueType(0);
2517  SDValue LHS = Cond.getOperand(0);
2518  SDValue RHS = Cond.getOperand(1);
2519  SDValue CC = Cond.getOperand(2);
2520
2521  SDValue True = N->getOperand(1);
2522  SDValue False = N->getOperand(2);
2523
2524  if (VT == MVT::f32 && Cond.hasOneUse()) {
2525    SDValue MinMax
2526      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
2527    // Revisit this node so we can catch min3/max3/med3 patterns.
2528    //DCI.AddToWorklist(MinMax.getNode());
2529    return MinMax;
2530  }
2531
2532  // There's no reason to not do this if the condition has other uses.
2533  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
2534}
2535
2536SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
2537                                                DAGCombinerInfo &DCI) const {
2538  SelectionDAG &DAG = DCI.DAG;
2539  SDLoc DL(N);
2540
2541  switch(N->getOpcode()) {
2542  default:
2543    break;
2544  case ISD::BITCAST: {
2545    EVT DestVT = N->getValueType(0);
2546    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
2547      break;
2548
2549    // Fold bitcasts of constants.
2550    //
2551    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
2552    // TODO: Generalize and move to DAGCombiner
2553    SDValue Src = N->getOperand(0);
2554    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
2555      assert(Src.getValueType() == MVT::i64);
2556      SDLoc SL(N);
2557      uint64_t CVal = C->getZExtValue();
2558      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
2559                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
2560                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
2561    }
2562
2563    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
2564      const APInt &Val = C->getValueAPF().bitcastToAPInt();
2565      SDLoc SL(N);
2566      uint64_t CVal = Val.getZExtValue();
2567      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2568                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
2569                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
2570
2571      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
2572    }
2573
2574    break;
2575  }
2576  case ISD::SHL: {
2577    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2578      break;
2579
2580    return performShlCombine(N, DCI);
2581  }
2582  case ISD::SRL: {
2583    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2584      break;
2585
2586    return performSrlCombine(N, DCI);
2587  }
2588  case ISD::SRA: {
2589    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2590      break;
2591
2592    return performSraCombine(N, DCI);
2593  }
2594  case ISD::AND: {
2595    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2596      break;
2597
2598    return performAndCombine(N, DCI);
2599  }
2600  case ISD::MUL:
2601    return performMulCombine(N, DCI);
2602  case AMDGPUISD::MUL_I24:
2603  case AMDGPUISD::MUL_U24: {
2604    SDValue N0 = N->getOperand(0);
2605    SDValue N1 = N->getOperand(1);
2606    simplifyI24(N0, DCI);
2607    simplifyI24(N1, DCI);
2608    return SDValue();
2609  }
2610  case ISD::SELECT:
2611    return performSelectCombine(N, DCI);
2612  case AMDGPUISD::BFE_I32:
2613  case AMDGPUISD::BFE_U32: {
2614    assert(!N->getValueType(0).isVector() &&
2615           "Vector handling of BFE not implemented");
2616    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
2617    if (!Width)
2618      break;
2619
2620    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
2621    if (WidthVal == 0)
2622      return DAG.getConstant(0, DL, MVT::i32);
2623
2624    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
2625    if (!Offset)
2626      break;
2627
2628    SDValue BitsFrom = N->getOperand(0);
2629    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
2630
2631    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
2632
2633    if (OffsetVal == 0) {
2634      // This is already sign / zero extended, so try to fold away extra BFEs.
2635      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
2636
2637      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
2638      if (OpSignBits >= SignBits)
2639        return BitsFrom;
2640
2641      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
2642      if (Signed) {
2643        // This is a sign_extend_inreg. Replace it to take advantage of existing
2644        // DAG Combines. If not eliminated, we will match back to BFE during
2645        // selection.
2646
2647        // TODO: The sext_inreg of extended types ends, although we can could
2648        // handle them in a single BFE.
2649        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
2650                           DAG.getValueType(SmallVT));
2651      }
2652
2653      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
2654    }
2655
2656    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
2657      if (Signed) {
2658        return constantFoldBFE<int32_t>(DAG,
2659                                        CVal->getSExtValue(),
2660                                        OffsetVal,
2661                                        WidthVal,
2662                                        DL);
2663      }
2664
2665      return constantFoldBFE<uint32_t>(DAG,
2666                                       CVal->getZExtValue(),
2667                                       OffsetVal,
2668                                       WidthVal,
2669                                       DL);
2670    }
2671
2672    if ((OffsetVal + WidthVal) >= 32) {
2673      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
2674      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
2675                         BitsFrom, ShiftVal);
2676    }
2677
2678    if (BitsFrom.hasOneUse()) {
2679      APInt Demanded = APInt::getBitsSet(32,
2680                                         OffsetVal,
2681                                         OffsetVal + WidthVal);
2682
2683      APInt KnownZero, KnownOne;
2684      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
2685                                            !DCI.isBeforeLegalizeOps());
2686      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2687      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
2688          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
2689                                   KnownZero, KnownOne, TLO)) {
2690        DCI.CommitTargetLoweringOpt(TLO);
2691      }
2692    }
2693
2694    break;
2695  }
2696  case ISD::LOAD:
2697    return performLoadCombine(N, DCI);
2698  case ISD::STORE:
2699    return performStoreCombine(N, DCI);
2700  }
2701  return SDValue();
2702}
2703
2704//===----------------------------------------------------------------------===//
2705// Helper functions
2706//===----------------------------------------------------------------------===//
2707
2708void AMDGPUTargetLowering::getOriginalFunctionArgs(
2709                               SelectionDAG &DAG,
2710                               const Function *F,
2711                               const SmallVectorImpl<ISD::InputArg> &Ins,
2712                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
2713
2714  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
2715    if (Ins[i].ArgVT == Ins[i].VT) {
2716      OrigIns.push_back(Ins[i]);
2717      continue;
2718    }
2719
2720    EVT VT;
2721    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
2722      // Vector has been split into scalars.
2723      VT = Ins[i].ArgVT.getVectorElementType();
2724    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
2725               Ins[i].ArgVT.getVectorElementType() !=
2726               Ins[i].VT.getVectorElementType()) {
2727      // Vector elements have been promoted
2728      VT = Ins[i].ArgVT;
2729    } else {
2730      // Vector has been spilt into smaller vectors.
2731      VT = Ins[i].VT;
2732    }
2733
2734    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
2735                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
2736    OrigIns.push_back(Arg);
2737  }
2738}
2739
2740SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
2741                                                  const TargetRegisterClass *RC,
2742                                                   unsigned Reg, EVT VT) const {
2743  MachineFunction &MF = DAG.getMachineFunction();
2744  MachineRegisterInfo &MRI = MF.getRegInfo();
2745  unsigned VirtualRegister;
2746  if (!MRI.isLiveIn(Reg)) {
2747    VirtualRegister = MRI.createVirtualRegister(RC);
2748    MRI.addLiveIn(Reg, VirtualRegister);
2749  } else {
2750    VirtualRegister = MRI.getLiveInVirtReg(Reg);
2751  }
2752  return DAG.getRegister(VirtualRegister, VT);
2753}
2754
2755uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
2756    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
2757  uint64_t ArgOffset = MFI->ABIArgOffset;
2758  switch (Param) {
2759  case GRID_DIM:
2760    return ArgOffset;
2761  case GRID_OFFSET:
2762    return ArgOffset + 4;
2763  }
2764  llvm_unreachable("unexpected implicit parameter type");
2765}
2766
2767#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
2768
2769const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
2770  switch ((AMDGPUISD::NodeType)Opcode) {
2771  case AMDGPUISD::FIRST_NUMBER: break;
2772  // AMDIL DAG nodes
2773  NODE_NAME_CASE(CALL);
2774  NODE_NAME_CASE(UMUL);
2775  NODE_NAME_CASE(BRANCH_COND);
2776
2777  // AMDGPU DAG nodes
2778  NODE_NAME_CASE(ENDPGM)
2779  NODE_NAME_CASE(RETURN)
2780  NODE_NAME_CASE(DWORDADDR)
2781  NODE_NAME_CASE(FRACT)
2782  NODE_NAME_CASE(CLAMP)
2783  NODE_NAME_CASE(COS_HW)
2784  NODE_NAME_CASE(SIN_HW)
2785  NODE_NAME_CASE(FMAX_LEGACY)
2786  NODE_NAME_CASE(FMIN_LEGACY)
2787  NODE_NAME_CASE(FMAX3)
2788  NODE_NAME_CASE(SMAX3)
2789  NODE_NAME_CASE(UMAX3)
2790  NODE_NAME_CASE(FMIN3)
2791  NODE_NAME_CASE(SMIN3)
2792  NODE_NAME_CASE(UMIN3)
2793  NODE_NAME_CASE(FMED3)
2794  NODE_NAME_CASE(SMED3)
2795  NODE_NAME_CASE(UMED3)
2796  NODE_NAME_CASE(URECIP)
2797  NODE_NAME_CASE(DIV_SCALE)
2798  NODE_NAME_CASE(DIV_FMAS)
2799  NODE_NAME_CASE(DIV_FIXUP)
2800  NODE_NAME_CASE(TRIG_PREOP)
2801  NODE_NAME_CASE(RCP)
2802  NODE_NAME_CASE(RSQ)
2803  NODE_NAME_CASE(RSQ_LEGACY)
2804  NODE_NAME_CASE(RSQ_CLAMP)
2805  NODE_NAME_CASE(LDEXP)
2806  NODE_NAME_CASE(FP_CLASS)
2807  NODE_NAME_CASE(DOT4)
2808  NODE_NAME_CASE(CARRY)
2809  NODE_NAME_CASE(BORROW)
2810  NODE_NAME_CASE(BFE_U32)
2811  NODE_NAME_CASE(BFE_I32)
2812  NODE_NAME_CASE(BFI)
2813  NODE_NAME_CASE(BFM)
2814  NODE_NAME_CASE(FFBH_U32)
2815  NODE_NAME_CASE(MUL_U24)
2816  NODE_NAME_CASE(MUL_I24)
2817  NODE_NAME_CASE(MAD_U24)
2818  NODE_NAME_CASE(MAD_I24)
2819  NODE_NAME_CASE(TEXTURE_FETCH)
2820  NODE_NAME_CASE(EXPORT)
2821  NODE_NAME_CASE(CONST_ADDRESS)
2822  NODE_NAME_CASE(REGISTER_LOAD)
2823  NODE_NAME_CASE(REGISTER_STORE)
2824  NODE_NAME_CASE(LOAD_INPUT)
2825  NODE_NAME_CASE(SAMPLE)
2826  NODE_NAME_CASE(SAMPLEB)
2827  NODE_NAME_CASE(SAMPLED)
2828  NODE_NAME_CASE(SAMPLEL)
2829  NODE_NAME_CASE(CVT_F32_UBYTE0)
2830  NODE_NAME_CASE(CVT_F32_UBYTE1)
2831  NODE_NAME_CASE(CVT_F32_UBYTE2)
2832  NODE_NAME_CASE(CVT_F32_UBYTE3)
2833  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
2834  NODE_NAME_CASE(CONST_DATA_PTR)
2835  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
2836  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
2837  NODE_NAME_CASE(SENDMSG)
2838  NODE_NAME_CASE(INTERP_MOV)
2839  NODE_NAME_CASE(INTERP_P1)
2840  NODE_NAME_CASE(INTERP_P2)
2841  NODE_NAME_CASE(STORE_MSKOR)
2842  NODE_NAME_CASE(LOAD_CONSTANT)
2843  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
2844  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
2845  NODE_NAME_CASE(ATOMIC_INC)
2846  NODE_NAME_CASE(ATOMIC_DEC)
2847  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
2848  }
2849  return nullptr;
2850}
2851
2852SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
2853                                               DAGCombinerInfo &DCI,
2854                                               unsigned &RefinementSteps,
2855                                               bool &UseOneConstNR) const {
2856  SelectionDAG &DAG = DCI.DAG;
2857  EVT VT = Operand.getValueType();
2858
2859  if (VT == MVT::f32) {
2860    RefinementSteps = 0;
2861    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
2862  }
2863
2864  // TODO: There is also f64 rsq instruction, but the documentation is less
2865  // clear on its precision.
2866
2867  return SDValue();
2868}
2869
2870SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
2871                                               DAGCombinerInfo &DCI,
2872                                               unsigned &RefinementSteps) const {
2873  SelectionDAG &DAG = DCI.DAG;
2874  EVT VT = Operand.getValueType();
2875
2876  if (VT == MVT::f32) {
2877    // Reciprocal, < 1 ulp error.
2878    //
2879    // This reciprocal approximation converges to < 0.5 ulp error with one
2880    // newton rhapson performed with two fused multiple adds (FMAs).
2881
2882    RefinementSteps = 0;
2883    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
2884  }
2885
2886  // TODO: There is also f64 rcp instruction, but the documentation is less
2887  // clear on its precision.
2888
2889  return SDValue();
2890}
2891
2892void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
2893  const SDValue Op,
2894  APInt &KnownZero,
2895  APInt &KnownOne,
2896  const SelectionDAG &DAG,
2897  unsigned Depth) const {
2898
2899  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
2900
2901  APInt KnownZero2;
2902  APInt KnownOne2;
2903  unsigned Opc = Op.getOpcode();
2904
2905  switch (Opc) {
2906  default:
2907    break;
2908  case AMDGPUISD::CARRY:
2909  case AMDGPUISD::BORROW: {
2910    KnownZero = APInt::getHighBitsSet(32, 31);
2911    break;
2912  }
2913
2914  case AMDGPUISD::BFE_I32:
2915  case AMDGPUISD::BFE_U32: {
2916    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2917    if (!CWidth)
2918      return;
2919
2920    unsigned BitWidth = 32;
2921    uint32_t Width = CWidth->getZExtValue() & 0x1f;
2922
2923    if (Opc == AMDGPUISD::BFE_U32)
2924      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
2925
2926    break;
2927  }
2928  }
2929}
2930
2931unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
2932  SDValue Op,
2933  const SelectionDAG &DAG,
2934  unsigned Depth) const {
2935  switch (Op.getOpcode()) {
2936  case AMDGPUISD::BFE_I32: {
2937    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2938    if (!Width)
2939      return 1;
2940
2941    unsigned SignBits = 32 - Width->getZExtValue() + 1;
2942    if (!isNullConstant(Op.getOperand(1)))
2943      return SignBits;
2944
2945    // TODO: Could probably figure something out with non-0 offsets.
2946    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
2947    return std::max(SignBits, Op0SignBits);
2948  }
2949
2950  case AMDGPUISD::BFE_U32: {
2951    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
2952    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
2953  }
2954
2955  case AMDGPUISD::CARRY:
2956  case AMDGPUISD::BORROW:
2957    return 31;
2958
2959  default:
2960    return 1;
2961  }
2962}
2963