1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUArgumentUsageInfo.h"
16#include "AMDGPUISelLowering.h" // For AMDGPUISD
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUPerfHintAnalysis.h"
19#include "AMDGPURegisterInfo.h"
20#include "AMDGPUSubtarget.h"
21#include "AMDGPUTargetMachine.h"
22#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23#include "SIDefines.h"
24#include "SIISelLowering.h"
25#include "SIInstrInfo.h"
26#include "SIMachineFunctionInfo.h"
27#include "SIRegisterInfo.h"
28#include "llvm/ADT/APInt.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/ADT/StringRef.h"
31#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/CodeGen/FunctionLoweringInfo.h"
34#include "llvm/CodeGen/ISDOpcodes.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineRegisterInfo.h"
37#include "llvm/CodeGen/SelectionDAG.h"
38#include "llvm/CodeGen/SelectionDAGISel.h"
39#include "llvm/CodeGen/SelectionDAGNodes.h"
40#include "llvm/CodeGen/ValueTypes.h"
41#include "llvm/IR/BasicBlock.h"
42#include "llvm/InitializePasses.h"
43#ifdef EXPENSIVE_CHECKS
44#include "llvm/IR/Dominators.h"
45#endif
46#include "llvm/IR/Instruction.h"
47#include "llvm/MC/MCInstrDesc.h"
48#include "llvm/Support/Casting.h"
49#include "llvm/Support/CodeGen.h"
50#include "llvm/Support/ErrorHandling.h"
51#include "llvm/Support/MachineValueType.h"
52#include "llvm/Support/MathExtras.h"
53#include <cassert>
54#include <cstdint>
55#include <new>
56#include <vector>
57
58#define DEBUG_TYPE "isel"
59
60using namespace llvm;
61
62namespace llvm {
63
64class R600InstrInfo;
65
66} // end namespace llvm
67
68//===----------------------------------------------------------------------===//
69// Instruction Selector Implementation
70//===----------------------------------------------------------------------===//
71
72namespace {
73
74static bool isNullConstantOrUndef(SDValue V) {
75  if (V.isUndef())
76    return true;
77
78  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
79  return Const != nullptr && Const->isNullValue();
80}
81
82static bool getConstantValue(SDValue N, uint32_t &Out) {
83  // This is only used for packed vectors, where ussing 0 for undef should
84  // always be good.
85  if (N.isUndef()) {
86    Out = 0;
87    return true;
88  }
89
90  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
91    Out = C->getAPIntValue().getSExtValue();
92    return true;
93  }
94
95  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
96    Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
97    return true;
98  }
99
100  return false;
101}
102
103// TODO: Handle undef as zero
104static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
105                                 bool Negate = false) {
106  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
107  uint32_t LHSVal, RHSVal;
108  if (getConstantValue(N->getOperand(0), LHSVal) &&
109      getConstantValue(N->getOperand(1), RHSVal)) {
110    SDLoc SL(N);
111    uint32_t K = Negate ?
112      (-LHSVal & 0xffff) | (-RHSVal << 16) :
113      (LHSVal & 0xffff) | (RHSVal << 16);
114    return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
115                              DAG.getTargetConstant(K, SL, MVT::i32));
116  }
117
118  return nullptr;
119}
120
121static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
122  return packConstantV2I16(N, DAG, true);
123}
124
125/// AMDGPU specific code to select AMDGPU machine instructions for
126/// SelectionDAG operations.
127class AMDGPUDAGToDAGISel : public SelectionDAGISel {
128  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
129  // make the right decision when generating code for different targets.
130  const GCNSubtarget *Subtarget;
131
132  // Default FP mode for the current function.
133  AMDGPU::SIModeRegisterDefaults Mode;
134
135  bool EnableLateStructurizeCFG;
136
137public:
138  explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
139                              CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
140    : SelectionDAGISel(*TM, OptLevel) {
141    EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
142  }
143  ~AMDGPUDAGToDAGISel() override = default;
144
145  void getAnalysisUsage(AnalysisUsage &AU) const override {
146    AU.addRequired<AMDGPUArgumentUsageInfo>();
147    AU.addRequired<LegacyDivergenceAnalysis>();
148#ifdef EXPENSIVE_CHECKS
149    AU.addRequired<DominatorTreeWrapperPass>();
150    AU.addRequired<LoopInfoWrapperPass>();
151#endif
152    SelectionDAGISel::getAnalysisUsage(AU);
153  }
154
155  bool matchLoadD16FromBuildVector(SDNode *N) const;
156
157  bool runOnMachineFunction(MachineFunction &MF) override;
158  void PreprocessISelDAG() override;
159  void Select(SDNode *N) override;
160  StringRef getPassName() const override;
161  void PostprocessISelDAG() override;
162
163protected:
164  void SelectBuildVector(SDNode *N, unsigned RegClassID);
165
166private:
167  std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
168  bool isNoNanSrc(SDValue N) const;
169  bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
170  bool isNegInlineImmediate(const SDNode *N) const {
171    return isInlineImmediate(N, true);
172  }
173
174  bool isInlineImmediate16(int64_t Imm) const {
175    return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
176  }
177
178  bool isInlineImmediate32(int64_t Imm) const {
179    return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
180  }
181
182  bool isInlineImmediate64(int64_t Imm) const {
183    return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
184  }
185
186  bool isInlineImmediate(const APFloat &Imm) const {
187    return Subtarget->getInstrInfo()->isInlineConstant(Imm);
188  }
189
190  bool isVGPRImm(const SDNode *N) const;
191  bool isUniformLoad(const SDNode *N) const;
192  bool isUniformBr(const SDNode *N) const;
193
194  MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
195
196  SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
197  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
198  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
199
200  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
201  virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
202  virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
203  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
204                       unsigned OffsetBits) const;
205  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
206  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
207                                 SDValue &Offset1) const;
208  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
209                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
210                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
211                   SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
212  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
213                         SDValue &SOffset, SDValue &Offset, SDValue &GLC,
214                         SDValue &SLC, SDValue &TFE, SDValue &DLC,
215                         SDValue &SWZ) const;
216  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
217                         SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
218                         SDValue &SLC) const;
219  bool SelectMUBUFScratchOffen(SDNode *Parent,
220                               SDValue Addr, SDValue &RSrc, SDValue &VAddr,
221                               SDValue &SOffset, SDValue &ImmOffset) const;
222  bool SelectMUBUFScratchOffset(SDNode *Parent,
223                                SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
224                                SDValue &Offset) const;
225
226  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
227                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
228                         SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
229  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
230                         SDValue &Offset, SDValue &SLC) const;
231  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
232                         SDValue &Offset) const;
233
234  template <bool IsSigned>
235  bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
236                        SDValue &Offset, SDValue &SLC) const;
237  bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
238                        SDValue &Offset, SDValue &SLC) const;
239  bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
240                              SDValue &Offset, SDValue &SLC) const;
241
242  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
243                        bool &Imm) const;
244  SDValue Expand32BitAddress(SDValue Addr) const;
245  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
246                  bool &Imm) const;
247  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
248  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
249  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
250  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
251  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
252  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
253
254  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
255  bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
256  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
257  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
258  bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
259  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
260                       SDValue &Clamp, SDValue &Omod) const;
261  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
262                         SDValue &Clamp, SDValue &Omod) const;
263
264  bool SelectVOP3OMods(SDValue In, SDValue &Src,
265                       SDValue &Clamp, SDValue &Omod) const;
266
267  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
268  bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
269                        SDValue &Clamp) const;
270
271  bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
272  bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
273                        SDValue &Clamp) const;
274
275  bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
276  bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
277                            SDValue &Clamp) const;
278  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
279  bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
280
281  SDValue getHi16Elt(SDValue In) const;
282
283  SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
284
285  void SelectADD_SUB_I64(SDNode *N);
286  void SelectAddcSubb(SDNode *N);
287  void SelectUADDO_USUBO(SDNode *N);
288  void SelectDIV_SCALE(SDNode *N);
289  void SelectDIV_FMAS(SDNode *N);
290  void SelectMAD_64_32(SDNode *N);
291  void SelectFMA_W_CHAIN(SDNode *N);
292  void SelectFMUL_W_CHAIN(SDNode *N);
293
294  SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
295                   uint32_t Offset, uint32_t Width);
296  void SelectS_BFEFromShifts(SDNode *N);
297  void SelectS_BFE(SDNode *N);
298  bool isCBranchSCC(const SDNode *N) const;
299  void SelectBRCOND(SDNode *N);
300  void SelectFMAD_FMA(SDNode *N);
301  void SelectATOMIC_CMP_SWAP(SDNode *N);
302  void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
303  void SelectDS_GWS(SDNode *N, unsigned IntrID);
304  void SelectINTRINSIC_W_CHAIN(SDNode *N);
305  void SelectINTRINSIC_WO_CHAIN(SDNode *N);
306  void SelectINTRINSIC_VOID(SDNode *N);
307
308protected:
309  // Include the pieces autogenerated from the target description.
310#include "AMDGPUGenDAGISel.inc"
311};
312
313class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
314  const R600Subtarget *Subtarget;
315
316  bool isConstantLoad(const MemSDNode *N, int cbID) const;
317  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
318  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
319                                       SDValue& Offset);
320public:
321  explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
322      AMDGPUDAGToDAGISel(TM, OptLevel) {}
323
324  void Select(SDNode *N) override;
325
326  bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
327                          SDValue &Offset) override;
328  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
329                          SDValue &Offset) override;
330
331  bool runOnMachineFunction(MachineFunction &MF) override;
332
333  void PreprocessISelDAG() override {}
334
335protected:
336  // Include the pieces autogenerated from the target description.
337#include "R600GenDAGISel.inc"
338};
339
340static SDValue stripBitcast(SDValue Val) {
341  return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
342}
343
344// Figure out if this is really an extract of the high 16-bits of a dword.
345static bool isExtractHiElt(SDValue In, SDValue &Out) {
346  In = stripBitcast(In);
347  if (In.getOpcode() != ISD::TRUNCATE)
348    return false;
349
350  SDValue Srl = In.getOperand(0);
351  if (Srl.getOpcode() == ISD::SRL) {
352    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
353      if (ShiftAmt->getZExtValue() == 16) {
354        Out = stripBitcast(Srl.getOperand(0));
355        return true;
356      }
357    }
358  }
359
360  return false;
361}
362
363// Look through operations that obscure just looking at the low 16-bits of the
364// same register.
365static SDValue stripExtractLoElt(SDValue In) {
366  if (In.getOpcode() == ISD::TRUNCATE) {
367    SDValue Src = In.getOperand(0);
368    if (Src.getValueType().getSizeInBits() == 32)
369      return stripBitcast(Src);
370  }
371
372  return In;
373}
374
375}  // end anonymous namespace
376
377INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
378                      "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
379INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
380INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
381INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
382#ifdef EXPENSIVE_CHECKS
383INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
384INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
385#endif
386INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
387                    "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
388
389/// This pass converts a legalized DAG into a AMDGPU-specific
390// DAG, ready for instruction scheduling.
391FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
392                                        CodeGenOpt::Level OptLevel) {
393  return new AMDGPUDAGToDAGISel(TM, OptLevel);
394}
395
396/// This pass converts a legalized DAG into a R600-specific
397// DAG, ready for instruction scheduling.
398FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
399                                      CodeGenOpt::Level OptLevel) {
400  return new R600DAGToDAGISel(TM, OptLevel);
401}
402
403bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
404#ifdef EXPENSIVE_CHECKS
405  DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
406  LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
407  for (auto &L : LI->getLoopsInPreorder()) {
408    assert(L->isLCSSAForm(DT));
409  }
410#endif
411  Subtarget = &MF.getSubtarget<GCNSubtarget>();
412  Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
413  return SelectionDAGISel::runOnMachineFunction(MF);
414}
415
416bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
417  assert(Subtarget->d16PreservesUnusedBits());
418  MVT VT = N->getValueType(0).getSimpleVT();
419  if (VT != MVT::v2i16 && VT != MVT::v2f16)
420    return false;
421
422  SDValue Lo = N->getOperand(0);
423  SDValue Hi = N->getOperand(1);
424
425  LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
426
427  // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
428  // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
429  // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
430
431  // Need to check for possible indirect dependencies on the other half of the
432  // vector to avoid introducing a cycle.
433  if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
434    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
435
436    SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
437    SDValue Ops[] = {
438      LdHi->getChain(), LdHi->getBasePtr(), TiedIn
439    };
440
441    unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
442    if (LdHi->getMemoryVT() == MVT::i8) {
443      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
444        AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
445    } else {
446      assert(LdHi->getMemoryVT() == MVT::i16);
447    }
448
449    SDValue NewLoadHi =
450      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
451                                  Ops, LdHi->getMemoryVT(),
452                                  LdHi->getMemOperand());
453
454    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
455    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
456    return true;
457  }
458
459  // build_vector (load ptr), hi -> load_d16_lo ptr, hi
460  // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
461  // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
462  LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
463  if (LdLo && Lo.hasOneUse()) {
464    SDValue TiedIn = getHi16Elt(Hi);
465    if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
466      return false;
467
468    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
469    unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
470    if (LdLo->getMemoryVT() == MVT::i8) {
471      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
472        AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
473    } else {
474      assert(LdLo->getMemoryVT() == MVT::i16);
475    }
476
477    TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
478
479    SDValue Ops[] = {
480      LdLo->getChain(), LdLo->getBasePtr(), TiedIn
481    };
482
483    SDValue NewLoadLo =
484      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
485                                  Ops, LdLo->getMemoryVT(),
486                                  LdLo->getMemOperand());
487
488    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
489    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
490    return true;
491  }
492
493  return false;
494}
495
496void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
497  if (!Subtarget->d16PreservesUnusedBits())
498    return;
499
500  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
501
502  bool MadeChange = false;
503  while (Position != CurDAG->allnodes_begin()) {
504    SDNode *N = &*--Position;
505    if (N->use_empty())
506      continue;
507
508    switch (N->getOpcode()) {
509    case ISD::BUILD_VECTOR:
510      MadeChange |= matchLoadD16FromBuildVector(N);
511      break;
512    default:
513      break;
514    }
515  }
516
517  if (MadeChange) {
518    CurDAG->RemoveDeadNodes();
519    LLVM_DEBUG(dbgs() << "After PreProcess:\n";
520               CurDAG->dump(););
521  }
522}
523
524bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
525  if (TM.Options.NoNaNsFPMath)
526    return true;
527
528  // TODO: Move into isKnownNeverNaN
529  if (N->getFlags().isDefined())
530    return N->getFlags().hasNoNaNs();
531
532  return CurDAG->isKnownNeverNaN(N);
533}
534
535bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
536                                           bool Negated) const {
537  if (N->isUndef())
538    return true;
539
540  const SIInstrInfo *TII = Subtarget->getInstrInfo();
541  if (Negated) {
542    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
543      return TII->isInlineConstant(-C->getAPIntValue());
544
545    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
546      return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
547
548  } else {
549    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
550      return TII->isInlineConstant(C->getAPIntValue());
551
552    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
553      return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
554  }
555
556  return false;
557}
558
559/// Determine the register class for \p OpNo
560/// \returns The register class of the virtual register that will be used for
561/// the given operand number \OpNo or NULL if the register class cannot be
562/// determined.
563const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
564                                                          unsigned OpNo) const {
565  if (!N->isMachineOpcode()) {
566    if (N->getOpcode() == ISD::CopyToReg) {
567      unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
568      if (Register::isVirtualRegister(Reg)) {
569        MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
570        return MRI.getRegClass(Reg);
571      }
572
573      const SIRegisterInfo *TRI
574        = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
575      return TRI->getPhysRegClass(Reg);
576    }
577
578    return nullptr;
579  }
580
581  switch (N->getMachineOpcode()) {
582  default: {
583    const MCInstrDesc &Desc =
584        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
585    unsigned OpIdx = Desc.getNumDefs() + OpNo;
586    if (OpIdx >= Desc.getNumOperands())
587      return nullptr;
588    int RegClass = Desc.OpInfo[OpIdx].RegClass;
589    if (RegClass == -1)
590      return nullptr;
591
592    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
593  }
594  case AMDGPU::REG_SEQUENCE: {
595    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
596    const TargetRegisterClass *SuperRC =
597        Subtarget->getRegisterInfo()->getRegClass(RCID);
598
599    SDValue SubRegOp = N->getOperand(OpNo + 1);
600    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
601    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
602                                                              SubRegIdx);
603  }
604  }
605}
606
607SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
608                                         SDValue Glue) const {
609  SmallVector <SDValue, 8> Ops;
610  Ops.push_back(NewChain); // Replace the chain.
611  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
612    Ops.push_back(N->getOperand(i));
613
614  Ops.push_back(Glue);
615  return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
616}
617
618SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
619  const SITargetLowering& Lowering =
620    *static_cast<const SITargetLowering*>(getTargetLowering());
621
622  assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
623
624  SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
625  return glueCopyToOp(N, M0, M0.getValue(1));
626}
627
628SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
629  unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
630  if (AS == AMDGPUAS::LOCAL_ADDRESS) {
631    if (Subtarget->ldsRequiresM0Init())
632      return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
633  } else if (AS == AMDGPUAS::REGION_ADDRESS) {
634    MachineFunction &MF = CurDAG->getMachineFunction();
635    unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
636    return
637        glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
638  }
639  return N;
640}
641
642MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
643                                                  EVT VT) const {
644  SDNode *Lo = CurDAG->getMachineNode(
645      AMDGPU::S_MOV_B32, DL, MVT::i32,
646      CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
647  SDNode *Hi =
648      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
649                             CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
650  const SDValue Ops[] = {
651      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
652      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
653      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
654
655  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
656}
657
658static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
659  switch (NumVectorElts) {
660  case 1:
661    return AMDGPU::SReg_32RegClassID;
662  case 2:
663    return AMDGPU::SReg_64RegClassID;
664  case 3:
665    return AMDGPU::SGPR_96RegClassID;
666  case 4:
667    return AMDGPU::SGPR_128RegClassID;
668  case 5:
669    return AMDGPU::SGPR_160RegClassID;
670  case 8:
671    return AMDGPU::SReg_256RegClassID;
672  case 16:
673    return AMDGPU::SReg_512RegClassID;
674  case 32:
675    return AMDGPU::SReg_1024RegClassID;
676  }
677
678  llvm_unreachable("invalid vector size");
679}
680
681void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
682  EVT VT = N->getValueType(0);
683  unsigned NumVectorElts = VT.getVectorNumElements();
684  EVT EltVT = VT.getVectorElementType();
685  SDLoc DL(N);
686  SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
687
688  if (NumVectorElts == 1) {
689    CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
690                         RegClass);
691    return;
692  }
693
694  assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
695                                  "supported yet");
696  // 32 = Max Num Vector Elements
697  // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
698  // 1 = Vector Register Class
699  SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
700
701  RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
702  bool IsRegSeq = true;
703  unsigned NOps = N->getNumOperands();
704  for (unsigned i = 0; i < NOps; i++) {
705    // XXX: Why is this here?
706    if (isa<RegisterSDNode>(N->getOperand(i))) {
707      IsRegSeq = false;
708      break;
709    }
710    unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
711    RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
712    RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
713  }
714  if (NOps != NumVectorElts) {
715    // Fill in the missing undef elements if this was a scalar_to_vector.
716    assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
717    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
718                                                   DL, EltVT);
719    for (unsigned i = NOps; i < NumVectorElts; ++i) {
720      unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
721      RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
722      RegSeqArgs[1 + (2 * i) + 1] =
723          CurDAG->getTargetConstant(Sub, DL, MVT::i32);
724    }
725  }
726
727  if (!IsRegSeq)
728    SelectCode(N);
729  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
730}
731
732void AMDGPUDAGToDAGISel::Select(SDNode *N) {
733  unsigned int Opc = N->getOpcode();
734  if (N->isMachineOpcode()) {
735    N->setNodeId(-1);
736    return;   // Already selected.
737  }
738
739  // isa<MemSDNode> almost works but is slightly too permissive for some DS
740  // intrinsics.
741  if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
742      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
743       Opc == ISD::ATOMIC_LOAD_FADD ||
744       Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
745       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
746    N = glueCopyToM0LDSInit(N);
747    SelectCode(N);
748    return;
749  }
750
751  switch (Opc) {
752  default:
753    break;
754  // We are selecting i64 ADD here instead of custom lower it during
755  // DAG legalization, so we can fold some i64 ADDs used for address
756  // calculation into the LOAD and STORE instructions.
757  case ISD::ADDC:
758  case ISD::ADDE:
759  case ISD::SUBC:
760  case ISD::SUBE: {
761    if (N->getValueType(0) != MVT::i64)
762      break;
763
764    SelectADD_SUB_I64(N);
765    return;
766  }
767  case ISD::ADDCARRY:
768  case ISD::SUBCARRY:
769    if (N->getValueType(0) != MVT::i32)
770      break;
771
772    SelectAddcSubb(N);
773    return;
774  case ISD::UADDO:
775  case ISD::USUBO: {
776    SelectUADDO_USUBO(N);
777    return;
778  }
779  case AMDGPUISD::FMUL_W_CHAIN: {
780    SelectFMUL_W_CHAIN(N);
781    return;
782  }
783  case AMDGPUISD::FMA_W_CHAIN: {
784    SelectFMA_W_CHAIN(N);
785    return;
786  }
787
788  case ISD::SCALAR_TO_VECTOR:
789  case ISD::BUILD_VECTOR: {
790    EVT VT = N->getValueType(0);
791    unsigned NumVectorElts = VT.getVectorNumElements();
792    if (VT.getScalarSizeInBits() == 16) {
793      if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
794        if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
795          ReplaceNode(N, Packed);
796          return;
797        }
798      }
799
800      break;
801    }
802
803    assert(VT.getVectorElementType().bitsEq(MVT::i32));
804    unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
805    SelectBuildVector(N, RegClassID);
806    return;
807  }
808  case ISD::BUILD_PAIR: {
809    SDValue RC, SubReg0, SubReg1;
810    SDLoc DL(N);
811    if (N->getValueType(0) == MVT::i128) {
812      RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
813      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
814      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
815    } else if (N->getValueType(0) == MVT::i64) {
816      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
817      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
818      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
819    } else {
820      llvm_unreachable("Unhandled value type for BUILD_PAIR");
821    }
822    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
823                            N->getOperand(1), SubReg1 };
824    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
825                                          N->getValueType(0), Ops));
826    return;
827  }
828
829  case ISD::Constant:
830  case ISD::ConstantFP: {
831    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
832      break;
833
834    uint64_t Imm;
835    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
836      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
837    else {
838      ConstantSDNode *C = cast<ConstantSDNode>(N);
839      Imm = C->getZExtValue();
840    }
841
842    SDLoc DL(N);
843    ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
844    return;
845  }
846  case AMDGPUISD::BFE_I32:
847  case AMDGPUISD::BFE_U32: {
848    // There is a scalar version available, but unlike the vector version which
849    // has a separate operand for the offset and width, the scalar version packs
850    // the width and offset into a single operand. Try to move to the scalar
851    // version if the offsets are constant, so that we can try to keep extended
852    // loads of kernel arguments in SGPRs.
853
854    // TODO: Technically we could try to pattern match scalar bitshifts of
855    // dynamic values, but it's probably not useful.
856    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
857    if (!Offset)
858      break;
859
860    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
861    if (!Width)
862      break;
863
864    bool Signed = Opc == AMDGPUISD::BFE_I32;
865
866    uint32_t OffsetVal = Offset->getZExtValue();
867    uint32_t WidthVal = Width->getZExtValue();
868
869    ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
870                            SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
871    return;
872  }
873  case AMDGPUISD::DIV_SCALE: {
874    SelectDIV_SCALE(N);
875    return;
876  }
877  case AMDGPUISD::DIV_FMAS: {
878    SelectDIV_FMAS(N);
879    return;
880  }
881  case AMDGPUISD::MAD_I64_I32:
882  case AMDGPUISD::MAD_U64_U32: {
883    SelectMAD_64_32(N);
884    return;
885  }
886  case ISD::CopyToReg: {
887    const SITargetLowering& Lowering =
888      *static_cast<const SITargetLowering*>(getTargetLowering());
889    N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
890    break;
891  }
892  case ISD::AND:
893  case ISD::SRL:
894  case ISD::SRA:
895  case ISD::SIGN_EXTEND_INREG:
896    if (N->getValueType(0) != MVT::i32)
897      break;
898
899    SelectS_BFE(N);
900    return;
901  case ISD::BRCOND:
902    SelectBRCOND(N);
903    return;
904  case ISD::FMAD:
905  case ISD::FMA:
906    SelectFMAD_FMA(N);
907    return;
908  case AMDGPUISD::ATOMIC_CMP_SWAP:
909    SelectATOMIC_CMP_SWAP(N);
910    return;
911  case AMDGPUISD::CVT_PKRTZ_F16_F32:
912  case AMDGPUISD::CVT_PKNORM_I16_F32:
913  case AMDGPUISD::CVT_PKNORM_U16_F32:
914  case AMDGPUISD::CVT_PK_U16_U32:
915  case AMDGPUISD::CVT_PK_I16_I32: {
916    // Hack around using a legal type if f16 is illegal.
917    if (N->getValueType(0) == MVT::i32) {
918      MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
919      N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
920                              { N->getOperand(0), N->getOperand(1) });
921      SelectCode(N);
922      return;
923    }
924
925    break;
926  }
927  case ISD::INTRINSIC_W_CHAIN: {
928    SelectINTRINSIC_W_CHAIN(N);
929    return;
930  }
931  case ISD::INTRINSIC_WO_CHAIN: {
932    SelectINTRINSIC_WO_CHAIN(N);
933    return;
934  }
935  case ISD::INTRINSIC_VOID: {
936    SelectINTRINSIC_VOID(N);
937    return;
938  }
939  }
940
941  SelectCode(N);
942}
943
944bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
945  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
946  const Instruction *Term = BB->getTerminator();
947  return Term->getMetadata("amdgpu.uniform") ||
948         Term->getMetadata("structurizecfg.uniform");
949}
950
951StringRef AMDGPUDAGToDAGISel::getPassName() const {
952  return "AMDGPU DAG->DAG Pattern Instruction Selection";
953}
954
955//===----------------------------------------------------------------------===//
956// Complex Patterns
957//===----------------------------------------------------------------------===//
958
959bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
960                                            SDValue &Offset) {
961  return false;
962}
963
964bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
965                                            SDValue &Offset) {
966  ConstantSDNode *C;
967  SDLoc DL(Addr);
968
969  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
970    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
971    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
972  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
973             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
974    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
975    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
976  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
977            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
978    Base = Addr.getOperand(0);
979    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
980  } else {
981    Base = Addr;
982    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
983  }
984
985  return true;
986}
987
988SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
989                                                       const SDLoc &DL) const {
990  SDNode *Mov = CurDAG->getMachineNode(
991    AMDGPU::S_MOV_B32, DL, MVT::i32,
992    CurDAG->getTargetConstant(Val, DL, MVT::i32));
993  return SDValue(Mov, 0);
994}
995
996// FIXME: Should only handle addcarry/subcarry
997void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
998  SDLoc DL(N);
999  SDValue LHS = N->getOperand(0);
1000  SDValue RHS = N->getOperand(1);
1001
1002  unsigned Opcode = N->getOpcode();
1003  bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1004  bool ProduceCarry =
1005      ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1006  bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1007
1008  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1009  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1010
1011  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1012                                       DL, MVT::i32, LHS, Sub0);
1013  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1014                                       DL, MVT::i32, LHS, Sub1);
1015
1016  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1017                                       DL, MVT::i32, RHS, Sub0);
1018  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1019                                       DL, MVT::i32, RHS, Sub1);
1020
1021  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1022
1023  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
1024  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
1025
1026  SDNode *AddLo;
1027  if (!ConsumeCarry) {
1028    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1029    AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1030  } else {
1031    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1032    AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1033  }
1034  SDValue AddHiArgs[] = {
1035    SDValue(Hi0, 0),
1036    SDValue(Hi1, 0),
1037    SDValue(AddLo, 1)
1038  };
1039  SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1040
1041  SDValue RegSequenceArgs[] = {
1042    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1043    SDValue(AddLo,0),
1044    Sub0,
1045    SDValue(AddHi,0),
1046    Sub1,
1047  };
1048  SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1049                                               MVT::i64, RegSequenceArgs);
1050
1051  if (ProduceCarry) {
1052    // Replace the carry-use
1053    ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1054  }
1055
1056  // Replace the remaining uses.
1057  ReplaceNode(N, RegSequence);
1058}
1059
1060void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1061  SDLoc DL(N);
1062  SDValue LHS = N->getOperand(0);
1063  SDValue RHS = N->getOperand(1);
1064  SDValue CI = N->getOperand(2);
1065
1066  unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
1067                                                 : AMDGPU::V_SUBB_U32_e64;
1068  CurDAG->SelectNodeTo(
1069      N, Opc, N->getVTList(),
1070      {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1071}
1072
1073void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1074  // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1075  // carry out despite the _i32 name. These were renamed in VI to _U32.
1076  // FIXME: We should probably rename the opcodes here.
1077  unsigned Opc = N->getOpcode() == ISD::UADDO ?
1078    AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
1079
1080  CurDAG->SelectNodeTo(
1081      N, Opc, N->getVTList(),
1082      {N->getOperand(0), N->getOperand(1),
1083       CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1084}
1085
1086void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1087  SDLoc SL(N);
1088  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1089  SDValue Ops[10];
1090
1091  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1092  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1093  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1094  Ops[8] = N->getOperand(0);
1095  Ops[9] = N->getOperand(4);
1096
1097  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
1098}
1099
1100void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1101  SDLoc SL(N);
1102  //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
1103  SDValue Ops[8];
1104
1105  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1106  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1107  Ops[6] = N->getOperand(0);
1108  Ops[7] = N->getOperand(3);
1109
1110  CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1111}
1112
1113// We need to handle this here because tablegen doesn't support matching
1114// instructions with multiple outputs.
1115void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1116  SDLoc SL(N);
1117  EVT VT = N->getValueType(0);
1118
1119  assert(VT == MVT::f32 || VT == MVT::f64);
1120
1121  unsigned Opc
1122    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
1123
1124  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
1125  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1126}
1127
1128void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
1129  const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
1130  const SIRegisterInfo *TRI = ST->getRegisterInfo();
1131
1132  SDLoc SL(N);
1133  EVT VT = N->getValueType(0);
1134
1135  assert(VT == MVT::f32 || VT == MVT::f64);
1136
1137  unsigned Opc
1138    = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
1139
1140  SDValue CarryIn = N->getOperand(3);
1141  // V_DIV_FMAS implicitly reads VCC.
1142  SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
1143                                     TRI->getVCC(), CarryIn, SDValue());
1144
1145  SDValue Ops[10];
1146
1147  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1148  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
1149  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
1150
1151  Ops[8] = VCC;
1152  Ops[9] = VCC.getValue(1);
1153
1154  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1155}
1156
1157// We need to handle this here because tablegen doesn't support matching
1158// instructions with multiple outputs.
1159void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1160  SDLoc SL(N);
1161  bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1162  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
1163
1164  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1165  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1166                    Clamp };
1167  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1168}
1169
1170bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
1171                                         unsigned OffsetBits) const {
1172  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
1173      (OffsetBits == 8 && !isUInt<8>(Offset)))
1174    return false;
1175
1176  if (Subtarget->hasUsableDSOffset() ||
1177      Subtarget->unsafeDSOffsetFoldingEnabled())
1178    return true;
1179
1180  // On Southern Islands instruction with a negative base value and an offset
1181  // don't seem to work.
1182  return CurDAG->SignBitIsZero(Base);
1183}
1184
1185bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1186                                              SDValue &Offset) const {
1187  SDLoc DL(Addr);
1188  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1189    SDValue N0 = Addr.getOperand(0);
1190    SDValue N1 = Addr.getOperand(1);
1191    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1192    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
1193      // (add n0, c0)
1194      Base = N0;
1195      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1196      return true;
1197    }
1198  } else if (Addr.getOpcode() == ISD::SUB) {
1199    // sub C, x -> add (sub 0, x), C
1200    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1201      int64_t ByteOffset = C->getSExtValue();
1202      if (isUInt<16>(ByteOffset)) {
1203        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1204
1205        // XXX - This is kind of hacky. Create a dummy sub node so we can check
1206        // the known bits in isDSOffsetLegal. We need to emit the selected node
1207        // here, so this is thrown away.
1208        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1209                                      Zero, Addr.getOperand(1));
1210
1211        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
1212          SmallVector<SDValue, 3> Opnds;
1213          Opnds.push_back(Zero);
1214          Opnds.push_back(Addr.getOperand(1));
1215
1216          // FIXME: Select to VOP3 version for with-carry.
1217          unsigned SubOp = AMDGPU::V_SUB_I32_e32;
1218          if (Subtarget->hasAddNoCarry()) {
1219            SubOp = AMDGPU::V_SUB_U32_e64;
1220            Opnds.push_back(
1221                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1222          }
1223
1224          MachineSDNode *MachineSub =
1225              CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1226
1227          Base = SDValue(MachineSub, 0);
1228          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1229          return true;
1230        }
1231      }
1232    }
1233  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1234    // If we have a constant address, prefer to put the constant into the
1235    // offset. This can save moves to load the constant address since multiple
1236    // operations can share the zero base address register, and enables merging
1237    // into read2 / write2 instructions.
1238
1239    SDLoc DL(Addr);
1240
1241    if (isUInt<16>(CAddr->getZExtValue())) {
1242      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1243      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1244                                 DL, MVT::i32, Zero);
1245      Base = SDValue(MovZero, 0);
1246      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1247      return true;
1248    }
1249  }
1250
1251  // default case
1252  Base = Addr;
1253  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1254  return true;
1255}
1256
1257// TODO: If offset is too big, put low 16-bit into offset.
1258bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1259                                                   SDValue &Offset0,
1260                                                   SDValue &Offset1) const {
1261  SDLoc DL(Addr);
1262
1263  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1264    SDValue N0 = Addr.getOperand(0);
1265    SDValue N1 = Addr.getOperand(1);
1266    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267    unsigned DWordOffset0 = C1->getZExtValue() / 4;
1268    unsigned DWordOffset1 = DWordOffset0 + 1;
1269    // (add n0, c0)
1270    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
1271      Base = N0;
1272      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
1273      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
1274      return true;
1275    }
1276  } else if (Addr.getOpcode() == ISD::SUB) {
1277    // sub C, x -> add (sub 0, x), C
1278    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1279      unsigned DWordOffset0 = C->getZExtValue() / 4;
1280      unsigned DWordOffset1 = DWordOffset0 + 1;
1281
1282      if (isUInt<8>(DWordOffset0)) {
1283        SDLoc DL(Addr);
1284        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1285
1286        // XXX - This is kind of hacky. Create a dummy sub node so we can check
1287        // the known bits in isDSOffsetLegal. We need to emit the selected node
1288        // here, so this is thrown away.
1289        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1290                                      Zero, Addr.getOperand(1));
1291
1292        if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
1293          SmallVector<SDValue, 3> Opnds;
1294          Opnds.push_back(Zero);
1295          Opnds.push_back(Addr.getOperand(1));
1296          unsigned SubOp = AMDGPU::V_SUB_I32_e32;
1297          if (Subtarget->hasAddNoCarry()) {
1298            SubOp = AMDGPU::V_SUB_U32_e64;
1299            Opnds.push_back(
1300                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1301          }
1302
1303          MachineSDNode *MachineSub
1304            = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1305
1306          Base = SDValue(MachineSub, 0);
1307          Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
1308          Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
1309          return true;
1310        }
1311      }
1312    }
1313  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1314    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
1315    unsigned DWordOffset1 = DWordOffset0 + 1;
1316    assert(4 * DWordOffset0 == CAddr->getZExtValue());
1317
1318    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
1319      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1320      MachineSDNode *MovZero
1321        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1322                                 DL, MVT::i32, Zero);
1323      Base = SDValue(MovZero, 0);
1324      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
1325      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
1326      return true;
1327    }
1328  }
1329
1330  // default case
1331
1332  Base = Addr;
1333  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1334  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1335  return true;
1336}
1337
1338bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
1339                                     SDValue &VAddr, SDValue &SOffset,
1340                                     SDValue &Offset, SDValue &Offen,
1341                                     SDValue &Idxen, SDValue &Addr64,
1342                                     SDValue &GLC, SDValue &SLC,
1343                                     SDValue &TFE, SDValue &DLC,
1344                                     SDValue &SWZ) const {
1345  // Subtarget prefers to use flat instruction
1346  if (Subtarget->useFlatForGlobal())
1347    return false;
1348
1349  SDLoc DL(Addr);
1350
1351  if (!GLC.getNode())
1352    GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1353  if (!SLC.getNode())
1354    SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1355  TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
1356  DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1357  SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
1358
1359  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1360  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1361  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1362  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1363
1364  ConstantSDNode *C1 = nullptr;
1365  SDValue N0 = Addr;
1366  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1367    C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1368    if (isUInt<32>(C1->getZExtValue()))
1369      N0 = Addr.getOperand(0);
1370    else
1371      C1 = nullptr;
1372  }
1373
1374  if (N0.getOpcode() == ISD::ADD) {
1375    // (add N2, N3) -> addr64, or
1376    // (add (add N2, N3), C1) -> addr64
1377    SDValue N2 = N0.getOperand(0);
1378    SDValue N3 = N0.getOperand(1);
1379    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1380
1381    if (N2->isDivergent()) {
1382      if (N3->isDivergent()) {
1383        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1384        // addr64, and construct the resource from a 0 address.
1385        Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1386        VAddr = N0;
1387      } else {
1388        // N2 is divergent, N3 is not.
1389        Ptr = N3;
1390        VAddr = N2;
1391      }
1392    } else {
1393      // N2 is not divergent.
1394      Ptr = N2;
1395      VAddr = N3;
1396    }
1397    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1398  } else if (N0->isDivergent()) {
1399    // N0 is divergent. Use it as the addr64, and construct the resource from a
1400    // 0 address.
1401    Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1402    VAddr = N0;
1403    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1404  } else {
1405    // N0 -> offset, or
1406    // (N0 + C1) -> offset
1407    VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1408    Ptr = N0;
1409  }
1410
1411  if (!C1) {
1412    // No offset.
1413    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1414    return true;
1415  }
1416
1417  if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1418    // Legal offset for instruction.
1419    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1420    return true;
1421  }
1422
1423  // Illegal offset, store it in soffset.
1424  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1425  SOffset =
1426      SDValue(CurDAG->getMachineNode(
1427                  AMDGPU::S_MOV_B32, DL, MVT::i32,
1428                  CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1429              0);
1430  return true;
1431}
1432
1433bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1434                                           SDValue &VAddr, SDValue &SOffset,
1435                                           SDValue &Offset, SDValue &GLC,
1436                                           SDValue &SLC, SDValue &TFE,
1437                                           SDValue &DLC, SDValue &SWZ) const {
1438  SDValue Ptr, Offen, Idxen, Addr64;
1439
1440  // addr64 bit was removed for volcanic islands.
1441  if (!Subtarget->hasAddr64())
1442    return false;
1443
1444  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1445              GLC, SLC, TFE, DLC, SWZ))
1446    return false;
1447
1448  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1449  if (C->getSExtValue()) {
1450    SDLoc DL(Addr);
1451
1452    const SITargetLowering& Lowering =
1453      *static_cast<const SITargetLowering*>(getTargetLowering());
1454
1455    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1456    return true;
1457  }
1458
1459  return false;
1460}
1461
1462bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1463                                           SDValue &VAddr, SDValue &SOffset,
1464                                           SDValue &Offset,
1465                                           SDValue &SLC) const {
1466  SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
1467  SDValue GLC, TFE, DLC, SWZ;
1468
1469  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
1470}
1471
1472static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
1473  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
1474  return PSV && PSV->isStack();
1475}
1476
1477std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1478  const MachineFunction &MF = CurDAG->getMachineFunction();
1479  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1480
1481  if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
1482    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1483                                              FI->getValueType(0));
1484
1485    // If we can resolve this to a frame index access, this will be relative to
1486    // either the stack or frame pointer SGPR.
1487    return std::make_pair(
1488        TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32));
1489  }
1490
1491  // If we don't know this private access is a local stack object, it needs to
1492  // be relative to the entry point's scratch wave offset register.
1493  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
1494                                               MVT::i32));
1495}
1496
1497bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1498                                                 SDValue Addr, SDValue &Rsrc,
1499                                                 SDValue &VAddr, SDValue &SOffset,
1500                                                 SDValue &ImmOffset) const {
1501
1502  SDLoc DL(Addr);
1503  MachineFunction &MF = CurDAG->getMachineFunction();
1504  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1505
1506  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1507
1508  if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1509    unsigned Imm = CAddr->getZExtValue();
1510
1511    SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1512    MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1513                                                        DL, MVT::i32, HighBits);
1514    VAddr = SDValue(MovHighBits, 0);
1515
1516    // In a call sequence, stores to the argument stack area are relative to the
1517    // stack pointer.
1518    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
1519    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
1520      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
1521
1522    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
1523    ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1524    return true;
1525  }
1526
1527  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1528    // (add n0, c1)
1529
1530    SDValue N0 = Addr.getOperand(0);
1531    SDValue N1 = Addr.getOperand(1);
1532
1533    // Offsets in vaddr must be positive if range checking is enabled.
1534    //
1535    // The total computation of vaddr + soffset + offset must not overflow.  If
1536    // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1537    // overflowing.
1538    //
1539    // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1540    // always perform a range check. If a negative vaddr base index was used,
1541    // this would fail the range check. The overall address computation would
1542    // compute a valid address, but this doesn't happen due to the range
1543    // check. For out-of-bounds MUBUF loads, a 0 is returned.
1544    //
1545    // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1546    // MUBUF vaddr, but not on older subtargets which can only do this if the
1547    // sign bit is known 0.
1548    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1549    if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1550        (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1551         CurDAG->SignBitIsZero(N0))) {
1552      std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1553      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1554      return true;
1555    }
1556  }
1557
1558  // (node)
1559  std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1560  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1561  return true;
1562}
1563
1564bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1565                                                  SDValue Addr,
1566                                                  SDValue &SRsrc,
1567                                                  SDValue &SOffset,
1568                                                  SDValue &Offset) const {
1569  ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
1570  if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1571    return false;
1572
1573  SDLoc DL(Addr);
1574  MachineFunction &MF = CurDAG->getMachineFunction();
1575  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1576
1577  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1578
1579  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
1580  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
1581    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
1582
1583  // FIXME: Get from MachinePointerInfo? We should only be using the frame
1584  // offset if we know this is in a call sequence.
1585  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
1586
1587  Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1588  return true;
1589}
1590
1591bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1592                                           SDValue &SOffset, SDValue &Offset,
1593                                           SDValue &GLC, SDValue &SLC,
1594                                           SDValue &TFE, SDValue &DLC,
1595                                           SDValue &SWZ) const {
1596  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1597  const SIInstrInfo *TII =
1598    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1599
1600  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1601              GLC, SLC, TFE, DLC, SWZ))
1602    return false;
1603
1604  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1605      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1606      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1607    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1608                    APInt::getAllOnesValue(32).getZExtValue(); // Size
1609    SDLoc DL(Addr);
1610
1611    const SITargetLowering& Lowering =
1612      *static_cast<const SITargetLowering*>(getTargetLowering());
1613
1614    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1615    return true;
1616  }
1617  return false;
1618}
1619
1620bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1621                                           SDValue &Soffset, SDValue &Offset
1622                                           ) const {
1623  SDValue GLC, SLC, TFE, DLC, SWZ;
1624
1625  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
1626}
1627bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1628                                           SDValue &Soffset, SDValue &Offset,
1629                                           SDValue &SLC) const {
1630  SDValue GLC, TFE, DLC, SWZ;
1631
1632  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
1633}
1634
1635// Find a load or store from corresponding pattern root.
1636// Roots may be build_vector, bitconvert or their combinations.
1637static MemSDNode* findMemSDNode(SDNode *N) {
1638  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1639  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1640    return MN;
1641  assert(isa<BuildVectorSDNode>(N));
1642  for (SDValue V : N->op_values())
1643    if (MemSDNode *MN =
1644          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1645      return MN;
1646  llvm_unreachable("cannot find MemSDNode in the pattern!");
1647}
1648
1649template <bool IsSigned>
1650bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
1651                                          SDValue Addr,
1652                                          SDValue &VAddr,
1653                                          SDValue &Offset,
1654                                          SDValue &SLC) const {
1655  int64_t OffsetVal = 0;
1656
1657  if (Subtarget->hasFlatInstOffsets() &&
1658      (!Subtarget->hasFlatSegmentOffsetBug() ||
1659       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
1660      CurDAG->isBaseWithConstantOffset(Addr)) {
1661    SDValue N0 = Addr.getOperand(0);
1662    SDValue N1 = Addr.getOperand(1);
1663    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1664
1665    const SIInstrInfo *TII = Subtarget->getInstrInfo();
1666    unsigned AS = findMemSDNode(N)->getAddressSpace();
1667    if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
1668      Addr = N0;
1669      OffsetVal = COffsetVal;
1670    } else {
1671      // If the offset doesn't fit, put the low bits into the offset field and
1672      // add the rest.
1673
1674      SDLoc DL(N);
1675      uint64_t ImmField;
1676      const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
1677      if (IsSigned) {
1678        ImmField = SignExtend64(COffsetVal, NumBits);
1679
1680        // Don't use a negative offset field if the base offset is positive.
1681        // Since the scheduler currently relies on the offset field, doing so
1682        // could result in strange scheduling decisions.
1683
1684        // TODO: Should we not do this in the opposite direction as well?
1685        if (static_cast<int64_t>(COffsetVal) > 0) {
1686          if (static_cast<int64_t>(ImmField) < 0) {
1687            const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
1688            ImmField = COffsetVal & OffsetMask;
1689          }
1690        }
1691      } else {
1692        // TODO: Should we do this for a negative offset?
1693        const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
1694        ImmField = COffsetVal & OffsetMask;
1695      }
1696
1697      uint64_t RemainderOffset = COffsetVal - ImmField;
1698
1699      assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
1700      assert(RemainderOffset + ImmField == COffsetVal);
1701
1702      OffsetVal = ImmField;
1703
1704      // TODO: Should this try to use a scalar add pseudo if the base address is
1705      // uniform and saddr is usable?
1706      SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1707      SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1708
1709      SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1710                                            DL, MVT::i32, N0, Sub0);
1711      SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1712                                            DL, MVT::i32, N0, Sub1);
1713
1714      SDValue AddOffsetLo
1715        = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1716      SDValue AddOffsetHi
1717        = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1718
1719      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1720      SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1721
1722      SDNode *Add = CurDAG->getMachineNode(
1723        AMDGPU::V_ADD_I32_e64, DL, VTs,
1724        {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1725
1726      SDNode *Addc = CurDAG->getMachineNode(
1727        AMDGPU::V_ADDC_U32_e64, DL, VTs,
1728        {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1729
1730      SDValue RegSequenceArgs[] = {
1731        CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1732        SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
1733      };
1734
1735      Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1736                                            MVT::i64, RegSequenceArgs), 0);
1737    }
1738  }
1739
1740  VAddr = Addr;
1741  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1742  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
1743  return true;
1744}
1745
1746bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
1747                                          SDValue Addr,
1748                                          SDValue &VAddr,
1749                                          SDValue &Offset,
1750                                          SDValue &SLC) const {
1751  return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC);
1752}
1753
1754bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
1755                                                SDValue Addr,
1756                                                SDValue &VAddr,
1757                                                SDValue &Offset,
1758                                                SDValue &SLC) const {
1759  return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
1760}
1761
1762bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1763                                          SDValue &Offset, bool &Imm) const {
1764
1765  // FIXME: Handle non-constant offsets.
1766  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1767  if (!C)
1768    return false;
1769
1770  SDLoc SL(ByteOffsetNode);
1771  GCNSubtarget::Generation Gen = Subtarget->getGeneration();
1772  int64_t ByteOffset = C->getSExtValue();
1773  int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
1774
1775  if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
1776    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
1777    Imm = true;
1778    return true;
1779  }
1780
1781  if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
1782    return false;
1783
1784  if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
1785    // 32-bit Immediates are supported on Sea Islands.
1786    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
1787  } else {
1788    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1789    Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
1790                                            C32Bit), 0);
1791  }
1792  Imm = false;
1793  return true;
1794}
1795
1796SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1797  if (Addr.getValueType() != MVT::i32)
1798    return Addr;
1799
1800  // Zero-extend a 32-bit address.
1801  SDLoc SL(Addr);
1802
1803  const MachineFunction &MF = CurDAG->getMachineFunction();
1804  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1805  unsigned AddrHiVal = Info->get32BitAddressHighBits();
1806  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1807
1808  const SDValue Ops[] = {
1809    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1810    Addr,
1811    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1812    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1813            0),
1814    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1815  };
1816
1817  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1818                                        Ops), 0);
1819}
1820
1821bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
1822                                     SDValue &Offset, bool &Imm) const {
1823  SDLoc SL(Addr);
1824
1825  // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1826  // wraparound, because s_load instructions perform the addition in 64 bits.
1827  if ((Addr.getValueType() != MVT::i32 ||
1828       Addr->getFlags().hasNoUnsignedWrap()) &&
1829      CurDAG->isBaseWithConstantOffset(Addr)) {
1830    SDValue N0 = Addr.getOperand(0);
1831    SDValue N1 = Addr.getOperand(1);
1832
1833    if (SelectSMRDOffset(N1, Offset, Imm)) {
1834      SBase = Expand32BitAddress(N0);
1835      return true;
1836    }
1837  }
1838  SBase = Expand32BitAddress(Addr);
1839  Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
1840  Imm = true;
1841  return true;
1842}
1843
1844bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
1845                                       SDValue &Offset) const {
1846  bool Imm;
1847  return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
1848}
1849
1850bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
1851                                         SDValue &Offset) const {
1852
1853  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
1854    return false;
1855
1856  bool Imm;
1857  if (!SelectSMRD(Addr, SBase, Offset, Imm))
1858    return false;
1859
1860  return !Imm && isa<ConstantSDNode>(Offset);
1861}
1862
1863bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
1864                                        SDValue &Offset) const {
1865  bool Imm;
1866  return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
1867         !isa<ConstantSDNode>(Offset);
1868}
1869
1870bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
1871                                             SDValue &Offset) const {
1872  bool Imm;
1873  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
1874}
1875
1876bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
1877                                               SDValue &Offset) const {
1878  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
1879    return false;
1880
1881  bool Imm;
1882  if (!SelectSMRDOffset(Addr, Offset, Imm))
1883    return false;
1884
1885  return !Imm && isa<ConstantSDNode>(Offset);
1886}
1887
1888bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
1889                                            SDValue &Base,
1890                                            SDValue &Offset) const {
1891  SDLoc DL(Index);
1892
1893  if (CurDAG->isBaseWithConstantOffset(Index)) {
1894    SDValue N0 = Index.getOperand(0);
1895    SDValue N1 = Index.getOperand(1);
1896    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1897
1898    // (add n0, c0)
1899    // Don't peel off the offset (c0) if doing so could possibly lead
1900    // the base (n0) to be negative.
1901    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
1902      Base = N0;
1903      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1904      return true;
1905    }
1906  }
1907
1908  if (isa<ConstantSDNode>(Index))
1909    return false;
1910
1911  Base = Index;
1912  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1913  return true;
1914}
1915
1916SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
1917                                     SDValue Val, uint32_t Offset,
1918                                     uint32_t Width) {
1919  // Transformation function, pack the offset and width of a BFE into
1920  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1921  // source, bits [5:0] contain the offset and bits [22:16] the width.
1922  uint32_t PackedVal = Offset | (Width << 16);
1923  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
1924
1925  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
1926}
1927
1928void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
1929  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
1930  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
1931  // Predicate: 0 < b <= c < 32
1932
1933  const SDValue &Shl = N->getOperand(0);
1934  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
1935  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
1936
1937  if (B && C) {
1938    uint32_t BVal = B->getZExtValue();
1939    uint32_t CVal = C->getZExtValue();
1940
1941    if (0 < BVal && BVal <= CVal && CVal < 32) {
1942      bool Signed = N->getOpcode() == ISD::SRA;
1943      unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1944
1945      ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
1946                              32 - CVal));
1947      return;
1948    }
1949  }
1950  SelectCode(N);
1951}
1952
1953void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
1954  switch (N->getOpcode()) {
1955  case ISD::AND:
1956    if (N->getOperand(0).getOpcode() == ISD::SRL) {
1957      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
1958      // Predicate: isMask(mask)
1959      const SDValue &Srl = N->getOperand(0);
1960      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
1961      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
1962
1963      if (Shift && Mask) {
1964        uint32_t ShiftVal = Shift->getZExtValue();
1965        uint32_t MaskVal = Mask->getZExtValue();
1966
1967        if (isMask_32(MaskVal)) {
1968          uint32_t WidthVal = countPopulation(MaskVal);
1969
1970          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
1971                                  Srl.getOperand(0), ShiftVal, WidthVal));
1972          return;
1973        }
1974      }
1975    }
1976    break;
1977  case ISD::SRL:
1978    if (N->getOperand(0).getOpcode() == ISD::AND) {
1979      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
1980      // Predicate: isMask(mask >> b)
1981      const SDValue &And = N->getOperand(0);
1982      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
1983      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
1984
1985      if (Shift && Mask) {
1986        uint32_t ShiftVal = Shift->getZExtValue();
1987        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
1988
1989        if (isMask_32(MaskVal)) {
1990          uint32_t WidthVal = countPopulation(MaskVal);
1991
1992          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
1993                                  And.getOperand(0), ShiftVal, WidthVal));
1994          return;
1995        }
1996      }
1997    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
1998      SelectS_BFEFromShifts(N);
1999      return;
2000    }
2001    break;
2002  case ISD::SRA:
2003    if (N->getOperand(0).getOpcode() == ISD::SHL) {
2004      SelectS_BFEFromShifts(N);
2005      return;
2006    }
2007    break;
2008
2009  case ISD::SIGN_EXTEND_INREG: {
2010    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2011    SDValue Src = N->getOperand(0);
2012    if (Src.getOpcode() != ISD::SRL)
2013      break;
2014
2015    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2016    if (!Amt)
2017      break;
2018
2019    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2020    ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2021                            Amt->getZExtValue(), Width));
2022    return;
2023  }
2024  }
2025
2026  SelectCode(N);
2027}
2028
2029bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2030  assert(N->getOpcode() == ISD::BRCOND);
2031  if (!N->hasOneUse())
2032    return false;
2033
2034  SDValue Cond = N->getOperand(1);
2035  if (Cond.getOpcode() == ISD::CopyToReg)
2036    Cond = Cond.getOperand(2);
2037
2038  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2039    return false;
2040
2041  MVT VT = Cond.getOperand(0).getSimpleValueType();
2042  if (VT == MVT::i32)
2043    return true;
2044
2045  if (VT == MVT::i64) {
2046    auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2047
2048    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2049    return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2050  }
2051
2052  return false;
2053}
2054
2055void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2056  SDValue Cond = N->getOperand(1);
2057
2058  if (Cond.isUndef()) {
2059    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2060                         N->getOperand(2), N->getOperand(0));
2061    return;
2062  }
2063
2064  const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2065  const SIRegisterInfo *TRI = ST->getRegisterInfo();
2066
2067  bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2068  unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2069  unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
2070  SDLoc SL(N);
2071
2072  if (!UseSCCBr) {
2073    // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2074    // analyzed what generates the vcc value, so we do not know whether vcc
2075    // bits for disabled lanes are 0.  Thus we need to mask out bits for
2076    // disabled lanes.
2077    //
2078    // For the case that we select S_CBRANCH_SCC1 and it gets
2079    // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2080    // SIInstrInfo::moveToVALU which inserts the S_AND).
2081    //
2082    // We could add an analysis of what generates the vcc value here and omit
2083    // the S_AND when is unnecessary. But it would be better to add a separate
2084    // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2085    // catches both cases.
2086    Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2087                                                         : AMDGPU::S_AND_B64,
2088                     SL, MVT::i1,
2089                     CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2090                                                        : AMDGPU::EXEC,
2091                                         MVT::i1),
2092                    Cond),
2093                   0);
2094  }
2095
2096  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2097  CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2098                       N->getOperand(2), // Basic Block
2099                       VCC.getValue(0));
2100}
2101
2102void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2103  MVT VT = N->getSimpleValueType(0);
2104  bool IsFMA = N->getOpcode() == ISD::FMA;
2105  if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2106                         !Subtarget->hasFmaMixInsts()) ||
2107      ((IsFMA && Subtarget->hasMadMixInsts()) ||
2108       (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2109    SelectCode(N);
2110    return;
2111  }
2112
2113  SDValue Src0 = N->getOperand(0);
2114  SDValue Src1 = N->getOperand(1);
2115  SDValue Src2 = N->getOperand(2);
2116  unsigned Src0Mods, Src1Mods, Src2Mods;
2117
2118  // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2119  // using the conversion from f16.
2120  bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2121  bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2122  bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2123
2124  assert((IsFMA || !Mode.FP32Denormals) &&
2125         "fmad selected with denormals enabled");
2126  // TODO: We can select this with f32 denormals enabled if all the sources are
2127  // converted from f16 (in which case fmad isn't legal).
2128
2129  if (Sel0 || Sel1 || Sel2) {
2130    // For dummy operands.
2131    SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2132    SDValue Ops[] = {
2133      CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2134      CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2135      CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2136      CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2137      Zero, Zero
2138    };
2139
2140    CurDAG->SelectNodeTo(N,
2141                         IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2142                         MVT::f32, Ops);
2143  } else {
2144    SelectCode(N);
2145  }
2146}
2147
2148// This is here because there isn't a way to use the generated sub0_sub1 as the
2149// subreg index to EXTRACT_SUBREG in tablegen.
2150void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2151  MemSDNode *Mem = cast<MemSDNode>(N);
2152  unsigned AS = Mem->getAddressSpace();
2153  if (AS == AMDGPUAS::FLAT_ADDRESS) {
2154    SelectCode(N);
2155    return;
2156  }
2157
2158  MVT VT = N->getSimpleValueType(0);
2159  bool Is32 = (VT == MVT::i32);
2160  SDLoc SL(N);
2161
2162  MachineSDNode *CmpSwap = nullptr;
2163  if (Subtarget->hasAddr64()) {
2164    SDValue SRsrc, VAddr, SOffset, Offset, SLC;
2165
2166    if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
2167      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2168        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2169      SDValue CmpVal = Mem->getOperand(2);
2170
2171      // XXX - Do we care about glue operands?
2172
2173      SDValue Ops[] = {
2174        CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
2175      };
2176
2177      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2178    }
2179  }
2180
2181  if (!CmpSwap) {
2182    SDValue SRsrc, SOffset, Offset, SLC;
2183    if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
2184      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2185        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2186
2187      SDValue CmpVal = Mem->getOperand(2);
2188      SDValue Ops[] = {
2189        CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
2190      };
2191
2192      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2193    }
2194  }
2195
2196  if (!CmpSwap) {
2197    SelectCode(N);
2198    return;
2199  }
2200
2201  MachineMemOperand *MMO = Mem->getMemOperand();
2202  CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2203
2204  unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2205  SDValue Extract
2206    = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2207
2208  ReplaceUses(SDValue(N, 0), Extract);
2209  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2210  CurDAG->RemoveDeadNode(N);
2211}
2212
2213void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2214  // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2215  // be copied to an SGPR with readfirstlane.
2216  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2217    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2218
2219  SDValue Chain = N->getOperand(0);
2220  SDValue Ptr = N->getOperand(2);
2221  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2222  MachineMemOperand *MMO = M->getMemOperand();
2223  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2224
2225  SDValue Offset;
2226  if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2227    SDValue PtrBase = Ptr.getOperand(0);
2228    SDValue PtrOffset = Ptr.getOperand(1);
2229
2230    const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2231    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
2232      N = glueCopyToM0(N, PtrBase);
2233      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2234    }
2235  }
2236
2237  if (!Offset) {
2238    N = glueCopyToM0(N, Ptr);
2239    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2240  }
2241
2242  SDValue Ops[] = {
2243    Offset,
2244    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2245    Chain,
2246    N->getOperand(N->getNumOperands() - 1) // New glue
2247  };
2248
2249  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2250  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2251}
2252
2253static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2254  switch (IntrID) {
2255  case Intrinsic::amdgcn_ds_gws_init:
2256    return AMDGPU::DS_GWS_INIT;
2257  case Intrinsic::amdgcn_ds_gws_barrier:
2258    return AMDGPU::DS_GWS_BARRIER;
2259  case Intrinsic::amdgcn_ds_gws_sema_v:
2260    return AMDGPU::DS_GWS_SEMA_V;
2261  case Intrinsic::amdgcn_ds_gws_sema_br:
2262    return AMDGPU::DS_GWS_SEMA_BR;
2263  case Intrinsic::amdgcn_ds_gws_sema_p:
2264    return AMDGPU::DS_GWS_SEMA_P;
2265  case Intrinsic::amdgcn_ds_gws_sema_release_all:
2266    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2267  default:
2268    llvm_unreachable("not a gws intrinsic");
2269  }
2270}
2271
2272void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2273  if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2274      !Subtarget->hasGWSSemaReleaseAll()) {
2275    // Let this error.
2276    SelectCode(N);
2277    return;
2278  }
2279
2280  // Chain, intrinsic ID, vsrc, offset
2281  const bool HasVSrc = N->getNumOperands() == 4;
2282  assert(HasVSrc || N->getNumOperands() == 3);
2283
2284  SDLoc SL(N);
2285  SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2286  int ImmOffset = 0;
2287  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2288  MachineMemOperand *MMO = M->getMemOperand();
2289
2290  // Don't worry if the offset ends up in a VGPR. Only one lane will have
2291  // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2292
2293  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2294  // offset field) % 64. Some versions of the programming guide omit the m0
2295  // part, or claim it's from offset 0.
2296  if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2297    // If we have a constant offset, try to use the 0 in m0 as the base.
2298    // TODO: Look into changing the default m0 initialization value. If the
2299    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2300    // the immediate offset.
2301    glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2302    ImmOffset = ConstOffset->getZExtValue();
2303  } else {
2304    if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2305      ImmOffset = BaseOffset.getConstantOperandVal(1);
2306      BaseOffset = BaseOffset.getOperand(0);
2307    }
2308
2309    // Prefer to do the shift in an SGPR since it should be possible to use m0
2310    // as the result directly. If it's already an SGPR, it will be eliminated
2311    // later.
2312    SDNode *SGPROffset
2313      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2314                               BaseOffset);
2315    // Shift to offset in m0
2316    SDNode *M0Base
2317      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2318                               SDValue(SGPROffset, 0),
2319                               CurDAG->getTargetConstant(16, SL, MVT::i32));
2320    glueCopyToM0(N, SDValue(M0Base, 0));
2321  }
2322
2323  SDValue Chain = N->getOperand(0);
2324  SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2325
2326  // TODO: Can this just be removed from the instruction?
2327  SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
2328
2329  const unsigned Opc = gwsIntrinToOpcode(IntrID);
2330  SmallVector<SDValue, 5> Ops;
2331  if (HasVSrc)
2332    Ops.push_back(N->getOperand(2));
2333  Ops.push_back(OffsetField);
2334  Ops.push_back(GDS);
2335  Ops.push_back(Chain);
2336
2337  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2338  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2339}
2340
2341void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2342  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2343  switch (IntrID) {
2344  case Intrinsic::amdgcn_ds_append:
2345  case Intrinsic::amdgcn_ds_consume: {
2346    if (N->getValueType(0) != MVT::i32)
2347      break;
2348    SelectDSAppendConsume(N, IntrID);
2349    return;
2350  }
2351  }
2352
2353  SelectCode(N);
2354}
2355
2356void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2357  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2358  unsigned Opcode;
2359  switch (IntrID) {
2360  case Intrinsic::amdgcn_wqm:
2361    Opcode = AMDGPU::WQM;
2362    break;
2363  case Intrinsic::amdgcn_softwqm:
2364    Opcode = AMDGPU::SOFT_WQM;
2365    break;
2366  case Intrinsic::amdgcn_wwm:
2367    Opcode = AMDGPU::WWM;
2368    break;
2369  default:
2370    SelectCode(N);
2371    return;
2372  }
2373
2374  SDValue Src = N->getOperand(1);
2375  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2376}
2377
2378void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2379  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2380  switch (IntrID) {
2381  case Intrinsic::amdgcn_ds_gws_init:
2382  case Intrinsic::amdgcn_ds_gws_barrier:
2383  case Intrinsic::amdgcn_ds_gws_sema_v:
2384  case Intrinsic::amdgcn_ds_gws_sema_br:
2385  case Intrinsic::amdgcn_ds_gws_sema_p:
2386  case Intrinsic::amdgcn_ds_gws_sema_release_all:
2387    SelectDS_GWS(N, IntrID);
2388    return;
2389  default:
2390    break;
2391  }
2392
2393  SelectCode(N);
2394}
2395
2396bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2397                                            unsigned &Mods) const {
2398  Mods = 0;
2399  Src = In;
2400
2401  if (Src.getOpcode() == ISD::FNEG) {
2402    Mods |= SISrcMods::NEG;
2403    Src = Src.getOperand(0);
2404  }
2405
2406  if (Src.getOpcode() == ISD::FABS) {
2407    Mods |= SISrcMods::ABS;
2408    Src = Src.getOperand(0);
2409  }
2410
2411  return true;
2412}
2413
2414bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2415                                        SDValue &SrcMods) const {
2416  unsigned Mods;
2417  if (SelectVOP3ModsImpl(In, Src, Mods)) {
2418    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2419    return true;
2420  }
2421
2422  return false;
2423}
2424
2425bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2426                                             SDValue &SrcMods) const {
2427  SelectVOP3Mods(In, Src, SrcMods);
2428  return isNoNanSrc(Src);
2429}
2430
2431bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
2432                                            SDValue &SrcMods) const {
2433  if (In.getValueType() == MVT::f32)
2434    return SelectVOP3Mods(In, Src, SrcMods);
2435  Src = In;
2436  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
2437  return true;
2438}
2439
2440bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2441  if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2442    return false;
2443
2444  Src = In;
2445  return true;
2446}
2447
2448bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2449                                         SDValue &SrcMods, SDValue &Clamp,
2450                                         SDValue &Omod) const {
2451  SDLoc DL(In);
2452  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2453  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2454
2455  return SelectVOP3Mods(In, Src, SrcMods);
2456}
2457
2458bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2459                                         SDValue &Clamp, SDValue &Omod) const {
2460  Src = In;
2461
2462  SDLoc DL(In);
2463  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2464  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2465
2466  return true;
2467}
2468
2469bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2470                                         SDValue &SrcMods) const {
2471  unsigned Mods = 0;
2472  Src = In;
2473
2474  if (Src.getOpcode() == ISD::FNEG) {
2475    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2476    Src = Src.getOperand(0);
2477  }
2478
2479  if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2480    unsigned VecMods = Mods;
2481
2482    SDValue Lo = stripBitcast(Src.getOperand(0));
2483    SDValue Hi = stripBitcast(Src.getOperand(1));
2484
2485    if (Lo.getOpcode() == ISD::FNEG) {
2486      Lo = stripBitcast(Lo.getOperand(0));
2487      Mods ^= SISrcMods::NEG;
2488    }
2489
2490    if (Hi.getOpcode() == ISD::FNEG) {
2491      Hi = stripBitcast(Hi.getOperand(0));
2492      Mods ^= SISrcMods::NEG_HI;
2493    }
2494
2495    if (isExtractHiElt(Lo, Lo))
2496      Mods |= SISrcMods::OP_SEL_0;
2497
2498    if (isExtractHiElt(Hi, Hi))
2499      Mods |= SISrcMods::OP_SEL_1;
2500
2501    Lo = stripExtractLoElt(Lo);
2502    Hi = stripExtractLoElt(Hi);
2503
2504    if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2505      // Really a scalar input. Just select from the low half of the register to
2506      // avoid packing.
2507
2508      Src = Lo;
2509      SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2510      return true;
2511    }
2512
2513    Mods = VecMods;
2514  }
2515
2516  // Packed instructions do not have abs modifiers.
2517  Mods |= SISrcMods::OP_SEL_1;
2518
2519  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2520  return true;
2521}
2522
2523bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
2524                                          SDValue &SrcMods,
2525                                          SDValue &Clamp) const {
2526  SDLoc SL(In);
2527
2528  // FIXME: Handle clamp and op_sel
2529  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
2530
2531  return SelectVOP3PMods(In, Src, SrcMods);
2532}
2533
2534bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2535                                         SDValue &SrcMods) const {
2536  Src = In;
2537  // FIXME: Handle op_sel
2538  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2539  return true;
2540}
2541
2542bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
2543                                          SDValue &SrcMods,
2544                                          SDValue &Clamp) const {
2545  SDLoc SL(In);
2546
2547  // FIXME: Handle clamp
2548  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
2549
2550  return SelectVOP3OpSel(In, Src, SrcMods);
2551}
2552
2553bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2554                                             SDValue &SrcMods) const {
2555  // FIXME: Handle op_sel
2556  return SelectVOP3Mods(In, Src, SrcMods);
2557}
2558
2559bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
2560                                              SDValue &SrcMods,
2561                                              SDValue &Clamp) const {
2562  SDLoc SL(In);
2563
2564  // FIXME: Handle clamp
2565  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
2566
2567  return SelectVOP3OpSelMods(In, Src, SrcMods);
2568}
2569
2570// The return value is not whether the match is possible (which it always is),
2571// but whether or not it a conversion is really used.
2572bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2573                                                   unsigned &Mods) const {
2574  Mods = 0;
2575  SelectVOP3ModsImpl(In, Src, Mods);
2576
2577  if (Src.getOpcode() == ISD::FP_EXTEND) {
2578    Src = Src.getOperand(0);
2579    assert(Src.getValueType() == MVT::f16);
2580    Src = stripBitcast(Src);
2581
2582    // Be careful about folding modifiers if we already have an abs. fneg is
2583    // applied last, so we don't want to apply an earlier fneg.
2584    if ((Mods & SISrcMods::ABS) == 0) {
2585      unsigned ModsTmp;
2586      SelectVOP3ModsImpl(Src, Src, ModsTmp);
2587
2588      if ((ModsTmp & SISrcMods::NEG) != 0)
2589        Mods ^= SISrcMods::NEG;
2590
2591      if ((ModsTmp & SISrcMods::ABS) != 0)
2592        Mods |= SISrcMods::ABS;
2593    }
2594
2595    // op_sel/op_sel_hi decide the source type and source.
2596    // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2597    // If the sources's op_sel is set, it picks the high half of the source
2598    // register.
2599
2600    Mods |= SISrcMods::OP_SEL_1;
2601    if (isExtractHiElt(Src, Src)) {
2602      Mods |= SISrcMods::OP_SEL_0;
2603
2604      // TODO: Should we try to look for neg/abs here?
2605    }
2606
2607    return true;
2608  }
2609
2610  return false;
2611}
2612
2613bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2614                                               SDValue &SrcMods) const {
2615  unsigned Mods = 0;
2616  SelectVOP3PMadMixModsImpl(In, Src, Mods);
2617  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2618  return true;
2619}
2620
2621SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2622  if (In.isUndef())
2623    return CurDAG->getUNDEF(MVT::i32);
2624
2625  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2626    SDLoc SL(In);
2627    return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2628  }
2629
2630  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2631    SDLoc SL(In);
2632    return CurDAG->getConstant(
2633      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2634  }
2635
2636  SDValue Src;
2637  if (isExtractHiElt(In, Src))
2638    return Src;
2639
2640  return SDValue();
2641}
2642
2643bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2644  assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2645
2646  const SIRegisterInfo *SIRI =
2647    static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2648  const SIInstrInfo * SII =
2649    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2650
2651  unsigned Limit = 0;
2652  bool AllUsesAcceptSReg = true;
2653  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2654    Limit < 10 && U != E; ++U, ++Limit) {
2655    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2656
2657    // If the register class is unknown, it could be an unknown
2658    // register class that needs to be an SGPR, e.g. an inline asm
2659    // constraint
2660    if (!RC || SIRI->isSGPRClass(RC))
2661      return false;
2662
2663    if (RC != &AMDGPU::VS_32RegClass) {
2664      AllUsesAcceptSReg = false;
2665      SDNode * User = *U;
2666      if (User->isMachineOpcode()) {
2667        unsigned Opc = User->getMachineOpcode();
2668        MCInstrDesc Desc = SII->get(Opc);
2669        if (Desc.isCommutable()) {
2670          unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2671          unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2672          if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2673            unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2674            const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2675            if (CommutedRC == &AMDGPU::VS_32RegClass)
2676              AllUsesAcceptSReg = true;
2677          }
2678        }
2679      }
2680      // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2681      // commuting current user. This means have at least one use
2682      // that strictly require VGPR. Thus, we will not attempt to commute
2683      // other user instructions.
2684      if (!AllUsesAcceptSReg)
2685        break;
2686    }
2687  }
2688  return !AllUsesAcceptSReg && (Limit < 10);
2689}
2690
2691bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2692  auto Ld = cast<LoadSDNode>(N);
2693
2694  return Ld->getAlignment() >= 4 &&
2695        (
2696          (
2697            (
2698              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2699              Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2700            )
2701            &&
2702            !N->isDivergent()
2703          )
2704          ||
2705          (
2706            Subtarget->getScalarizeGlobalBehavior() &&
2707            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2708            !Ld->isVolatile() &&
2709            !N->isDivergent() &&
2710            static_cast<const SITargetLowering *>(
2711              getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2712          )
2713        );
2714}
2715
2716void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2717  const AMDGPUTargetLowering& Lowering =
2718    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2719  bool IsModified = false;
2720  do {
2721    IsModified = false;
2722
2723    // Go over all selected nodes and try to fold them a bit more
2724    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2725    while (Position != CurDAG->allnodes_end()) {
2726      SDNode *Node = &*Position++;
2727      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2728      if (!MachineNode)
2729        continue;
2730
2731      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2732      if (ResNode != Node) {
2733        if (ResNode)
2734          ReplaceUses(Node, ResNode);
2735        IsModified = true;
2736      }
2737    }
2738    CurDAG->RemoveDeadNodes();
2739  } while (IsModified);
2740}
2741
2742bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
2743  Subtarget = &MF.getSubtarget<R600Subtarget>();
2744  return SelectionDAGISel::runOnMachineFunction(MF);
2745}
2746
2747bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
2748  if (!N->readMem())
2749    return false;
2750  if (CbId == -1)
2751    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
2752           N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
2753
2754  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
2755}
2756
2757bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
2758                                                         SDValue& IntPtr) {
2759  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
2760    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
2761                                       true);
2762    return true;
2763  }
2764  return false;
2765}
2766
2767bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
2768    SDValue& BaseReg, SDValue &Offset) {
2769  if (!isa<ConstantSDNode>(Addr)) {
2770    BaseReg = Addr;
2771    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
2772    return true;
2773  }
2774  return false;
2775}
2776
2777void R600DAGToDAGISel::Select(SDNode *N) {
2778  unsigned int Opc = N->getOpcode();
2779  if (N->isMachineOpcode()) {
2780    N->setNodeId(-1);
2781    return;   // Already selected.
2782  }
2783
2784  switch (Opc) {
2785  default: break;
2786  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
2787  case ISD::SCALAR_TO_VECTOR:
2788  case ISD::BUILD_VECTOR: {
2789    EVT VT = N->getValueType(0);
2790    unsigned NumVectorElts = VT.getVectorNumElements();
2791    unsigned RegClassID;
2792    // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
2793    // that adds a 128 bits reg copy when going through TwoAddressInstructions
2794    // pass. We want to avoid 128 bits copies as much as possible because they
2795    // can't be bundled by our scheduler.
2796    switch(NumVectorElts) {
2797    case 2: RegClassID = R600::R600_Reg64RegClassID; break;
2798    case 4:
2799      if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
2800        RegClassID = R600::R600_Reg128VerticalRegClassID;
2801      else
2802        RegClassID = R600::R600_Reg128RegClassID;
2803      break;
2804    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
2805    }
2806    SelectBuildVector(N, RegClassID);
2807    return;
2808  }
2809  }
2810
2811  SelectCode(N);
2812}
2813
2814bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
2815                                          SDValue &Offset) {
2816  ConstantSDNode *C;
2817  SDLoc DL(Addr);
2818
2819  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
2820    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
2821    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
2822  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
2823             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
2824    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
2825    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
2826  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
2827            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
2828    Base = Addr.getOperand(0);
2829    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
2830  } else {
2831    Base = Addr;
2832    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2833  }
2834
2835  return true;
2836}
2837
2838bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
2839                                          SDValue &Offset) {
2840  ConstantSDNode *IMMOffset;
2841
2842  if (Addr.getOpcode() == ISD::ADD
2843      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
2844      && isInt<16>(IMMOffset->getZExtValue())) {
2845
2846      Base = Addr.getOperand(0);
2847      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
2848                                         MVT::i32);
2849      return true;
2850  // If the pointer address is constant, we can move it to the offset field.
2851  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
2852             && isInt<16>(IMMOffset->getZExtValue())) {
2853    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
2854                                  SDLoc(CurDAG->getEntryNode()),
2855                                  R600::ZERO, MVT::i32);
2856    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
2857                                       MVT::i32);
2858    return true;
2859  }
2860
2861  // Default case, no offset
2862  Base = Addr;
2863  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2864  return true;
2865}
2866