SIInstrInfo.cpp revision 321369
1//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief SI Implementation of TargetInstrInfo.
12//
13//===----------------------------------------------------------------------===//
14
15#include "SIInstrInfo.h"
16#include "AMDGPUTargetMachine.h"
17#include "GCNHazardRecognizer.h"
18#include "SIDefines.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/RegisterScavenging.h"
24#include "llvm/CodeGen/ScheduleDAG.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/Function.h"
27#include "llvm/MC/MCInstrDesc.h"
28#include "llvm/Support/Debug.h"
29
30using namespace llvm;
31
32// Must be at least 4 to be able to branch over minimum unconditional branch
33// code. This is only for making it possible to write reasonably small tests for
34// long branches.
35static cl::opt<unsigned>
36BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
37                 cl::desc("Restrict range of branch instructions (DEBUG)"));
38
39SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
40  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
41
42//===----------------------------------------------------------------------===//
43// TargetInstrInfo callbacks
44//===----------------------------------------------------------------------===//
45
46static unsigned getNumOperandsNoGlue(SDNode *Node) {
47  unsigned N = Node->getNumOperands();
48  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
49    --N;
50  return N;
51}
52
53static SDValue findChainOperand(SDNode *Load) {
54  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
55  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
56  return LastOp;
57}
58
59/// \brief Returns true if both nodes have the same value for the given
60///        operand \p Op, or if both nodes do not have this operand.
61static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
62  unsigned Opc0 = N0->getMachineOpcode();
63  unsigned Opc1 = N1->getMachineOpcode();
64
65  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
66  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
67
68  if (Op0Idx == -1 && Op1Idx == -1)
69    return true;
70
71
72  if ((Op0Idx == -1 && Op1Idx != -1) ||
73      (Op1Idx == -1 && Op0Idx != -1))
74    return false;
75
76  // getNamedOperandIdx returns the index for the MachineInstr's operands,
77  // which includes the result as the first operand. We are indexing into the
78  // MachineSDNode's operands, so we need to skip the result operand to get
79  // the real index.
80  --Op0Idx;
81  --Op1Idx;
82
83  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
84}
85
86bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
87                                                    AliasAnalysis *AA) const {
88  // TODO: The generic check fails for VALU instructions that should be
89  // rematerializable due to implicit reads of exec. We really want all of the
90  // generic logic for this except for this.
91  switch (MI.getOpcode()) {
92  case AMDGPU::V_MOV_B32_e32:
93  case AMDGPU::V_MOV_B32_e64:
94  case AMDGPU::V_MOV_B64_PSEUDO:
95    return true;
96  default:
97    return false;
98  }
99}
100
101bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
102                                          int64_t &Offset0,
103                                          int64_t &Offset1) const {
104  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
105    return false;
106
107  unsigned Opc0 = Load0->getMachineOpcode();
108  unsigned Opc1 = Load1->getMachineOpcode();
109
110  // Make sure both are actually loads.
111  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
112    return false;
113
114  if (isDS(Opc0) && isDS(Opc1)) {
115
116    // FIXME: Handle this case:
117    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
118      return false;
119
120    // Check base reg.
121    if (Load0->getOperand(1) != Load1->getOperand(1))
122      return false;
123
124    // Check chain.
125    if (findChainOperand(Load0) != findChainOperand(Load1))
126      return false;
127
128    // Skip read2 / write2 variants for simplicity.
129    // TODO: We should report true if the used offsets are adjacent (excluded
130    // st64 versions).
131    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
132        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
133      return false;
134
135    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
136    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
137    return true;
138  }
139
140  if (isSMRD(Opc0) && isSMRD(Opc1)) {
141    // Skip time and cache invalidation instructions.
142    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
143        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
144      return false;
145
146    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
147
148    // Check base reg.
149    if (Load0->getOperand(0) != Load1->getOperand(0))
150      return false;
151
152    const ConstantSDNode *Load0Offset =
153        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
154    const ConstantSDNode *Load1Offset =
155        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
156
157    if (!Load0Offset || !Load1Offset)
158      return false;
159
160    // Check chain.
161    if (findChainOperand(Load0) != findChainOperand(Load1))
162      return false;
163
164    Offset0 = Load0Offset->getZExtValue();
165    Offset1 = Load1Offset->getZExtValue();
166    return true;
167  }
168
169  // MUBUF and MTBUF can access the same addresses.
170  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
171
172    // MUBUF and MTBUF have vaddr at different indices.
173    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
174        findChainOperand(Load0) != findChainOperand(Load1) ||
175        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
176        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
177      return false;
178
179    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
180    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
181
182    if (OffIdx0 == -1 || OffIdx1 == -1)
183      return false;
184
185    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
186    // inlcude the output in the operand list, but SDNodes don't, we need to
187    // subtract the index by one.
188    --OffIdx0;
189    --OffIdx1;
190
191    SDValue Off0 = Load0->getOperand(OffIdx0);
192    SDValue Off1 = Load1->getOperand(OffIdx1);
193
194    // The offset might be a FrameIndexSDNode.
195    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
196      return false;
197
198    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
199    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
200    return true;
201  }
202
203  return false;
204}
205
206static bool isStride64(unsigned Opc) {
207  switch (Opc) {
208  case AMDGPU::DS_READ2ST64_B32:
209  case AMDGPU::DS_READ2ST64_B64:
210  case AMDGPU::DS_WRITE2ST64_B32:
211  case AMDGPU::DS_WRITE2ST64_B64:
212    return true;
213  default:
214    return false;
215  }
216}
217
218bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
219                                        int64_t &Offset,
220                                        const TargetRegisterInfo *TRI) const {
221  unsigned Opc = LdSt.getOpcode();
222
223  if (isDS(LdSt)) {
224    const MachineOperand *OffsetImm =
225        getNamedOperand(LdSt, AMDGPU::OpName::offset);
226    if (OffsetImm) {
227      // Normal, single offset LDS instruction.
228      const MachineOperand *AddrReg =
229          getNamedOperand(LdSt, AMDGPU::OpName::addr);
230
231      BaseReg = AddrReg->getReg();
232      Offset = OffsetImm->getImm();
233      return true;
234    }
235
236    // The 2 offset instructions use offset0 and offset1 instead. We can treat
237    // these as a load with a single offset if the 2 offsets are consecutive. We
238    // will use this for some partially aligned loads.
239    const MachineOperand *Offset0Imm =
240        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
241    const MachineOperand *Offset1Imm =
242        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
243
244    uint8_t Offset0 = Offset0Imm->getImm();
245    uint8_t Offset1 = Offset1Imm->getImm();
246
247    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
248      // Each of these offsets is in element sized units, so we need to convert
249      // to bytes of the individual reads.
250
251      unsigned EltSize;
252      if (LdSt.mayLoad())
253        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
254      else {
255        assert(LdSt.mayStore());
256        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
257        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
258      }
259
260      if (isStride64(Opc))
261        EltSize *= 64;
262
263      const MachineOperand *AddrReg =
264          getNamedOperand(LdSt, AMDGPU::OpName::addr);
265      BaseReg = AddrReg->getReg();
266      Offset = EltSize * Offset0;
267      return true;
268    }
269
270    return false;
271  }
272
273  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
274    const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
275    if (SOffset && SOffset->isReg())
276      return false;
277
278    const MachineOperand *AddrReg =
279        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
280    if (!AddrReg)
281      return false;
282
283    const MachineOperand *OffsetImm =
284        getNamedOperand(LdSt, AMDGPU::OpName::offset);
285    BaseReg = AddrReg->getReg();
286    Offset = OffsetImm->getImm();
287
288    if (SOffset) // soffset can be an inline immediate.
289      Offset += SOffset->getImm();
290
291    return true;
292  }
293
294  if (isSMRD(LdSt)) {
295    const MachineOperand *OffsetImm =
296        getNamedOperand(LdSt, AMDGPU::OpName::offset);
297    if (!OffsetImm)
298      return false;
299
300    const MachineOperand *SBaseReg =
301        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
302    BaseReg = SBaseReg->getReg();
303    Offset = OffsetImm->getImm();
304    return true;
305  }
306
307  if (isFLAT(LdSt)) {
308    const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
309    BaseReg = AddrReg->getReg();
310    Offset = 0;
311    return true;
312  }
313
314  return false;
315}
316
317bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
318                                      MachineInstr &SecondLdSt,
319                                      unsigned NumLoads) const {
320  const MachineOperand *FirstDst = nullptr;
321  const MachineOperand *SecondDst = nullptr;
322
323  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
324      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
325      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
326    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
327    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
328  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
329    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
330    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
331  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
332    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
333    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
334  }
335
336  if (!FirstDst || !SecondDst)
337    return false;
338
339  // Try to limit clustering based on the total number of bytes loaded
340  // rather than the number of instructions.  This is done to help reduce
341  // register pressure.  The method used is somewhat inexact, though,
342  // because it assumes that all loads in the cluster will load the
343  // same number of bytes as FirstLdSt.
344
345  // The unit of this value is bytes.
346  // FIXME: This needs finer tuning.
347  unsigned LoadClusterThreshold = 16;
348
349  const MachineRegisterInfo &MRI =
350      FirstLdSt.getParent()->getParent()->getRegInfo();
351  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
352
353  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
354}
355
356static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
357                              MachineBasicBlock::iterator MI,
358                              const DebugLoc &DL, unsigned DestReg,
359                              unsigned SrcReg, bool KillSrc) {
360  MachineFunction *MF = MBB.getParent();
361  DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
362                                        "illegal SGPR to VGPR copy",
363                                        DL, DS_Error);
364  LLVMContext &C = MF->getFunction()->getContext();
365  C.diagnose(IllegalCopy);
366
367  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
368    .addReg(SrcReg, getKillRegState(KillSrc));
369}
370
371void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
372                              MachineBasicBlock::iterator MI,
373                              const DebugLoc &DL, unsigned DestReg,
374                              unsigned SrcReg, bool KillSrc) const {
375  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
376
377  if (RC == &AMDGPU::VGPR_32RegClass) {
378    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
379           AMDGPU::SReg_32RegClass.contains(SrcReg));
380    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
381      .addReg(SrcReg, getKillRegState(KillSrc));
382    return;
383  }
384
385  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
386      RC == &AMDGPU::SReg_32RegClass) {
387    if (SrcReg == AMDGPU::SCC) {
388      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
389          .addImm(-1)
390          .addImm(0);
391      return;
392    }
393
394    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
395      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
396      return;
397    }
398
399    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
400            .addReg(SrcReg, getKillRegState(KillSrc));
401    return;
402  }
403
404  if (RC == &AMDGPU::SReg_64RegClass) {
405    if (DestReg == AMDGPU::VCC) {
406      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
407        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
408          .addReg(SrcReg, getKillRegState(KillSrc));
409      } else {
410        // FIXME: Hack until VReg_1 removed.
411        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
412        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
413          .addImm(0)
414          .addReg(SrcReg, getKillRegState(KillSrc));
415      }
416
417      return;
418    }
419
420    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
421      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
422      return;
423    }
424
425    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
426            .addReg(SrcReg, getKillRegState(KillSrc));
427    return;
428  }
429
430  if (DestReg == AMDGPU::SCC) {
431    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
432    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
433      .addReg(SrcReg, getKillRegState(KillSrc))
434      .addImm(0);
435    return;
436  }
437
438  unsigned EltSize = 4;
439  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
440  if (RI.isSGPRClass(RC)) {
441    if (RI.getRegSizeInBits(*RC) > 32) {
442      Opcode =  AMDGPU::S_MOV_B64;
443      EltSize = 8;
444    } else {
445      Opcode = AMDGPU::S_MOV_B32;
446      EltSize = 4;
447    }
448
449    if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
450      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
451      return;
452    }
453  }
454
455
456  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
457  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
458
459  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
460    unsigned SubIdx;
461    if (Forward)
462      SubIdx = SubIndices[Idx];
463    else
464      SubIdx = SubIndices[SubIndices.size() - Idx - 1];
465
466    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
467      get(Opcode), RI.getSubReg(DestReg, SubIdx));
468
469    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
470
471    if (Idx == 0)
472      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
473
474    bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
475    Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
476  }
477}
478
479int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
480  int NewOpc;
481
482  // Try to map original to commuted opcode
483  NewOpc = AMDGPU::getCommuteRev(Opcode);
484  if (NewOpc != -1)
485    // Check if the commuted (REV) opcode exists on the target.
486    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
487
488  // Try to map commuted to original opcode
489  NewOpc = AMDGPU::getCommuteOrig(Opcode);
490  if (NewOpc != -1)
491    // Check if the original (non-REV) opcode exists on the target.
492    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
493
494  return Opcode;
495}
496
497void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
498                                       MachineBasicBlock::iterator MI,
499                                       const DebugLoc &DL, unsigned DestReg,
500                                       int64_t Value) const {
501  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
502  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
503  if (RegClass == &AMDGPU::SReg_32RegClass ||
504      RegClass == &AMDGPU::SGPR_32RegClass ||
505      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
506      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
507    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
508      .addImm(Value);
509    return;
510  }
511
512  if (RegClass == &AMDGPU::SReg_64RegClass ||
513      RegClass == &AMDGPU::SGPR_64RegClass ||
514      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
515    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
516      .addImm(Value);
517    return;
518  }
519
520  if (RegClass == &AMDGPU::VGPR_32RegClass) {
521    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
522      .addImm(Value);
523    return;
524  }
525  if (RegClass == &AMDGPU::VReg_64RegClass) {
526    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
527      .addImm(Value);
528    return;
529  }
530
531  unsigned EltSize = 4;
532  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
533  if (RI.isSGPRClass(RegClass)) {
534    if (RI.getRegSizeInBits(*RegClass) > 32) {
535      Opcode =  AMDGPU::S_MOV_B64;
536      EltSize = 8;
537    } else {
538      Opcode = AMDGPU::S_MOV_B32;
539      EltSize = 4;
540    }
541  }
542
543  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
544  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
545    int64_t IdxValue = Idx == 0 ? Value : 0;
546
547    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
548      get(Opcode), RI.getSubReg(DestReg, Idx));
549    Builder.addImm(IdxValue);
550  }
551}
552
553const TargetRegisterClass *
554SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
555  return &AMDGPU::VGPR_32RegClass;
556}
557
558void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
559                                     MachineBasicBlock::iterator I,
560                                     const DebugLoc &DL, unsigned DstReg,
561                                     ArrayRef<MachineOperand> Cond,
562                                     unsigned TrueReg,
563                                     unsigned FalseReg) const {
564  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
565  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
566         "Not a VGPR32 reg");
567
568  if (Cond.size() == 1) {
569    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
570      .addReg(FalseReg)
571      .addReg(TrueReg)
572      .add(Cond[0]);
573  } else if (Cond.size() == 2) {
574    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
575    switch (Cond[0].getImm()) {
576    case SIInstrInfo::SCC_TRUE: {
577      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
578      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
579        .addImm(-1)
580        .addImm(0);
581      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
582        .addReg(FalseReg)
583        .addReg(TrueReg)
584        .addReg(SReg);
585      break;
586    }
587    case SIInstrInfo::SCC_FALSE: {
588      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
589      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
590        .addImm(0)
591        .addImm(-1);
592      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
593        .addReg(FalseReg)
594        .addReg(TrueReg)
595        .addReg(SReg);
596      break;
597    }
598    case SIInstrInfo::VCCNZ: {
599      MachineOperand RegOp = Cond[1];
600      RegOp.setImplicit(false);
601      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
602          .addReg(FalseReg)
603          .addReg(TrueReg)
604          .add(RegOp);
605      break;
606    }
607    case SIInstrInfo::VCCZ: {
608      MachineOperand RegOp = Cond[1];
609      RegOp.setImplicit(false);
610      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
611          .addReg(TrueReg)
612          .addReg(FalseReg)
613          .add(RegOp);
614      break;
615    }
616    case SIInstrInfo::EXECNZ: {
617      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
618      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
619      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
620        .addImm(0);
621      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
622        .addImm(-1)
623        .addImm(0);
624      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
625        .addReg(FalseReg)
626        .addReg(TrueReg)
627        .addReg(SReg);
628      break;
629    }
630    case SIInstrInfo::EXECZ: {
631      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
632      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
633      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
634        .addImm(0);
635      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
636        .addImm(0)
637        .addImm(-1);
638      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
639        .addReg(FalseReg)
640        .addReg(TrueReg)
641        .addReg(SReg);
642      llvm_unreachable("Unhandled branch predicate EXECZ");
643      break;
644    }
645    default:
646      llvm_unreachable("invalid branch predicate");
647    }
648  } else {
649    llvm_unreachable("Can only handle Cond size 1 or 2");
650  }
651}
652
653unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
654                               MachineBasicBlock::iterator I,
655                               const DebugLoc &DL,
656                               unsigned SrcReg, int Value) const {
657  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
658  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
659  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
660    .addImm(Value)
661    .addReg(SrcReg);
662
663  return Reg;
664}
665
666unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
667                               MachineBasicBlock::iterator I,
668                               const DebugLoc &DL,
669                               unsigned SrcReg, int Value) const {
670  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
671  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
672  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
673    .addImm(Value)
674    .addReg(SrcReg);
675
676  return Reg;
677}
678
679unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
680
681  if (RI.getRegSizeInBits(*DstRC) == 32) {
682    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
683  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
684    return AMDGPU::S_MOV_B64;
685  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
686    return  AMDGPU::V_MOV_B64_PSEUDO;
687  }
688  return AMDGPU::COPY;
689}
690
691static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
692  switch (Size) {
693  case 4:
694    return AMDGPU::SI_SPILL_S32_SAVE;
695  case 8:
696    return AMDGPU::SI_SPILL_S64_SAVE;
697  case 16:
698    return AMDGPU::SI_SPILL_S128_SAVE;
699  case 32:
700    return AMDGPU::SI_SPILL_S256_SAVE;
701  case 64:
702    return AMDGPU::SI_SPILL_S512_SAVE;
703  default:
704    llvm_unreachable("unknown register size");
705  }
706}
707
708static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
709  switch (Size) {
710  case 4:
711    return AMDGPU::SI_SPILL_V32_SAVE;
712  case 8:
713    return AMDGPU::SI_SPILL_V64_SAVE;
714  case 12:
715    return AMDGPU::SI_SPILL_V96_SAVE;
716  case 16:
717    return AMDGPU::SI_SPILL_V128_SAVE;
718  case 32:
719    return AMDGPU::SI_SPILL_V256_SAVE;
720  case 64:
721    return AMDGPU::SI_SPILL_V512_SAVE;
722  default:
723    llvm_unreachable("unknown register size");
724  }
725}
726
727void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
728                                      MachineBasicBlock::iterator MI,
729                                      unsigned SrcReg, bool isKill,
730                                      int FrameIndex,
731                                      const TargetRegisterClass *RC,
732                                      const TargetRegisterInfo *TRI) const {
733  MachineFunction *MF = MBB.getParent();
734  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
735  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
736  DebugLoc DL = MBB.findDebugLoc(MI);
737
738  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
739  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
740  MachinePointerInfo PtrInfo
741    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
742  MachineMemOperand *MMO
743    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
744                               Size, Align);
745  unsigned SpillSize = TRI->getSpillSize(*RC);
746
747  if (RI.isSGPRClass(RC)) {
748    MFI->setHasSpilledSGPRs();
749
750    // We are only allowed to create one new instruction when spilling
751    // registers, so we need to use pseudo instruction for spilling SGPRs.
752    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
753
754    // The SGPR spill/restore instructions only work on number sgprs, so we need
755    // to make sure we are using the correct register class.
756    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
757      MachineRegisterInfo &MRI = MF->getRegInfo();
758      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
759    }
760
761    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
762      .addReg(SrcReg, getKillRegState(isKill)) // data
763      .addFrameIndex(FrameIndex)               // addr
764      .addMemOperand(MMO)
765      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
766      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
767    // Add the scratch resource registers as implicit uses because we may end up
768    // needing them, and need to ensure that the reserved registers are
769    // correctly handled.
770
771    if (ST.hasScalarStores()) {
772      // m0 is used for offset to scalar stores if used to spill.
773      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
774    }
775
776    return;
777  }
778
779  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
780    LLVMContext &Ctx = MF->getFunction()->getContext();
781    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
782                  " spill register");
783    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
784      .addReg(SrcReg);
785
786    return;
787  }
788
789  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
790
791  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
792  MFI->setHasSpilledVGPRs();
793  BuildMI(MBB, MI, DL, get(Opcode))
794    .addReg(SrcReg, getKillRegState(isKill)) // data
795    .addFrameIndex(FrameIndex)               // addr
796    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
797    .addReg(MFI->getFrameOffsetReg())        // scratch_offset
798    .addImm(0)                               // offset
799    .addMemOperand(MMO);
800}
801
802static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
803  switch (Size) {
804  case 4:
805    return AMDGPU::SI_SPILL_S32_RESTORE;
806  case 8:
807    return AMDGPU::SI_SPILL_S64_RESTORE;
808  case 16:
809    return AMDGPU::SI_SPILL_S128_RESTORE;
810  case 32:
811    return AMDGPU::SI_SPILL_S256_RESTORE;
812  case 64:
813    return AMDGPU::SI_SPILL_S512_RESTORE;
814  default:
815    llvm_unreachable("unknown register size");
816  }
817}
818
819static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
820  switch (Size) {
821  case 4:
822    return AMDGPU::SI_SPILL_V32_RESTORE;
823  case 8:
824    return AMDGPU::SI_SPILL_V64_RESTORE;
825  case 12:
826    return AMDGPU::SI_SPILL_V96_RESTORE;
827  case 16:
828    return AMDGPU::SI_SPILL_V128_RESTORE;
829  case 32:
830    return AMDGPU::SI_SPILL_V256_RESTORE;
831  case 64:
832    return AMDGPU::SI_SPILL_V512_RESTORE;
833  default:
834    llvm_unreachable("unknown register size");
835  }
836}
837
838void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
839                                       MachineBasicBlock::iterator MI,
840                                       unsigned DestReg, int FrameIndex,
841                                       const TargetRegisterClass *RC,
842                                       const TargetRegisterInfo *TRI) const {
843  MachineFunction *MF = MBB.getParent();
844  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
845  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
846  DebugLoc DL = MBB.findDebugLoc(MI);
847  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
848  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
849  unsigned SpillSize = TRI->getSpillSize(*RC);
850
851  MachinePointerInfo PtrInfo
852    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
853
854  MachineMemOperand *MMO = MF->getMachineMemOperand(
855    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
856
857  if (RI.isSGPRClass(RC)) {
858    // FIXME: Maybe this should not include a memoperand because it will be
859    // lowered to non-memory instructions.
860    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
861    if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
862      MachineRegisterInfo &MRI = MF->getRegInfo();
863      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
864    }
865
866    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
867      .addFrameIndex(FrameIndex) // addr
868      .addMemOperand(MMO)
869      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
870      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
871
872    if (ST.hasScalarStores()) {
873      // m0 is used for offset to scalar stores if used to spill.
874      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
875    }
876
877    return;
878  }
879
880  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
881    LLVMContext &Ctx = MF->getFunction()->getContext();
882    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
883                  " restore register");
884    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
885
886    return;
887  }
888
889  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
890
891  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
892  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
893    .addFrameIndex(FrameIndex)        // vaddr
894    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
895    .addReg(MFI->getFrameOffsetReg()) // scratch_offset
896    .addImm(0)                        // offset
897    .addMemOperand(MMO);
898}
899
900/// \param @Offset Offset in bytes of the FrameIndex being spilled
901unsigned SIInstrInfo::calculateLDSSpillAddress(
902    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
903    unsigned FrameOffset, unsigned Size) const {
904  MachineFunction *MF = MBB.getParent();
905  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
906  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
907  const SIRegisterInfo *TRI = ST.getRegisterInfo();
908  DebugLoc DL = MBB.findDebugLoc(MI);
909  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
910  unsigned WavefrontSize = ST.getWavefrontSize();
911
912  unsigned TIDReg = MFI->getTIDReg();
913  if (!MFI->hasCalculatedTID()) {
914    MachineBasicBlock &Entry = MBB.getParent()->front();
915    MachineBasicBlock::iterator Insert = Entry.front();
916    DebugLoc DL = Insert->getDebugLoc();
917
918    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
919                                   *MF);
920    if (TIDReg == AMDGPU::NoRegister)
921      return TIDReg;
922
923    if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
924        WorkGroupSize > WavefrontSize) {
925
926      unsigned TIDIGXReg
927        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
928      unsigned TIDIGYReg
929        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
930      unsigned TIDIGZReg
931        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
932      unsigned InputPtrReg =
933          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
934      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
935        if (!Entry.isLiveIn(Reg))
936          Entry.addLiveIn(Reg);
937      }
938
939      RS->enterBasicBlock(Entry);
940      // FIXME: Can we scavenge an SReg_64 and access the subregs?
941      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
942      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
943      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
944              .addReg(InputPtrReg)
945              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
946      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
947              .addReg(InputPtrReg)
948              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
949
950      // NGROUPS.X * NGROUPS.Y
951      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
952              .addReg(STmp1)
953              .addReg(STmp0);
954      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
955      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
956              .addReg(STmp1)
957              .addReg(TIDIGXReg);
958      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
959      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
960              .addReg(STmp0)
961              .addReg(TIDIGYReg)
962              .addReg(TIDReg);
963      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
964      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
965              .addReg(TIDReg)
966              .addReg(TIDIGZReg);
967    } else {
968      // Get the wave id
969      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
970              TIDReg)
971              .addImm(-1)
972              .addImm(0);
973
974      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
975              TIDReg)
976              .addImm(-1)
977              .addReg(TIDReg);
978    }
979
980    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
981            TIDReg)
982            .addImm(2)
983            .addReg(TIDReg);
984    MFI->setTIDReg(TIDReg);
985  }
986
987  // Add FrameIndex to LDS offset
988  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
989  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
990          .addImm(LDSOffset)
991          .addReg(TIDReg);
992
993  return TmpReg;
994}
995
996void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
997                                   MachineBasicBlock::iterator MI,
998                                   int Count) const {
999  DebugLoc DL = MBB.findDebugLoc(MI);
1000  while (Count > 0) {
1001    int Arg;
1002    if (Count >= 8)
1003      Arg = 7;
1004    else
1005      Arg = Count - 1;
1006    Count -= 8;
1007    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1008            .addImm(Arg);
1009  }
1010}
1011
1012void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1013                             MachineBasicBlock::iterator MI) const {
1014  insertWaitStates(MBB, MI, 1);
1015}
1016
1017void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1018  auto MF = MBB.getParent();
1019  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1020
1021  assert(Info->isEntryFunction());
1022
1023  if (MBB.succ_empty()) {
1024    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1025    if (HasNoTerminator)
1026      BuildMI(MBB, MBB.end(), DebugLoc(),
1027              get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1028  }
1029}
1030
1031unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
1032  switch (MI.getOpcode()) {
1033  default: return 1; // FIXME: Do wait states equal cycles?
1034
1035  case AMDGPU::S_NOP:
1036    return MI.getOperand(0).getImm() + 1;
1037  }
1038}
1039
1040bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1041  MachineBasicBlock &MBB = *MI.getParent();
1042  DebugLoc DL = MBB.findDebugLoc(MI);
1043  switch (MI.getOpcode()) {
1044  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
1045  case AMDGPU::S_MOV_B64_term: {
1046    // This is only a terminator to get the correct spill code placement during
1047    // register allocation.
1048    MI.setDesc(get(AMDGPU::S_MOV_B64));
1049    break;
1050  }
1051  case AMDGPU::S_XOR_B64_term: {
1052    // This is only a terminator to get the correct spill code placement during
1053    // register allocation.
1054    MI.setDesc(get(AMDGPU::S_XOR_B64));
1055    break;
1056  }
1057  case AMDGPU::S_ANDN2_B64_term: {
1058    // This is only a terminator to get the correct spill code placement during
1059    // register allocation.
1060    MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1061    break;
1062  }
1063  case AMDGPU::V_MOV_B64_PSEUDO: {
1064    unsigned Dst = MI.getOperand(0).getReg();
1065    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1066    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1067
1068    const MachineOperand &SrcOp = MI.getOperand(1);
1069    // FIXME: Will this work for 64-bit floating point immediates?
1070    assert(!SrcOp.isFPImm());
1071    if (SrcOp.isImm()) {
1072      APInt Imm(64, SrcOp.getImm());
1073      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1074        .addImm(Imm.getLoBits(32).getZExtValue())
1075        .addReg(Dst, RegState::Implicit | RegState::Define);
1076      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1077        .addImm(Imm.getHiBits(32).getZExtValue())
1078        .addReg(Dst, RegState::Implicit | RegState::Define);
1079    } else {
1080      assert(SrcOp.isReg());
1081      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1082        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1083        .addReg(Dst, RegState::Implicit | RegState::Define);
1084      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1085        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1086        .addReg(Dst, RegState::Implicit | RegState::Define);
1087    }
1088    MI.eraseFromParent();
1089    break;
1090  }
1091  case AMDGPU::V_MOVRELD_B32_V1:
1092  case AMDGPU::V_MOVRELD_B32_V2:
1093  case AMDGPU::V_MOVRELD_B32_V4:
1094  case AMDGPU::V_MOVRELD_B32_V8:
1095  case AMDGPU::V_MOVRELD_B32_V16: {
1096    const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1097    unsigned VecReg = MI.getOperand(0).getReg();
1098    bool IsUndef = MI.getOperand(1).isUndef();
1099    unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1100    assert(VecReg == MI.getOperand(1).getReg());
1101
1102    MachineInstr *MovRel =
1103        BuildMI(MBB, MI, DL, MovRelDesc)
1104            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1105            .add(MI.getOperand(2))
1106            .addReg(VecReg, RegState::ImplicitDefine)
1107            .addReg(VecReg,
1108                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1109
1110    const int ImpDefIdx =
1111        MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1112    const int ImpUseIdx = ImpDefIdx + 1;
1113    MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1114
1115    MI.eraseFromParent();
1116    break;
1117  }
1118  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1119    MachineFunction &MF = *MBB.getParent();
1120    unsigned Reg = MI.getOperand(0).getReg();
1121    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1122    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1123
1124    // Create a bundle so these instructions won't be re-ordered by the
1125    // post-RA scheduler.
1126    MIBundleBuilder Bundler(MBB, MI);
1127    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1128
1129    // Add 32-bit offset from this instruction to the start of the
1130    // constant data.
1131    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1132                       .addReg(RegLo)
1133                       .add(MI.getOperand(1)));
1134
1135    MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1136                                  .addReg(RegHi);
1137    if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
1138      MIB.addImm(0);
1139    else
1140      MIB.add(MI.getOperand(2));
1141
1142    Bundler.append(MIB);
1143    llvm::finalizeBundle(MBB, Bundler.begin());
1144
1145    MI.eraseFromParent();
1146    break;
1147  }
1148  }
1149  return true;
1150}
1151
1152bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
1153                                      MachineOperand &Src0,
1154                                      unsigned Src0OpName,
1155                                      MachineOperand &Src1,
1156                                      unsigned Src1OpName) const {
1157  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1158  if (!Src0Mods)
1159    return false;
1160
1161  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1162  assert(Src1Mods &&
1163         "All commutable instructions have both src0 and src1 modifiers");
1164
1165  int Src0ModsVal = Src0Mods->getImm();
1166  int Src1ModsVal = Src1Mods->getImm();
1167
1168  Src1Mods->setImm(Src0ModsVal);
1169  Src0Mods->setImm(Src1ModsVal);
1170  return true;
1171}
1172
1173static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
1174                                             MachineOperand &RegOp,
1175                                             MachineOperand &NonRegOp) {
1176  unsigned Reg = RegOp.getReg();
1177  unsigned SubReg = RegOp.getSubReg();
1178  bool IsKill = RegOp.isKill();
1179  bool IsDead = RegOp.isDead();
1180  bool IsUndef = RegOp.isUndef();
1181  bool IsDebug = RegOp.isDebug();
1182
1183  if (NonRegOp.isImm())
1184    RegOp.ChangeToImmediate(NonRegOp.getImm());
1185  else if (NonRegOp.isFI())
1186    RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1187  else
1188    return nullptr;
1189
1190  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1191  NonRegOp.setSubReg(SubReg);
1192
1193  return &MI;
1194}
1195
1196MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1197                                                  unsigned Src0Idx,
1198                                                  unsigned Src1Idx) const {
1199  assert(!NewMI && "this should never be used");
1200
1201  unsigned Opc = MI.getOpcode();
1202  int CommutedOpcode = commuteOpcode(Opc);
1203  if (CommutedOpcode == -1)
1204    return nullptr;
1205
1206  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1207           static_cast<int>(Src0Idx) &&
1208         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1209           static_cast<int>(Src1Idx) &&
1210         "inconsistency with findCommutedOpIndices");
1211
1212  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1213  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1214
1215  MachineInstr *CommutedMI = nullptr;
1216  if (Src0.isReg() && Src1.isReg()) {
1217    if (isOperandLegal(MI, Src1Idx, &Src0)) {
1218      // Be sure to copy the source modifiers to the right place.
1219      CommutedMI
1220        = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1221    }
1222
1223  } else if (Src0.isReg() && !Src1.isReg()) {
1224    // src0 should always be able to support any operand type, so no need to
1225    // check operand legality.
1226    CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1227  } else if (!Src0.isReg() && Src1.isReg()) {
1228    if (isOperandLegal(MI, Src1Idx, &Src0))
1229      CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1230  } else {
1231    // FIXME: Found two non registers to commute. This does happen.
1232    return nullptr;
1233  }
1234
1235
1236  if (CommutedMI) {
1237    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1238                        Src1, AMDGPU::OpName::src1_modifiers);
1239
1240    CommutedMI->setDesc(get(CommutedOpcode));
1241  }
1242
1243  return CommutedMI;
1244}
1245
1246// This needs to be implemented because the source modifiers may be inserted
1247// between the true commutable operands, and the base
1248// TargetInstrInfo::commuteInstruction uses it.
1249bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1250                                        unsigned &SrcOpIdx1) const {
1251  if (!MI.isCommutable())
1252    return false;
1253
1254  unsigned Opc = MI.getOpcode();
1255  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1256  if (Src0Idx == -1)
1257    return false;
1258
1259  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1260  if (Src1Idx == -1)
1261    return false;
1262
1263  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1264}
1265
1266bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1267                                        int64_t BrOffset) const {
1268  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1269  // block is unanalyzable.
1270  assert(BranchOp != AMDGPU::S_SETPC_B64);
1271
1272  // Convert to dwords.
1273  BrOffset /= 4;
1274
1275  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1276  // from the next instruction.
1277  BrOffset -= 1;
1278
1279  return isIntN(BranchOffsetBits, BrOffset);
1280}
1281
1282MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1283  const MachineInstr &MI) const {
1284  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1285    // This would be a difficult analysis to perform, but can always be legal so
1286    // there's no need to analyze it.
1287    return nullptr;
1288  }
1289
1290  return MI.getOperand(0).getMBB();
1291}
1292
1293unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1294                                           MachineBasicBlock &DestBB,
1295                                           const DebugLoc &DL,
1296                                           int64_t BrOffset,
1297                                           RegScavenger *RS) const {
1298  assert(RS && "RegScavenger required for long branching");
1299  assert(MBB.empty() &&
1300         "new block should be inserted for expanding unconditional branch");
1301  assert(MBB.pred_size() == 1);
1302
1303  MachineFunction *MF = MBB.getParent();
1304  MachineRegisterInfo &MRI = MF->getRegInfo();
1305
1306  // FIXME: Virtual register workaround for RegScavenger not working with empty
1307  // blocks.
1308  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1309
1310  auto I = MBB.end();
1311
1312  // We need to compute the offset relative to the instruction immediately after
1313  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1314  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1315
1316  // TODO: Handle > 32-bit block address.
1317  if (BrOffset >= 0) {
1318    BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1319      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1320      .addReg(PCReg, 0, AMDGPU::sub0)
1321      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1322    BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1323      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1324      .addReg(PCReg, 0, AMDGPU::sub1)
1325      .addImm(0);
1326  } else {
1327    // Backwards branch.
1328    BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1329      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1330      .addReg(PCReg, 0, AMDGPU::sub0)
1331      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1332    BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1333      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1334      .addReg(PCReg, 0, AMDGPU::sub1)
1335      .addImm(0);
1336  }
1337
1338  // Insert the indirect branch after the other terminator.
1339  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1340    .addReg(PCReg);
1341
1342  // FIXME: If spilling is necessary, this will fail because this scavenger has
1343  // no emergency stack slots. It is non-trivial to spill in this situation,
1344  // because the restore code needs to be specially placed after the
1345  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1346  // block.
1347  //
1348  // If a spill is needed for the pc register pair, we need to insert a spill
1349  // restore block right before the destination block, and insert a short branch
1350  // into the old destination block's fallthrough predecessor.
1351  // e.g.:
1352  //
1353  // s_cbranch_scc0 skip_long_branch:
1354  //
1355  // long_branch_bb:
1356  //   spill s[8:9]
1357  //   s_getpc_b64 s[8:9]
1358  //   s_add_u32 s8, s8, restore_bb
1359  //   s_addc_u32 s9, s9, 0
1360  //   s_setpc_b64 s[8:9]
1361  //
1362  // skip_long_branch:
1363  //   foo;
1364  //
1365  // .....
1366  //
1367  // dest_bb_fallthrough_predecessor:
1368  // bar;
1369  // s_branch dest_bb
1370  //
1371  // restore_bb:
1372  //  restore s[8:9]
1373  //  fallthrough dest_bb
1374  ///
1375  // dest_bb:
1376  //   buzz;
1377
1378  RS->enterBasicBlockEnd(MBB);
1379  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1380                                       MachineBasicBlock::iterator(GetPC), 0);
1381  MRI.replaceRegWith(PCReg, Scav);
1382  MRI.clearVirtRegs();
1383  RS->setRegUsed(Scav);
1384
1385  return 4 + 8 + 4 + 4;
1386}
1387
1388unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1389  switch (Cond) {
1390  case SIInstrInfo::SCC_TRUE:
1391    return AMDGPU::S_CBRANCH_SCC1;
1392  case SIInstrInfo::SCC_FALSE:
1393    return AMDGPU::S_CBRANCH_SCC0;
1394  case SIInstrInfo::VCCNZ:
1395    return AMDGPU::S_CBRANCH_VCCNZ;
1396  case SIInstrInfo::VCCZ:
1397    return AMDGPU::S_CBRANCH_VCCZ;
1398  case SIInstrInfo::EXECNZ:
1399    return AMDGPU::S_CBRANCH_EXECNZ;
1400  case SIInstrInfo::EXECZ:
1401    return AMDGPU::S_CBRANCH_EXECZ;
1402  default:
1403    llvm_unreachable("invalid branch predicate");
1404  }
1405}
1406
1407SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1408  switch (Opcode) {
1409  case AMDGPU::S_CBRANCH_SCC0:
1410    return SCC_FALSE;
1411  case AMDGPU::S_CBRANCH_SCC1:
1412    return SCC_TRUE;
1413  case AMDGPU::S_CBRANCH_VCCNZ:
1414    return VCCNZ;
1415  case AMDGPU::S_CBRANCH_VCCZ:
1416    return VCCZ;
1417  case AMDGPU::S_CBRANCH_EXECNZ:
1418    return EXECNZ;
1419  case AMDGPU::S_CBRANCH_EXECZ:
1420    return EXECZ;
1421  default:
1422    return INVALID_BR;
1423  }
1424}
1425
1426bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1427                                    MachineBasicBlock::iterator I,
1428                                    MachineBasicBlock *&TBB,
1429                                    MachineBasicBlock *&FBB,
1430                                    SmallVectorImpl<MachineOperand> &Cond,
1431                                    bool AllowModify) const {
1432  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1433    // Unconditional Branch
1434    TBB = I->getOperand(0).getMBB();
1435    return false;
1436  }
1437
1438  MachineBasicBlock *CondBB = nullptr;
1439
1440  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1441    CondBB = I->getOperand(1).getMBB();
1442    Cond.push_back(I->getOperand(0));
1443  } else {
1444    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1445    if (Pred == INVALID_BR)
1446      return true;
1447
1448    CondBB = I->getOperand(0).getMBB();
1449    Cond.push_back(MachineOperand::CreateImm(Pred));
1450    Cond.push_back(I->getOperand(1)); // Save the branch register.
1451  }
1452  ++I;
1453
1454  if (I == MBB.end()) {
1455    // Conditional branch followed by fall-through.
1456    TBB = CondBB;
1457    return false;
1458  }
1459
1460  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1461    TBB = CondBB;
1462    FBB = I->getOperand(0).getMBB();
1463    return false;
1464  }
1465
1466  return true;
1467}
1468
1469bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1470                                MachineBasicBlock *&FBB,
1471                                SmallVectorImpl<MachineOperand> &Cond,
1472                                bool AllowModify) const {
1473  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1474  if (I == MBB.end())
1475    return false;
1476
1477  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1478    return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1479
1480  ++I;
1481
1482  // TODO: Should be able to treat as fallthrough?
1483  if (I == MBB.end())
1484    return true;
1485
1486  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1487    return true;
1488
1489  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1490
1491  // Specifically handle the case where the conditional branch is to the same
1492  // destination as the mask branch. e.g.
1493  //
1494  // si_mask_branch BB8
1495  // s_cbranch_execz BB8
1496  // s_cbranch BB9
1497  //
1498  // This is required to understand divergent loops which may need the branches
1499  // to be relaxed.
1500  if (TBB != MaskBrDest || Cond.empty())
1501    return true;
1502
1503  auto Pred = Cond[0].getImm();
1504  return (Pred != EXECZ && Pred != EXECNZ);
1505}
1506
1507unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1508                                   int *BytesRemoved) const {
1509  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1510
1511  unsigned Count = 0;
1512  unsigned RemovedSize = 0;
1513  while (I != MBB.end()) {
1514    MachineBasicBlock::iterator Next = std::next(I);
1515    if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1516      I = Next;
1517      continue;
1518    }
1519
1520    RemovedSize += getInstSizeInBytes(*I);
1521    I->eraseFromParent();
1522    ++Count;
1523    I = Next;
1524  }
1525
1526  if (BytesRemoved)
1527    *BytesRemoved = RemovedSize;
1528
1529  return Count;
1530}
1531
1532// Copy the flags onto the implicit condition register operand.
1533static void preserveCondRegFlags(MachineOperand &CondReg,
1534                                 const MachineOperand &OrigCond) {
1535  CondReg.setIsUndef(OrigCond.isUndef());
1536  CondReg.setIsKill(OrigCond.isKill());
1537}
1538
1539unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1540                                   MachineBasicBlock *TBB,
1541                                   MachineBasicBlock *FBB,
1542                                   ArrayRef<MachineOperand> Cond,
1543                                   const DebugLoc &DL,
1544                                   int *BytesAdded) const {
1545
1546  if (!FBB && Cond.empty()) {
1547    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1548      .addMBB(TBB);
1549    if (BytesAdded)
1550      *BytesAdded = 4;
1551    return 1;
1552  }
1553
1554  if(Cond.size() == 1 && Cond[0].isReg()) {
1555     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1556       .add(Cond[0])
1557       .addMBB(TBB);
1558     return 1;
1559  }
1560
1561  assert(TBB && Cond[0].isImm());
1562
1563  unsigned Opcode
1564    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1565
1566  if (!FBB) {
1567    Cond[1].isUndef();
1568    MachineInstr *CondBr =
1569      BuildMI(&MBB, DL, get(Opcode))
1570      .addMBB(TBB);
1571
1572    // Copy the flags onto the implicit condition register operand.
1573    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1574
1575    if (BytesAdded)
1576      *BytesAdded = 4;
1577    return 1;
1578  }
1579
1580  assert(TBB && FBB);
1581
1582  MachineInstr *CondBr =
1583    BuildMI(&MBB, DL, get(Opcode))
1584    .addMBB(TBB);
1585  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1586    .addMBB(FBB);
1587
1588  MachineOperand &CondReg = CondBr->getOperand(1);
1589  CondReg.setIsUndef(Cond[1].isUndef());
1590  CondReg.setIsKill(Cond[1].isKill());
1591
1592  if (BytesAdded)
1593      *BytesAdded = 8;
1594
1595  return 2;
1596}
1597
1598bool SIInstrInfo::reverseBranchCondition(
1599  SmallVectorImpl<MachineOperand> &Cond) const {
1600  if (Cond.size() != 2) {
1601    return true;
1602  }
1603
1604  if (Cond[0].isImm()) {
1605    Cond[0].setImm(-Cond[0].getImm());
1606    return false;
1607  }
1608
1609  return true;
1610}
1611
1612bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1613                                  ArrayRef<MachineOperand> Cond,
1614                                  unsigned TrueReg, unsigned FalseReg,
1615                                  int &CondCycles,
1616                                  int &TrueCycles, int &FalseCycles) const {
1617  switch (Cond[0].getImm()) {
1618  case VCCNZ:
1619  case VCCZ: {
1620    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1621    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1622    assert(MRI.getRegClass(FalseReg) == RC);
1623
1624    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1625    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1626
1627    // Limit to equal cost for branch vs. N v_cndmask_b32s.
1628    return !RI.isSGPRClass(RC) && NumInsts <= 6;
1629  }
1630  case SCC_TRUE:
1631  case SCC_FALSE: {
1632    // FIXME: We could insert for VGPRs if we could replace the original compare
1633    // with a vector one.
1634    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1635    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1636    assert(MRI.getRegClass(FalseReg) == RC);
1637
1638    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1639
1640    // Multiples of 8 can do s_cselect_b64
1641    if (NumInsts % 2 == 0)
1642      NumInsts /= 2;
1643
1644    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1645    return RI.isSGPRClass(RC);
1646  }
1647  default:
1648    return false;
1649  }
1650}
1651
1652void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1653                               MachineBasicBlock::iterator I, const DebugLoc &DL,
1654                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
1655                               unsigned TrueReg, unsigned FalseReg) const {
1656  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1657  if (Pred == VCCZ || Pred == SCC_FALSE) {
1658    Pred = static_cast<BranchPredicate>(-Pred);
1659    std::swap(TrueReg, FalseReg);
1660  }
1661
1662  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1663  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1664  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1665
1666  if (DstSize == 32) {
1667    unsigned SelOp = Pred == SCC_TRUE ?
1668      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1669
1670    // Instruction's operands are backwards from what is expected.
1671    MachineInstr *Select =
1672      BuildMI(MBB, I, DL, get(SelOp), DstReg)
1673      .addReg(FalseReg)
1674      .addReg(TrueReg);
1675
1676    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1677    return;
1678  }
1679
1680  if (DstSize == 64 && Pred == SCC_TRUE) {
1681    MachineInstr *Select =
1682      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1683      .addReg(FalseReg)
1684      .addReg(TrueReg);
1685
1686    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1687    return;
1688  }
1689
1690  static const int16_t Sub0_15[] = {
1691    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1692    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1693    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1694    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1695  };
1696
1697  static const int16_t Sub0_15_64[] = {
1698    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1699    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1700    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1701    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1702  };
1703
1704  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1705  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1706  const int16_t *SubIndices = Sub0_15;
1707  int NElts = DstSize / 32;
1708
1709  // 64-bit select is only avaialble for SALU.
1710  if (Pred == SCC_TRUE) {
1711    SelOp = AMDGPU::S_CSELECT_B64;
1712    EltRC = &AMDGPU::SGPR_64RegClass;
1713    SubIndices = Sub0_15_64;
1714
1715    assert(NElts % 2 == 0);
1716    NElts /= 2;
1717  }
1718
1719  MachineInstrBuilder MIB = BuildMI(
1720    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1721
1722  I = MIB->getIterator();
1723
1724  SmallVector<unsigned, 8> Regs;
1725  for (int Idx = 0; Idx != NElts; ++Idx) {
1726    unsigned DstElt = MRI.createVirtualRegister(EltRC);
1727    Regs.push_back(DstElt);
1728
1729    unsigned SubIdx = SubIndices[Idx];
1730
1731    MachineInstr *Select =
1732      BuildMI(MBB, I, DL, get(SelOp), DstElt)
1733      .addReg(FalseReg, 0, SubIdx)
1734      .addReg(TrueReg, 0, SubIdx);
1735    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1736
1737    MIB.addReg(DstElt)
1738       .addImm(SubIdx);
1739  }
1740}
1741
1742bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1743  switch (MI.getOpcode()) {
1744  case AMDGPU::V_MOV_B32_e32:
1745  case AMDGPU::V_MOV_B32_e64:
1746  case AMDGPU::V_MOV_B64_PSEUDO: {
1747    // If there are additional implicit register operands, this may be used for
1748    // register indexing so the source register operand isn't simply copied.
1749    unsigned NumOps = MI.getDesc().getNumOperands() +
1750      MI.getDesc().getNumImplicitUses();
1751
1752    return MI.getNumOperands() == NumOps;
1753  }
1754  case AMDGPU::S_MOV_B32:
1755  case AMDGPU::S_MOV_B64:
1756  case AMDGPU::COPY:
1757    return true;
1758  default:
1759    return false;
1760  }
1761}
1762
1763static void removeModOperands(MachineInstr &MI) {
1764  unsigned Opc = MI.getOpcode();
1765  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1766                                              AMDGPU::OpName::src0_modifiers);
1767  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1768                                              AMDGPU::OpName::src1_modifiers);
1769  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1770                                              AMDGPU::OpName::src2_modifiers);
1771
1772  MI.RemoveOperand(Src2ModIdx);
1773  MI.RemoveOperand(Src1ModIdx);
1774  MI.RemoveOperand(Src0ModIdx);
1775}
1776
1777bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1778                                unsigned Reg, MachineRegisterInfo *MRI) const {
1779  if (!MRI->hasOneNonDBGUse(Reg))
1780    return false;
1781
1782  unsigned Opc = UseMI.getOpcode();
1783  if (Opc == AMDGPU::COPY) {
1784    bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1785    switch (DefMI.getOpcode()) {
1786    default:
1787      return false;
1788    case AMDGPU::S_MOV_B64:
1789      // TODO: We could fold 64-bit immediates, but this get compilicated
1790      // when there are sub-registers.
1791      return false;
1792
1793    case AMDGPU::V_MOV_B32_e32:
1794    case AMDGPU::S_MOV_B32:
1795      break;
1796    }
1797    unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1798    const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1799    assert(ImmOp);
1800    // FIXME: We could handle FrameIndex values here.
1801    if (!ImmOp->isImm()) {
1802      return false;
1803    }
1804    UseMI.setDesc(get(NewOpc));
1805    UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1806    UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1807    return true;
1808  }
1809
1810  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1811      Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1812    // Don't fold if we are using source or output modifiers. The new VOP2
1813    // instructions don't have them.
1814    if (hasAnyModifiersSet(UseMI))
1815      return false;
1816
1817    const MachineOperand &ImmOp = DefMI.getOperand(1);
1818
1819    // If this is a free constant, there's no reason to do this.
1820    // TODO: We could fold this here instead of letting SIFoldOperands do it
1821    // later.
1822    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1823
1824    // Any src operand can be used for the legality check.
1825    if (isInlineConstant(UseMI, *Src0, ImmOp))
1826      return false;
1827
1828    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1829    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1830    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1831
1832    // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1833    // We should only expect these to be on src0 due to canonicalizations.
1834    if (Src0->isReg() && Src0->getReg() == Reg) {
1835      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1836        return false;
1837
1838      if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1839        return false;
1840
1841      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1842
1843      const int64_t Imm = DefMI.getOperand(1).getImm();
1844
1845      // FIXME: This would be a lot easier if we could return a new instruction
1846      // instead of having to modify in place.
1847
1848      // Remove these first since they are at the end.
1849      UseMI.RemoveOperand(
1850          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1851      UseMI.RemoveOperand(
1852          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1853
1854      unsigned Src1Reg = Src1->getReg();
1855      unsigned Src1SubReg = Src1->getSubReg();
1856      Src0->setReg(Src1Reg);
1857      Src0->setSubReg(Src1SubReg);
1858      Src0->setIsKill(Src1->isKill());
1859
1860      if (Opc == AMDGPU::V_MAC_F32_e64 ||
1861          Opc == AMDGPU::V_MAC_F16_e64)
1862        UseMI.untieRegOperand(
1863            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1864
1865      Src1->ChangeToImmediate(Imm);
1866
1867      removeModOperands(UseMI);
1868      UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
1869
1870      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1871      if (DeleteDef)
1872        DefMI.eraseFromParent();
1873
1874      return true;
1875    }
1876
1877    // Added part is the constant: Use v_madak_{f16, f32}.
1878    if (Src2->isReg() && Src2->getReg() == Reg) {
1879      // Not allowed to use constant bus for another operand.
1880      // We can however allow an inline immediate as src0.
1881      if (!Src0->isImm() &&
1882          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1883        return false;
1884
1885      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1886        return false;
1887
1888      const int64_t Imm = DefMI.getOperand(1).getImm();
1889
1890      // FIXME: This would be a lot easier if we could return a new instruction
1891      // instead of having to modify in place.
1892
1893      // Remove these first since they are at the end.
1894      UseMI.RemoveOperand(
1895          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1896      UseMI.RemoveOperand(
1897          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1898
1899      if (Opc == AMDGPU::V_MAC_F32_e64 ||
1900          Opc == AMDGPU::V_MAC_F16_e64)
1901        UseMI.untieRegOperand(
1902            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1903
1904      // ChangingToImmediate adds Src2 back to the instruction.
1905      Src2->ChangeToImmediate(Imm);
1906
1907      // These come before src2.
1908      removeModOperands(UseMI);
1909      UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
1910
1911      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1912      if (DeleteDef)
1913        DefMI.eraseFromParent();
1914
1915      return true;
1916    }
1917  }
1918
1919  return false;
1920}
1921
1922static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1923                                int WidthB, int OffsetB) {
1924  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1925  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1926  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1927  return LowOffset + LowWidth <= HighOffset;
1928}
1929
1930bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
1931                                               MachineInstr &MIb) const {
1932  unsigned BaseReg0, BaseReg1;
1933  int64_t Offset0, Offset1;
1934
1935  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
1936      getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
1937
1938    if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
1939      // FIXME: Handle ds_read2 / ds_write2.
1940      return false;
1941    }
1942    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
1943    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
1944    if (BaseReg0 == BaseReg1 &&
1945        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
1946      return true;
1947    }
1948  }
1949
1950  return false;
1951}
1952
1953bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
1954                                                  MachineInstr &MIb,
1955                                                  AliasAnalysis *AA) const {
1956  assert((MIa.mayLoad() || MIa.mayStore()) &&
1957         "MIa must load from or modify a memory location");
1958  assert((MIb.mayLoad() || MIb.mayStore()) &&
1959         "MIb must load from or modify a memory location");
1960
1961  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
1962    return false;
1963
1964  // XXX - Can we relax this between address spaces?
1965  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1966    return false;
1967
1968  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
1969    const MachineMemOperand *MMOa = *MIa.memoperands_begin();
1970    const MachineMemOperand *MMOb = *MIb.memoperands_begin();
1971    if (MMOa->getValue() && MMOb->getValue()) {
1972      MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
1973      MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
1974      if (!AA->alias(LocA, LocB))
1975        return true;
1976    }
1977  }
1978
1979  // TODO: Should we check the address space from the MachineMemOperand? That
1980  // would allow us to distinguish objects we know don't alias based on the
1981  // underlying address space, even if it was lowered to a different one,
1982  // e.g. private accesses lowered to use MUBUF instructions on a scratch
1983  // buffer.
1984  if (isDS(MIa)) {
1985    if (isDS(MIb))
1986      return checkInstOffsetsDoNotOverlap(MIa, MIb);
1987
1988    return !isFLAT(MIb);
1989  }
1990
1991  if (isMUBUF(MIa) || isMTBUF(MIa)) {
1992    if (isMUBUF(MIb) || isMTBUF(MIb))
1993      return checkInstOffsetsDoNotOverlap(MIa, MIb);
1994
1995    return !isFLAT(MIb) && !isSMRD(MIb);
1996  }
1997
1998  if (isSMRD(MIa)) {
1999    if (isSMRD(MIb))
2000      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2001
2002    return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2003  }
2004
2005  if (isFLAT(MIa)) {
2006    if (isFLAT(MIb))
2007      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2008
2009    return false;
2010  }
2011
2012  return false;
2013}
2014
2015MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
2016                                                 MachineInstr &MI,
2017                                                 LiveVariables *LV) const {
2018  bool IsF16 = false;
2019
2020  switch (MI.getOpcode()) {
2021  default:
2022    return nullptr;
2023  case AMDGPU::V_MAC_F16_e64:
2024    IsF16 = true;
2025    LLVM_FALLTHROUGH;
2026  case AMDGPU::V_MAC_F32_e64:
2027    break;
2028  case AMDGPU::V_MAC_F16_e32:
2029    IsF16 = true;
2030    LLVM_FALLTHROUGH;
2031  case AMDGPU::V_MAC_F32_e32: {
2032    int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2033                                             AMDGPU::OpName::src0);
2034    const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2035    if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2036      return nullptr;
2037    break;
2038  }
2039  }
2040
2041  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2042  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2043  const MachineOperand *Src0Mods =
2044    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2045  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2046  const MachineOperand *Src1Mods =
2047    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2048  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2049  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2050  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2051
2052  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2053                 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
2054      .add(*Dst)
2055      .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2056      .add(*Src0)
2057      .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2058      .add(*Src1)
2059      .addImm(0) // Src mods
2060      .add(*Src2)
2061      .addImm(Clamp ? Clamp->getImm() : 0)
2062      .addImm(Omod ? Omod->getImm() : 0);
2063}
2064
2065// It's not generally safe to move VALU instructions across these since it will
2066// start using the register as a base index rather than directly.
2067// XXX - Why isn't hasSideEffects sufficient for these?
2068static bool changesVGPRIndexingMode(const MachineInstr &MI) {
2069  switch (MI.getOpcode()) {
2070  case AMDGPU::S_SET_GPR_IDX_ON:
2071  case AMDGPU::S_SET_GPR_IDX_MODE:
2072  case AMDGPU::S_SET_GPR_IDX_OFF:
2073    return true;
2074  default:
2075    return false;
2076  }
2077}
2078
2079bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
2080                                       const MachineBasicBlock *MBB,
2081                                       const MachineFunction &MF) const {
2082  // XXX - Do we want the SP check in the base implementation?
2083
2084  // Target-independent instructions do not have an implicit-use of EXEC, even
2085  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2086  // boundaries prevents incorrect movements of such instructions.
2087  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2088         MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2089         MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2090         MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2091         changesVGPRIndexingMode(MI);
2092}
2093
2094bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2095  switch (Imm.getBitWidth()) {
2096  case 32:
2097    return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
2098                                        ST.hasInv2PiInlineImm());
2099  case 64:
2100    return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
2101                                        ST.hasInv2PiInlineImm());
2102  case 16:
2103    return ST.has16BitInsts() &&
2104           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
2105                                        ST.hasInv2PiInlineImm());
2106  default:
2107    llvm_unreachable("invalid bitwidth");
2108  }
2109}
2110
2111bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
2112                                   uint8_t OperandType) const {
2113  if (!MO.isImm() ||
2114      OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2115      OperandType > AMDGPU::OPERAND_SRC_LAST)
2116    return false;
2117
2118  // MachineOperand provides no way to tell the true operand size, since it only
2119  // records a 64-bit value. We need to know the size to determine if a 32-bit
2120  // floating point immediate bit pattern is legal for an integer immediate. It
2121  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2122
2123  int64_t Imm = MO.getImm();
2124  switch (OperandType) {
2125  case AMDGPU::OPERAND_REG_IMM_INT32:
2126  case AMDGPU::OPERAND_REG_IMM_FP32:
2127  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2128  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
2129    int32_t Trunc = static_cast<int32_t>(Imm);
2130    return Trunc == Imm &&
2131           AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
2132  }
2133  case AMDGPU::OPERAND_REG_IMM_INT64:
2134  case AMDGPU::OPERAND_REG_IMM_FP64:
2135  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2136  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
2137    return AMDGPU::isInlinableLiteral64(MO.getImm(),
2138                                        ST.hasInv2PiInlineImm());
2139  }
2140  case AMDGPU::OPERAND_REG_IMM_INT16:
2141  case AMDGPU::OPERAND_REG_IMM_FP16:
2142  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2143  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2144    if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2145      // A few special case instructions have 16-bit operands on subtargets
2146      // where 16-bit instructions are not legal.
2147      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2148      // constants in these cases
2149      int16_t Trunc = static_cast<int16_t>(Imm);
2150      return ST.has16BitInsts() &&
2151             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2152    }
2153
2154    return false;
2155  }
2156  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
2157  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
2158    uint32_t Trunc = static_cast<uint32_t>(Imm);
2159    return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
2160  }
2161  default:
2162    llvm_unreachable("invalid bitwidth");
2163  }
2164}
2165
2166bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
2167                                        const MCOperandInfo &OpInfo) const {
2168  switch (MO.getType()) {
2169  case MachineOperand::MO_Register:
2170    return false;
2171  case MachineOperand::MO_Immediate:
2172    return !isInlineConstant(MO, OpInfo);
2173  case MachineOperand::MO_FrameIndex:
2174  case MachineOperand::MO_MachineBasicBlock:
2175  case MachineOperand::MO_ExternalSymbol:
2176  case MachineOperand::MO_GlobalAddress:
2177  case MachineOperand::MO_MCSymbol:
2178    return true;
2179  default:
2180    llvm_unreachable("unexpected operand type");
2181  }
2182}
2183
2184static bool compareMachineOp(const MachineOperand &Op0,
2185                             const MachineOperand &Op1) {
2186  if (Op0.getType() != Op1.getType())
2187    return false;
2188
2189  switch (Op0.getType()) {
2190  case MachineOperand::MO_Register:
2191    return Op0.getReg() == Op1.getReg();
2192  case MachineOperand::MO_Immediate:
2193    return Op0.getImm() == Op1.getImm();
2194  default:
2195    llvm_unreachable("Didn't expect to be comparing these operand types");
2196  }
2197}
2198
2199bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
2200                                    const MachineOperand &MO) const {
2201  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2202
2203  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2204
2205  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2206    return true;
2207
2208  if (OpInfo.RegClass < 0)
2209    return false;
2210
2211  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2212    return RI.opCanUseInlineConstant(OpInfo.OperandType);
2213
2214  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2215}
2216
2217bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2218  int Op32 = AMDGPU::getVOPe32(Opcode);
2219  if (Op32 == -1)
2220    return false;
2221
2222  return pseudoToMCOpcode(Op32) != -1;
2223}
2224
2225bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2226  // The src0_modifier operand is present on all instructions
2227  // that have modifiers.
2228
2229  return AMDGPU::getNamedOperandIdx(Opcode,
2230                                    AMDGPU::OpName::src0_modifiers) != -1;
2231}
2232
2233bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2234                                  unsigned OpName) const {
2235  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2236  return Mods && Mods->getImm();
2237}
2238
2239bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2240  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2241         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2242         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2243         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2244         hasModifiersSet(MI, AMDGPU::OpName::omod);
2245}
2246
2247bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2248                                  const MachineOperand &MO,
2249                                  const MCOperandInfo &OpInfo) const {
2250  // Literal constants use the constant bus.
2251  //if (isLiteralConstantLike(MO, OpInfo))
2252  // return true;
2253  if (MO.isImm())
2254    return !isInlineConstant(MO, OpInfo);
2255
2256  if (!MO.isReg())
2257    return true; // Misc other operands like FrameIndex
2258
2259  if (!MO.isUse())
2260    return false;
2261
2262  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2263    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2264
2265  // FLAT_SCR is just an SGPR pair.
2266  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2267    return true;
2268
2269  // EXEC register uses the constant bus.
2270  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2271    return true;
2272
2273  // SGPRs use the constant bus
2274  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2275          (!MO.isImplicit() &&
2276           (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2277            AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2278}
2279
2280static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2281  for (const MachineOperand &MO : MI.implicit_operands()) {
2282    // We only care about reads.
2283    if (MO.isDef())
2284      continue;
2285
2286    switch (MO.getReg()) {
2287    case AMDGPU::VCC:
2288    case AMDGPU::M0:
2289    case AMDGPU::FLAT_SCR:
2290      return MO.getReg();
2291
2292    default:
2293      break;
2294    }
2295  }
2296
2297  return AMDGPU::NoRegister;
2298}
2299
2300static bool shouldReadExec(const MachineInstr &MI) {
2301  if (SIInstrInfo::isVALU(MI)) {
2302    switch (MI.getOpcode()) {
2303    case AMDGPU::V_READLANE_B32:
2304    case AMDGPU::V_READLANE_B32_si:
2305    case AMDGPU::V_READLANE_B32_vi:
2306    case AMDGPU::V_WRITELANE_B32:
2307    case AMDGPU::V_WRITELANE_B32_si:
2308    case AMDGPU::V_WRITELANE_B32_vi:
2309      return false;
2310    }
2311
2312    return true;
2313  }
2314
2315  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2316      SIInstrInfo::isSALU(MI) ||
2317      SIInstrInfo::isSMRD(MI))
2318    return false;
2319
2320  return true;
2321}
2322
2323static bool isSubRegOf(const SIRegisterInfo &TRI,
2324                       const MachineOperand &SuperVec,
2325                       const MachineOperand &SubReg) {
2326  if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2327    return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2328
2329  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2330         SubReg.getReg() == SuperVec.getReg();
2331}
2332
2333bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2334                                    StringRef &ErrInfo) const {
2335  uint16_t Opcode = MI.getOpcode();
2336  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2337    return true;
2338
2339  const MachineFunction *MF = MI.getParent()->getParent();
2340  const MachineRegisterInfo &MRI = MF->getRegInfo();
2341
2342  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2343  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2344  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2345
2346  // Make sure the number of operands is correct.
2347  const MCInstrDesc &Desc = get(Opcode);
2348  if (!Desc.isVariadic() &&
2349      Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2350    ErrInfo = "Instruction has wrong number of operands.";
2351    return false;
2352  }
2353
2354  if (MI.isInlineAsm()) {
2355    // Verify register classes for inlineasm constraints.
2356    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2357         I != E; ++I) {
2358      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2359      if (!RC)
2360        continue;
2361
2362      const MachineOperand &Op = MI.getOperand(I);
2363      if (!Op.isReg())
2364        continue;
2365
2366      unsigned Reg = Op.getReg();
2367      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2368        ErrInfo = "inlineasm operand has incorrect register class.";
2369        return false;
2370      }
2371    }
2372
2373    return true;
2374  }
2375
2376  // Make sure the register classes are correct.
2377  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2378    if (MI.getOperand(i).isFPImm()) {
2379      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2380                "all fp values to integers.";
2381      return false;
2382    }
2383
2384    int RegClass = Desc.OpInfo[i].RegClass;
2385
2386    switch (Desc.OpInfo[i].OperandType) {
2387    case MCOI::OPERAND_REGISTER:
2388      if (MI.getOperand(i).isImm()) {
2389        ErrInfo = "Illegal immediate value for operand.";
2390        return false;
2391      }
2392      break;
2393    case AMDGPU::OPERAND_REG_IMM_INT32:
2394    case AMDGPU::OPERAND_REG_IMM_FP32:
2395      break;
2396    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2397    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2398    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2399    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2400    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2401    case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2402      const MachineOperand &MO = MI.getOperand(i);
2403      if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2404        ErrInfo = "Illegal immediate value for operand.";
2405        return false;
2406      }
2407      break;
2408    }
2409    case MCOI::OPERAND_IMMEDIATE:
2410    case AMDGPU::OPERAND_KIMM32:
2411      // Check if this operand is an immediate.
2412      // FrameIndex operands will be replaced by immediates, so they are
2413      // allowed.
2414      if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2415        ErrInfo = "Expected immediate, but got non-immediate";
2416        return false;
2417      }
2418      LLVM_FALLTHROUGH;
2419    default:
2420      continue;
2421    }
2422
2423    if (!MI.getOperand(i).isReg())
2424      continue;
2425
2426    if (RegClass != -1) {
2427      unsigned Reg = MI.getOperand(i).getReg();
2428      if (Reg == AMDGPU::NoRegister ||
2429          TargetRegisterInfo::isVirtualRegister(Reg))
2430        continue;
2431
2432      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2433      if (!RC->contains(Reg)) {
2434        ErrInfo = "Operand has incorrect register class.";
2435        return false;
2436      }
2437    }
2438  }
2439
2440  // Verify SDWA
2441  if (isSDWA(MI)) {
2442
2443    if (!ST.hasSDWA()) {
2444      ErrInfo = "SDWA is not supported on this target";
2445      return false;
2446    }
2447
2448    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2449
2450    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2451
2452    for (int OpIdx: OpIndicies) {
2453      if (OpIdx == -1)
2454        continue;
2455      const MachineOperand &MO = MI.getOperand(OpIdx);
2456
2457      if (!ST.hasSDWAScalar()) {
2458        // Only VGPRS on VI
2459        if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2460          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2461          return false;
2462        }
2463      } else {
2464        // No immediates on GFX9
2465        if (!MO.isReg()) {
2466          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2467          return false;
2468        }
2469      }
2470    }
2471
2472    if (!ST.hasSDWAOmod()) {
2473      // No omod allowed on VI
2474      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2475      if (OMod != nullptr &&
2476        (!OMod->isImm() || OMod->getImm() != 0)) {
2477        ErrInfo = "OMod not allowed in SDWA instructions on VI";
2478        return false;
2479      }
2480    }
2481
2482    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2483    if (isVOPC(BasicOpcode)) {
2484      if (!ST.hasSDWASdst() && DstIdx != -1) {
2485        // Only vcc allowed as dst on VI for VOPC
2486        const MachineOperand &Dst = MI.getOperand(DstIdx);
2487        if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2488          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2489          return false;
2490        }
2491      } else if (!ST.hasSDWAOutModsVOPC()) {
2492        // No clamp allowed on GFX9 for VOPC
2493        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2494        if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2495          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2496          return false;
2497        }
2498
2499        // No omod allowed on GFX9 for VOPC
2500        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2501        if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2502          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2503          return false;
2504        }
2505      }
2506    }
2507  }
2508
2509  // Verify VOP*
2510  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
2511    // Only look at the true operands. Only a real operand can use the constant
2512    // bus, and we don't want to check pseudo-operands like the source modifier
2513    // flags.
2514    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2515
2516    unsigned ConstantBusCount = 0;
2517
2518    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2519      ++ConstantBusCount;
2520
2521    unsigned SGPRUsed = findImplicitSGPRRead(MI);
2522    if (SGPRUsed != AMDGPU::NoRegister)
2523      ++ConstantBusCount;
2524
2525    for (int OpIdx : OpIndices) {
2526      if (OpIdx == -1)
2527        break;
2528      const MachineOperand &MO = MI.getOperand(OpIdx);
2529      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2530        if (MO.isReg()) {
2531          if (MO.getReg() != SGPRUsed)
2532            ++ConstantBusCount;
2533          SGPRUsed = MO.getReg();
2534        } else {
2535          ++ConstantBusCount;
2536        }
2537      }
2538    }
2539    if (ConstantBusCount > 1) {
2540      ErrInfo = "VOP* instruction uses the constant bus more than once";
2541      return false;
2542    }
2543  }
2544
2545  // Verify misc. restrictions on specific instructions.
2546  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2547      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2548    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2549    const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2550    const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2551    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2552      if (!compareMachineOp(Src0, Src1) &&
2553          !compareMachineOp(Src0, Src2)) {
2554        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2555        return false;
2556      }
2557    }
2558  }
2559
2560  if (isSOPK(MI)) {
2561    int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2562    if (sopkIsZext(MI)) {
2563      if (!isUInt<16>(Imm)) {
2564        ErrInfo = "invalid immediate for SOPK instruction";
2565        return false;
2566      }
2567    } else {
2568      if (!isInt<16>(Imm)) {
2569        ErrInfo = "invalid immediate for SOPK instruction";
2570        return false;
2571      }
2572    }
2573  }
2574
2575  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2576      Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2577      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2578      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2579    const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2580                       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2581
2582    const unsigned StaticNumOps = Desc.getNumOperands() +
2583      Desc.getNumImplicitUses();
2584    const unsigned NumImplicitOps = IsDst ? 2 : 1;
2585
2586    // Allow additional implicit operands. This allows a fixup done by the post
2587    // RA scheduler where the main implicit operand is killed and implicit-defs
2588    // are added for sub-registers that remain live after this instruction.
2589    if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2590      ErrInfo = "missing implicit register operands";
2591      return false;
2592    }
2593
2594    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2595    if (IsDst) {
2596      if (!Dst->isUse()) {
2597        ErrInfo = "v_movreld_b32 vdst should be a use operand";
2598        return false;
2599      }
2600
2601      unsigned UseOpIdx;
2602      if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2603          UseOpIdx != StaticNumOps + 1) {
2604        ErrInfo = "movrel implicit operands should be tied";
2605        return false;
2606      }
2607    }
2608
2609    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2610    const MachineOperand &ImpUse
2611      = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2612    if (!ImpUse.isReg() || !ImpUse.isUse() ||
2613        !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2614      ErrInfo = "src0 should be subreg of implicit vector use";
2615      return false;
2616    }
2617  }
2618
2619  // Make sure we aren't losing exec uses in the td files. This mostly requires
2620  // being careful when using let Uses to try to add other use registers.
2621  if (shouldReadExec(MI)) {
2622    if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2623      ErrInfo = "VALU instruction does not implicitly read exec mask";
2624      return false;
2625    }
2626  }
2627
2628  if (isSMRD(MI)) {
2629    if (MI.mayStore()) {
2630      // The register offset form of scalar stores may only use m0 as the
2631      // soffset register.
2632      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2633      if (Soff && Soff->getReg() != AMDGPU::M0) {
2634        ErrInfo = "scalar stores must use m0 as offset register";
2635        return false;
2636      }
2637    }
2638  }
2639
2640  if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
2641    const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2642    if (Offset->getImm() != 0) {
2643      ErrInfo = "subtarget does not support offsets in flat instructions";
2644      return false;
2645    }
2646  }
2647
2648  return true;
2649}
2650
2651unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
2652  switch (MI.getOpcode()) {
2653  default: return AMDGPU::INSTRUCTION_LIST_END;
2654  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2655  case AMDGPU::COPY: return AMDGPU::COPY;
2656  case AMDGPU::PHI: return AMDGPU::PHI;
2657  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2658  case AMDGPU::S_MOV_B32:
2659    return MI.getOperand(1).isReg() ?
2660           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2661  case AMDGPU::S_ADD_I32:
2662  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
2663  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
2664  case AMDGPU::S_SUB_I32:
2665  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
2666  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2667  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2668  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2669  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2670  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2671  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2672  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2673  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2674  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2675  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2676  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2677  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2678  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2679  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2680  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2681  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2682  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2683  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2684  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2685  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2686  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2687  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2688  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2689  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2690  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2691  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2692  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2693  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2694  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2695  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2696  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2697  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2698  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2699  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2700  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2701  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2702  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2703  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2704  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2705  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2706  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2707  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2708  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2709  }
2710}
2711
2712bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
2713  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
2714}
2715
2716const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
2717                                                      unsigned OpNo) const {
2718  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2719  const MCInstrDesc &Desc = get(MI.getOpcode());
2720  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
2721      Desc.OpInfo[OpNo].RegClass == -1) {
2722    unsigned Reg = MI.getOperand(OpNo).getReg();
2723
2724    if (TargetRegisterInfo::isVirtualRegister(Reg))
2725      return MRI.getRegClass(Reg);
2726    return RI.getPhysRegClass(Reg);
2727  }
2728
2729  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2730  return RI.getRegClass(RCID);
2731}
2732
2733bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2734  switch (MI.getOpcode()) {
2735  case AMDGPU::COPY:
2736  case AMDGPU::REG_SEQUENCE:
2737  case AMDGPU::PHI:
2738  case AMDGPU::INSERT_SUBREG:
2739    return RI.hasVGPRs(getOpRegClass(MI, 0));
2740  default:
2741    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
2742  }
2743}
2744
2745void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
2746  MachineBasicBlock::iterator I = MI;
2747  MachineBasicBlock *MBB = MI.getParent();
2748  MachineOperand &MO = MI.getOperand(OpIdx);
2749  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2750  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
2751  const TargetRegisterClass *RC = RI.getRegClass(RCID);
2752  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
2753  if (MO.isReg())
2754    Opcode = AMDGPU::COPY;
2755  else if (RI.isSGPRClass(RC))
2756    Opcode = AMDGPU::S_MOV_B32;
2757
2758  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
2759  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
2760    VRC = &AMDGPU::VReg_64RegClass;
2761  else
2762    VRC = &AMDGPU::VGPR_32RegClass;
2763
2764  unsigned Reg = MRI.createVirtualRegister(VRC);
2765  DebugLoc DL = MBB->findDebugLoc(I);
2766  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
2767  MO.ChangeToRegister(Reg, false);
2768}
2769
2770unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
2771                                         MachineRegisterInfo &MRI,
2772                                         MachineOperand &SuperReg,
2773                                         const TargetRegisterClass *SuperRC,
2774                                         unsigned SubIdx,
2775                                         const TargetRegisterClass *SubRC)
2776                                         const {
2777  MachineBasicBlock *MBB = MI->getParent();
2778  DebugLoc DL = MI->getDebugLoc();
2779  unsigned SubReg = MRI.createVirtualRegister(SubRC);
2780
2781  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
2782    BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2783      .addReg(SuperReg.getReg(), 0, SubIdx);
2784    return SubReg;
2785  }
2786
2787  // Just in case the super register is itself a sub-register, copy it to a new
2788  // value so we don't need to worry about merging its subreg index with the
2789  // SubIdx passed to this function. The register coalescer should be able to
2790  // eliminate this extra copy.
2791  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
2792
2793  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
2794    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
2795
2796  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2797    .addReg(NewSuperReg, 0, SubIdx);
2798
2799  return SubReg;
2800}
2801
2802MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
2803  MachineBasicBlock::iterator MII,
2804  MachineRegisterInfo &MRI,
2805  MachineOperand &Op,
2806  const TargetRegisterClass *SuperRC,
2807  unsigned SubIdx,
2808  const TargetRegisterClass *SubRC) const {
2809  if (Op.isImm()) {
2810    if (SubIdx == AMDGPU::sub0)
2811      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
2812    if (SubIdx == AMDGPU::sub1)
2813      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
2814
2815    llvm_unreachable("Unhandled register index for immediate");
2816  }
2817
2818  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
2819                                       SubIdx, SubRC);
2820  return MachineOperand::CreateReg(SubReg, false);
2821}
2822
2823// Change the order of operands from (0, 1, 2) to (0, 2, 1)
2824void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
2825  assert(Inst.getNumExplicitOperands() == 3);
2826  MachineOperand Op1 = Inst.getOperand(1);
2827  Inst.RemoveOperand(1);
2828  Inst.addOperand(Op1);
2829}
2830
2831bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
2832                                    const MCOperandInfo &OpInfo,
2833                                    const MachineOperand &MO) const {
2834  if (!MO.isReg())
2835    return false;
2836
2837  unsigned Reg = MO.getReg();
2838  const TargetRegisterClass *RC =
2839    TargetRegisterInfo::isVirtualRegister(Reg) ?
2840    MRI.getRegClass(Reg) :
2841    RI.getPhysRegClass(Reg);
2842
2843  const SIRegisterInfo *TRI =
2844      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
2845  RC = TRI->getSubRegClass(RC, MO.getSubReg());
2846
2847  // In order to be legal, the common sub-class must be equal to the
2848  // class of the current operand.  For example:
2849  //
2850  // v_mov_b32 s0 ; Operand defined as vsrc_b32
2851  //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
2852  //
2853  // s_sendmsg 0, s0 ; Operand defined as m0reg
2854  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
2855
2856  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
2857}
2858
2859bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
2860                                     const MCOperandInfo &OpInfo,
2861                                     const MachineOperand &MO) const {
2862  if (MO.isReg())
2863    return isLegalRegOperand(MRI, OpInfo, MO);
2864
2865  // Handle non-register types that are treated like immediates.
2866  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2867  return true;
2868}
2869
2870bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
2871                                 const MachineOperand *MO) const {
2872  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2873  const MCInstrDesc &InstDesc = MI.getDesc();
2874  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
2875  const TargetRegisterClass *DefinedRC =
2876      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
2877  if (!MO)
2878    MO = &MI.getOperand(OpIdx);
2879
2880  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
2881
2882    RegSubRegPair SGPRUsed;
2883    if (MO->isReg())
2884      SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
2885
2886    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2887      if (i == OpIdx)
2888        continue;
2889      const MachineOperand &Op = MI.getOperand(i);
2890      if (Op.isReg()) {
2891        if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2892            usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
2893          return false;
2894        }
2895      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
2896        return false;
2897      }
2898    }
2899  }
2900
2901  if (MO->isReg()) {
2902    assert(DefinedRC);
2903    return isLegalRegOperand(MRI, OpInfo, *MO);
2904  }
2905
2906  // Handle non-register types that are treated like immediates.
2907  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
2908
2909  if (!DefinedRC) {
2910    // This operand expects an immediate.
2911    return true;
2912  }
2913
2914  return isImmOperandLegal(MI, OpIdx, *MO);
2915}
2916
2917void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
2918                                       MachineInstr &MI) const {
2919  unsigned Opc = MI.getOpcode();
2920  const MCInstrDesc &InstrDesc = get(Opc);
2921
2922  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2923  MachineOperand &Src1 = MI.getOperand(Src1Idx);
2924
2925  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
2926  // we need to only have one constant bus use.
2927  //
2928  // Note we do not need to worry about literal constants here. They are
2929  // disabled for the operand type for instructions because they will always
2930  // violate the one constant bus use rule.
2931  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
2932  if (HasImplicitSGPR) {
2933    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2934    MachineOperand &Src0 = MI.getOperand(Src0Idx);
2935
2936    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
2937      legalizeOpWithMove(MI, Src0Idx);
2938  }
2939
2940  // VOP2 src0 instructions support all operand types, so we don't need to check
2941  // their legality. If src1 is already legal, we don't need to do anything.
2942  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
2943    return;
2944
2945  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
2946  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
2947  // select is uniform.
2948  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
2949      RI.isVGPR(MRI, Src1.getReg())) {
2950    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2951    const DebugLoc &DL = MI.getDebugLoc();
2952    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
2953        .add(Src1);
2954    Src1.ChangeToRegister(Reg, false);
2955    return;
2956  }
2957
2958  // We do not use commuteInstruction here because it is too aggressive and will
2959  // commute if it is possible. We only want to commute here if it improves
2960  // legality. This can be called a fairly large number of times so don't waste
2961  // compile time pointlessly swapping and checking legality again.
2962  if (HasImplicitSGPR || !MI.isCommutable()) {
2963    legalizeOpWithMove(MI, Src1Idx);
2964    return;
2965  }
2966
2967  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2968  MachineOperand &Src0 = MI.getOperand(Src0Idx);
2969
2970  // If src0 can be used as src1, commuting will make the operands legal.
2971  // Otherwise we have to give up and insert a move.
2972  //
2973  // TODO: Other immediate-like operand kinds could be commuted if there was a
2974  // MachineOperand::ChangeTo* for them.
2975  if ((!Src1.isImm() && !Src1.isReg()) ||
2976      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
2977    legalizeOpWithMove(MI, Src1Idx);
2978    return;
2979  }
2980
2981  int CommutedOpc = commuteOpcode(MI);
2982  if (CommutedOpc == -1) {
2983    legalizeOpWithMove(MI, Src1Idx);
2984    return;
2985  }
2986
2987  MI.setDesc(get(CommutedOpc));
2988
2989  unsigned Src0Reg = Src0.getReg();
2990  unsigned Src0SubReg = Src0.getSubReg();
2991  bool Src0Kill = Src0.isKill();
2992
2993  if (Src1.isImm())
2994    Src0.ChangeToImmediate(Src1.getImm());
2995  else if (Src1.isReg()) {
2996    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
2997    Src0.setSubReg(Src1.getSubReg());
2998  } else
2999    llvm_unreachable("Should only have register or immediate operands");
3000
3001  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3002  Src1.setSubReg(Src0SubReg);
3003}
3004
3005// Legalize VOP3 operands. Because all operand types are supported for any
3006// operand, and since literal constants are not allowed and should never be
3007// seen, we only need to worry about inserting copies if we use multiple SGPR
3008// operands.
3009void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
3010                                       MachineInstr &MI) const {
3011  unsigned Opc = MI.getOpcode();
3012
3013  int VOP3Idx[3] = {
3014    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3015    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3016    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3017  };
3018
3019  // Find the one SGPR operand we are allowed to use.
3020  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3021
3022  for (unsigned i = 0; i < 3; ++i) {
3023    int Idx = VOP3Idx[i];
3024    if (Idx == -1)
3025      break;
3026    MachineOperand &MO = MI.getOperand(Idx);
3027
3028    // We should never see a VOP3 instruction with an illegal immediate operand.
3029    if (!MO.isReg())
3030      continue;
3031
3032    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3033      continue; // VGPRs are legal
3034
3035    if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3036      SGPRReg = MO.getReg();
3037      // We can use one SGPR in each VOP3 instruction.
3038      continue;
3039    }
3040
3041    // If we make it this far, then the operand is not legal and we must
3042    // legalize it.
3043    legalizeOpWithMove(MI, Idx);
3044  }
3045}
3046
3047unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
3048                                         MachineRegisterInfo &MRI) const {
3049  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3050  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3051  unsigned DstReg = MRI.createVirtualRegister(SRC);
3052  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3053
3054  SmallVector<unsigned, 8> SRegs;
3055  for (unsigned i = 0; i < SubRegs; ++i) {
3056    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3057    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3058            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3059        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3060    SRegs.push_back(SGPR);
3061  }
3062
3063  MachineInstrBuilder MIB =
3064      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3065              get(AMDGPU::REG_SEQUENCE), DstReg);
3066  for (unsigned i = 0; i < SubRegs; ++i) {
3067    MIB.addReg(SRegs[i]);
3068    MIB.addImm(RI.getSubRegFromChannel(i));
3069  }
3070  return DstReg;
3071}
3072
3073void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
3074                                       MachineInstr &MI) const {
3075
3076  // If the pointer is store in VGPRs, then we need to move them to
3077  // SGPRs using v_readfirstlane.  This is safe because we only select
3078  // loads with uniform pointers to SMRD instruction so we know the
3079  // pointer value is uniform.
3080  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3081  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3082      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3083      SBase->setReg(SGPR);
3084  }
3085}
3086
3087void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
3088                                         MachineBasicBlock::iterator I,
3089                                         const TargetRegisterClass *DstRC,
3090                                         MachineOperand &Op,
3091                                         MachineRegisterInfo &MRI,
3092                                         const DebugLoc &DL) const {
3093
3094  unsigned OpReg = Op.getReg();
3095  unsigned OpSubReg = Op.getSubReg();
3096
3097  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3098      RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3099
3100  // Check if operand is already the correct register class.
3101  if (DstRC == OpRC)
3102    return;
3103
3104  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3105  MachineInstr *Copy =
3106      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3107
3108  Op.setReg(DstReg);
3109  Op.setSubReg(0);
3110
3111  MachineInstr *Def = MRI.getVRegDef(OpReg);
3112  if (!Def)
3113    return;
3114
3115  // Try to eliminate the copy if it is copying an immediate value.
3116  if (Def->isMoveImmediate())
3117    FoldImmediate(*Copy, *Def, OpReg, &MRI);
3118}
3119
3120void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
3121  MachineFunction &MF = *MI.getParent()->getParent();
3122  MachineRegisterInfo &MRI = MF.getRegInfo();
3123
3124  // Legalize VOP2
3125  if (isVOP2(MI) || isVOPC(MI)) {
3126    legalizeOperandsVOP2(MRI, MI);
3127    return;
3128  }
3129
3130  // Legalize VOP3
3131  if (isVOP3(MI)) {
3132    legalizeOperandsVOP3(MRI, MI);
3133    return;
3134  }
3135
3136  // Legalize SMRD
3137  if (isSMRD(MI)) {
3138    legalizeOperandsSMRD(MRI, MI);
3139    return;
3140  }
3141
3142  // Legalize REG_SEQUENCE and PHI
3143  // The register class of the operands much be the same type as the register
3144  // class of the output.
3145  if (MI.getOpcode() == AMDGPU::PHI) {
3146    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3147    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3148      if (!MI.getOperand(i).isReg() ||
3149          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
3150        continue;
3151      const TargetRegisterClass *OpRC =
3152          MRI.getRegClass(MI.getOperand(i).getReg());
3153      if (RI.hasVGPRs(OpRC)) {
3154        VRC = OpRC;
3155      } else {
3156        SRC = OpRC;
3157      }
3158    }
3159
3160    // If any of the operands are VGPR registers, then they all most be
3161    // otherwise we will create illegal VGPR->SGPR copies when legalizing
3162    // them.
3163    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3164      if (!VRC) {
3165        assert(SRC);
3166        VRC = RI.getEquivalentVGPRClass(SRC);
3167      }
3168      RC = VRC;
3169    } else {
3170      RC = SRC;
3171    }
3172
3173    // Update all the operands so they have the same type.
3174    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3175      MachineOperand &Op = MI.getOperand(I);
3176      if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3177        continue;
3178
3179      // MI is a PHI instruction.
3180      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3181      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3182
3183      // Avoid creating no-op copies with the same src and dst reg class.  These
3184      // confuse some of the machine passes.
3185      legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3186    }
3187  }
3188
3189  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3190  // VGPR dest type and SGPR sources, insert copies so all operands are
3191  // VGPRs. This seems to help operand folding / the register coalescer.
3192  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3193    MachineBasicBlock *MBB = MI.getParent();
3194    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3195    if (RI.hasVGPRs(DstRC)) {
3196      // Update all the operands so they are VGPR register classes. These may
3197      // not be the same register class because REG_SEQUENCE supports mixing
3198      // subregister index types e.g. sub0_sub1 + sub2 + sub3
3199      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3200        MachineOperand &Op = MI.getOperand(I);
3201        if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3202          continue;
3203
3204        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3205        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3206        if (VRC == OpRC)
3207          continue;
3208
3209        legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3210        Op.setIsKill();
3211      }
3212    }
3213
3214    return;
3215  }
3216
3217  // Legalize INSERT_SUBREG
3218  // src0 must have the same register class as dst
3219  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3220    unsigned Dst = MI.getOperand(0).getReg();
3221    unsigned Src0 = MI.getOperand(1).getReg();
3222    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3223    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3224    if (DstRC != Src0RC) {
3225      MachineBasicBlock *MBB = MI.getParent();
3226      MachineOperand &Op = MI.getOperand(1);
3227      legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3228    }
3229    return;
3230  }
3231
3232  // Legalize MIMG and MUBUF/MTBUF for shaders.
3233  //
3234  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3235  // scratch memory access. In both cases, the legalization never involves
3236  // conversion to the addr64 form.
3237  if (isMIMG(MI) ||
3238      (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
3239       (isMUBUF(MI) || isMTBUF(MI)))) {
3240    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3241    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
3242      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3243      SRsrc->setReg(SGPR);
3244    }
3245
3246    MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3247    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
3248      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3249      SSamp->setReg(SGPR);
3250    }
3251    return;
3252  }
3253
3254  // Legalize MUBUF* instructions by converting to addr64 form.
3255  // FIXME: If we start using the non-addr64 instructions for compute, we
3256  // may need to legalize them as above. This especially applies to the
3257  // buffer_load_format_* variants and variants with idxen (or bothen).
3258  int SRsrcIdx =
3259      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3260  if (SRsrcIdx != -1) {
3261    // We have an MUBUF instruction
3262    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
3263    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
3264    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
3265                                             RI.getRegClass(SRsrcRC))) {
3266      // The operands are legal.
3267      // FIXME: We may need to legalize operands besided srsrc.
3268      return;
3269    }
3270
3271    MachineBasicBlock &MBB = *MI.getParent();
3272
3273    // Extract the ptr from the resource descriptor.
3274    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
3275      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3276
3277    // Create an empty resource descriptor
3278    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3279    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3280    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3281    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3282    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
3283
3284    // Zero64 = 0
3285    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
3286        .addImm(0);
3287
3288    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3289    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3290        .addImm(RsrcDataFormat & 0xFFFFFFFF);
3291
3292    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3293    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3294        .addImm(RsrcDataFormat >> 32);
3295
3296    // NewSRsrc = {Zero64, SRsrcFormat}
3297    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3298        .addReg(Zero64)
3299        .addImm(AMDGPU::sub0_sub1)
3300        .addReg(SRsrcFormatLo)
3301        .addImm(AMDGPU::sub2)
3302        .addReg(SRsrcFormatHi)
3303        .addImm(AMDGPU::sub3);
3304
3305    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3306    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3307    if (VAddr) {
3308      // This is already an ADDR64 instruction so we need to add the pointer
3309      // extracted from the resource descriptor to the current value of VAddr.
3310      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3311      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3312
3313      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3314      DebugLoc DL = MI.getDebugLoc();
3315      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3316        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3317        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3318
3319      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3320      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3321        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3322        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3323
3324      // NewVaddr = {NewVaddrHi, NewVaddrLo}
3325      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3326          .addReg(NewVAddrLo)
3327          .addImm(AMDGPU::sub0)
3328          .addReg(NewVAddrHi)
3329          .addImm(AMDGPU::sub1);
3330    } else {
3331      // This instructions is the _OFFSET variant, so we need to convert it to
3332      // ADDR64.
3333      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
3334             < SISubtarget::VOLCANIC_ISLANDS &&
3335             "FIXME: Need to emit flat atomics here");
3336
3337      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3338      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3339      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3340      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3341
3342      // Atomics rith return have have an additional tied operand and are
3343      // missing some of the special bits.
3344      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3345      MachineInstr *Addr64;
3346
3347      if (!VDataIn) {
3348        // Regular buffer load / store.
3349        MachineInstrBuilder MIB =
3350            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3351                .add(*VData)
3352                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3353                // This will be replaced later
3354                // with the new value of vaddr.
3355                .add(*SRsrc)
3356                .add(*SOffset)
3357                .add(*Offset);
3358
3359        // Atomics do not have this operand.
3360        if (const MachineOperand *GLC =
3361                getNamedOperand(MI, AMDGPU::OpName::glc)) {
3362          MIB.addImm(GLC->getImm());
3363        }
3364
3365        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3366
3367        if (const MachineOperand *TFE =
3368                getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3369          MIB.addImm(TFE->getImm());
3370        }
3371
3372        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3373        Addr64 = MIB;
3374      } else {
3375        // Atomics with return.
3376        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3377                     .add(*VData)
3378                     .add(*VDataIn)
3379                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3380                     // This will be replaced later
3381                     // with the new value of vaddr.
3382                     .add(*SRsrc)
3383                     .add(*SOffset)
3384                     .add(*Offset)
3385                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3386                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3387      }
3388
3389      MI.removeFromParent();
3390
3391      // NewVaddr = {NewVaddrHi, NewVaddrLo}
3392      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3393              NewVAddr)
3394          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3395          .addImm(AMDGPU::sub0)
3396          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3397          .addImm(AMDGPU::sub1);
3398
3399      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3400      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3401    }
3402
3403    // Update the instruction to use NewVaddr
3404    VAddr->setReg(NewVAddr);
3405    // Update the instruction to use NewSRsrc
3406    SRsrc->setReg(NewSRsrc);
3407  }
3408}
3409
3410void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
3411  SetVectorType Worklist;
3412  Worklist.insert(&TopInst);
3413
3414  while (!Worklist.empty()) {
3415    MachineInstr &Inst = *Worklist.pop_back_val();
3416    MachineBasicBlock *MBB = Inst.getParent();
3417    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3418
3419    unsigned Opcode = Inst.getOpcode();
3420    unsigned NewOpcode = getVALUOp(Inst);
3421
3422    // Handle some special cases
3423    switch (Opcode) {
3424    default:
3425      break;
3426    case AMDGPU::S_AND_B64:
3427      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3428      Inst.eraseFromParent();
3429      continue;
3430
3431    case AMDGPU::S_OR_B64:
3432      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3433      Inst.eraseFromParent();
3434      continue;
3435
3436    case AMDGPU::S_XOR_B64:
3437      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3438      Inst.eraseFromParent();
3439      continue;
3440
3441    case AMDGPU::S_NOT_B64:
3442      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3443      Inst.eraseFromParent();
3444      continue;
3445
3446    case AMDGPU::S_BCNT1_I32_B64:
3447      splitScalar64BitBCNT(Worklist, Inst);
3448      Inst.eraseFromParent();
3449      continue;
3450
3451    case AMDGPU::S_BFE_I64: {
3452      splitScalar64BitBFE(Worklist, Inst);
3453      Inst.eraseFromParent();
3454      continue;
3455    }
3456
3457    case AMDGPU::S_LSHL_B32:
3458      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3459        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3460        swapOperands(Inst);
3461      }
3462      break;
3463    case AMDGPU::S_ASHR_I32:
3464      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3465        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3466        swapOperands(Inst);
3467      }
3468      break;
3469    case AMDGPU::S_LSHR_B32:
3470      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3471        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3472        swapOperands(Inst);
3473      }
3474      break;
3475    case AMDGPU::S_LSHL_B64:
3476      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3477        NewOpcode = AMDGPU::V_LSHLREV_B64;
3478        swapOperands(Inst);
3479      }
3480      break;
3481    case AMDGPU::S_ASHR_I64:
3482      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3483        NewOpcode = AMDGPU::V_ASHRREV_I64;
3484        swapOperands(Inst);
3485      }
3486      break;
3487    case AMDGPU::S_LSHR_B64:
3488      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3489        NewOpcode = AMDGPU::V_LSHRREV_B64;
3490        swapOperands(Inst);
3491      }
3492      break;
3493
3494    case AMDGPU::S_ABS_I32:
3495      lowerScalarAbs(Worklist, Inst);
3496      Inst.eraseFromParent();
3497      continue;
3498
3499    case AMDGPU::S_CBRANCH_SCC0:
3500    case AMDGPU::S_CBRANCH_SCC1:
3501      // Clear unused bits of vcc
3502      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3503              AMDGPU::VCC)
3504          .addReg(AMDGPU::EXEC)
3505          .addReg(AMDGPU::VCC);
3506      break;
3507
3508    case AMDGPU::S_BFE_U64:
3509    case AMDGPU::S_BFM_B64:
3510      llvm_unreachable("Moving this op to VALU not implemented");
3511
3512    case AMDGPU::S_PACK_LL_B32_B16:
3513    case AMDGPU::S_PACK_LH_B32_B16:
3514    case AMDGPU::S_PACK_HH_B32_B16: {
3515      movePackToVALU(Worklist, MRI, Inst);
3516      Inst.eraseFromParent();
3517      continue;
3518    }
3519    }
3520
3521    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3522      // We cannot move this instruction to the VALU, so we should try to
3523      // legalize its operands instead.
3524      legalizeOperands(Inst);
3525      continue;
3526    }
3527
3528    // Use the new VALU Opcode.
3529    const MCInstrDesc &NewDesc = get(NewOpcode);
3530    Inst.setDesc(NewDesc);
3531
3532    // Remove any references to SCC. Vector instructions can't read from it, and
3533    // We're just about to add the implicit use / defs of VCC, and we don't want
3534    // both.
3535    for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3536      MachineOperand &Op = Inst.getOperand(i);
3537      if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3538        Inst.RemoveOperand(i);
3539        addSCCDefUsersToVALUWorklist(Inst, Worklist);
3540      }
3541    }
3542
3543    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
3544      // We are converting these to a BFE, so we need to add the missing
3545      // operands for the size and offset.
3546      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
3547      Inst.addOperand(MachineOperand::CreateImm(0));
3548      Inst.addOperand(MachineOperand::CreateImm(Size));
3549
3550    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
3551      // The VALU version adds the second operand to the result, so insert an
3552      // extra 0 operand.
3553      Inst.addOperand(MachineOperand::CreateImm(0));
3554    }
3555
3556    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
3557
3558    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
3559      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3560      // If we need to move this to VGPRs, we need to unpack the second operand
3561      // back into the 2 separate ones for bit offset and width.
3562      assert(OffsetWidthOp.isImm() &&
3563             "Scalar BFE is only implemented for constant width and offset");
3564      uint32_t Imm = OffsetWidthOp.getImm();
3565
3566      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3567      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3568      Inst.RemoveOperand(2);                     // Remove old immediate.
3569      Inst.addOperand(MachineOperand::CreateImm(Offset));
3570      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3571    }
3572
3573    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3574    unsigned NewDstReg = AMDGPU::NoRegister;
3575    if (HasDst) {
3576      unsigned DstReg = Inst.getOperand(0).getReg();
3577      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
3578        continue;
3579
3580      // Update the destination register class.
3581      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3582      if (!NewDstRC)
3583        continue;
3584
3585      if (Inst.isCopy() &&
3586          TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
3587          NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
3588        // Instead of creating a copy where src and dst are the same register
3589        // class, we just replace all uses of dst with src.  These kinds of
3590        // copies interfere with the heuristics MachineSink uses to decide
3591        // whether or not to split a critical edge.  Since the pass assumes
3592        // that copies will end up as machine instructions and not be
3593        // eliminated.
3594        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3595        MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3596        MRI.clearKillFlags(Inst.getOperand(1).getReg());
3597        Inst.getOperand(0).setReg(DstReg);
3598        continue;
3599      }
3600
3601      NewDstReg = MRI.createVirtualRegister(NewDstRC);
3602      MRI.replaceRegWith(DstReg, NewDstReg);
3603    }
3604
3605    // Legalize the operands
3606    legalizeOperands(Inst);
3607
3608    if (HasDst)
3609     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
3610  }
3611}
3612
3613void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
3614                                 MachineInstr &Inst) const {
3615  MachineBasicBlock &MBB = *Inst.getParent();
3616  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3617  MachineBasicBlock::iterator MII = Inst;
3618  DebugLoc DL = Inst.getDebugLoc();
3619
3620  MachineOperand &Dest = Inst.getOperand(0);
3621  MachineOperand &Src = Inst.getOperand(1);
3622  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3623  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3624
3625  BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
3626    .addImm(0)
3627    .addReg(Src.getReg());
3628
3629  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
3630    .addReg(Src.getReg())
3631    .addReg(TmpReg);
3632
3633  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3634  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3635}
3636
3637void SIInstrInfo::splitScalar64BitUnaryOp(
3638    SetVectorType &Worklist, MachineInstr &Inst,
3639    unsigned Opcode) const {
3640  MachineBasicBlock &MBB = *Inst.getParent();
3641  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3642
3643  MachineOperand &Dest = Inst.getOperand(0);
3644  MachineOperand &Src0 = Inst.getOperand(1);
3645  DebugLoc DL = Inst.getDebugLoc();
3646
3647  MachineBasicBlock::iterator MII = Inst;
3648
3649  const MCInstrDesc &InstDesc = get(Opcode);
3650  const TargetRegisterClass *Src0RC = Src0.isReg() ?
3651    MRI.getRegClass(Src0.getReg()) :
3652    &AMDGPU::SGPR_32RegClass;
3653
3654  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3655
3656  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3657                                                       AMDGPU::sub0, Src0SubRC);
3658
3659  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3660  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3661  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3662
3663  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3664  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
3665
3666  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3667                                                       AMDGPU::sub1, Src0SubRC);
3668
3669  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3670  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
3671
3672  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3673  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3674    .addReg(DestSub0)
3675    .addImm(AMDGPU::sub0)
3676    .addReg(DestSub1)
3677    .addImm(AMDGPU::sub1);
3678
3679  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3680
3681  // We don't need to legalizeOperands here because for a single operand, src0
3682  // will support any kind of input.
3683
3684  // Move all users of this moved value.
3685  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3686}
3687
3688void SIInstrInfo::splitScalar64BitBinaryOp(
3689    SetVectorType &Worklist, MachineInstr &Inst,
3690    unsigned Opcode) const {
3691  MachineBasicBlock &MBB = *Inst.getParent();
3692  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3693
3694  MachineOperand &Dest = Inst.getOperand(0);
3695  MachineOperand &Src0 = Inst.getOperand(1);
3696  MachineOperand &Src1 = Inst.getOperand(2);
3697  DebugLoc DL = Inst.getDebugLoc();
3698
3699  MachineBasicBlock::iterator MII = Inst;
3700
3701  const MCInstrDesc &InstDesc = get(Opcode);
3702  const TargetRegisterClass *Src0RC = Src0.isReg() ?
3703    MRI.getRegClass(Src0.getReg()) :
3704    &AMDGPU::SGPR_32RegClass;
3705
3706  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3707  const TargetRegisterClass *Src1RC = Src1.isReg() ?
3708    MRI.getRegClass(Src1.getReg()) :
3709    &AMDGPU::SGPR_32RegClass;
3710
3711  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3712
3713  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3714                                                       AMDGPU::sub0, Src0SubRC);
3715  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3716                                                       AMDGPU::sub0, Src1SubRC);
3717
3718  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3719  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3720  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3721
3722  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3723  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
3724                              .add(SrcReg0Sub0)
3725                              .add(SrcReg1Sub0);
3726
3727  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3728                                                       AMDGPU::sub1, Src0SubRC);
3729  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3730                                                       AMDGPU::sub1, Src1SubRC);
3731
3732  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3733  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
3734                              .add(SrcReg0Sub1)
3735                              .add(SrcReg1Sub1);
3736
3737  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3738  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3739    .addReg(DestSub0)
3740    .addImm(AMDGPU::sub0)
3741    .addReg(DestSub1)
3742    .addImm(AMDGPU::sub1);
3743
3744  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3745
3746  // Try to legalize the operands in case we need to swap the order to keep it
3747  // valid.
3748  legalizeOperands(LoHalf);
3749  legalizeOperands(HiHalf);
3750
3751  // Move all users of this moved vlaue.
3752  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3753}
3754
3755void SIInstrInfo::splitScalar64BitBCNT(
3756    SetVectorType &Worklist, MachineInstr &Inst) const {
3757  MachineBasicBlock &MBB = *Inst.getParent();
3758  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3759
3760  MachineBasicBlock::iterator MII = Inst;
3761  DebugLoc DL = Inst.getDebugLoc();
3762
3763  MachineOperand &Dest = Inst.getOperand(0);
3764  MachineOperand &Src = Inst.getOperand(1);
3765
3766  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
3767  const TargetRegisterClass *SrcRC = Src.isReg() ?
3768    MRI.getRegClass(Src.getReg()) :
3769    &AMDGPU::SGPR_32RegClass;
3770
3771  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3772  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3773
3774  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
3775
3776  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3777                                                      AMDGPU::sub0, SrcSubRC);
3778  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3779                                                      AMDGPU::sub1, SrcSubRC);
3780
3781  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
3782
3783  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
3784
3785  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3786
3787  // We don't need to legalize operands here. src0 for etiher instruction can be
3788  // an SGPR, and the second input is unused or determined here.
3789  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3790}
3791
3792void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
3793                                      MachineInstr &Inst) const {
3794  MachineBasicBlock &MBB = *Inst.getParent();
3795  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3796  MachineBasicBlock::iterator MII = Inst;
3797  DebugLoc DL = Inst.getDebugLoc();
3798
3799  MachineOperand &Dest = Inst.getOperand(0);
3800  uint32_t Imm = Inst.getOperand(2).getImm();
3801  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3802  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3803
3804  (void) Offset;
3805
3806  // Only sext_inreg cases handled.
3807  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
3808         Offset == 0 && "Not implemented");
3809
3810  if (BitWidth < 32) {
3811    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3812    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3813    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3814
3815    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
3816        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
3817        .addImm(0)
3818        .addImm(BitWidth);
3819
3820    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
3821      .addImm(31)
3822      .addReg(MidRegLo);
3823
3824    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3825      .addReg(MidRegLo)
3826      .addImm(AMDGPU::sub0)
3827      .addReg(MidRegHi)
3828      .addImm(AMDGPU::sub1);
3829
3830    MRI.replaceRegWith(Dest.getReg(), ResultReg);
3831    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3832    return;
3833  }
3834
3835  MachineOperand &Src = Inst.getOperand(1);
3836  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3837  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3838
3839  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
3840    .addImm(31)
3841    .addReg(Src.getReg(), 0, AMDGPU::sub0);
3842
3843  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3844    .addReg(Src.getReg(), 0, AMDGPU::sub0)
3845    .addImm(AMDGPU::sub0)
3846    .addReg(TmpReg)
3847    .addImm(AMDGPU::sub1);
3848
3849  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3850  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3851}
3852
3853void SIInstrInfo::addUsersToMoveToVALUWorklist(
3854  unsigned DstReg,
3855  MachineRegisterInfo &MRI,
3856  SetVectorType &Worklist) const {
3857  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
3858         E = MRI.use_end(); I != E;) {
3859    MachineInstr &UseMI = *I->getParent();
3860    if (!canReadVGPR(UseMI, I.getOperandNo())) {
3861      Worklist.insert(&UseMI);
3862
3863      do {
3864        ++I;
3865      } while (I != E && I->getParent() == &UseMI);
3866    } else {
3867      ++I;
3868    }
3869  }
3870}
3871
3872void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
3873                                 MachineRegisterInfo &MRI,
3874                                 MachineInstr &Inst) const {
3875  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3876  MachineBasicBlock *MBB = Inst.getParent();
3877  MachineOperand &Src0 = Inst.getOperand(1);
3878  MachineOperand &Src1 = Inst.getOperand(2);
3879  const DebugLoc &DL = Inst.getDebugLoc();
3880
3881  switch (Inst.getOpcode()) {
3882  case AMDGPU::S_PACK_LL_B32_B16: {
3883    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3884    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3885
3886    // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
3887    // 0.
3888    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3889      .addImm(0xffff);
3890
3891    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
3892      .addReg(ImmReg, RegState::Kill)
3893      .add(Src0);
3894
3895    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
3896      .add(Src1)
3897      .addImm(16)
3898      .addReg(TmpReg, RegState::Kill);
3899    break;
3900  }
3901  case AMDGPU::S_PACK_LH_B32_B16: {
3902    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3903    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3904      .addImm(0xffff);
3905    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
3906      .addReg(ImmReg, RegState::Kill)
3907      .add(Src0)
3908      .add(Src1);
3909    break;
3910  }
3911  case AMDGPU::S_PACK_HH_B32_B16: {
3912    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3913    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3914    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
3915      .addImm(16)
3916      .add(Src0);
3917    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3918      .addImm(0xffff0000);
3919    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
3920      .add(Src1)
3921      .addReg(ImmReg, RegState::Kill)
3922      .addReg(TmpReg, RegState::Kill);
3923    break;
3924  }
3925  default:
3926    llvm_unreachable("unhandled s_pack_* instruction");
3927  }
3928
3929  MachineOperand &Dest = Inst.getOperand(0);
3930  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3931  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3932}
3933
3934void SIInstrInfo::addSCCDefUsersToVALUWorklist(
3935    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
3936  // This assumes that all the users of SCC are in the same block
3937  // as the SCC def.
3938  for (MachineInstr &MI :
3939       llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
3940                        SCCDefInst.getParent()->end())) {
3941    // Exit if we find another SCC def.
3942    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
3943      return;
3944
3945    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
3946      Worklist.insert(&MI);
3947  }
3948}
3949
3950const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
3951  const MachineInstr &Inst) const {
3952  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
3953
3954  switch (Inst.getOpcode()) {
3955  // For target instructions, getOpRegClass just returns the virtual register
3956  // class associated with the operand, so we need to find an equivalent VGPR
3957  // register class in order to move the instruction to the VALU.
3958  case AMDGPU::COPY:
3959  case AMDGPU::PHI:
3960  case AMDGPU::REG_SEQUENCE:
3961  case AMDGPU::INSERT_SUBREG:
3962    if (RI.hasVGPRs(NewDstRC))
3963      return nullptr;
3964
3965    NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
3966    if (!NewDstRC)
3967      return nullptr;
3968    return NewDstRC;
3969  default:
3970    return NewDstRC;
3971  }
3972}
3973
3974// Find the one SGPR operand we are allowed to use.
3975unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
3976                                   int OpIndices[3]) const {
3977  const MCInstrDesc &Desc = MI.getDesc();
3978
3979  // Find the one SGPR operand we are allowed to use.
3980  //
3981  // First we need to consider the instruction's operand requirements before
3982  // legalizing. Some operands are required to be SGPRs, such as implicit uses
3983  // of VCC, but we are still bound by the constant bus requirement to only use
3984  // one.
3985  //
3986  // If the operand's class is an SGPR, we can never move it.
3987
3988  unsigned SGPRReg = findImplicitSGPRRead(MI);
3989  if (SGPRReg != AMDGPU::NoRegister)
3990    return SGPRReg;
3991
3992  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
3993  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3994
3995  for (unsigned i = 0; i < 3; ++i) {
3996    int Idx = OpIndices[i];
3997    if (Idx == -1)
3998      break;
3999
4000    const MachineOperand &MO = MI.getOperand(Idx);
4001    if (!MO.isReg())
4002      continue;
4003
4004    // Is this operand statically required to be an SGPR based on the operand
4005    // constraints?
4006    const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4007    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4008    if (IsRequiredSGPR)
4009      return MO.getReg();
4010
4011    // If this could be a VGPR or an SGPR, Check the dynamic register class.
4012    unsigned Reg = MO.getReg();
4013    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4014    if (RI.isSGPRClass(RegRC))
4015      UsedSGPRs[i] = Reg;
4016  }
4017
4018  // We don't have a required SGPR operand, so we have a bit more freedom in
4019  // selecting operands to move.
4020
4021  // Try to select the most used SGPR. If an SGPR is equal to one of the
4022  // others, we choose that.
4023  //
4024  // e.g.
4025  // V_FMA_F32 v0, s0, s0, s0 -> No moves
4026  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4027
4028  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4029  // prefer those.
4030
4031  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
4032    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
4033      SGPRReg = UsedSGPRs[0];
4034  }
4035
4036  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
4037    if (UsedSGPRs[1] == UsedSGPRs[2])
4038      SGPRReg = UsedSGPRs[1];
4039  }
4040
4041  return SGPRReg;
4042}
4043
4044MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
4045                                             unsigned OperandName) const {
4046  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4047  if (Idx == -1)
4048    return nullptr;
4049
4050  return &MI.getOperand(Idx);
4051}
4052
4053uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
4054  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4055  if (ST.isAmdHsaOS()) {
4056    // Set ATC = 1. GFX9 doesn't have this bit.
4057    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
4058      RsrcDataFormat |= (1ULL << 56);
4059
4060    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4061    // BTW, it disables TC L2 and therefore decreases performance.
4062    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
4063      RsrcDataFormat |= (2ULL << 59);
4064  }
4065
4066  return RsrcDataFormat;
4067}
4068
4069uint64_t SIInstrInfo::getScratchRsrcWords23() const {
4070  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4071                    AMDGPU::RSRC_TID_ENABLE |
4072                    0xffffffff; // Size;
4073
4074  // GFX9 doesn't have ELEMENT_SIZE.
4075  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
4076    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4077    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4078  }
4079
4080  // IndexStride = 64.
4081  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4082
4083  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4084  // Clear them unless we want a huge stride.
4085  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4086    Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4087
4088  return Rsrc23;
4089}
4090
4091bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
4092  unsigned Opc = MI.getOpcode();
4093
4094  return isSMRD(Opc);
4095}
4096
4097bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
4098  unsigned Opc = MI.getOpcode();
4099
4100  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
4101}
4102
4103unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
4104                                    int &FrameIndex) const {
4105  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4106  if (!Addr || !Addr->isFI())
4107    return AMDGPU::NoRegister;
4108
4109  assert(!MI.memoperands_empty() &&
4110         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
4111
4112  FrameIndex = Addr->getIndex();
4113  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4114}
4115
4116unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
4117                                        int &FrameIndex) const {
4118  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4119  assert(Addr && Addr->isFI());
4120  FrameIndex = Addr->getIndex();
4121  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4122}
4123
4124unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
4125                                          int &FrameIndex) const {
4126
4127  if (!MI.mayLoad())
4128    return AMDGPU::NoRegister;
4129
4130  if (isMUBUF(MI) || isVGPRSpill(MI))
4131    return isStackAccess(MI, FrameIndex);
4132
4133  if (isSGPRSpill(MI))
4134    return isSGPRStackAccess(MI, FrameIndex);
4135
4136  return AMDGPU::NoRegister;
4137}
4138
4139unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
4140                                         int &FrameIndex) const {
4141  if (!MI.mayStore())
4142    return AMDGPU::NoRegister;
4143
4144  if (isMUBUF(MI) || isVGPRSpill(MI))
4145    return isStackAccess(MI, FrameIndex);
4146
4147  if (isSGPRSpill(MI))
4148    return isSGPRStackAccess(MI, FrameIndex);
4149
4150  return AMDGPU::NoRegister;
4151}
4152
4153unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
4154  unsigned Opc = MI.getOpcode();
4155  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
4156  unsigned DescSize = Desc.getSize();
4157
4158  // If we have a definitive size, we can use it. Otherwise we need to inspect
4159  // the operands to know the size.
4160  //
4161  // FIXME: Instructions that have a base 32-bit encoding report their size as
4162  // 4, even though they are really 8 bytes if they have a literal operand.
4163  if (DescSize != 0 && DescSize != 4)
4164    return DescSize;
4165
4166  // 4-byte instructions may have a 32-bit literal encoded after them. Check
4167  // operands that coud ever be literals.
4168  if (isVALU(MI) || isSALU(MI)) {
4169    if (isFixedSize(MI))
4170      return DescSize;
4171
4172    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4173    if (Src0Idx == -1)
4174      return 4; // No operands.
4175
4176    if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
4177      return 8;
4178
4179    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4180    if (Src1Idx == -1)
4181      return 4;
4182
4183    if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
4184      return 8;
4185
4186    return 4;
4187  }
4188
4189  if (DescSize == 4)
4190    return 4;
4191
4192  switch (Opc) {
4193  case TargetOpcode::IMPLICIT_DEF:
4194  case TargetOpcode::KILL:
4195  case TargetOpcode::DBG_VALUE:
4196  case TargetOpcode::BUNDLE:
4197  case TargetOpcode::EH_LABEL:
4198    return 0;
4199  case TargetOpcode::INLINEASM: {
4200    const MachineFunction *MF = MI.getParent()->getParent();
4201    const char *AsmStr = MI.getOperand(0).getSymbolName();
4202    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
4203  }
4204  default:
4205    llvm_unreachable("unable to find instruction size");
4206  }
4207}
4208
4209bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
4210  if (!isFLAT(MI))
4211    return false;
4212
4213  if (MI.memoperands_empty())
4214    return true;
4215
4216  for (const MachineMemOperand *MMO : MI.memoperands()) {
4217    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
4218      return true;
4219  }
4220  return false;
4221}
4222
4223bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
4224  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
4225}
4226
4227void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
4228                                            MachineBasicBlock *IfEnd) const {
4229  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
4230  assert(TI != IfEntry->end());
4231
4232  MachineInstr *Branch = &(*TI);
4233  MachineFunction *MF = IfEntry->getParent();
4234  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
4235
4236  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4237    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4238    MachineInstr *SIIF =
4239        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
4240            .add(Branch->getOperand(0))
4241            .add(Branch->getOperand(1));
4242    MachineInstr *SIEND =
4243        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
4244            .addReg(DstReg);
4245
4246    IfEntry->erase(TI);
4247    IfEntry->insert(IfEntry->end(), SIIF);
4248    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
4249  }
4250}
4251
4252void SIInstrInfo::convertNonUniformLoopRegion(
4253    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
4254  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
4255  // We expect 2 terminators, one conditional and one unconditional.
4256  assert(TI != LoopEnd->end());
4257
4258  MachineInstr *Branch = &(*TI);
4259  MachineFunction *MF = LoopEnd->getParent();
4260  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
4261
4262  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4263
4264    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4265    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4266    MachineInstrBuilder HeaderPHIBuilder =
4267        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
4268    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
4269                                          E = LoopEntry->pred_end();
4270         PI != E; ++PI) {
4271      if (*PI == LoopEnd) {
4272        HeaderPHIBuilder.addReg(BackEdgeReg);
4273      } else {
4274        MachineBasicBlock *PMBB = *PI;
4275        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4276        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
4277                             ZeroReg, 0);
4278        HeaderPHIBuilder.addReg(ZeroReg);
4279      }
4280      HeaderPHIBuilder.addMBB(*PI);
4281    }
4282    MachineInstr *HeaderPhi = HeaderPHIBuilder;
4283    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
4284                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
4285                                  .addReg(DstReg)
4286                                  .add(Branch->getOperand(0));
4287    MachineInstr *SILOOP =
4288        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
4289            .addReg(BackEdgeReg)
4290            .addMBB(LoopEntry);
4291
4292    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
4293    LoopEnd->erase(TI);
4294    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
4295    LoopEnd->insert(LoopEnd->end(), SILOOP);
4296  }
4297}
4298
4299ArrayRef<std::pair<int, const char *>>
4300SIInstrInfo::getSerializableTargetIndices() const {
4301  static const std::pair<int, const char *> TargetIndices[] = {
4302      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
4303      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
4304      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
4305      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
4306      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
4307  return makeArrayRef(TargetIndices);
4308}
4309
4310/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
4311/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
4312ScheduleHazardRecognizer *
4313SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
4314                                            const ScheduleDAG *DAG) const {
4315  return new GCNHazardRecognizer(DAG->MF);
4316}
4317
4318/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
4319/// pass.
4320ScheduleHazardRecognizer *
4321SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
4322  return new GCNHazardRecognizer(MF);
4323}
4324
4325std::pair<unsigned, unsigned>
4326SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4327  return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
4328}
4329
4330ArrayRef<std::pair<unsigned, const char *>>
4331SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4332  static const std::pair<unsigned, const char *> TargetFlags[] = {
4333    { MO_GOTPCREL, "amdgpu-gotprel" },
4334    { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
4335    { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
4336    { MO_REL32_LO, "amdgpu-rel32-lo" },
4337    { MO_REL32_HI, "amdgpu-rel32-hi" }
4338  };
4339
4340  return makeArrayRef(TargetFlags);
4341}
4342
4343bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
4344  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
4345         MI.modifiesRegister(AMDGPU::EXEC, &RI);
4346}
4347
4348MachineInstrBuilder
4349SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
4350                           MachineBasicBlock::iterator I,
4351                           const DebugLoc &DL,
4352                           unsigned DestReg) const {
4353  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4354
4355  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4356
4357  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
4358           .addReg(UnusedCarry, RegState::Define | RegState::Dead);
4359}
4360