1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPUInstrInfo.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPURegisterBankInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27#include "llvm/CodeGen/GlobalISel/Utils.h"
28#include "llvm/CodeGen/MachineBasicBlock.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstr.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/IR/Type.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Support/raw_ostream.h"
36
37#define DEBUG_TYPE "amdgpu-isel"
38
39using namespace llvm;
40using namespace MIPatternMatch;
41
42static cl::opt<bool> AllowRiskySelect(
43  "amdgpu-global-isel-risky-select",
44  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45  cl::init(false),
46  cl::ReallyHidden);
47
48#define GET_GLOBALISEL_IMPL
49#define AMDGPUSubtarget GCNSubtarget
50#include "AMDGPUGenGlobalISel.inc"
51#undef GET_GLOBALISEL_IMPL
52#undef AMDGPUSubtarget
53
54AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55    const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56    const AMDGPUTargetMachine &TM)
57    : InstructionSelector(), TII(*STI.getInstrInfo()),
58      TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59      STI(STI),
60      EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61#define GET_GLOBALISEL_PREDICATES_INIT
62#include "AMDGPUGenGlobalISel.inc"
63#undef GET_GLOBALISEL_PREDICATES_INIT
64#define GET_GLOBALISEL_TEMPORARIES_INIT
65#include "AMDGPUGenGlobalISel.inc"
66#undef GET_GLOBALISEL_TEMPORARIES_INIT
67{
68}
69
70const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71
72void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                        CodeGenCoverage &CoverageInfo) {
74  MRI = &MF.getRegInfo();
75  InstructionSelector::setupMF(MF, KB, CoverageInfo);
76}
77
78bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                      const MachineRegisterInfo &MRI) const {
80  if (Register::isPhysicalRegister(Reg))
81    return Reg == TRI.getVCC();
82
83  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84  const TargetRegisterClass *RC =
85      RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86  if (RC) {
87    const LLT Ty = MRI.getType(Reg);
88    return RC->hasSuperClassEq(TRI.getBoolRC()) &&
89           Ty.isValid() && Ty.getSizeInBits() == 1;
90  }
91
92  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
93  return RB->getID() == AMDGPU::VCCRegBankID;
94}
95
96bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97                                                        unsigned NewOpc) const {
98  MI.setDesc(TII.get(NewOpc));
99  MI.RemoveOperand(1); // Remove intrinsic ID.
100  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101
102  MachineOperand &Dst = MI.getOperand(0);
103  MachineOperand &Src = MI.getOperand(1);
104
105  // TODO: This should be legalized to s32 if needed
106  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107    return false;
108
109  const TargetRegisterClass *DstRC
110    = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111  const TargetRegisterClass *SrcRC
112    = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113  if (!DstRC || DstRC != SrcRC)
114    return false;
115
116  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118}
119
120bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
121  const DebugLoc &DL = I.getDebugLoc();
122  MachineBasicBlock *BB = I.getParent();
123  I.setDesc(TII.get(TargetOpcode::COPY));
124
125  const MachineOperand &Src = I.getOperand(1);
126  MachineOperand &Dst = I.getOperand(0);
127  Register DstReg = Dst.getReg();
128  Register SrcReg = Src.getReg();
129
130  if (isVCC(DstReg, *MRI)) {
131    if (SrcReg == AMDGPU::SCC) {
132      const TargetRegisterClass *RC
133        = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
134      if (!RC)
135        return true;
136      return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
137    }
138
139    if (!isVCC(SrcReg, *MRI)) {
140      // TODO: Should probably leave the copy and let copyPhysReg expand it.
141      if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
142        return false;
143
144      const TargetRegisterClass *SrcRC
145        = TRI.getConstrainedRegClassForOperand(Src, *MRI);
146
147      Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148
149      // We can't trust the high bits at this point, so clear them.
150
151      // TODO: Skip masking high bits if def is known boolean.
152
153      unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
154        AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155      BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156        .addImm(1)
157        .addReg(SrcReg);
158      BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159        .addImm(0)
160        .addReg(MaskedReg);
161
162      if (!MRI->getRegClassOrNull(SrcReg))
163        MRI->setRegClass(SrcReg, SrcRC);
164      I.eraseFromParent();
165      return true;
166    }
167
168    const TargetRegisterClass *RC =
169      TRI.getConstrainedRegClassForOperand(Dst, *MRI);
170    if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
171      return false;
172
173    // Don't constrain the source register to a class so the def instruction
174    // handles it (unless it's undef).
175    //
176    // FIXME: This is a hack. When selecting the def, we neeed to know
177    // specifically know that the result is VCCRegBank, and not just an SGPR
178    // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
179    if (Src.isUndef()) {
180      const TargetRegisterClass *SrcRC =
181        TRI.getConstrainedRegClassForOperand(Src, *MRI);
182      if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
183        return false;
184    }
185
186    return true;
187  }
188
189  for (const MachineOperand &MO : I.operands()) {
190    if (Register::isPhysicalRegister(MO.getReg()))
191      continue;
192
193    const TargetRegisterClass *RC =
194            TRI.getConstrainedRegClassForOperand(MO, *MRI);
195    if (!RC)
196      continue;
197    RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
198  }
199  return true;
200}
201
202bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
203  const Register DefReg = I.getOperand(0).getReg();
204  const LLT DefTy = MRI->getType(DefReg);
205  if (DefTy == LLT::scalar(1)) {
206    if (!AllowRiskySelect) {
207      LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
208      return false;
209    }
210
211    LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
212  }
213
214  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
215
216  const RegClassOrRegBank &RegClassOrBank =
217    MRI->getRegClassOrRegBank(DefReg);
218
219  const TargetRegisterClass *DefRC
220    = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
221  if (!DefRC) {
222    if (!DefTy.isValid()) {
223      LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
224      return false;
225    }
226
227    const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
228    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
229    if (!DefRC) {
230      LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
231      return false;
232    }
233  }
234
235  // TODO: Verify that all registers have the same bank
236  I.setDesc(TII.get(TargetOpcode::PHI));
237  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
238}
239
240MachineOperand
241AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
242                                           const TargetRegisterClass &SubRC,
243                                           unsigned SubIdx) const {
244
245  MachineInstr *MI = MO.getParent();
246  MachineBasicBlock *BB = MO.getParent()->getParent();
247  Register DstReg = MRI->createVirtualRegister(&SubRC);
248
249  if (MO.isReg()) {
250    unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
251    Register Reg = MO.getReg();
252    BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
253            .addReg(Reg, 0, ComposedSubIdx);
254
255    return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
256                                     MO.isKill(), MO.isDead(), MO.isUndef(),
257                                     MO.isEarlyClobber(), 0, MO.isDebug(),
258                                     MO.isInternalRead());
259  }
260
261  assert(MO.isImm());
262
263  APInt Imm(64, MO.getImm());
264
265  switch (SubIdx) {
266  default:
267    llvm_unreachable("do not know to split immediate with this sub index.");
268  case AMDGPU::sub0:
269    return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
270  case AMDGPU::sub1:
271    return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
272  }
273}
274
275static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
276  switch (Opc) {
277  case AMDGPU::G_AND:
278    return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
279  case AMDGPU::G_OR:
280    return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
281  case AMDGPU::G_XOR:
282    return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
283  default:
284    llvm_unreachable("not a bit op");
285  }
286}
287
288bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
289  MachineOperand &Dst = I.getOperand(0);
290  MachineOperand &Src0 = I.getOperand(1);
291  MachineOperand &Src1 = I.getOperand(2);
292  Register DstReg = Dst.getReg();
293  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
294
295  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
296  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
297    const TargetRegisterClass *RC = TRI.getBoolRC();
298    unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
299                                           RC == &AMDGPU::SReg_64RegClass);
300    I.setDesc(TII.get(InstOpc));
301    // Dead implicit-def of scc
302    I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
303                                           true, // isImp
304                                           false, // isKill
305                                           true)); // isDead
306
307    // FIXME: Hack to avoid turning the register bank into a register class.
308    // The selector for G_ICMP relies on seeing the register bank for the result
309    // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
310    // be ambiguous whether it's a scalar or vector bool.
311    if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
312      MRI->setRegClass(Src0.getReg(), RC);
313    if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
314      MRI->setRegClass(Src1.getReg(), RC);
315
316    return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
317  }
318
319  // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
320  // the result?
321  if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
322    unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
323    I.setDesc(TII.get(InstOpc));
324    // Dead implicit-def of scc
325    I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
326                                           true, // isImp
327                                           false, // isKill
328                                           true)); // isDead
329    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
330  }
331
332  return false;
333}
334
335bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
336  MachineBasicBlock *BB = I.getParent();
337  MachineFunction *MF = BB->getParent();
338  Register DstReg = I.getOperand(0).getReg();
339  const DebugLoc &DL = I.getDebugLoc();
340  LLT Ty = MRI->getType(DstReg);
341  if (Ty.isVector())
342    return false;
343
344  unsigned Size = Ty.getSizeInBits();
345  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
346  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
347  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
348
349  if (Size == 32) {
350    if (IsSALU) {
351      const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
352      MachineInstr *Add =
353        BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
354        .add(I.getOperand(1))
355        .add(I.getOperand(2));
356      I.eraseFromParent();
357      return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
358    }
359
360    if (STI.hasAddNoCarry()) {
361      const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
362      I.setDesc(TII.get(Opc));
363      I.addOperand(*MF, MachineOperand::CreateImm(0));
364      I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
365      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
366    }
367
368    const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
369
370    Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
371    MachineInstr *Add
372      = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
373      .addDef(UnusedCarry, RegState::Dead)
374      .add(I.getOperand(1))
375      .add(I.getOperand(2))
376      .addImm(0);
377    I.eraseFromParent();
378    return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
379  }
380
381  assert(!Sub && "illegal sub should not reach here");
382
383  const TargetRegisterClass &RC
384    = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
385  const TargetRegisterClass &HalfRC
386    = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
387
388  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
389  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
390  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
391  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
392
393  Register DstLo = MRI->createVirtualRegister(&HalfRC);
394  Register DstHi = MRI->createVirtualRegister(&HalfRC);
395
396  if (IsSALU) {
397    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
398      .add(Lo1)
399      .add(Lo2);
400    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
401      .add(Hi1)
402      .add(Hi2);
403  } else {
404    const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
405    Register CarryReg = MRI->createVirtualRegister(CarryRC);
406    BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
407      .addDef(CarryReg)
408      .add(Lo1)
409      .add(Lo2)
410      .addImm(0);
411    MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
412      .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
413      .add(Hi1)
414      .add(Hi2)
415      .addReg(CarryReg, RegState::Kill)
416      .addImm(0);
417
418    if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
419      return false;
420  }
421
422  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
423    .addReg(DstLo)
424    .addImm(AMDGPU::sub0)
425    .addReg(DstHi)
426    .addImm(AMDGPU::sub1);
427
428
429  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
430    return false;
431
432  I.eraseFromParent();
433  return true;
434}
435
436bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
437  MachineInstr &I) const {
438  MachineBasicBlock *BB = I.getParent();
439  MachineFunction *MF = BB->getParent();
440  const DebugLoc &DL = I.getDebugLoc();
441  Register Dst0Reg = I.getOperand(0).getReg();
442  Register Dst1Reg = I.getOperand(1).getReg();
443  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
444                     I.getOpcode() == AMDGPU::G_UADDE;
445  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
446                          I.getOpcode() == AMDGPU::G_USUBE;
447
448  if (isVCC(Dst1Reg, *MRI)) {
449      // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
450      // carry out despite the _i32 name. These were renamed in VI to _U32.
451      // FIXME: We should probably rename the opcodes here.
452    unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
453    unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
454    I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
455    I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
456    I.addOperand(*MF, MachineOperand::CreateImm(0));
457    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
458  }
459
460  Register Src0Reg = I.getOperand(2).getReg();
461  Register Src1Reg = I.getOperand(3).getReg();
462
463  if (HasCarryIn) {
464    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
465      .addReg(I.getOperand(4).getReg());
466  }
467
468  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
469  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
470
471  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
472    .add(I.getOperand(2))
473    .add(I.getOperand(3));
474  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
475    .addReg(AMDGPU::SCC);
476
477  if (!MRI->getRegClassOrNull(Dst1Reg))
478    MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
479
480  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
481      !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
482      !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
483    return false;
484
485  if (HasCarryIn &&
486      !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
487                                    AMDGPU::SReg_32RegClass, *MRI))
488    return false;
489
490  I.eraseFromParent();
491  return true;
492}
493
494// TODO: We should probably legalize these to only using 32-bit results.
495bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
496  MachineBasicBlock *BB = I.getParent();
497  Register DstReg = I.getOperand(0).getReg();
498  Register SrcReg = I.getOperand(1).getReg();
499  LLT DstTy = MRI->getType(DstReg);
500  LLT SrcTy = MRI->getType(SrcReg);
501  const unsigned SrcSize = SrcTy.getSizeInBits();
502  unsigned DstSize = DstTy.getSizeInBits();
503
504  // TODO: Should handle any multiple of 32 offset.
505  unsigned Offset = I.getOperand(2).getImm();
506  if (Offset % 32 != 0 || DstSize > 128)
507    return false;
508
509  // 16-bit operations really use 32-bit registers.
510  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
511  if (DstSize == 16)
512    DstSize = 32;
513
514  const TargetRegisterClass *DstRC =
515    TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
516  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
517    return false;
518
519  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
520  const TargetRegisterClass *SrcRC =
521    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
522  if (!SrcRC)
523    return false;
524  unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
525                                                         DstSize / 32);
526  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
527  if (!SrcRC)
528    return false;
529
530  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
531                                    *SrcRC, I.getOperand(1));
532  const DebugLoc &DL = I.getDebugLoc();
533  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
534    .addReg(SrcReg, 0, SubReg);
535
536  I.eraseFromParent();
537  return true;
538}
539
540bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
541  MachineBasicBlock *BB = MI.getParent();
542  Register DstReg = MI.getOperand(0).getReg();
543  LLT DstTy = MRI->getType(DstReg);
544  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
545
546  const unsigned SrcSize = SrcTy.getSizeInBits();
547  if (SrcSize < 32)
548    return selectImpl(MI, *CoverageInfo);
549
550  const DebugLoc &DL = MI.getDebugLoc();
551  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
552  const unsigned DstSize = DstTy.getSizeInBits();
553  const TargetRegisterClass *DstRC =
554    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
555  if (!DstRC)
556    return false;
557
558  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
559  MachineInstrBuilder MIB =
560    BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
561  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
562    MachineOperand &Src = MI.getOperand(I + 1);
563    MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
564    MIB.addImm(SubRegs[I]);
565
566    const TargetRegisterClass *SrcRC
567      = TRI.getConstrainedRegClassForOperand(Src, *MRI);
568    if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
569      return false;
570  }
571
572  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
573    return false;
574
575  MI.eraseFromParent();
576  return true;
577}
578
579bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
580  MachineBasicBlock *BB = MI.getParent();
581  const int NumDst = MI.getNumOperands() - 1;
582
583  MachineOperand &Src = MI.getOperand(NumDst);
584
585  Register SrcReg = Src.getReg();
586  Register DstReg0 = MI.getOperand(0).getReg();
587  LLT DstTy = MRI->getType(DstReg0);
588  LLT SrcTy = MRI->getType(SrcReg);
589
590  const unsigned DstSize = DstTy.getSizeInBits();
591  const unsigned SrcSize = SrcTy.getSizeInBits();
592  const DebugLoc &DL = MI.getDebugLoc();
593  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
594
595  const TargetRegisterClass *SrcRC =
596    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
597  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
598    return false;
599
600  const unsigned SrcFlags = getUndefRegState(Src.isUndef());
601
602  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
603  // source, and this relies on the fact that the same subregister indices are
604  // used for both.
605  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
606  for (int I = 0, E = NumDst; I != E; ++I) {
607    MachineOperand &Dst = MI.getOperand(I);
608    BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
609      .addReg(SrcReg, SrcFlags, SubRegs[I]);
610
611    const TargetRegisterClass *DstRC =
612      TRI.getConstrainedRegClassForOperand(Dst, *MRI);
613    if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
614      return false;
615  }
616
617  MI.eraseFromParent();
618  return true;
619}
620
621static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
622  int64_t Val;
623  return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
624}
625
626bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
627  MachineInstr &MI) const {
628  if (selectImpl(MI, *CoverageInfo))
629    return true;
630
631  const LLT S32 = LLT::scalar(32);
632  const LLT V2S16 = LLT::vector(2, 16);
633
634  Register Dst = MI.getOperand(0).getReg();
635  if (MRI->getType(Dst) != V2S16)
636    return false;
637
638  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
639  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
640    return false;
641
642  Register Src0 = MI.getOperand(1).getReg();
643  Register Src1 = MI.getOperand(2).getReg();
644  if (MRI->getType(Src0) != S32)
645    return false;
646
647  const DebugLoc &DL = MI.getDebugLoc();
648  MachineBasicBlock *BB = MI.getParent();
649
650  // TODO: This should probably be a combine somewhere
651  // (build_vector_trunc $src0, undef -> copy $src0
652  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
653  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
654    MI.setDesc(TII.get(AMDGPU::COPY));
655    MI.RemoveOperand(2);
656    return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
657           RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
658  }
659
660  Register ShiftSrc0;
661  Register ShiftSrc1;
662  int64_t ShiftAmt;
663
664  // With multiple uses of the shift, this will duplicate the shift and
665  // increase register pressure.
666  //
667  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
668  //  => (S_PACK_HH_B32_B16 $src0, $src1)
669  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
670  //  => (S_PACK_LH_B32_B16 $src0, $src1)
671  // (build_vector_trunc $src0, $src1)
672  //  => (S_PACK_LL_B32_B16 $src0, $src1)
673
674  // FIXME: This is an inconvenient way to check a specific value
675  bool Shift0 = mi_match(
676    Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
677    ShiftAmt == 16;
678
679  bool Shift1 = mi_match(
680    Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
681    ShiftAmt == 16;
682
683  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
684  if (Shift0 && Shift1) {
685    Opc = AMDGPU::S_PACK_HH_B32_B16;
686    MI.getOperand(1).setReg(ShiftSrc0);
687    MI.getOperand(2).setReg(ShiftSrc1);
688  } else if (Shift1) {
689    Opc = AMDGPU::S_PACK_LH_B32_B16;
690    MI.getOperand(2).setReg(ShiftSrc1);
691  } else if (Shift0 && isZero(Src1, *MRI)) {
692    // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
693    auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
694      .addReg(ShiftSrc0)
695      .addImm(16);
696
697    MI.eraseFromParent();
698    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
699  }
700
701  MI.setDesc(TII.get(Opc));
702  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
703}
704
705bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
706  return selectG_ADD_SUB(I);
707}
708
709bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
710  const MachineOperand &MO = I.getOperand(0);
711
712  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
713  // regbank check here is to know why getConstrainedRegClassForOperand failed.
714  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
715  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
716      (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
717    I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
718    return true;
719  }
720
721  return false;
722}
723
724bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
725  MachineBasicBlock *BB = I.getParent();
726
727  Register DstReg = I.getOperand(0).getReg();
728  Register Src0Reg = I.getOperand(1).getReg();
729  Register Src1Reg = I.getOperand(2).getReg();
730  LLT Src1Ty = MRI->getType(Src1Reg);
731
732  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
733  unsigned InsSize = Src1Ty.getSizeInBits();
734
735  int64_t Offset = I.getOperand(3).getImm();
736
737  // FIXME: These cases should have been illegal and unnecessary to check here.
738  if (Offset % 32 != 0 || InsSize % 32 != 0)
739    return false;
740
741  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
742  if (SubReg == AMDGPU::NoSubRegister)
743    return false;
744
745  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
746  const TargetRegisterClass *DstRC =
747    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
748  if (!DstRC)
749    return false;
750
751  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
752  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
753  const TargetRegisterClass *Src0RC =
754    TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
755  const TargetRegisterClass *Src1RC =
756    TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
757
758  // Deal with weird cases where the class only partially supports the subreg
759  // index.
760  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
761  if (!Src0RC || !Src1RC)
762    return false;
763
764  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
765      !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
766      !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
767    return false;
768
769  const DebugLoc &DL = I.getDebugLoc();
770  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
771    .addReg(Src0Reg)
772    .addReg(Src1Reg)
773    .addImm(SubReg);
774
775  I.eraseFromParent();
776  return true;
777}
778
779bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
780  if (STI.getLDSBankCount() != 16)
781    return selectImpl(MI, *CoverageInfo);
782
783  Register Dst = MI.getOperand(0).getReg();
784  Register Src0 = MI.getOperand(2).getReg();
785  Register M0Val = MI.getOperand(6).getReg();
786  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
787      !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
788      !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
789    return false;
790
791  // This requires 2 instructions. It is possible to write a pattern to support
792  // this, but the generated isel emitter doesn't correctly deal with multiple
793  // output instructions using the same physical register input. The copy to m0
794  // is incorrectly placed before the second instruction.
795  //
796  // TODO: Match source modifiers.
797
798  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799  const DebugLoc &DL = MI.getDebugLoc();
800  MachineBasicBlock *MBB = MI.getParent();
801
802  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
803    .addReg(M0Val);
804  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
805    .addImm(2)
806    .addImm(MI.getOperand(4).getImm())  // $attr
807    .addImm(MI.getOperand(3).getImm()); // $attrchan
808
809  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
810    .addImm(0)                          // $src0_modifiers
811    .addReg(Src0)                       // $src0
812    .addImm(MI.getOperand(4).getImm())  // $attr
813    .addImm(MI.getOperand(3).getImm())  // $attrchan
814    .addImm(0)                          // $src2_modifiers
815    .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
816    .addImm(MI.getOperand(5).getImm())  // $high
817    .addImm(0)                          // $clamp
818    .addImm(0);                         // $omod
819
820  MI.eraseFromParent();
821  return true;
822}
823
824// We need to handle this here because tablegen doesn't support matching
825// instructions with multiple outputs.
826bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
827  Register Dst0 = MI.getOperand(0).getReg();
828  Register Dst1 = MI.getOperand(1).getReg();
829
830  LLT Ty = MRI->getType(Dst0);
831  unsigned Opc;
832  if (Ty == LLT::scalar(32))
833    Opc = AMDGPU::V_DIV_SCALE_F32;
834  else if (Ty == LLT::scalar(64))
835    Opc = AMDGPU::V_DIV_SCALE_F64;
836  else
837    return false;
838
839  const DebugLoc &DL = MI.getDebugLoc();
840  MachineBasicBlock *MBB = MI.getParent();
841
842  Register Numer = MI.getOperand(3).getReg();
843  Register Denom = MI.getOperand(4).getReg();
844  unsigned ChooseDenom = MI.getOperand(5).getImm();
845
846  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
847
848  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
849    .addDef(Dst1)
850    .addUse(Src0)
851    .addUse(Denom)
852    .addUse(Numer);
853
854  MI.eraseFromParent();
855  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
856}
857
858bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
859  unsigned IntrinsicID = I.getIntrinsicID();
860  switch (IntrinsicID) {
861  case Intrinsic::amdgcn_if_break: {
862    MachineBasicBlock *BB = I.getParent();
863
864    // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
865    // SelectionDAG uses for wave32 vs wave64.
866    BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
867      .add(I.getOperand(0))
868      .add(I.getOperand(2))
869      .add(I.getOperand(3));
870
871    Register DstReg = I.getOperand(0).getReg();
872    Register Src0Reg = I.getOperand(2).getReg();
873    Register Src1Reg = I.getOperand(3).getReg();
874
875    I.eraseFromParent();
876
877    for (Register Reg : { DstReg, Src0Reg, Src1Reg })
878      MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
879
880    return true;
881  }
882  case Intrinsic::amdgcn_interp_p1_f16:
883    return selectInterpP1F16(I);
884  case Intrinsic::amdgcn_wqm:
885    return constrainCopyLikeIntrin(I, AMDGPU::WQM);
886  case Intrinsic::amdgcn_softwqm:
887    return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
888  case Intrinsic::amdgcn_wwm:
889    return constrainCopyLikeIntrin(I, AMDGPU::WWM);
890  case Intrinsic::amdgcn_div_scale:
891    return selectDivScale(I);
892  case Intrinsic::amdgcn_icmp:
893    return selectIntrinsicIcmp(I);
894  case Intrinsic::amdgcn_ballot:
895    return selectBallot(I);
896  default:
897    return selectImpl(I, *CoverageInfo);
898  }
899}
900
901static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
902  if (Size != 32 && Size != 64)
903    return -1;
904  switch (P) {
905  default:
906    llvm_unreachable("Unknown condition code!");
907  case CmpInst::ICMP_NE:
908    return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
909  case CmpInst::ICMP_EQ:
910    return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
911  case CmpInst::ICMP_SGT:
912    return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
913  case CmpInst::ICMP_SGE:
914    return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
915  case CmpInst::ICMP_SLT:
916    return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
917  case CmpInst::ICMP_SLE:
918    return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
919  case CmpInst::ICMP_UGT:
920    return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
921  case CmpInst::ICMP_UGE:
922    return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
923  case CmpInst::ICMP_ULT:
924    return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
925  case CmpInst::ICMP_ULE:
926    return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
927  }
928}
929
930int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
931                                              unsigned Size) const {
932  if (Size == 64) {
933    if (!STI.hasScalarCompareEq64())
934      return -1;
935
936    switch (P) {
937    case CmpInst::ICMP_NE:
938      return AMDGPU::S_CMP_LG_U64;
939    case CmpInst::ICMP_EQ:
940      return AMDGPU::S_CMP_EQ_U64;
941    default:
942      return -1;
943    }
944  }
945
946  if (Size != 32)
947    return -1;
948
949  switch (P) {
950  case CmpInst::ICMP_NE:
951    return AMDGPU::S_CMP_LG_U32;
952  case CmpInst::ICMP_EQ:
953    return AMDGPU::S_CMP_EQ_U32;
954  case CmpInst::ICMP_SGT:
955    return AMDGPU::S_CMP_GT_I32;
956  case CmpInst::ICMP_SGE:
957    return AMDGPU::S_CMP_GE_I32;
958  case CmpInst::ICMP_SLT:
959    return AMDGPU::S_CMP_LT_I32;
960  case CmpInst::ICMP_SLE:
961    return AMDGPU::S_CMP_LE_I32;
962  case CmpInst::ICMP_UGT:
963    return AMDGPU::S_CMP_GT_U32;
964  case CmpInst::ICMP_UGE:
965    return AMDGPU::S_CMP_GE_U32;
966  case CmpInst::ICMP_ULT:
967    return AMDGPU::S_CMP_LT_U32;
968  case CmpInst::ICMP_ULE:
969    return AMDGPU::S_CMP_LE_U32;
970  default:
971    llvm_unreachable("Unknown condition code!");
972  }
973}
974
975bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
976  MachineBasicBlock *BB = I.getParent();
977  const DebugLoc &DL = I.getDebugLoc();
978
979  Register SrcReg = I.getOperand(2).getReg();
980  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
981
982  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
983
984  Register CCReg = I.getOperand(0).getReg();
985  if (!isVCC(CCReg, *MRI)) {
986    int Opcode = getS_CMPOpcode(Pred, Size);
987    if (Opcode == -1)
988      return false;
989    MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
990            .add(I.getOperand(2))
991            .add(I.getOperand(3));
992    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
993      .addReg(AMDGPU::SCC);
994    bool Ret =
995        constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
996        RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
997    I.eraseFromParent();
998    return Ret;
999  }
1000
1001  int Opcode = getV_CMPOpcode(Pred, Size);
1002  if (Opcode == -1)
1003    return false;
1004
1005  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1006            I.getOperand(0).getReg())
1007            .add(I.getOperand(2))
1008            .add(I.getOperand(3));
1009  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1010                               *TRI.getBoolRC(), *MRI);
1011  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1012  I.eraseFromParent();
1013  return Ret;
1014}
1015
1016bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1017  Register Dst = I.getOperand(0).getReg();
1018  if (isVCC(Dst, *MRI))
1019    return false;
1020
1021  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1022    return false;
1023
1024  MachineBasicBlock *BB = I.getParent();
1025  const DebugLoc &DL = I.getDebugLoc();
1026  Register SrcReg = I.getOperand(2).getReg();
1027  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1028  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1029
1030  int Opcode = getV_CMPOpcode(Pred, Size);
1031  if (Opcode == -1)
1032    return false;
1033
1034  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1035                           .add(I.getOperand(2))
1036                           .add(I.getOperand(3));
1037  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1038                               *MRI);
1039  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1040  I.eraseFromParent();
1041  return Ret;
1042}
1043
1044bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1045  MachineBasicBlock *BB = I.getParent();
1046  const DebugLoc &DL = I.getDebugLoc();
1047  Register DstReg = I.getOperand(0).getReg();
1048  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1049  const bool Is64 = Size == 64;
1050
1051  if (Size != STI.getWavefrontSize())
1052    return false;
1053
1054  Optional<ValueAndVReg> Arg =
1055      getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1056
1057  if (Arg.hasValue()) {
1058    const int64_t Value = Arg.getValue().Value;
1059    if (Value == 0) {
1060      unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1061      BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1062    } else if (Value == -1) { // all ones
1063      Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1064      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1065    } else
1066      return false;
1067  } else {
1068    Register SrcReg = I.getOperand(2).getReg();
1069    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1070  }
1071
1072  I.eraseFromParent();
1073  return true;
1074}
1075
1076bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1077  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1078  // SelectionDAG uses for wave32 vs wave64.
1079  MachineBasicBlock *BB = MI.getParent();
1080  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1081      .add(MI.getOperand(1));
1082
1083  Register Reg = MI.getOperand(1).getReg();
1084  MI.eraseFromParent();
1085
1086  if (!MRI->getRegClassOrNull(Reg))
1087    MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1088  return true;
1089}
1090
1091static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1092  switch (MF.getFunction().getCallingConv()) {
1093  case CallingConv::AMDGPU_PS:
1094    return 1;
1095  case CallingConv::AMDGPU_VS:
1096    return 2;
1097  case CallingConv::AMDGPU_GS:
1098    return 3;
1099  case CallingConv::AMDGPU_HS:
1100  case CallingConv::AMDGPU_LS:
1101  case CallingConv::AMDGPU_ES:
1102    report_fatal_error("ds_ordered_count unsupported for this calling conv");
1103  case CallingConv::AMDGPU_CS:
1104  case CallingConv::AMDGPU_KERNEL:
1105  case CallingConv::C:
1106  case CallingConv::Fast:
1107  default:
1108    // Assume other calling conventions are various compute callable functions
1109    return 0;
1110  }
1111}
1112
1113bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1114  MachineInstr &MI, Intrinsic::ID IntrID) const {
1115  MachineBasicBlock *MBB = MI.getParent();
1116  MachineFunction *MF = MBB->getParent();
1117  const DebugLoc &DL = MI.getDebugLoc();
1118
1119  unsigned IndexOperand = MI.getOperand(7).getImm();
1120  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1121  bool WaveDone = MI.getOperand(9).getImm() != 0;
1122
1123  if (WaveDone && !WaveRelease)
1124    report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1125
1126  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1127  IndexOperand &= ~0x3f;
1128  unsigned CountDw = 0;
1129
1130  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1131    CountDw = (IndexOperand >> 24) & 0xf;
1132    IndexOperand &= ~(0xf << 24);
1133
1134    if (CountDw < 1 || CountDw > 4) {
1135      report_fatal_error(
1136        "ds_ordered_count: dword count must be between 1 and 4");
1137    }
1138  }
1139
1140  if (IndexOperand)
1141    report_fatal_error("ds_ordered_count: bad index operand");
1142
1143  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1144  unsigned ShaderType = getDSShaderTypeValue(*MF);
1145
1146  unsigned Offset0 = OrderedCountIndex << 2;
1147  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1148                     (Instruction << 4);
1149
1150  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1151    Offset1 |= (CountDw - 1) << 6;
1152
1153  unsigned Offset = Offset0 | (Offset1 << 8);
1154
1155  Register M0Val = MI.getOperand(2).getReg();
1156  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1157    .addReg(M0Val);
1158
1159  Register DstReg = MI.getOperand(0).getReg();
1160  Register ValReg = MI.getOperand(3).getReg();
1161  MachineInstrBuilder DS =
1162    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1163      .addReg(ValReg)
1164      .addImm(Offset)
1165      .cloneMemRefs(MI);
1166
1167  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1168    return false;
1169
1170  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1171  MI.eraseFromParent();
1172  return Ret;
1173}
1174
1175static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1176  switch (IntrID) {
1177  case Intrinsic::amdgcn_ds_gws_init:
1178    return AMDGPU::DS_GWS_INIT;
1179  case Intrinsic::amdgcn_ds_gws_barrier:
1180    return AMDGPU::DS_GWS_BARRIER;
1181  case Intrinsic::amdgcn_ds_gws_sema_v:
1182    return AMDGPU::DS_GWS_SEMA_V;
1183  case Intrinsic::amdgcn_ds_gws_sema_br:
1184    return AMDGPU::DS_GWS_SEMA_BR;
1185  case Intrinsic::amdgcn_ds_gws_sema_p:
1186    return AMDGPU::DS_GWS_SEMA_P;
1187  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1188    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1189  default:
1190    llvm_unreachable("not a gws intrinsic");
1191  }
1192}
1193
1194bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1195                                                     Intrinsic::ID IID) const {
1196  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1197      !STI.hasGWSSemaReleaseAll())
1198    return false;
1199
1200  // intrinsic ID, vsrc, offset
1201  const bool HasVSrc = MI.getNumOperands() == 3;
1202  assert(HasVSrc || MI.getNumOperands() == 2);
1203
1204  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1205  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1206  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1207    return false;
1208
1209  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1210  assert(OffsetDef);
1211
1212  unsigned ImmOffset;
1213
1214  MachineBasicBlock *MBB = MI.getParent();
1215  const DebugLoc &DL = MI.getDebugLoc();
1216
1217  MachineInstr *Readfirstlane = nullptr;
1218
1219  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1220  // incoming offset, in case there's an add of a constant. We'll have to put it
1221  // back later.
1222  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1223    Readfirstlane = OffsetDef;
1224    BaseOffset = OffsetDef->getOperand(1).getReg();
1225    OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1226  }
1227
1228  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1229    // If we have a constant offset, try to use the 0 in m0 as the base.
1230    // TODO: Look into changing the default m0 initialization value. If the
1231    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1232    // the immediate offset.
1233
1234    ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1235    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1236      .addImm(0);
1237  } else {
1238    std::tie(BaseOffset, ImmOffset, OffsetDef)
1239      = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1240
1241    if (Readfirstlane) {
1242      // We have the constant offset now, so put the readfirstlane back on the
1243      // variable component.
1244      if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1245        return false;
1246
1247      Readfirstlane->getOperand(1).setReg(BaseOffset);
1248      BaseOffset = Readfirstlane->getOperand(0).getReg();
1249    } else {
1250      if (!RBI.constrainGenericRegister(BaseOffset,
1251                                        AMDGPU::SReg_32RegClass, *MRI))
1252        return false;
1253    }
1254
1255    Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1256    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1257      .addReg(BaseOffset)
1258      .addImm(16);
1259
1260    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1261      .addReg(M0Base);
1262  }
1263
1264  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1265  // offset field) % 64. Some versions of the programming guide omit the m0
1266  // part, or claim it's from offset 0.
1267  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1268
1269  if (HasVSrc) {
1270    Register VSrc = MI.getOperand(1).getReg();
1271    MIB.addReg(VSrc);
1272    if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1273      return false;
1274  }
1275
1276  MIB.addImm(ImmOffset)
1277     .addImm(-1) // $gds
1278     .cloneMemRefs(MI);
1279
1280  MI.eraseFromParent();
1281  return true;
1282}
1283
1284bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1285                                                      bool IsAppend) const {
1286  Register PtrBase = MI.getOperand(2).getReg();
1287  LLT PtrTy = MRI->getType(PtrBase);
1288  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1289
1290  unsigned Offset;
1291  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1292
1293  // TODO: Should this try to look through readfirstlane like GWS?
1294  if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1295    PtrBase = MI.getOperand(2).getReg();
1296    Offset = 0;
1297  }
1298
1299  MachineBasicBlock *MBB = MI.getParent();
1300  const DebugLoc &DL = MI.getDebugLoc();
1301  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1302
1303  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1304    .addReg(PtrBase);
1305  BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1306    .addImm(Offset)
1307    .addImm(IsGDS ? -1 : 0)
1308    .cloneMemRefs(MI);
1309  MI.eraseFromParent();
1310  return true;
1311}
1312
1313static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1314                         bool &IsTexFail) {
1315  if (TexFailCtrl)
1316    IsTexFail = true;
1317
1318  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1319  TexFailCtrl &= ~(uint64_t)0x1;
1320  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1321  TexFailCtrl &= ~(uint64_t)0x2;
1322
1323  return TexFailCtrl == 0;
1324}
1325
1326static bool parseCachePolicy(uint64_t Value,
1327                             bool *GLC, bool *SLC, bool *DLC) {
1328  if (GLC) {
1329    *GLC = (Value & 0x1) ? 1 : 0;
1330    Value &= ~(uint64_t)0x1;
1331  }
1332  if (SLC) {
1333    *SLC = (Value & 0x2) ? 1 : 0;
1334    Value &= ~(uint64_t)0x2;
1335  }
1336  if (DLC) {
1337    *DLC = (Value & 0x4) ? 1 : 0;
1338    Value &= ~(uint64_t)0x4;
1339  }
1340
1341  return Value == 0;
1342}
1343
1344bool AMDGPUInstructionSelector::selectImageIntrinsic(
1345  MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1346  MachineBasicBlock *MBB = MI.getParent();
1347  const DebugLoc &DL = MI.getDebugLoc();
1348
1349  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1350    AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1351
1352  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1353  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1354      AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1355  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1356      AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1357  unsigned IntrOpcode = Intr->BaseOpcode;
1358  const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1359
1360  const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1361                                             MI.getNumExplicitDefs());
1362  int NumVAddr, NumGradients;
1363  std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1364
1365  Register VDataIn, VDataOut;
1366  LLT VDataTy;
1367  int NumVDataDwords = -1;
1368  bool IsD16 = false;
1369
1370  // XXX - Can we just get the second to last argument for ctrl?
1371  unsigned CtrlIdx; // Index of texfailctrl argument
1372  bool Unorm;
1373  if (!BaseOpcode->Sampler) {
1374    Unorm = true;
1375    CtrlIdx = VAddrIdx + NumVAddr + 1;
1376  } else {
1377    Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1378    CtrlIdx = VAddrIdx + NumVAddr + 3;
1379  }
1380
1381  bool TFE;
1382  bool LWE;
1383  bool IsTexFail = false;
1384  if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1385    return false;
1386
1387  const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1388  const bool IsA16 = (Flags & 1) != 0;
1389  const bool IsG16 = (Flags & 2) != 0;
1390
1391  // A16 implies 16 bit gradients
1392  if (IsA16 && !IsG16)
1393    return false;
1394
1395  unsigned DMask = 0;
1396  unsigned DMaskLanes = 0;
1397
1398  if (BaseOpcode->Atomic) {
1399    VDataOut = MI.getOperand(0).getReg();
1400    VDataIn = MI.getOperand(2).getReg();
1401    LLT Ty = MRI->getType(VDataIn);
1402
1403    // Be careful to allow atomic swap on 16-bit element vectors.
1404    const bool Is64Bit = BaseOpcode->AtomicX2 ?
1405      Ty.getSizeInBits() == 128 :
1406      Ty.getSizeInBits() == 64;
1407
1408    if (BaseOpcode->AtomicX2) {
1409      assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1410
1411      DMask = Is64Bit ? 0xf : 0x3;
1412      NumVDataDwords = Is64Bit ? 4 : 2;
1413    } else {
1414      DMask = Is64Bit ? 0x3 : 0x1;
1415      NumVDataDwords = Is64Bit ? 2 : 1;
1416    }
1417  } else {
1418    const int DMaskIdx = 2; // Input/output + intrinsic ID.
1419
1420    DMask = MI.getOperand(DMaskIdx).getImm();
1421    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1422
1423    if (BaseOpcode->Store) {
1424      VDataIn = MI.getOperand(1).getReg();
1425      VDataTy = MRI->getType(VDataIn);
1426      NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1427    } else {
1428      VDataOut = MI.getOperand(0).getReg();
1429      VDataTy = MRI->getType(VDataOut);
1430      NumVDataDwords = DMaskLanes;
1431
1432      // One memoperand is mandatory, except for getresinfo.
1433      // FIXME: Check this in verifier.
1434      if (!MI.memoperands_empty()) {
1435        const MachineMemOperand *MMO = *MI.memoperands_begin();
1436
1437        // Infer d16 from the memory size, as the register type will be mangled by
1438        // unpacked subtargets, or by TFE.
1439        IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1440
1441        if (IsD16 && !STI.hasUnpackedD16VMem())
1442          NumVDataDwords = (DMaskLanes + 1) / 2;
1443      }
1444    }
1445  }
1446
1447  // Optimize _L to _LZ when _L is zero
1448  if (LZMappingInfo) {
1449    // The legalizer replaced the register with an immediate 0 if we need to
1450    // change the opcode.
1451    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1452    if (Lod.isImm()) {
1453      assert(Lod.getImm() == 0);
1454      IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1455    }
1456  }
1457
1458  // Optimize _mip away, when 'lod' is zero
1459  if (MIPMappingInfo) {
1460    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1461    if (Lod.isImm()) {
1462      assert(Lod.getImm() == 0);
1463      IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1464    }
1465  }
1466
1467  // Set G16 opcode
1468  if (IsG16 && !IsA16) {
1469    const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1470        AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1471    assert(G16MappingInfo);
1472    IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1473  }
1474
1475  // TODO: Check this in verifier.
1476  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1477
1478  bool GLC = false;
1479  bool SLC = false;
1480  bool DLC = false;
1481  if (BaseOpcode->Atomic) {
1482    GLC = true; // TODO no-return optimization
1483    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1484                          IsGFX10 ? &DLC : nullptr))
1485      return false;
1486  } else {
1487    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1488                          IsGFX10 ? &DLC : nullptr))
1489      return false;
1490  }
1491
1492  int NumVAddrRegs = 0;
1493  int NumVAddrDwords = 0;
1494  for (int I = 0; I < NumVAddr; ++I) {
1495    // Skip the $noregs and 0s inserted during legalization.
1496    MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1497    if (!AddrOp.isReg())
1498      continue; // XXX - Break?
1499
1500    Register Addr = AddrOp.getReg();
1501    if (!Addr)
1502      break;
1503
1504    ++NumVAddrRegs;
1505    NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1506  }
1507
1508  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1509  // NSA, these should have beeen packed into a single value in the first
1510  // address register
1511  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1512  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1513    LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1514    return false;
1515  }
1516
1517  if (IsTexFail)
1518    ++NumVDataDwords;
1519
1520  int Opcode = -1;
1521  if (IsGFX10) {
1522    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1523                                   UseNSA ? AMDGPU::MIMGEncGfx10NSA
1524                                          : AMDGPU::MIMGEncGfx10Default,
1525                                   NumVDataDwords, NumVAddrDwords);
1526  } else {
1527    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1528      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1529                                     NumVDataDwords, NumVAddrDwords);
1530    if (Opcode == -1)
1531      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1532                                     NumVDataDwords, NumVAddrDwords);
1533  }
1534  assert(Opcode != -1);
1535
1536  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1537    .cloneMemRefs(MI);
1538
1539  if (VDataOut) {
1540    if (BaseOpcode->AtomicX2) {
1541      const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1542
1543      Register TmpReg = MRI->createVirtualRegister(
1544        Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1545      unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1546
1547      MIB.addDef(TmpReg);
1548      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1549        .addReg(TmpReg, RegState::Kill, SubReg);
1550
1551    } else {
1552      MIB.addDef(VDataOut); // vdata output
1553    }
1554  }
1555
1556  if (VDataIn)
1557    MIB.addReg(VDataIn); // vdata input
1558
1559  for (int i = 0; i != NumVAddrRegs; ++i) {
1560    MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1561    if (SrcOp.isReg()) {
1562      assert(SrcOp.getReg() != 0);
1563      MIB.addReg(SrcOp.getReg());
1564    }
1565  }
1566
1567  MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1568  if (BaseOpcode->Sampler)
1569    MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1570
1571  MIB.addImm(DMask); // dmask
1572
1573  if (IsGFX10)
1574    MIB.addImm(DimInfo->Encoding);
1575  MIB.addImm(Unorm);
1576  if (IsGFX10)
1577    MIB.addImm(DLC);
1578
1579  MIB.addImm(GLC);
1580  MIB.addImm(SLC);
1581  MIB.addImm(IsA16 &&  // a16 or r128
1582             STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1583  if (IsGFX10)
1584    MIB.addImm(IsA16 ? -1 : 0);
1585
1586  MIB.addImm(TFE); // tfe
1587  MIB.addImm(LWE); // lwe
1588  if (!IsGFX10)
1589    MIB.addImm(DimInfo->DA ? -1 : 0);
1590  if (BaseOpcode->HasD16)
1591    MIB.addImm(IsD16 ? -1 : 0);
1592
1593  MI.eraseFromParent();
1594  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1595}
1596
1597bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1598    MachineInstr &I) const {
1599  unsigned IntrinsicID = I.getIntrinsicID();
1600  switch (IntrinsicID) {
1601  case Intrinsic::amdgcn_end_cf:
1602    return selectEndCfIntrinsic(I);
1603  case Intrinsic::amdgcn_ds_ordered_add:
1604  case Intrinsic::amdgcn_ds_ordered_swap:
1605    return selectDSOrderedIntrinsic(I, IntrinsicID);
1606  case Intrinsic::amdgcn_ds_gws_init:
1607  case Intrinsic::amdgcn_ds_gws_barrier:
1608  case Intrinsic::amdgcn_ds_gws_sema_v:
1609  case Intrinsic::amdgcn_ds_gws_sema_br:
1610  case Intrinsic::amdgcn_ds_gws_sema_p:
1611  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1612    return selectDSGWSIntrinsic(I, IntrinsicID);
1613  case Intrinsic::amdgcn_ds_append:
1614    return selectDSAppendConsume(I, true);
1615  case Intrinsic::amdgcn_ds_consume:
1616    return selectDSAppendConsume(I, false);
1617  default: {
1618    return selectImpl(I, *CoverageInfo);
1619  }
1620  }
1621}
1622
1623bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1624  if (selectImpl(I, *CoverageInfo))
1625    return true;
1626
1627  MachineBasicBlock *BB = I.getParent();
1628  const DebugLoc &DL = I.getDebugLoc();
1629
1630  Register DstReg = I.getOperand(0).getReg();
1631  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1632  assert(Size <= 32 || Size == 64);
1633  const MachineOperand &CCOp = I.getOperand(1);
1634  Register CCReg = CCOp.getReg();
1635  if (!isVCC(CCReg, *MRI)) {
1636    unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1637                                         AMDGPU::S_CSELECT_B32;
1638    MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1639            .addReg(CCReg);
1640
1641    // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1642    // bank, because it does not cover the register class that we used to represent
1643    // for it.  So we need to manually set the register class here.
1644    if (!MRI->getRegClassOrNull(CCReg))
1645        MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1646    MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1647            .add(I.getOperand(2))
1648            .add(I.getOperand(3));
1649
1650    bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1651               constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1652    I.eraseFromParent();
1653    return Ret;
1654  }
1655
1656  // Wide VGPR select should have been split in RegBankSelect.
1657  if (Size > 32)
1658    return false;
1659
1660  MachineInstr *Select =
1661      BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1662              .addImm(0)
1663              .add(I.getOperand(3))
1664              .addImm(0)
1665              .add(I.getOperand(2))
1666              .add(I.getOperand(1));
1667
1668  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1669  I.eraseFromParent();
1670  return Ret;
1671}
1672
1673bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1674  initM0(I);
1675  return selectImpl(I, *CoverageInfo);
1676}
1677
1678static int sizeToSubRegIndex(unsigned Size) {
1679  switch (Size) {
1680  case 32:
1681    return AMDGPU::sub0;
1682  case 64:
1683    return AMDGPU::sub0_sub1;
1684  case 96:
1685    return AMDGPU::sub0_sub1_sub2;
1686  case 128:
1687    return AMDGPU::sub0_sub1_sub2_sub3;
1688  case 256:
1689    return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1690  default:
1691    if (Size < 32)
1692      return AMDGPU::sub0;
1693    if (Size > 256)
1694      return -1;
1695    return sizeToSubRegIndex(PowerOf2Ceil(Size));
1696  }
1697}
1698
1699bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1700  Register DstReg = I.getOperand(0).getReg();
1701  Register SrcReg = I.getOperand(1).getReg();
1702  const LLT DstTy = MRI->getType(DstReg);
1703  const LLT SrcTy = MRI->getType(SrcReg);
1704  const LLT S1 = LLT::scalar(1);
1705
1706  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1707  const RegisterBank *DstRB;
1708  if (DstTy == S1) {
1709    // This is a special case. We don't treat s1 for legalization artifacts as
1710    // vcc booleans.
1711    DstRB = SrcRB;
1712  } else {
1713    DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1714    if (SrcRB != DstRB)
1715      return false;
1716  }
1717
1718  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1719
1720  unsigned DstSize = DstTy.getSizeInBits();
1721  unsigned SrcSize = SrcTy.getSizeInBits();
1722
1723  const TargetRegisterClass *SrcRC
1724    = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1725  const TargetRegisterClass *DstRC
1726    = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1727  if (!SrcRC || !DstRC)
1728    return false;
1729
1730  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1731      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1732    LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1733    return false;
1734  }
1735
1736  if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1737    MachineBasicBlock *MBB = I.getParent();
1738    const DebugLoc &DL = I.getDebugLoc();
1739
1740    Register LoReg = MRI->createVirtualRegister(DstRC);
1741    Register HiReg = MRI->createVirtualRegister(DstRC);
1742    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1743      .addReg(SrcReg, 0, AMDGPU::sub0);
1744    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1745      .addReg(SrcReg, 0, AMDGPU::sub1);
1746
1747    if (IsVALU && STI.hasSDWA()) {
1748      // Write the low 16-bits of the high element into the high 16-bits of the
1749      // low element.
1750      MachineInstr *MovSDWA =
1751        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1752        .addImm(0)                             // $src0_modifiers
1753        .addReg(HiReg)                         // $src0
1754        .addImm(0)                             // $clamp
1755        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1756        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1757        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1758        .addReg(LoReg, RegState::Implicit);
1759      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1760    } else {
1761      Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1762      Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1763      Register ImmReg = MRI->createVirtualRegister(DstRC);
1764      if (IsVALU) {
1765        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1766          .addImm(16)
1767          .addReg(HiReg);
1768      } else {
1769        BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1770          .addReg(HiReg)
1771          .addImm(16);
1772      }
1773
1774      unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1775      unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1776      unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1777
1778      BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1779        .addImm(0xffff);
1780      BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1781        .addReg(LoReg)
1782        .addReg(ImmReg);
1783      BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1784        .addReg(TmpReg0)
1785        .addReg(TmpReg1);
1786    }
1787
1788    I.eraseFromParent();
1789    return true;
1790  }
1791
1792  if (!DstTy.isScalar())
1793    return false;
1794
1795  if (SrcSize > 32) {
1796    int SubRegIdx = sizeToSubRegIndex(DstSize);
1797    if (SubRegIdx == -1)
1798      return false;
1799
1800    // Deal with weird cases where the class only partially supports the subreg
1801    // index.
1802    const TargetRegisterClass *SrcWithSubRC
1803      = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1804    if (!SrcWithSubRC)
1805      return false;
1806
1807    if (SrcWithSubRC != SrcRC) {
1808      if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1809        return false;
1810    }
1811
1812    I.getOperand(1).setSubReg(SubRegIdx);
1813  }
1814
1815  I.setDesc(TII.get(TargetOpcode::COPY));
1816  return true;
1817}
1818
1819/// \returns true if a bitmask for \p Size bits will be an inline immediate.
1820static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1821  Mask = maskTrailingOnes<unsigned>(Size);
1822  int SignedMask = static_cast<int>(Mask);
1823  return SignedMask >= -16 && SignedMask <= 64;
1824}
1825
1826// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1827const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1828  Register Reg, const MachineRegisterInfo &MRI,
1829  const TargetRegisterInfo &TRI) const {
1830  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1831  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1832    return RB;
1833
1834  // Ignore the type, since we don't use vcc in artifacts.
1835  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1836    return &RBI.getRegBankFromRegClass(*RC, LLT());
1837  return nullptr;
1838}
1839
1840bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1841  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1842  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1843  const DebugLoc &DL = I.getDebugLoc();
1844  MachineBasicBlock &MBB = *I.getParent();
1845  const Register DstReg = I.getOperand(0).getReg();
1846  const Register SrcReg = I.getOperand(1).getReg();
1847
1848  const LLT DstTy = MRI->getType(DstReg);
1849  const LLT SrcTy = MRI->getType(SrcReg);
1850  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1851    I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1852  const unsigned DstSize = DstTy.getSizeInBits();
1853  if (!DstTy.isScalar())
1854    return false;
1855
1856  if (I.getOpcode() == AMDGPU::G_ANYEXT)
1857    return selectCOPY(I);
1858
1859  // Artifact casts should never use vcc.
1860  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1861
1862  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1863    // 64-bit should have been split up in RegBankSelect
1864
1865    // Try to use an and with a mask if it will save code size.
1866    unsigned Mask;
1867    if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1868      MachineInstr *ExtI =
1869      BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1870        .addImm(Mask)
1871        .addReg(SrcReg);
1872      I.eraseFromParent();
1873      return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1874    }
1875
1876    const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1877    MachineInstr *ExtI =
1878      BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1879      .addReg(SrcReg)
1880      .addImm(0) // Offset
1881      .addImm(SrcSize); // Width
1882    I.eraseFromParent();
1883    return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1884  }
1885
1886  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1887    const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1888      AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1889    if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1890      return false;
1891
1892    if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1893      const unsigned SextOpc = SrcSize == 8 ?
1894        AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1895      BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1896        .addReg(SrcReg);
1897      I.eraseFromParent();
1898      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1899    }
1900
1901    const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1902    const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1903
1904    // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1905    if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1906      // We need a 64-bit register source, but the high bits don't matter.
1907      Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1908      Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1909      unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1910
1911      BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1912      BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1913        .addReg(SrcReg, 0, SubReg)
1914        .addImm(AMDGPU::sub0)
1915        .addReg(UndefReg)
1916        .addImm(AMDGPU::sub1);
1917
1918      BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1919        .addReg(ExtReg)
1920        .addImm(SrcSize << 16);
1921
1922      I.eraseFromParent();
1923      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1924    }
1925
1926    unsigned Mask;
1927    if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1928      BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1929        .addReg(SrcReg)
1930        .addImm(Mask);
1931    } else {
1932      BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
1933        .addReg(SrcReg)
1934        .addImm(SrcSize << 16);
1935    }
1936
1937    I.eraseFromParent();
1938    return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1939  }
1940
1941  return false;
1942}
1943
1944bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
1945  MachineBasicBlock *BB = I.getParent();
1946  MachineOperand &ImmOp = I.getOperand(1);
1947
1948  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
1949  if (ImmOp.isFPImm()) {
1950    const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
1951    ImmOp.ChangeToImmediate(Imm.getZExtValue());
1952  } else if (ImmOp.isCImm()) {
1953    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
1954  }
1955
1956  Register DstReg = I.getOperand(0).getReg();
1957  unsigned Size;
1958  bool IsSgpr;
1959  const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
1960  if (RB) {
1961    IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
1962    Size = MRI->getType(DstReg).getSizeInBits();
1963  } else {
1964    const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
1965    IsSgpr = TRI.isSGPRClass(RC);
1966    Size = TRI.getRegSizeInBits(*RC);
1967  }
1968
1969  if (Size != 32 && Size != 64)
1970    return false;
1971
1972  unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1973  if (Size == 32) {
1974    I.setDesc(TII.get(Opcode));
1975    I.addImplicitDefUseOperands(*MF);
1976    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
1977  }
1978
1979  const DebugLoc &DL = I.getDebugLoc();
1980
1981  APInt Imm(Size, I.getOperand(1).getImm());
1982
1983  MachineInstr *ResInst;
1984  if (IsSgpr && TII.isInlineConstant(Imm)) {
1985    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1986      .addImm(I.getOperand(1).getImm());
1987  } else {
1988    const TargetRegisterClass *RC = IsSgpr ?
1989      &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
1990    Register LoReg = MRI->createVirtualRegister(RC);
1991    Register HiReg = MRI->createVirtualRegister(RC);
1992
1993    BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
1994      .addImm(Imm.trunc(32).getZExtValue());
1995
1996    BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
1997      .addImm(Imm.ashr(32).getZExtValue());
1998
1999    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2000      .addReg(LoReg)
2001      .addImm(AMDGPU::sub0)
2002      .addReg(HiReg)
2003      .addImm(AMDGPU::sub1);
2004  }
2005
2006  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2007  // work for target independent opcodes
2008  I.eraseFromParent();
2009  const TargetRegisterClass *DstRC =
2010    TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2011  if (!DstRC)
2012    return true;
2013  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2014}
2015
2016bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2017  // Only manually handle the f64 SGPR case.
2018  //
2019  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2020  // the bit ops theoretically have a second result due to the implicit def of
2021  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2022  // that is easy by disabling the check. The result works, but uses a
2023  // nonsensical sreg32orlds_and_sreg_1 regclass.
2024  //
2025  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2026  // the variadic REG_SEQUENCE operands.
2027
2028  Register Dst = MI.getOperand(0).getReg();
2029  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2030  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2031      MRI->getType(Dst) != LLT::scalar(64))
2032    return false;
2033
2034  Register Src = MI.getOperand(1).getReg();
2035  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2036  if (Fabs)
2037    Src = Fabs->getOperand(1).getReg();
2038
2039  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2040      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2041    return false;
2042
2043  MachineBasicBlock *BB = MI.getParent();
2044  const DebugLoc &DL = MI.getDebugLoc();
2045  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2046  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2047  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2048  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2049
2050  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2051    .addReg(Src, 0, AMDGPU::sub0);
2052  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2053    .addReg(Src, 0, AMDGPU::sub1);
2054  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2055    .addImm(0x80000000);
2056
2057  // Set or toggle sign bit.
2058  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2059  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2060    .addReg(HiReg)
2061    .addReg(ConstReg);
2062  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2063    .addReg(LoReg)
2064    .addImm(AMDGPU::sub0)
2065    .addReg(OpReg)
2066    .addImm(AMDGPU::sub1);
2067  MI.eraseFromParent();
2068  return true;
2069}
2070
2071// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2072bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2073  Register Dst = MI.getOperand(0).getReg();
2074  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2075  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2076      MRI->getType(Dst) != LLT::scalar(64))
2077    return false;
2078
2079  Register Src = MI.getOperand(1).getReg();
2080  MachineBasicBlock *BB = MI.getParent();
2081  const DebugLoc &DL = MI.getDebugLoc();
2082  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2083  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2084  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2085  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2086
2087  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2088      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2089    return false;
2090
2091  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2092    .addReg(Src, 0, AMDGPU::sub0);
2093  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2094    .addReg(Src, 0, AMDGPU::sub1);
2095  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2096    .addImm(0x7fffffff);
2097
2098  // Clear sign bit.
2099  // TODO: Should this used S_BITSET0_*?
2100  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2101    .addReg(HiReg)
2102    .addReg(ConstReg);
2103  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2104    .addReg(LoReg)
2105    .addImm(AMDGPU::sub0)
2106    .addReg(OpReg)
2107    .addImm(AMDGPU::sub1);
2108
2109  MI.eraseFromParent();
2110  return true;
2111}
2112
2113static bool isConstant(const MachineInstr &MI) {
2114  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2115}
2116
2117void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2118    const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2119
2120  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2121
2122  assert(PtrMI);
2123
2124  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2125    return;
2126
2127  GEPInfo GEPInfo(*PtrMI);
2128
2129  for (unsigned i = 1; i != 3; ++i) {
2130    const MachineOperand &GEPOp = PtrMI->getOperand(i);
2131    const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2132    assert(OpDef);
2133    if (i == 2 && isConstant(*OpDef)) {
2134      // TODO: Could handle constant base + variable offset, but a combine
2135      // probably should have commuted it.
2136      assert(GEPInfo.Imm == 0);
2137      GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2138      continue;
2139    }
2140    const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2141    if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2142      GEPInfo.SgprParts.push_back(GEPOp.getReg());
2143    else
2144      GEPInfo.VgprParts.push_back(GEPOp.getReg());
2145  }
2146
2147  AddrInfo.push_back(GEPInfo);
2148  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2149}
2150
2151bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2152  if (!MI.hasOneMemOperand())
2153    return false;
2154
2155  const MachineMemOperand *MMO = *MI.memoperands_begin();
2156  const Value *Ptr = MMO->getValue();
2157
2158  // UndefValue means this is a load of a kernel input.  These are uniform.
2159  // Sometimes LDS instructions have constant pointers.
2160  // If Ptr is null, then that means this mem operand contains a
2161  // PseudoSourceValue like GOT.
2162  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2163      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2164    return true;
2165
2166  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2167    return true;
2168
2169  const Instruction *I = dyn_cast<Instruction>(Ptr);
2170  return I && I->getMetadata("amdgpu.uniform");
2171}
2172
2173bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2174  for (const GEPInfo &GEPInfo : AddrInfo) {
2175    if (!GEPInfo.VgprParts.empty())
2176      return true;
2177  }
2178  return false;
2179}
2180
2181void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2182  MachineBasicBlock *BB = I.getParent();
2183
2184  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2185  unsigned AS = PtrTy.getAddressSpace();
2186  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2187      STI.ldsRequiresM0Init()) {
2188    // If DS instructions require M0 initializtion, insert it before selecting.
2189    BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2190      .addImm(-1);
2191  }
2192}
2193
2194bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2195  initM0(I);
2196  return selectImpl(I, *CoverageInfo);
2197}
2198
2199// TODO: No rtn optimization.
2200bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2201  MachineInstr &MI) const {
2202  Register PtrReg = MI.getOperand(1).getReg();
2203  const LLT PtrTy = MRI->getType(PtrReg);
2204  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2205      STI.useFlatForGlobal())
2206    return selectImpl(MI, *CoverageInfo);
2207
2208  Register DstReg = MI.getOperand(0).getReg();
2209  const LLT Ty = MRI->getType(DstReg);
2210  const bool Is64 = Ty.getSizeInBits() == 64;
2211  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2212  Register TmpReg = MRI->createVirtualRegister(
2213    Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2214
2215  const DebugLoc &DL = MI.getDebugLoc();
2216  MachineBasicBlock *BB = MI.getParent();
2217
2218  Register VAddr, RSrcReg, SOffset;
2219  int64_t Offset = 0;
2220
2221  unsigned Opcode;
2222  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2223    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2224                             AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2225  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2226                                   RSrcReg, SOffset, Offset)) {
2227    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2228                    AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2229  } else
2230    return selectImpl(MI, *CoverageInfo);
2231
2232  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2233    .addReg(MI.getOperand(2).getReg());
2234
2235  if (VAddr)
2236    MIB.addReg(VAddr);
2237
2238  MIB.addReg(RSrcReg);
2239  if (SOffset)
2240    MIB.addReg(SOffset);
2241  else
2242    MIB.addImm(0);
2243
2244  MIB.addImm(Offset);
2245  MIB.addImm(0); // slc
2246  MIB.cloneMemRefs(MI);
2247
2248  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2249    .addReg(TmpReg, RegState::Kill, SubReg);
2250
2251  MI.eraseFromParent();
2252
2253  MRI->setRegClass(
2254    DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2255  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2256}
2257
2258bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2259  MachineBasicBlock *BB = I.getParent();
2260  MachineOperand &CondOp = I.getOperand(0);
2261  Register CondReg = CondOp.getReg();
2262  const DebugLoc &DL = I.getDebugLoc();
2263
2264  unsigned BrOpcode;
2265  Register CondPhysReg;
2266  const TargetRegisterClass *ConstrainRC;
2267
2268  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2269  // whether the branch is uniform when selecting the instruction. In
2270  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2271  // RegBankSelect knows what it's doing if the branch condition is scc, even
2272  // though it currently does not.
2273  if (!isVCC(CondReg, *MRI)) {
2274    if (MRI->getType(CondReg) != LLT::scalar(32))
2275      return false;
2276
2277    CondPhysReg = AMDGPU::SCC;
2278    BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2279    // FIXME: Hack for isSCC tests
2280    ConstrainRC = &AMDGPU::SGPR_32RegClass;
2281  } else {
2282    // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2283    // We sort of know that a VCC producer based on the register bank, that ands
2284    // inactive lanes with 0. What if there was a logical operation with vcc
2285    // producers in different blocks/with different exec masks?
2286    // FIXME: Should scc->vcc copies and with exec?
2287    CondPhysReg = TRI.getVCC();
2288    BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2289    ConstrainRC = TRI.getBoolRC();
2290  }
2291
2292  if (!MRI->getRegClassOrNull(CondReg))
2293    MRI->setRegClass(CondReg, ConstrainRC);
2294
2295  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2296    .addReg(CondReg);
2297  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2298    .addMBB(I.getOperand(1).getMBB());
2299
2300  I.eraseFromParent();
2301  return true;
2302}
2303
2304bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
2305  MachineInstr &I) const {
2306  Register DstReg = I.getOperand(0).getReg();
2307  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2308  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2309  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2310  if (IsVGPR)
2311    I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2312
2313  return RBI.constrainGenericRegister(
2314    DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2315}
2316
2317bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2318  Register DstReg = I.getOperand(0).getReg();
2319  Register SrcReg = I.getOperand(1).getReg();
2320  Register MaskReg = I.getOperand(2).getReg();
2321  LLT Ty = MRI->getType(DstReg);
2322  LLT MaskTy = MRI->getType(MaskReg);
2323
2324  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2325  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2326  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2327  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2328  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2329    return false;
2330
2331  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2332  const TargetRegisterClass &RegRC
2333    = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2334
2335  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2336                                                                  *MRI);
2337  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2338                                                                  *MRI);
2339  const TargetRegisterClass *MaskRC =
2340      TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2341
2342  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2343      !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2344      !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2345    return false;
2346
2347  MachineBasicBlock *BB = I.getParent();
2348  const DebugLoc &DL = I.getDebugLoc();
2349  if (Ty.getSizeInBits() == 32) {
2350    assert(MaskTy.getSizeInBits() == 32 &&
2351           "ptrmask should have been narrowed during legalize");
2352
2353    BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2354      .addReg(SrcReg)
2355      .addReg(MaskReg);
2356    I.eraseFromParent();
2357    return true;
2358  }
2359
2360  Register HiReg = MRI->createVirtualRegister(&RegRC);
2361  Register LoReg = MRI->createVirtualRegister(&RegRC);
2362
2363  // Extract the subregisters from the source pointer.
2364  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2365    .addReg(SrcReg, 0, AMDGPU::sub0);
2366  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2367    .addReg(SrcReg, 0, AMDGPU::sub1);
2368
2369  Register MaskedLo, MaskedHi;
2370
2371  // Try to avoid emitting a bit operation when we only need to touch half of
2372  // the 64-bit pointer.
2373  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2374
2375  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2376  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2377  if ((MaskOnes & MaskLo32) == MaskLo32) {
2378    // If all the bits in the low half are 1, we only need a copy for it.
2379    MaskedLo = LoReg;
2380  } else {
2381    // Extract the mask subregister and apply the and.
2382    Register MaskLo = MRI->createVirtualRegister(&RegRC);
2383    MaskedLo = MRI->createVirtualRegister(&RegRC);
2384
2385    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2386      .addReg(MaskReg, 0, AMDGPU::sub0);
2387    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2388      .addReg(LoReg)
2389      .addReg(MaskLo);
2390  }
2391
2392  if ((MaskOnes & MaskHi32) == MaskHi32) {
2393    // If all the bits in the high half are 1, we only need a copy for it.
2394    MaskedHi = HiReg;
2395  } else {
2396    Register MaskHi = MRI->createVirtualRegister(&RegRC);
2397    MaskedHi = MRI->createVirtualRegister(&RegRC);
2398
2399    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2400      .addReg(MaskReg, 0, AMDGPU::sub1);
2401    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2402      .addReg(HiReg)
2403      .addReg(MaskHi);
2404  }
2405
2406  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2407    .addReg(MaskedLo)
2408    .addImm(AMDGPU::sub0)
2409    .addReg(MaskedHi)
2410    .addImm(AMDGPU::sub1);
2411  I.eraseFromParent();
2412  return true;
2413}
2414
2415/// Return the register to use for the index value, and the subregister to use
2416/// for the indirectly accessed register.
2417static std::pair<Register, unsigned>
2418computeIndirectRegIndex(MachineRegisterInfo &MRI,
2419                        const SIRegisterInfo &TRI,
2420                        const TargetRegisterClass *SuperRC,
2421                        Register IdxReg,
2422                        unsigned EltSize) {
2423  Register IdxBaseReg;
2424  int Offset;
2425  MachineInstr *Unused;
2426
2427  std::tie(IdxBaseReg, Offset, Unused)
2428    = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2429  if (IdxBaseReg == AMDGPU::NoRegister) {
2430    // This will happen if the index is a known constant. This should ordinarily
2431    // be legalized out, but handle it as a register just in case.
2432    assert(Offset == 0);
2433    IdxBaseReg = IdxReg;
2434  }
2435
2436  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2437
2438  // Skip out of bounds offsets, or else we would end up using an undefined
2439  // register.
2440  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2441    return std::make_pair(IdxReg, SubRegs[0]);
2442  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2443}
2444
2445bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2446  MachineInstr &MI) const {
2447  Register DstReg = MI.getOperand(0).getReg();
2448  Register SrcReg = MI.getOperand(1).getReg();
2449  Register IdxReg = MI.getOperand(2).getReg();
2450
2451  LLT DstTy = MRI->getType(DstReg);
2452  LLT SrcTy = MRI->getType(SrcReg);
2453
2454  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2455  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2456  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2457
2458  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2459  // into a waterfall loop.
2460  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2461    return false;
2462
2463  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2464                                                                  *MRI);
2465  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2466                                                                  *MRI);
2467  if (!SrcRC || !DstRC)
2468    return false;
2469  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2470      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2471      !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2472    return false;
2473
2474  MachineBasicBlock *BB = MI.getParent();
2475  const DebugLoc &DL = MI.getDebugLoc();
2476  const bool Is64 = DstTy.getSizeInBits() == 64;
2477
2478  unsigned SubReg;
2479  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2480                                                     DstTy.getSizeInBits() / 8);
2481
2482  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2483    if (DstTy.getSizeInBits() != 32 && !Is64)
2484      return false;
2485
2486    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2487      .addReg(IdxReg);
2488
2489    unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2490    BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2491      .addReg(SrcReg, 0, SubReg)
2492      .addReg(SrcReg, RegState::Implicit);
2493    MI.eraseFromParent();
2494    return true;
2495  }
2496
2497  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2498    return false;
2499
2500  if (!STI.useVGPRIndexMode()) {
2501    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2502      .addReg(IdxReg);
2503    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2504      .addReg(SrcReg, RegState::Undef, SubReg)
2505      .addReg(SrcReg, RegState::Implicit);
2506    MI.eraseFromParent();
2507    return true;
2508  }
2509
2510  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2511    .addReg(IdxReg)
2512    .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2513  BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2514    .addReg(SrcReg, RegState::Undef, SubReg)
2515    .addReg(SrcReg, RegState::Implicit)
2516    .addReg(AMDGPU::M0, RegState::Implicit);
2517  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2518
2519  MI.eraseFromParent();
2520  return true;
2521}
2522
2523// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2524bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2525  MachineInstr &MI) const {
2526  Register DstReg = MI.getOperand(0).getReg();
2527  Register VecReg = MI.getOperand(1).getReg();
2528  Register ValReg = MI.getOperand(2).getReg();
2529  Register IdxReg = MI.getOperand(3).getReg();
2530
2531  LLT VecTy = MRI->getType(DstReg);
2532  LLT ValTy = MRI->getType(ValReg);
2533  unsigned VecSize = VecTy.getSizeInBits();
2534  unsigned ValSize = ValTy.getSizeInBits();
2535
2536  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2537  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2538  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2539
2540  assert(VecTy.getElementType() == ValTy);
2541
2542  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2543  // into a waterfall loop.
2544  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2545    return false;
2546
2547  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2548                                                                  *MRI);
2549  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2550                                                                  *MRI);
2551
2552  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2553      !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2554      !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2555      !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2556    return false;
2557
2558  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2559    return false;
2560
2561  unsigned SubReg;
2562  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2563                                                     ValSize / 8);
2564
2565  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2566                         STI.useVGPRIndexMode();
2567
2568  MachineBasicBlock *BB = MI.getParent();
2569  const DebugLoc &DL = MI.getDebugLoc();
2570
2571  if (IndexMode) {
2572    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2573      .addReg(IdxReg)
2574      .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2575  } else {
2576    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2577      .addReg(IdxReg);
2578  }
2579
2580  const MCInstrDesc &RegWriteOp
2581    = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2582                                    VecRB->getID() == AMDGPU::SGPRRegBankID);
2583  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2584    .addReg(VecReg)
2585    .addReg(ValReg)
2586    .addImm(SubReg);
2587
2588  if (IndexMode)
2589    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2590
2591  MI.eraseFromParent();
2592  return true;
2593}
2594
2595static bool isZeroOrUndef(int X) {
2596  return X == 0 || X == -1;
2597}
2598
2599static bool isOneOrUndef(int X) {
2600  return X == 1 || X == -1;
2601}
2602
2603static bool isZeroOrOneOrUndef(int X) {
2604  return X == 0 || X == 1 || X == -1;
2605}
2606
2607// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2608// 32-bit register.
2609static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2610                                   ArrayRef<int> Mask) {
2611  NewMask[0] = Mask[0];
2612  NewMask[1] = Mask[1];
2613  if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2614    return Src0;
2615
2616  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2617  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2618
2619  // Shift the mask inputs to be 0/1;
2620  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2621  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2622  return Src1;
2623}
2624
2625// This is only legal with VOP3P instructions as an aid to op_sel matching.
2626bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2627  MachineInstr &MI) const {
2628  Register DstReg = MI.getOperand(0).getReg();
2629  Register Src0Reg = MI.getOperand(1).getReg();
2630  Register Src1Reg = MI.getOperand(2).getReg();
2631  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2632
2633  const LLT V2S16 = LLT::vector(2, 16);
2634  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2635    return false;
2636
2637  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2638    return false;
2639
2640  assert(ShufMask.size() == 2);
2641  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2642
2643  MachineBasicBlock *MBB = MI.getParent();
2644  const DebugLoc &DL = MI.getDebugLoc();
2645
2646  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2647  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2648  const TargetRegisterClass &RC = IsVALU ?
2649    AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2650
2651  // Handle the degenerate case which should have folded out.
2652  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2653    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2654
2655    MI.eraseFromParent();
2656    return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2657  }
2658
2659  // A legal VOP3P mask only reads one of the sources.
2660  int Mask[2];
2661  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2662
2663  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2664      !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2665    return false;
2666
2667  // TODO: This also should have been folded out
2668  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2669    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2670      .addReg(SrcVec);
2671
2672    MI.eraseFromParent();
2673    return true;
2674  }
2675
2676  if (Mask[0] == 1 && Mask[1] == -1) {
2677    if (IsVALU) {
2678      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2679        .addImm(16)
2680        .addReg(SrcVec);
2681    } else {
2682      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2683        .addReg(SrcVec)
2684        .addImm(16);
2685    }
2686  } else if (Mask[0] == -1 && Mask[1] == 0) {
2687    if (IsVALU) {
2688      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2689        .addImm(16)
2690        .addReg(SrcVec);
2691    } else {
2692      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2693        .addReg(SrcVec)
2694        .addImm(16);
2695    }
2696  } else if (Mask[0] == 0 && Mask[1] == 0) {
2697    if (IsVALU) {
2698      // Write low half of the register into the high half.
2699      MachineInstr *MovSDWA =
2700        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2701        .addImm(0)                             // $src0_modifiers
2702        .addReg(SrcVec)                        // $src0
2703        .addImm(0)                             // $clamp
2704        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2705        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2706        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2707        .addReg(SrcVec, RegState::Implicit);
2708      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2709    } else {
2710      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2711        .addReg(SrcVec)
2712        .addReg(SrcVec);
2713    }
2714  } else if (Mask[0] == 1 && Mask[1] == 1) {
2715    if (IsVALU) {
2716      // Write high half of the register into the low half.
2717      MachineInstr *MovSDWA =
2718        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2719        .addImm(0)                             // $src0_modifiers
2720        .addReg(SrcVec)                        // $src0
2721        .addImm(0)                             // $clamp
2722        .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2723        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2724        .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2725        .addReg(SrcVec, RegState::Implicit);
2726      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2727    } else {
2728      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2729        .addReg(SrcVec)
2730        .addReg(SrcVec);
2731    }
2732  } else if (Mask[0] == 1 && Mask[1] == 0) {
2733    if (IsVALU) {
2734      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2735        .addReg(SrcVec)
2736        .addReg(SrcVec)
2737        .addImm(16);
2738    } else {
2739      Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2740      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2741        .addReg(SrcVec)
2742        .addImm(16);
2743      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2744        .addReg(TmpReg)
2745        .addReg(SrcVec);
2746    }
2747  } else
2748    llvm_unreachable("all shuffle masks should be handled");
2749
2750  MI.eraseFromParent();
2751  return true;
2752}
2753
2754bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2755  if (I.isPHI())
2756    return selectPHI(I);
2757
2758  if (!I.isPreISelOpcode()) {
2759    if (I.isCopy())
2760      return selectCOPY(I);
2761    return true;
2762  }
2763
2764  switch (I.getOpcode()) {
2765  case TargetOpcode::G_AND:
2766  case TargetOpcode::G_OR:
2767  case TargetOpcode::G_XOR:
2768    if (selectImpl(I, *CoverageInfo))
2769      return true;
2770    return selectG_AND_OR_XOR(I);
2771  case TargetOpcode::G_ADD:
2772  case TargetOpcode::G_SUB:
2773    if (selectImpl(I, *CoverageInfo))
2774      return true;
2775    return selectG_ADD_SUB(I);
2776  case TargetOpcode::G_UADDO:
2777  case TargetOpcode::G_USUBO:
2778  case TargetOpcode::G_UADDE:
2779  case TargetOpcode::G_USUBE:
2780    return selectG_UADDO_USUBO_UADDE_USUBE(I);
2781  case TargetOpcode::G_INTTOPTR:
2782  case TargetOpcode::G_BITCAST:
2783  case TargetOpcode::G_PTRTOINT:
2784    return selectCOPY(I);
2785  case TargetOpcode::G_CONSTANT:
2786  case TargetOpcode::G_FCONSTANT:
2787    return selectG_CONSTANT(I);
2788  case TargetOpcode::G_FNEG:
2789    if (selectImpl(I, *CoverageInfo))
2790      return true;
2791    return selectG_FNEG(I);
2792  case TargetOpcode::G_FABS:
2793    if (selectImpl(I, *CoverageInfo))
2794      return true;
2795    return selectG_FABS(I);
2796  case TargetOpcode::G_EXTRACT:
2797    return selectG_EXTRACT(I);
2798  case TargetOpcode::G_MERGE_VALUES:
2799  case TargetOpcode::G_BUILD_VECTOR:
2800  case TargetOpcode::G_CONCAT_VECTORS:
2801    return selectG_MERGE_VALUES(I);
2802  case TargetOpcode::G_UNMERGE_VALUES:
2803    return selectG_UNMERGE_VALUES(I);
2804  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2805    return selectG_BUILD_VECTOR_TRUNC(I);
2806  case TargetOpcode::G_PTR_ADD:
2807    return selectG_PTR_ADD(I);
2808  case TargetOpcode::G_IMPLICIT_DEF:
2809    return selectG_IMPLICIT_DEF(I);
2810  case TargetOpcode::G_INSERT:
2811    return selectG_INSERT(I);
2812  case TargetOpcode::G_INTRINSIC:
2813    return selectG_INTRINSIC(I);
2814  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2815    return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2816  case TargetOpcode::G_ICMP:
2817    if (selectG_ICMP(I))
2818      return true;
2819    return selectImpl(I, *CoverageInfo);
2820  case TargetOpcode::G_LOAD:
2821  case TargetOpcode::G_ATOMIC_CMPXCHG:
2822  case TargetOpcode::G_ATOMICRMW_XCHG:
2823  case TargetOpcode::G_ATOMICRMW_ADD:
2824  case TargetOpcode::G_ATOMICRMW_SUB:
2825  case TargetOpcode::G_ATOMICRMW_AND:
2826  case TargetOpcode::G_ATOMICRMW_OR:
2827  case TargetOpcode::G_ATOMICRMW_XOR:
2828  case TargetOpcode::G_ATOMICRMW_MIN:
2829  case TargetOpcode::G_ATOMICRMW_MAX:
2830  case TargetOpcode::G_ATOMICRMW_UMIN:
2831  case TargetOpcode::G_ATOMICRMW_UMAX:
2832  case TargetOpcode::G_ATOMICRMW_FADD:
2833    return selectG_LOAD_ATOMICRMW(I);
2834  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2835    return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2836  case TargetOpcode::G_SELECT:
2837    return selectG_SELECT(I);
2838  case TargetOpcode::G_STORE:
2839    return selectG_STORE(I);
2840  case TargetOpcode::G_TRUNC:
2841    return selectG_TRUNC(I);
2842  case TargetOpcode::G_SEXT:
2843  case TargetOpcode::G_ZEXT:
2844  case TargetOpcode::G_ANYEXT:
2845  case TargetOpcode::G_SEXT_INREG:
2846    if (selectImpl(I, *CoverageInfo))
2847      return true;
2848    return selectG_SZA_EXT(I);
2849  case TargetOpcode::G_BRCOND:
2850    return selectG_BRCOND(I);
2851  case TargetOpcode::G_FRAME_INDEX:
2852  case TargetOpcode::G_GLOBAL_VALUE:
2853    return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
2854  case TargetOpcode::G_PTRMASK:
2855    return selectG_PTRMASK(I);
2856  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2857    return selectG_EXTRACT_VECTOR_ELT(I);
2858  case TargetOpcode::G_INSERT_VECTOR_ELT:
2859    return selectG_INSERT_VECTOR_ELT(I);
2860  case TargetOpcode::G_SHUFFLE_VECTOR:
2861    return selectG_SHUFFLE_VECTOR(I);
2862  case AMDGPU::G_AMDGPU_ATOMIC_INC:
2863  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2864    initM0(I);
2865    return selectImpl(I, *CoverageInfo);
2866  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2867  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2868    const AMDGPU::ImageDimIntrinsicInfo *Intr
2869      = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2870    assert(Intr && "not an image intrinsic with image pseudo");
2871    return selectImageIntrinsic(I, Intr);
2872  }
2873  default:
2874    return selectImpl(I, *CoverageInfo);
2875  }
2876  return false;
2877}
2878
2879InstructionSelector::ComplexRendererFns
2880AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2881  return {{
2882      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2883  }};
2884
2885}
2886
2887std::pair<Register, unsigned>
2888AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2889  Register Src = Root.getReg();
2890  Register OrigSrc = Src;
2891  unsigned Mods = 0;
2892  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2893
2894  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2895    Src = MI->getOperand(1).getReg();
2896    Mods |= SISrcMods::NEG;
2897    MI = getDefIgnoringCopies(Src, *MRI);
2898  }
2899
2900  if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2901    Src = MI->getOperand(1).getReg();
2902    Mods |= SISrcMods::ABS;
2903  }
2904
2905  if (Mods != 0 &&
2906      RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2907    MachineInstr *UseMI = Root.getParent();
2908
2909    // If we looked through copies to find source modifiers on an SGPR operand,
2910    // we now have an SGPR register source. To avoid potentially violating the
2911    // constant bus restriction, we need to insert a copy to a VGPR.
2912    Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2913    BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2914            TII.get(AMDGPU::COPY), VGPRSrc)
2915      .addReg(Src);
2916    Src = VGPRSrc;
2917  }
2918
2919  return std::make_pair(Src, Mods);
2920}
2921
2922///
2923/// This will select either an SGPR or VGPR operand and will save us from
2924/// having to write an extra tablegen pattern.
2925InstructionSelector::ComplexRendererFns
2926AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2927  return {{
2928      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2929  }};
2930}
2931
2932InstructionSelector::ComplexRendererFns
2933AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
2934  Register Src;
2935  unsigned Mods;
2936  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2937
2938  return {{
2939      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2940      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
2941      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
2942      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
2943  }};
2944}
2945
2946InstructionSelector::ComplexRendererFns
2947AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
2948  return {{
2949      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2950      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
2951      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
2952  }};
2953}
2954
2955InstructionSelector::ComplexRendererFns
2956AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
2957  Register Src;
2958  unsigned Mods;
2959  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2960
2961  return {{
2962      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2963      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
2964  }};
2965}
2966
2967InstructionSelector::ComplexRendererFns
2968AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
2969  Register Reg = Root.getReg();
2970  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
2971  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
2972              Def->getOpcode() == AMDGPU::G_FABS))
2973    return {};
2974  return {{
2975      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
2976  }};
2977}
2978
2979std::pair<Register, unsigned>
2980AMDGPUInstructionSelector::selectVOP3PModsImpl(
2981  Register Src, const MachineRegisterInfo &MRI) const {
2982  unsigned Mods = 0;
2983  MachineInstr *MI = MRI.getVRegDef(Src);
2984
2985  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
2986      // It's possible to see an f32 fneg here, but unlikely.
2987      // TODO: Treat f32 fneg as only high bit.
2988      MRI.getType(Src) == LLT::vector(2, 16)) {
2989    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2990    Src = MI->getOperand(1).getReg();
2991    MI = MRI.getVRegDef(Src);
2992  }
2993
2994  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
2995
2996  // Packed instructions do not have abs modifiers.
2997  Mods |= SISrcMods::OP_SEL_1;
2998
2999  return std::make_pair(Src, Mods);
3000}
3001
3002InstructionSelector::ComplexRendererFns
3003AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3004  MachineRegisterInfo &MRI
3005    = Root.getParent()->getParent()->getParent()->getRegInfo();
3006
3007  Register Src;
3008  unsigned Mods;
3009  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3010
3011  return {{
3012      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3013      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3014  }};
3015}
3016
3017InstructionSelector::ComplexRendererFns
3018AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3019  Register Src;
3020  unsigned Mods;
3021  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3022  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3023    return None;
3024
3025  return {{
3026      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3027      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3028  }};
3029}
3030
3031InstructionSelector::ComplexRendererFns
3032AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3033  // FIXME: Handle op_sel
3034  return {{
3035      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3036      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3037  }};
3038}
3039
3040InstructionSelector::ComplexRendererFns
3041AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3042  SmallVector<GEPInfo, 4> AddrInfo;
3043  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3044
3045  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3046    return None;
3047
3048  const GEPInfo &GEPInfo = AddrInfo[0];
3049  Optional<int64_t> EncodedImm =
3050      AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3051  if (!EncodedImm)
3052    return None;
3053
3054  unsigned PtrReg = GEPInfo.SgprParts[0];
3055  return {{
3056    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3057    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3058  }};
3059}
3060
3061InstructionSelector::ComplexRendererFns
3062AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3063  SmallVector<GEPInfo, 4> AddrInfo;
3064  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3065
3066  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3067    return None;
3068
3069  const GEPInfo &GEPInfo = AddrInfo[0];
3070  Register PtrReg = GEPInfo.SgprParts[0];
3071  Optional<int64_t> EncodedImm =
3072      AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3073  if (!EncodedImm)
3074    return None;
3075
3076  return {{
3077    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3078    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3079  }};
3080}
3081
3082InstructionSelector::ComplexRendererFns
3083AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3084  MachineInstr *MI = Root.getParent();
3085  MachineBasicBlock *MBB = MI->getParent();
3086
3087  SmallVector<GEPInfo, 4> AddrInfo;
3088  getAddrModeInfo(*MI, *MRI, AddrInfo);
3089
3090  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3091  // then we can select all ptr + 32-bit offsets not just immediate offsets.
3092  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3093    return None;
3094
3095  const GEPInfo &GEPInfo = AddrInfo[0];
3096  // SGPR offset is unsigned.
3097  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3098    return None;
3099
3100  // If we make it this far we have a load with an 32-bit immediate offset.
3101  // It is OK to select this using a sgpr offset, because we have already
3102  // failed trying to select this load into one of the _IMM variants since
3103  // the _IMM Patterns are considered before the _SGPR patterns.
3104  Register PtrReg = GEPInfo.SgprParts[0];
3105  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3106  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3107          .addImm(GEPInfo.Imm);
3108  return {{
3109    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3110    [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3111  }};
3112}
3113
3114template <bool Signed>
3115InstructionSelector::ComplexRendererFns
3116AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3117  MachineInstr *MI = Root.getParent();
3118
3119  InstructionSelector::ComplexRendererFns Default = {{
3120      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3121      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3122      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3123    }};
3124
3125  if (!STI.hasFlatInstOffsets())
3126    return Default;
3127
3128  const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3129  if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3130    return Default;
3131
3132  Optional<int64_t> Offset =
3133    getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3134  if (!Offset.hasValue())
3135    return Default;
3136
3137  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3138  if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3139    return Default;
3140
3141  Register BasePtr = OpDef->getOperand(1).getReg();
3142
3143  return {{
3144      [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3145      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3146      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3147    }};
3148}
3149
3150InstructionSelector::ComplexRendererFns
3151AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3152  return selectFlatOffsetImpl<false>(Root);
3153}
3154
3155InstructionSelector::ComplexRendererFns
3156AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3157  return selectFlatOffsetImpl<true>(Root);
3158}
3159
3160static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3161  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3162  return PSV && PSV->isStack();
3163}
3164
3165InstructionSelector::ComplexRendererFns
3166AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3167  MachineInstr *MI = Root.getParent();
3168  MachineBasicBlock *MBB = MI->getParent();
3169  MachineFunction *MF = MBB->getParent();
3170  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3171
3172  int64_t Offset = 0;
3173  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3174      Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3175    Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3176
3177    // TODO: Should this be inside the render function? The iterator seems to
3178    // move.
3179    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3180            HighBits)
3181      .addImm(Offset & ~4095);
3182
3183    return {{[=](MachineInstrBuilder &MIB) { // rsrc
3184               MIB.addReg(Info->getScratchRSrcReg());
3185             },
3186             [=](MachineInstrBuilder &MIB) { // vaddr
3187               MIB.addReg(HighBits);
3188             },
3189             [=](MachineInstrBuilder &MIB) { // soffset
3190               const MachineMemOperand *MMO = *MI->memoperands_begin();
3191               const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3192
3193               if (isStackPtrRelative(PtrInfo))
3194                 MIB.addReg(Info->getStackPtrOffsetReg());
3195               else
3196                 MIB.addImm(0);
3197             },
3198             [=](MachineInstrBuilder &MIB) { // offset
3199               MIB.addImm(Offset & 4095);
3200             }}};
3201  }
3202
3203  assert(Offset == 0 || Offset == -1);
3204
3205  // Try to fold a frame index directly into the MUBUF vaddr field, and any
3206  // offsets.
3207  Optional<int> FI;
3208  Register VAddr = Root.getReg();
3209  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3210    if (isBaseWithConstantOffset(Root, *MRI)) {
3211      const MachineOperand &LHS = RootDef->getOperand(1);
3212      const MachineOperand &RHS = RootDef->getOperand(2);
3213      const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3214      const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3215      if (LHSDef && RHSDef) {
3216        int64_t PossibleOffset =
3217            RHSDef->getOperand(1).getCImm()->getSExtValue();
3218        if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3219            (!STI.privateMemoryResourceIsRangeChecked() ||
3220             KnownBits->signBitIsZero(LHS.getReg()))) {
3221          if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3222            FI = LHSDef->getOperand(1).getIndex();
3223          else
3224            VAddr = LHS.getReg();
3225          Offset = PossibleOffset;
3226        }
3227      }
3228    } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3229      FI = RootDef->getOperand(1).getIndex();
3230    }
3231  }
3232
3233  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3234             MIB.addReg(Info->getScratchRSrcReg());
3235           },
3236           [=](MachineInstrBuilder &MIB) { // vaddr
3237             if (FI.hasValue())
3238               MIB.addFrameIndex(FI.getValue());
3239             else
3240               MIB.addReg(VAddr);
3241           },
3242           [=](MachineInstrBuilder &MIB) { // soffset
3243             // If we don't know this private access is a local stack object, it
3244             // needs to be relative to the entry point's scratch wave offset.
3245             // TODO: Should split large offsets that don't fit like above.
3246             // TODO: Don't use scratch wave offset just because the offset
3247             // didn't fit.
3248             if (!Info->isEntryFunction() && FI.hasValue())
3249               MIB.addReg(Info->getStackPtrOffsetReg());
3250             else
3251               MIB.addImm(0);
3252           },
3253           [=](MachineInstrBuilder &MIB) { // offset
3254             MIB.addImm(Offset);
3255           }}};
3256}
3257
3258bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3259                                                int64_t Offset,
3260                                                unsigned OffsetBits) const {
3261  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3262      (OffsetBits == 8 && !isUInt<8>(Offset)))
3263    return false;
3264
3265  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3266    return true;
3267
3268  // On Southern Islands instruction with a negative base value and an offset
3269  // don't seem to work.
3270  return KnownBits->signBitIsZero(Base);
3271}
3272
3273InstructionSelector::ComplexRendererFns
3274AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3275    MachineOperand &Root) const {
3276  MachineInstr *MI = Root.getParent();
3277  MachineBasicBlock *MBB = MI->getParent();
3278
3279  int64_t Offset = 0;
3280  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3281      !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3282    return {};
3283
3284  const MachineFunction *MF = MBB->getParent();
3285  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3286  const MachineMemOperand *MMO = *MI->memoperands_begin();
3287  const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3288
3289  return {{
3290      [=](MachineInstrBuilder &MIB) { // rsrc
3291        MIB.addReg(Info->getScratchRSrcReg());
3292      },
3293      [=](MachineInstrBuilder &MIB) { // soffset
3294        if (isStackPtrRelative(PtrInfo))
3295          MIB.addReg(Info->getStackPtrOffsetReg());
3296        else
3297          MIB.addImm(0);
3298      },
3299      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3300  }};
3301}
3302
3303std::pair<Register, unsigned>
3304AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3305  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3306  if (!RootDef)
3307    return std::make_pair(Root.getReg(), 0);
3308
3309  int64_t ConstAddr = 0;
3310
3311  Register PtrBase;
3312  int64_t Offset;
3313  std::tie(PtrBase, Offset) =
3314    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3315
3316  if (Offset) {
3317    if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3318      // (add n0, c0)
3319      return std::make_pair(PtrBase, Offset);
3320    }
3321  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3322    // TODO
3323
3324
3325  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3326    // TODO
3327
3328  }
3329
3330  return std::make_pair(Root.getReg(), 0);
3331}
3332
3333InstructionSelector::ComplexRendererFns
3334AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3335  Register Reg;
3336  unsigned Offset;
3337  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3338  return {{
3339      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3340      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3341    }};
3342}
3343
3344InstructionSelector::ComplexRendererFns
3345AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3346  Register Reg;
3347  unsigned Offset;
3348  std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3349  return {{
3350      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3351      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3352      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3353    }};
3354}
3355
3356std::pair<Register, unsigned>
3357AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3358  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3359  if (!RootDef)
3360    return std::make_pair(Root.getReg(), 0);
3361
3362  int64_t ConstAddr = 0;
3363
3364  Register PtrBase;
3365  int64_t Offset;
3366  std::tie(PtrBase, Offset) =
3367    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3368
3369  if (Offset) {
3370    int64_t DWordOffset0 = Offset / 4;
3371    int64_t DWordOffset1 = DWordOffset0 + 1;
3372    if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3373      // (add n0, c0)
3374      return std::make_pair(PtrBase, DWordOffset0);
3375    }
3376  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3377    // TODO
3378
3379  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3380    // TODO
3381
3382  }
3383
3384  return std::make_pair(Root.getReg(), 0);
3385}
3386
3387/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3388/// the base value with the constant offset. There may be intervening copies
3389/// between \p Root and the identified constant. Returns \p Root, 0 if this does
3390/// not match the pattern.
3391std::pair<Register, int64_t>
3392AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3393  Register Root, const MachineRegisterInfo &MRI) const {
3394  MachineInstr *RootI = MRI.getVRegDef(Root);
3395  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3396    return {Root, 0};
3397
3398  MachineOperand &RHS = RootI->getOperand(2);
3399  Optional<ValueAndVReg> MaybeOffset
3400    = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3401  if (!MaybeOffset)
3402    return {Root, 0};
3403  return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3404}
3405
3406static void addZeroImm(MachineInstrBuilder &MIB) {
3407  MIB.addImm(0);
3408}
3409
3410/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3411/// BasePtr is not valid, a null base pointer will be used.
3412static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3413                          uint32_t FormatLo, uint32_t FormatHi,
3414                          Register BasePtr) {
3415  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3416  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3417  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3418  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3419
3420  B.buildInstr(AMDGPU::S_MOV_B32)
3421    .addDef(RSrc2)
3422    .addImm(FormatLo);
3423  B.buildInstr(AMDGPU::S_MOV_B32)
3424    .addDef(RSrc3)
3425    .addImm(FormatHi);
3426
3427  // Build the half of the subregister with the constants before building the
3428  // full 128-bit register. If we are building multiple resource descriptors,
3429  // this will allow CSEing of the 2-component register.
3430  B.buildInstr(AMDGPU::REG_SEQUENCE)
3431    .addDef(RSrcHi)
3432    .addReg(RSrc2)
3433    .addImm(AMDGPU::sub0)
3434    .addReg(RSrc3)
3435    .addImm(AMDGPU::sub1);
3436
3437  Register RSrcLo = BasePtr;
3438  if (!BasePtr) {
3439    RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3440    B.buildInstr(AMDGPU::S_MOV_B64)
3441      .addDef(RSrcLo)
3442      .addImm(0);
3443  }
3444
3445  B.buildInstr(AMDGPU::REG_SEQUENCE)
3446    .addDef(RSrc)
3447    .addReg(RSrcLo)
3448    .addImm(AMDGPU::sub0_sub1)
3449    .addReg(RSrcHi)
3450    .addImm(AMDGPU::sub2_sub3);
3451
3452  return RSrc;
3453}
3454
3455static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3456                                const SIInstrInfo &TII, Register BasePtr) {
3457  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3458
3459  // FIXME: Why are half the "default" bits ignored based on the addressing
3460  // mode?
3461  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3462}
3463
3464static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3465                               const SIInstrInfo &TII, Register BasePtr) {
3466  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3467
3468  // FIXME: Why are half the "default" bits ignored based on the addressing
3469  // mode?
3470  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3471}
3472
3473AMDGPUInstructionSelector::MUBUFAddressData
3474AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3475  MUBUFAddressData Data;
3476  Data.N0 = Src;
3477
3478  Register PtrBase;
3479  int64_t Offset;
3480
3481  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3482  if (isUInt<32>(Offset)) {
3483    Data.N0 = PtrBase;
3484    Data.Offset = Offset;
3485  }
3486
3487  if (MachineInstr *InputAdd
3488      = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3489    Data.N2 = InputAdd->getOperand(1).getReg();
3490    Data.N3 = InputAdd->getOperand(2).getReg();
3491
3492    // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3493    // FIXME: Don't know this was defined by operand 0
3494    //
3495    // TODO: Remove this when we have copy folding optimizations after
3496    // RegBankSelect.
3497    Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3498    Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3499  }
3500
3501  return Data;
3502}
3503
3504/// Return if the addr64 mubuf mode should be used for the given address.
3505bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3506  // (ptr_add N2, N3) -> addr64, or
3507  // (ptr_add (ptr_add N2, N3), C1) -> addr64
3508  if (Addr.N2)
3509    return true;
3510
3511  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3512  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3513}
3514
3515/// Split an immediate offset \p ImmOffset depending on whether it fits in the
3516/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3517/// component.
3518void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3519  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3520  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3521    return;
3522
3523  // Illegal offset, store it in soffset.
3524  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3525  B.buildInstr(AMDGPU::S_MOV_B32)
3526    .addDef(SOffset)
3527    .addImm(ImmOffset);
3528  ImmOffset = 0;
3529}
3530
3531bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3532  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3533  Register &SOffset, int64_t &Offset) const {
3534  // FIXME: Predicates should stop this from reaching here.
3535  // addr64 bit was removed for volcanic islands.
3536  if (!STI.hasAddr64() || STI.useFlatForGlobal())
3537    return false;
3538
3539  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3540  if (!shouldUseAddr64(AddrData))
3541    return false;
3542
3543  Register N0 = AddrData.N0;
3544  Register N2 = AddrData.N2;
3545  Register N3 = AddrData.N3;
3546  Offset = AddrData.Offset;
3547
3548  // Base pointer for the SRD.
3549  Register SRDPtr;
3550
3551  if (N2) {
3552    if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3553      assert(N3);
3554      if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3555        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3556        // addr64, and construct the default resource from a 0 address.
3557        VAddr = N0;
3558      } else {
3559        SRDPtr = N3;
3560        VAddr = N2;
3561      }
3562    } else {
3563      // N2 is not divergent.
3564      SRDPtr = N2;
3565      VAddr = N3;
3566    }
3567  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3568    // Use the default null pointer in the resource
3569    VAddr = N0;
3570  } else {
3571    // N0 -> offset, or
3572    // (N0 + C1) -> offset
3573    SRDPtr = N0;
3574  }
3575
3576  MachineIRBuilder B(*Root.getParent());
3577  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3578  splitIllegalMUBUFOffset(B, SOffset, Offset);
3579  return true;
3580}
3581
3582bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3583  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3584  int64_t &Offset) const {
3585  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3586  if (shouldUseAddr64(AddrData))
3587    return false;
3588
3589  // N0 -> offset, or
3590  // (N0 + C1) -> offset
3591  Register SRDPtr = AddrData.N0;
3592  Offset = AddrData.Offset;
3593
3594  // TODO: Look through extensions for 32-bit soffset.
3595  MachineIRBuilder B(*Root.getParent());
3596
3597  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3598  splitIllegalMUBUFOffset(B, SOffset, Offset);
3599  return true;
3600}
3601
3602InstructionSelector::ComplexRendererFns
3603AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3604  Register VAddr;
3605  Register RSrcReg;
3606  Register SOffset;
3607  int64_t Offset = 0;
3608
3609  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3610    return {};
3611
3612  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3613  // pattern.
3614  return {{
3615      [=](MachineInstrBuilder &MIB) {  // rsrc
3616        MIB.addReg(RSrcReg);
3617      },
3618      [=](MachineInstrBuilder &MIB) { // vaddr
3619        MIB.addReg(VAddr);
3620      },
3621      [=](MachineInstrBuilder &MIB) { // soffset
3622        if (SOffset)
3623          MIB.addReg(SOffset);
3624        else
3625          MIB.addImm(0);
3626      },
3627      [=](MachineInstrBuilder &MIB) { // offset
3628        MIB.addImm(Offset);
3629      },
3630      addZeroImm, //  glc
3631      addZeroImm, //  slc
3632      addZeroImm, //  tfe
3633      addZeroImm, //  dlc
3634      addZeroImm  //  swz
3635    }};
3636}
3637
3638InstructionSelector::ComplexRendererFns
3639AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3640  Register RSrcReg;
3641  Register SOffset;
3642  int64_t Offset = 0;
3643
3644  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3645    return {};
3646
3647  return {{
3648      [=](MachineInstrBuilder &MIB) {  // rsrc
3649        MIB.addReg(RSrcReg);
3650      },
3651      [=](MachineInstrBuilder &MIB) { // soffset
3652        if (SOffset)
3653          MIB.addReg(SOffset);
3654        else
3655          MIB.addImm(0);
3656      },
3657      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3658      addZeroImm, //  glc
3659      addZeroImm, //  slc
3660      addZeroImm, //  tfe
3661      addZeroImm, //  dlc
3662      addZeroImm  //  swz
3663    }};
3664}
3665
3666InstructionSelector::ComplexRendererFns
3667AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3668  Register VAddr;
3669  Register RSrcReg;
3670  Register SOffset;
3671  int64_t Offset = 0;
3672
3673  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3674    return {};
3675
3676  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3677  // pattern.
3678  return {{
3679      [=](MachineInstrBuilder &MIB) {  // rsrc
3680        MIB.addReg(RSrcReg);
3681      },
3682      [=](MachineInstrBuilder &MIB) { // vaddr
3683        MIB.addReg(VAddr);
3684      },
3685      [=](MachineInstrBuilder &MIB) { // soffset
3686        if (SOffset)
3687          MIB.addReg(SOffset);
3688        else
3689          MIB.addImm(0);
3690      },
3691      [=](MachineInstrBuilder &MIB) { // offset
3692        MIB.addImm(Offset);
3693      },
3694      addZeroImm //  slc
3695    }};
3696}
3697
3698InstructionSelector::ComplexRendererFns
3699AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3700  Register RSrcReg;
3701  Register SOffset;
3702  int64_t Offset = 0;
3703
3704  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3705    return {};
3706
3707  return {{
3708      [=](MachineInstrBuilder &MIB) {  // rsrc
3709        MIB.addReg(RSrcReg);
3710      },
3711      [=](MachineInstrBuilder &MIB) { // soffset
3712        if (SOffset)
3713          MIB.addReg(SOffset);
3714        else
3715          MIB.addImm(0);
3716      },
3717      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3718      addZeroImm //  slc
3719    }};
3720}
3721
3722/// Get an immediate that must be 32-bits, and treated as zero extended.
3723static Optional<uint64_t> getConstantZext32Val(Register Reg,
3724                                               const MachineRegisterInfo &MRI) {
3725  // getConstantVRegVal sexts any values, so see if that matters.
3726  Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3727  if (!OffsetVal || !isInt<32>(*OffsetVal))
3728    return None;
3729  return Lo_32(*OffsetVal);
3730}
3731
3732InstructionSelector::ComplexRendererFns
3733AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3734  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3735  if (!OffsetVal)
3736    return {};
3737
3738  Optional<int64_t> EncodedImm =
3739      AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3740  if (!EncodedImm)
3741    return {};
3742
3743  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3744}
3745
3746InstructionSelector::ComplexRendererFns
3747AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3748  assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3749
3750  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3751  if (!OffsetVal)
3752    return {};
3753
3754  Optional<int64_t> EncodedImm
3755    = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3756  if (!EncodedImm)
3757    return {};
3758
3759  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3760}
3761
3762void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3763                                                 const MachineInstr &MI,
3764                                                 int OpIdx) const {
3765  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3766         "Expected G_CONSTANT");
3767  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3768}
3769
3770void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3771                                                const MachineInstr &MI,
3772                                                int OpIdx) const {
3773  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3774         "Expected G_CONSTANT");
3775  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3776}
3777
3778void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3779                                                 const MachineInstr &MI,
3780                                                 int OpIdx) const {
3781  assert(OpIdx == -1);
3782
3783  const MachineOperand &Op = MI.getOperand(1);
3784  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3785    MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3786  else {
3787    assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3788    MIB.addImm(Op.getCImm()->getSExtValue());
3789  }
3790}
3791
3792void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3793                                                const MachineInstr &MI,
3794                                                int OpIdx) const {
3795  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3796         "Expected G_CONSTANT");
3797  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3798}
3799
3800/// This only really exists to satisfy DAG type checking machinery, so is a
3801/// no-op here.
3802void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3803                                                const MachineInstr &MI,
3804                                                int OpIdx) const {
3805  MIB.addImm(MI.getOperand(OpIdx).getImm());
3806}
3807
3808void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3809                                                 const MachineInstr &MI,
3810                                                 int OpIdx) const {
3811  assert(OpIdx >= 0 && "expected to match an immediate operand");
3812  MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3813}
3814
3815void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3816                                                 const MachineInstr &MI,
3817                                                 int OpIdx) const {
3818  assert(OpIdx >= 0 && "expected to match an immediate operand");
3819  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3820}
3821
3822void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3823                                                 const MachineInstr &MI,
3824                                                 int OpIdx) const {
3825  assert(OpIdx >= 0 && "expected to match an immediate operand");
3826  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3827}
3828
3829void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3830                                                 const MachineInstr &MI,
3831                                                 int OpIdx) const {
3832  assert(OpIdx >= 0 && "expected to match an immediate operand");
3833  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3834}
3835
3836bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3837  return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3838}
3839
3840bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3841  return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3842}
3843
3844bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3845  return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3846}
3847
3848bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3849  return TII.isInlineConstant(Imm);
3850}
3851