1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPURegisterBankInfo.h"
15#include "AMDGPUInstrInfo.h"
16#include "AMDGPUSubtarget.h"
17#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18#include "SIMachineFunctionInfo.h"
19#include "SIRegisterInfo.h"
20#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
21#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26#include "llvm/CodeGen/TargetRegisterInfo.h"
27#include "llvm/CodeGen/TargetSubtargetInfo.h"
28#include "llvm/IR/Constants.h"
29
30#define GET_TARGET_REGBANK_IMPL
31#include "AMDGPUGenRegisterBank.inc"
32
33// This file will be TableGen'ed at some point.
34#include "AMDGPUGenRegisterBankInfo.def"
35
36using namespace llvm;
37using namespace MIPatternMatch;
38
39namespace {
40
41// Observer to apply a register bank to new registers created by LegalizerHelper.
42class ApplyRegBankMapping final : public GISelChangeObserver {
43private:
44  const AMDGPURegisterBankInfo &RBI;
45  MachineRegisterInfo &MRI;
46  const RegisterBank *NewBank;
47  SmallVector<MachineInstr *, 4> NewInsts;
48
49public:
50  ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
51                      MachineRegisterInfo &MRI_, const RegisterBank *RB)
52    : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
53
54  ~ApplyRegBankMapping() {
55    for (MachineInstr *MI : NewInsts)
56      applyBank(*MI);
57  }
58
59  /// Set any registers that don't have a set register class or bank to SALU.
60  void applyBank(MachineInstr &MI) {
61    const unsigned Opc = MI.getOpcode();
62    if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
63        Opc == AMDGPU::G_SEXT) {
64      // LegalizerHelper wants to use the basic legalization artifacts when
65      // widening etc. We don't handle selection with vcc in artifact sources,
66      // so we need to use a sslect instead to handle these properly.
67      Register DstReg = MI.getOperand(0).getReg();
68      Register SrcReg = MI.getOperand(1).getReg();
69      const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
70      if (SrcBank == &AMDGPU::VCCRegBank) {
71        const LLT S32 = LLT::scalar(32);
72        assert(MRI.getType(SrcReg) == LLT::scalar(1));
73        assert(MRI.getType(DstReg) == S32);
74        assert(NewBank == &AMDGPU::VGPRRegBank);
75
76        // Replace the extension with a select, which really uses the boolean
77        // source.
78        MachineIRBuilder B(MI);
79        auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
80        auto False = B.buildConstant(S32, 0);
81        B.buildSelect(DstReg, SrcReg, True, False);
82        MRI.setRegBank(True.getReg(0), *NewBank);
83        MRI.setRegBank(False.getReg(0), *NewBank);
84        MI.eraseFromParent();
85      }
86
87      assert(!MRI.getRegClassOrRegBank(DstReg));
88      MRI.setRegBank(DstReg, *NewBank);
89      return;
90    }
91
92#ifndef NDEBUG
93    if (Opc == AMDGPU::G_TRUNC) {
94      Register DstReg = MI.getOperand(0).getReg();
95      const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
96      assert(DstBank != &AMDGPU::VCCRegBank);
97    }
98#endif
99
100    for (MachineOperand &Op : MI.operands()) {
101      if (!Op.isReg())
102        continue;
103
104      Register Reg = Op.getReg();
105      if (MRI.getRegClassOrRegBank(Reg))
106        continue;
107
108      const RegisterBank *RB = NewBank;
109      if (MRI.getType(Reg) == LLT::scalar(1)) {
110        assert(NewBank == &AMDGPU::VGPRRegBank &&
111               "s1 operands should only be used for vector bools");
112        assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
113                MI.getOpcode() != AMDGPU::G_ANYEXT) &&
114               "not expecting legalization artifacts here");
115        RB = &AMDGPU::VCCRegBank;
116      }
117
118      MRI.setRegBank(Reg, *RB);
119    }
120  }
121
122  void erasingInstr(MachineInstr &MI) override {}
123
124  void createdInstr(MachineInstr &MI) override {
125    // At this point, the instruction was just inserted and has no operands.
126    NewInsts.push_back(&MI);
127  }
128
129  void changingInstr(MachineInstr &MI) override {}
130  void changedInstr(MachineInstr &MI) override {}
131};
132
133}
134AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
135    : AMDGPUGenRegisterBankInfo(),
136      Subtarget(ST),
137      TRI(Subtarget.getRegisterInfo()),
138      TII(Subtarget.getInstrInfo()) {
139
140  // HACK: Until this is fully tablegen'd.
141  static bool AlreadyInit = false;
142  if (AlreadyInit)
143    return;
144
145  AlreadyInit = true;
146
147  assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
148         &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
149         &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
150}
151
152static bool isVectorRegisterBank(const RegisterBank &Bank) {
153  unsigned BankID = Bank.getID();
154  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
155}
156
157unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
158                                          const RegisterBank &Src,
159                                          unsigned Size) const {
160  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
161  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
162      isVectorRegisterBank(Src)) {
163    return std::numeric_limits<unsigned>::max();
164  }
165
166  // Bool values are tricky, because the meaning is based on context. The SCC
167  // and VCC banks are for the natural scalar and vector conditions produced by
168  // a compare.
169  //
170  // Legalization doesn't know about the necessary context, so an s1 use may
171  // have been a truncate from an arbitrary value, in which case a copy (lowered
172  // as a compare with 0) needs to be inserted.
173  if (Size == 1 &&
174      (Dst.getID() == AMDGPU::SGPRRegBankID) &&
175      (isVectorRegisterBank(Src) ||
176       Src.getID() == AMDGPU::SGPRRegBankID ||
177       Src.getID() == AMDGPU::VCCRegBankID))
178    return std::numeric_limits<unsigned>::max();
179
180  if (Src.getID() == AMDGPU::VCCRegBankID)
181    return std::numeric_limits<unsigned>::max();
182
183  // There is no direct copy between AGPRs.
184  if (Dst.getID() == AMDGPU::AGPRRegBankID &&
185      Src.getID() == AMDGPU::AGPRRegBankID)
186    return 4;
187
188  return RegisterBankInfo::copyCost(Dst, Src, Size);
189}
190
191unsigned AMDGPURegisterBankInfo::getBreakDownCost(
192  const ValueMapping &ValMapping,
193  const RegisterBank *CurBank) const {
194  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
195  // VGPR.
196  // FIXME: Is there a better way to do this?
197  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
198    return 10; // This is expensive.
199
200  assert(ValMapping.NumBreakDowns == 2 &&
201         ValMapping.BreakDown[0].Length == 32 &&
202         ValMapping.BreakDown[0].StartIdx == 0 &&
203         ValMapping.BreakDown[1].Length == 32 &&
204         ValMapping.BreakDown[1].StartIdx == 32 &&
205         ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
206
207  // 32-bit extract of a 64-bit value is just access of a subregister, so free.
208  // TODO: Cost of 0 hits assert, though it's not clear it's what we really
209  // want.
210
211  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
212  // alignment restrictions, but this probably isn't important.
213  return 1;
214}
215
216const RegisterBank &
217AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
218                                               LLT Ty) const {
219  if (&RC == &AMDGPU::SReg_1RegClass)
220    return AMDGPU::VCCRegBank;
221
222  // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
223  // VCC-like use.
224  if (TRI->isSGPRClass(&RC)) {
225    // FIXME: This probably came from a copy from a physical register, which
226    // should be inferrrable from the copied to-type. We don't have many boolean
227    // physical register constraints so just assume a normal SGPR for now.
228    if (!Ty.isValid())
229      return AMDGPU::SGPRRegBank;
230
231    return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
232  }
233
234  return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
235}
236
237template <unsigned NumOps>
238RegisterBankInfo::InstructionMappings
239AMDGPURegisterBankInfo::addMappingFromTable(
240    const MachineInstr &MI, const MachineRegisterInfo &MRI,
241    const std::array<unsigned, NumOps> RegSrcOpIdx,
242    ArrayRef<OpRegBankEntry<NumOps>> Table) const {
243
244  InstructionMappings AltMappings;
245
246  SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
247
248  unsigned Sizes[NumOps];
249  for (unsigned I = 0; I < NumOps; ++I) {
250    Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
251    Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
252  }
253
254  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
255    unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
256    Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
257  }
258
259  // getInstrMapping's default mapping uses ID 1, so start at 2.
260  unsigned MappingID = 2;
261  for (const auto &Entry : Table) {
262    for (unsigned I = 0; I < NumOps; ++I) {
263      int OpIdx = RegSrcOpIdx[I];
264      Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
265    }
266
267    AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
268                                                 getOperandsMapping(Operands),
269                                                 Operands.size()));
270  }
271
272  return AltMappings;
273}
274
275RegisterBankInfo::InstructionMappings
276AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
277    const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
278  switch (MI.getIntrinsicID()) {
279  case Intrinsic::amdgcn_readlane: {
280    static const OpRegBankEntry<3> Table[2] = {
281      // Perfectly legal.
282      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
283
284      // Need a readfirstlane for the index.
285      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
286    };
287
288    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
289    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
290  }
291  case Intrinsic::amdgcn_writelane: {
292    static const OpRegBankEntry<4> Table[4] = {
293      // Perfectly legal.
294      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
295
296      // Need readfirstlane of first op
297      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
298
299      // Need readfirstlane of second op
300      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
301
302      // Need readfirstlane of both ops
303      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
304    };
305
306    // rsrc, voffset, offset
307    const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
308    return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
309  }
310  default:
311    return RegisterBankInfo::getInstrAlternativeMappings(MI);
312  }
313}
314
315RegisterBankInfo::InstructionMappings
316AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
317    const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
318
319  switch (MI.getIntrinsicID()) {
320  case Intrinsic::amdgcn_buffer_load: {
321    static const OpRegBankEntry<3> Table[4] = {
322      // Perfectly legal.
323      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
324      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
325
326      // Waterfall loop needed for rsrc. In the worst case this will execute
327      // approximately an extra 10 * wavesize + 2 instructions.
328      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
329      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
330    };
331
332    // rsrc, voffset, offset
333    const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
334    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
335  }
336  case Intrinsic::amdgcn_s_buffer_load: {
337    static const OpRegBankEntry<2> Table[4] = {
338      // Perfectly legal.
339      { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
340
341      // Only need 1 register in loop
342      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
343
344      // Have to waterfall the resource.
345      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
346
347      // Have to waterfall the resource, and the offset.
348      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
349    };
350
351    // rsrc, offset
352    const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
353    return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
354  }
355  case Intrinsic::amdgcn_ds_ordered_add:
356  case Intrinsic::amdgcn_ds_ordered_swap: {
357    // VGPR = M0, VGPR
358    static const OpRegBankEntry<3> Table[2] = {
359      // Perfectly legal.
360      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
361
362      // Need a readfirstlane for m0
363      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
364    };
365
366    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
367    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368  }
369  case Intrinsic::amdgcn_s_sendmsg:
370  case Intrinsic::amdgcn_s_sendmsghalt: {
371    // FIXME: Should have no register for immediate
372    static const OpRegBankEntry<1> Table[2] = {
373      // Perfectly legal.
374      { { AMDGPU::SGPRRegBankID }, 1 },
375
376      // Need readlane
377      { { AMDGPU::VGPRRegBankID }, 3 }
378    };
379
380    const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
381    return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
382  }
383  default:
384    return RegisterBankInfo::getInstrAlternativeMappings(MI);
385  }
386}
387
388static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
389  const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
390  return I && I->getMetadata("amdgpu.noclobber");
391}
392
393// FIXME: Returns uniform if there's no source value information. This is
394// probably wrong.
395static bool isScalarLoadLegal(const MachineInstr &MI) {
396  if (!MI.hasOneMemOperand())
397    return false;
398
399  const MachineMemOperand *MMO = *MI.memoperands_begin();
400  const unsigned AS = MMO->getAddrSpace();
401  const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
402                       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
403
404  // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
405  return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
406    // Can't do a scalar atomic load.
407    !MMO->isAtomic() &&
408    // Don't use scalar loads for volatile accesses to non-constant address
409    // spaces.
410    (IsConst || !MMO->isVolatile()) &&
411    // Memory must be known constant, or not written before this load.
412    (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
413    AMDGPUInstrInfo::isUniformMMO(MMO);
414}
415
416RegisterBankInfo::InstructionMappings
417AMDGPURegisterBankInfo::getInstrAlternativeMappings(
418    const MachineInstr &MI) const {
419
420  const MachineFunction &MF = *MI.getParent()->getParent();
421  const MachineRegisterInfo &MRI = MF.getRegInfo();
422
423
424  InstructionMappings AltMappings;
425  switch (MI.getOpcode()) {
426  case TargetOpcode::G_CONSTANT: {
427    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
428    if (Size == 1) {
429      static const OpRegBankEntry<1> Table[3] = {
430        { { AMDGPU::VGPRRegBankID }, 1 },
431        { { AMDGPU::SGPRRegBankID }, 1 },
432        { { AMDGPU::VCCRegBankID }, 1 }
433      };
434
435      return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
436    }
437
438    LLVM_FALLTHROUGH;
439  }
440  case TargetOpcode::G_FCONSTANT:
441  case TargetOpcode::G_FRAME_INDEX:
442  case TargetOpcode::G_GLOBAL_VALUE: {
443    static const OpRegBankEntry<1> Table[2] = {
444      { { AMDGPU::VGPRRegBankID }, 1 },
445      { { AMDGPU::SGPRRegBankID }, 1 }
446    };
447
448    return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
449  }
450  case TargetOpcode::G_AND:
451  case TargetOpcode::G_OR:
452  case TargetOpcode::G_XOR: {
453    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
454
455    if (Size == 1) {
456      // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
457      const InstructionMapping &SCCMapping = getInstructionMapping(
458        1, 1, getOperandsMapping(
459          {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
460           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
461           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
462        3); // Num Operands
463      AltMappings.push_back(&SCCMapping);
464
465      const InstructionMapping &VCCMapping0 = getInstructionMapping(
466        2, 1, getOperandsMapping(
467          {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
468           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
469           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
470        3); // Num Operands
471      AltMappings.push_back(&VCCMapping0);
472      return AltMappings;
473    }
474
475    if (Size != 64)
476      break;
477
478    const InstructionMapping &SSMapping = getInstructionMapping(
479      1, 1, getOperandsMapping(
480        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
481         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
482         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
483      3); // Num Operands
484    AltMappings.push_back(&SSMapping);
485
486    const InstructionMapping &VVMapping = getInstructionMapping(
487      2, 2, getOperandsMapping(
488        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
489         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
490         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
491      3); // Num Operands
492    AltMappings.push_back(&VVMapping);
493
494    const InstructionMapping &SVMapping = getInstructionMapping(
495      3, 3, getOperandsMapping(
496        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
497         AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
498         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
499      3); // Num Operands
500    AltMappings.push_back(&SVMapping);
501
502    // SGPR in LHS is slightly preferrable, so make it VS more expensive than
503    // SV.
504    const InstructionMapping &VSMapping = getInstructionMapping(
505      3, 4, getOperandsMapping(
506        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
507         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
508         AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
509      3); // Num Operands
510    AltMappings.push_back(&VSMapping);
511    break;
512  }
513  case TargetOpcode::G_LOAD:
514  case TargetOpcode::G_ZEXTLOAD:
515  case TargetOpcode::G_SEXTLOAD: {
516    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
517    LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
518    unsigned PtrSize = PtrTy.getSizeInBits();
519    unsigned AS = PtrTy.getAddressSpace();
520    LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
521
522    if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
523         AS != AMDGPUAS::PRIVATE_ADDRESS) &&
524        isScalarLoadLegal(MI)) {
525      const InstructionMapping &SSMapping = getInstructionMapping(
526          1, 1, getOperandsMapping(
527                    {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
528                     AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
529          2); // Num Operands
530      AltMappings.push_back(&SSMapping);
531    }
532
533    const InstructionMapping &VVMapping = getInstructionMapping(
534        2, 1, getOperandsMapping(
535          {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
536           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
537        2); // Num Operands
538    AltMappings.push_back(&VVMapping);
539
540    // It may be possible to have a vgpr = load sgpr mapping here, because
541    // the mubuf instructions support this kind of load, but probably for only
542    // gfx7 and older.  However, the addressing mode matching in the instruction
543    // selector should be able to do a better job of detecting and selecting
544    // these kinds of loads from the vgpr = load vgpr mapping.
545
546    return AltMappings;
547
548  }
549  case TargetOpcode::G_ICMP: {
550    // TODO: Should report 32-bit for scalar output type.
551    unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
552    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
553      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
554                          nullptr, // Predicate operand.
555                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
556                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
557      4); // Num Operands
558    AltMappings.push_back(&SSMapping);
559
560    const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
561      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
562                          nullptr, // Predicate operand.
563                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
564                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
565      4); // Num Operands
566    AltMappings.push_back(&SVMapping);
567
568    const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
569      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
570                          nullptr, // Predicate operand.
571                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
573      4); // Num Operands
574    AltMappings.push_back(&VSMapping);
575
576    const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
577      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
578                          nullptr, // Predicate operand.
579                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
580                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
581      4); // Num Operands
582    AltMappings.push_back(&VVMapping);
583
584    return AltMappings;
585  }
586  case TargetOpcode::G_SELECT: {
587    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
588    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
589      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
590                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
591                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
592                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
593      4); // Num Operands
594    AltMappings.push_back(&SSMapping);
595
596    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
597      getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
598                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
599                          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
600                          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
601      4); // Num Operands
602    AltMappings.push_back(&VVMapping);
603
604    return AltMappings;
605  }
606  case TargetOpcode::G_SMIN:
607  case TargetOpcode::G_SMAX:
608  case TargetOpcode::G_UMIN:
609  case TargetOpcode::G_UMAX: {
610    static const OpRegBankEntry<3> Table[4] = {
611      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
612      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
613      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
614
615      // Scalar requires cmp+select, and extends if 16-bit.
616      // FIXME: Should there be separate costs for 32 and 16-bit
617      { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
618    };
619
620    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
621    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
622  }
623  case TargetOpcode::G_UADDE:
624  case TargetOpcode::G_USUBE:
625  case TargetOpcode::G_SADDE:
626  case TargetOpcode::G_SSUBE: {
627    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
628    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
629      getOperandsMapping(
630        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
631         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
632         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
633         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
634         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
635      5); // Num Operands
636    AltMappings.push_back(&SSMapping);
637
638    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
639      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
640                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
641                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
642                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
643                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
644      5); // Num Operands
645    AltMappings.push_back(&VVMapping);
646    return AltMappings;
647  }
648  case AMDGPU::G_BRCOND: {
649    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
650
651    // TODO: Change type to 32 for scalar
652    const InstructionMapping &SMapping = getInstructionMapping(
653      1, 1, getOperandsMapping(
654        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
655      2); // Num Operands
656    AltMappings.push_back(&SMapping);
657
658    const InstructionMapping &VMapping = getInstructionMapping(
659      1, 1, getOperandsMapping(
660        {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
661      2); // Num Operands
662    AltMappings.push_back(&VMapping);
663    return AltMappings;
664  }
665  case AMDGPU::G_INTRINSIC:
666    return getInstrAlternativeMappingsIntrinsic(MI, MRI);
667  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
668    return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
669  default:
670    break;
671  }
672  return RegisterBankInfo::getInstrAlternativeMappings(MI);
673}
674
675void AMDGPURegisterBankInfo::split64BitValueForMapping(
676  MachineIRBuilder &B,
677  SmallVector<Register, 2> &Regs,
678  LLT HalfTy,
679  Register Reg) const {
680  assert(HalfTy.getSizeInBits() == 32);
681  MachineRegisterInfo *MRI = B.getMRI();
682  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
683  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
684  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
685  MRI->setRegBank(LoLHS, *Bank);
686  MRI->setRegBank(HiLHS, *Bank);
687
688  Regs.push_back(LoLHS);
689  Regs.push_back(HiLHS);
690
691  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
692    .addDef(LoLHS)
693    .addDef(HiLHS)
694    .addUse(Reg);
695}
696
697/// Replace the current type each register in \p Regs has with \p NewTy
698static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
699                          LLT NewTy) {
700  for (Register Reg : Regs) {
701    assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
702    MRI.setType(Reg, NewTy);
703  }
704}
705
706static LLT getHalfSizedType(LLT Ty) {
707  if (Ty.isVector()) {
708    assert(Ty.getNumElements() % 2 == 0);
709    return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
710  }
711
712  assert(Ty.getSizeInBits() % 2 == 0);
713  return LLT::scalar(Ty.getSizeInBits() / 2);
714}
715
716/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
717/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
718/// execute the instruction for each unique combination of values in all lanes
719/// in the wave. The block will be split such that rest of the instructions are
720/// moved to a new block.
721///
722/// Essentially performs this loop:
723//
724/// Save Execution Mask
725/// For (Lane : Wavefront) {
726///   Enable Lane, Disable all other lanes
727///   SGPR = read SGPR value for current lane from VGPR
728///   VGPRResult[Lane] = use_op SGPR
729/// }
730/// Restore Execution Mask
731///
732/// There is additional complexity to try for compare values to identify the
733/// unique values used.
734bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
735  MachineIRBuilder &B,
736  iterator_range<MachineBasicBlock::iterator> Range,
737  SmallSet<Register, 4> &SGPROperandRegs,
738  MachineRegisterInfo &MRI) const {
739  SmallVector<Register, 4> ResultRegs;
740  SmallVector<Register, 4> InitResultRegs;
741  SmallVector<Register, 4> PhiRegs;
742
743  MachineBasicBlock &MBB = B.getMBB();
744  MachineFunction *MF = &B.getMF();
745
746  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
747  const unsigned WaveAndOpc = Subtarget.isWave32() ?
748    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
749  const unsigned MovTermOpc = Subtarget.isWave32() ?
750    AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
751  const unsigned XorTermOpc = Subtarget.isWave32() ?
752    AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
753  const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
754    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
755  const unsigned ExecReg =  Subtarget.isWave32() ?
756    AMDGPU::EXEC_LO : AMDGPU::EXEC;
757
758  for (MachineInstr &MI : Range) {
759    for (MachineOperand &Def : MI.defs()) {
760      LLT ResTy = MRI.getType(Def.getReg());
761      const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
762      ResultRegs.push_back(Def.getReg());
763      Register InitReg = B.buildUndef(ResTy).getReg(0);
764      Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
765      InitResultRegs.push_back(InitReg);
766      PhiRegs.push_back(PhiReg);
767      MRI.setRegBank(PhiReg, *DefBank);
768      MRI.setRegBank(InitReg, *DefBank);
769    }
770  }
771
772  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
773  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
774
775  // Don't bother using generic instructions/registers for the exec mask.
776  B.buildInstr(TargetOpcode::IMPLICIT_DEF)
777    .addDef(InitSaveExecReg);
778
779  Register PhiExec = MRI.createVirtualRegister(WaveRC);
780  Register NewExec = MRI.createVirtualRegister(WaveRC);
781
782  // To insert the loop we need to split the block. Move everything before this
783  // point to a new block, and insert a new empty block before this instruction.
784  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
785  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
786  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
787  MachineFunction::iterator MBBI(MBB);
788  ++MBBI;
789  MF->insert(MBBI, LoopBB);
790  MF->insert(MBBI, RestoreExecBB);
791  MF->insert(MBBI, RemainderBB);
792
793  LoopBB->addSuccessor(RestoreExecBB);
794  LoopBB->addSuccessor(LoopBB);
795
796  // Move the rest of the block into a new block.
797  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
798  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
799
800  MBB.addSuccessor(LoopBB);
801  RestoreExecBB->addSuccessor(RemainderBB);
802
803  B.setInsertPt(*LoopBB, LoopBB->end());
804
805  B.buildInstr(TargetOpcode::PHI)
806    .addDef(PhiExec)
807    .addReg(InitSaveExecReg)
808    .addMBB(&MBB)
809    .addReg(NewExec)
810    .addMBB(LoopBB);
811
812  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
813    B.buildInstr(TargetOpcode::G_PHI)
814      .addDef(std::get<2>(Result))
815      .addReg(std::get<0>(Result)) // Initial value / implicit_def
816      .addMBB(&MBB)
817      .addReg(std::get<1>(Result)) // Mid-loop value.
818      .addMBB(LoopBB);
819  }
820
821  const DebugLoc &DL = B.getDL();
822
823  // Figure out the iterator range after splicing the instructions.
824  auto NewBegin = std::prev(LoopBB->end());
825
826  // Move the instruction into the loop. Note we moved everything after
827  // Range.end() already into a new block, so Range.end() is no longer valid.
828  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
829
830  auto NewEnd = LoopBB->end();
831
832  MachineBasicBlock::iterator I = Range.begin();
833  B.setInsertPt(*LoopBB, I);
834
835  Register CondReg;
836
837  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
838    for (MachineOperand &Op : MI.uses()) {
839      if (!Op.isReg() || Op.isDef())
840        continue;
841
842      if (SGPROperandRegs.count(Op.getReg())) {
843        LLT OpTy = MRI.getType(Op.getReg());
844        unsigned OpSize = OpTy.getSizeInBits();
845
846        // Can only do a readlane of 32-bit pieces.
847        if (OpSize == 32) {
848          // Avoid extra copies in the simple case of one 32-bit register.
849          Register CurrentLaneOpReg
850            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
851          MRI.setType(CurrentLaneOpReg, OpTy);
852
853          constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
854          // Read the next variant <- also loop target.
855          BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
856                  CurrentLaneOpReg)
857            .addReg(Op.getReg());
858
859          Register NewCondReg = MRI.createVirtualRegister(WaveRC);
860          bool First = CondReg == AMDGPU::NoRegister;
861          if (First)
862            CondReg = NewCondReg;
863
864          // Compare the just read M0 value to all possible Idx values.
865          B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
866            .addDef(NewCondReg)
867            .addReg(CurrentLaneOpReg)
868            .addReg(Op.getReg());
869          Op.setReg(CurrentLaneOpReg);
870
871          if (!First) {
872            Register AndReg = MRI.createVirtualRegister(WaveRC);
873
874            // If there are multiple operands to consider, and the conditions.
875            B.buildInstr(WaveAndOpc)
876              .addDef(AndReg)
877              .addReg(NewCondReg)
878              .addReg(CondReg);
879            CondReg = AndReg;
880          }
881        } else {
882          LLT S32 = LLT::scalar(32);
883          SmallVector<Register, 8> ReadlanePieces;
884
885          // The compares can be done as 64-bit, but the extract needs to be done
886          // in 32-bit pieces.
887
888          bool Is64 = OpSize % 64 == 0;
889
890          LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
891          unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
892            : AMDGPU::V_CMP_EQ_U32_e64;
893
894          // The compares can be done as 64-bit, but the extract needs to be done
895          // in 32-bit pieces.
896
897          // Insert the unmerge before the loop.
898
899          B.setMBB(MBB);
900          auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
901          B.setInstr(*I);
902
903          unsigned NumPieces = Unmerge->getNumOperands() - 1;
904          for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
905            Register UnmergePiece = Unmerge.getReg(PieceIdx);
906
907            Register CurrentLaneOpReg;
908            if (Is64) {
909              Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
910              Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
911
912              MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
913              MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
914              MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
915
916              // Read the next variant <- also loop target.
917              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
918                      CurrentLaneOpRegLo)
919                .addReg(UnmergePiece, 0, AMDGPU::sub0);
920
921              // Read the next variant <- also loop target.
922              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
923                      CurrentLaneOpRegHi)
924                .addReg(UnmergePiece, 0, AMDGPU::sub1);
925
926              CurrentLaneOpReg =
927                B.buildMerge(LLT::scalar(64),
928                             {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
929                .getReg(0);
930
931              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
932
933              if (OpTy.getScalarSizeInBits() == 64) {
934                // If we need to produce a 64-bit element vector, so use the
935                // merged pieces
936                ReadlanePieces.push_back(CurrentLaneOpReg);
937              } else {
938                // 32-bit element type.
939                ReadlanePieces.push_back(CurrentLaneOpRegLo);
940                ReadlanePieces.push_back(CurrentLaneOpRegHi);
941              }
942            } else {
943              CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
944              MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
945              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
946
947              // Read the next variant <- also loop target.
948              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
949                      CurrentLaneOpReg)
950                .addReg(UnmergePiece);
951              ReadlanePieces.push_back(CurrentLaneOpReg);
952            }
953
954            Register NewCondReg = MRI.createVirtualRegister(WaveRC);
955            bool First = CondReg == AMDGPU::NoRegister;
956            if (First)
957              CondReg = NewCondReg;
958
959            B.buildInstr(CmpOp)
960              .addDef(NewCondReg)
961              .addReg(CurrentLaneOpReg)
962              .addReg(UnmergePiece);
963
964            if (!First) {
965              Register AndReg = MRI.createVirtualRegister(WaveRC);
966
967              // If there are multiple operands to consider, and the conditions.
968              B.buildInstr(WaveAndOpc)
969                .addDef(AndReg)
970                .addReg(NewCondReg)
971                .addReg(CondReg);
972              CondReg = AndReg;
973            }
974          }
975
976          // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
977          // BUILD_VECTOR
978          if (OpTy.isVector()) {
979            auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
980            Op.setReg(Merge.getReg(0));
981          } else {
982            auto Merge = B.buildMerge(OpTy, ReadlanePieces);
983            Op.setReg(Merge.getReg(0));
984          }
985
986          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
987        }
988      }
989    }
990  }
991
992  B.setInsertPt(*LoopBB, LoopBB->end());
993
994  // Update EXEC, save the original EXEC value to VCC.
995  B.buildInstr(AndSaveExecOpc)
996    .addDef(NewExec)
997    .addReg(CondReg, RegState::Kill);
998
999  MRI.setSimpleHint(NewExec, CondReg);
1000
1001  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1002  B.buildInstr(XorTermOpc)
1003    .addDef(ExecReg)
1004    .addReg(ExecReg)
1005    .addReg(NewExec);
1006
1007  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1008  // s_cbranch_scc0?
1009
1010  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1011  B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1012    .addMBB(LoopBB);
1013
1014  // Save the EXEC mask before the loop.
1015  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1016    .addReg(ExecReg);
1017
1018  // Restore the EXEC mask after the loop.
1019  B.setMBB(*RestoreExecBB);
1020  B.buildInstr(MovTermOpc)
1021    .addDef(ExecReg)
1022    .addReg(SaveExecReg);
1023
1024  // Set the insert point after the original instruction, so any new
1025  // instructions will be in the remainder.
1026  B.setInsertPt(*RemainderBB, RemainderBB->begin());
1027
1028  return true;
1029}
1030
1031// Return any unique registers used by \p MI at \p OpIndices that need to be
1032// handled in a waterfall loop. Returns these registers in \p
1033// SGPROperandRegs. Returns true if there are any operansd to handle and a
1034// waterfall loop is necessary.
1035bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1036  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1037  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1038  for (unsigned Op : OpIndices) {
1039    assert(MI.getOperand(Op).isUse());
1040    Register Reg = MI.getOperand(Op).getReg();
1041    const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1042    if (OpBank->getID() == AMDGPU::VGPRRegBankID)
1043      SGPROperandRegs.insert(Reg);
1044  }
1045
1046  // No operands need to be replaced, so no need to loop.
1047  return !SGPROperandRegs.empty();
1048}
1049
1050bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1051  MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1052  ArrayRef<unsigned> OpIndices) const {
1053  // Use a set to avoid extra readfirstlanes in the case where multiple operands
1054  // are the same register.
1055  SmallSet<Register, 4> SGPROperandRegs;
1056
1057  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1058    return false;
1059
1060  MachineBasicBlock::iterator I = MI.getIterator();
1061  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1062                                SGPROperandRegs, MRI);
1063}
1064
1065bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1066  MachineInstr &MI, MachineRegisterInfo &MRI,
1067  ArrayRef<unsigned> OpIndices) const {
1068  MachineIRBuilder B(MI);
1069  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1070}
1071
1072// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1073void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1074    MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1075  Register Reg = MI.getOperand(OpIdx).getReg();
1076  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1077  if (Bank != &AMDGPU::VGPRRegBank)
1078    return;
1079
1080  MachineIRBuilder B(MI);
1081  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1082  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1083    .addDef(SGPR)
1084    .addReg(Reg);
1085
1086  MRI.setType(SGPR, MRI.getType(Reg));
1087
1088  const TargetRegisterClass *Constrained =
1089      constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1090  (void)Constrained;
1091  assert(Constrained && "Failed to constrain readfirstlane src reg");
1092
1093  MI.getOperand(OpIdx).setReg(SGPR);
1094}
1095
1096// When regbankselect repairs registers, it will insert a repair instruction
1097// which defines the repaired register.  Then it calls applyMapping and expects
1098// that the targets will either delete or rewrite the originally wrote to the
1099// repaired registers.  Beccause of this, we end up in a situation where
1100// we have 2 instructions defining the same registers.
1101static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
1102                                     Register Reg,
1103                                     const MachineInstr &MI) {
1104  // Is there some way we can assert that there are exactly 2 def instructions?
1105  for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1106    if (&Other != &MI)
1107      return &Other;
1108  }
1109
1110  return nullptr;
1111}
1112
1113bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1114                        const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1115                                              MachineRegisterInfo &MRI) const {
1116  Register DstReg = MI.getOperand(0).getReg();
1117  const LLT LoadTy =  MRI.getType(DstReg);
1118  unsigned LoadSize = LoadTy.getSizeInBits();
1119  const unsigned MaxNonSmrdLoadSize = 128;
1120  // 128-bit loads are supported for all instruction types.
1121  if (LoadSize <= MaxNonSmrdLoadSize)
1122    return false;
1123
1124  SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1125  SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1126
1127  // If the pointer is an SGPR, we have nothing to do.
1128  if (SrcRegs.empty()) {
1129    const RegisterBank *PtrBank =
1130      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1131    if (PtrBank == &AMDGPU::SGPRRegBank)
1132      return false;
1133    SrcRegs.push_back(MI.getOperand(1).getReg());
1134  }
1135
1136  assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137
1138  // We want to get the repair instruction now, because it will help us
1139  // determine which instruction the legalizer inserts that will also
1140  // write to DstReg.
1141  MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1142
1143  // RegBankSelect only emits scalar types, so we need to reset the pointer
1144  // operand to a pointer type.
1145  Register BasePtrReg = SrcRegs[0];
1146  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1147  MRI.setType(BasePtrReg, PtrTy);
1148
1149  MachineIRBuilder B(MI);
1150
1151  unsigned SplitElts =
1152      MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1153  const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
1154  ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1155  GISelObserverWrapper Observer(&O);
1156  B.setChangeObserver(Observer);
1157  LegalizerHelper Helper(B.getMF(), Observer, B);
1158  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1159    return false;
1160
1161  // At this point, the legalizer has split the original load into smaller
1162  // loads.  At the end of lowering, it inserts an instruction (LegalizedInst)
1163  // that combines the outputs of the lower loads and writes it to DstReg.
1164  // The register bank selector has also added the RepairInst which writes to
1165  // DstReg as well.
1166
1167  MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1168
1169  // Replace the output of the LegalizedInst with a temporary register, since
1170  // RepairInst already defines DstReg.
1171  Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1172  LegalizedInst->getOperand(0).setReg(TmpReg);
1173  B.setInsertPt(*RepairInst->getParent(), RepairInst);
1174
1175  for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1176    Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1177    B.buildConstant(IdxReg, DefIdx);
1178    MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
1179    B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1180  }
1181
1182  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1183  return true;
1184}
1185
1186bool AMDGPURegisterBankInfo::applyMappingImage(
1187    MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1188    MachineRegisterInfo &MRI, int RsrcIdx) const {
1189  const int NumDefs = MI.getNumExplicitDefs();
1190
1191  // The reported argument index is relative to the IR intrinsic call arguments,
1192  // so we need to shift by the number of defs and the intrinsic ID.
1193  RsrcIdx += NumDefs + 1;
1194
1195  // Insert copies to VGPR arguments.
1196  applyDefaultMapping(OpdMapper);
1197
1198  // Fixup any SGPR arguments.
1199  SmallVector<unsigned, 4> SGPRIndexes;
1200  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1201    if (!MI.getOperand(I).isReg())
1202      continue;
1203
1204    // If this intrinsic has a sampler, it immediately follows rsrc.
1205    if (I == RsrcIdx || I == RsrcIdx + 1)
1206      SGPRIndexes.push_back(I);
1207  }
1208
1209  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1210  return true;
1211}
1212
1213// FIXME: Duplicated from LegalizerHelper
1214static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1215  switch (Opc) {
1216  case TargetOpcode::G_SMIN:
1217    return CmpInst::ICMP_SLT;
1218  case TargetOpcode::G_SMAX:
1219    return CmpInst::ICMP_SGT;
1220  case TargetOpcode::G_UMIN:
1221    return CmpInst::ICMP_ULT;
1222  case TargetOpcode::G_UMAX:
1223    return CmpInst::ICMP_UGT;
1224  default:
1225    llvm_unreachable("not in integer min/max");
1226  }
1227}
1228
1229// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
1230void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1231                                               MachineInstr &MI) const {
1232  Register Dst = MI.getOperand(0).getReg();
1233  Register Src0 = MI.getOperand(1).getReg();
1234  Register Src1 = MI.getOperand(2).getReg();
1235
1236  const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1237  LLT CmpType = LLT::scalar(32);
1238
1239  auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1240  B.buildSelect(Dst, Cmp, Src0, Src1);
1241
1242  B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
1243  MI.eraseFromParent();
1244}
1245
1246// For cases where only a single copy is inserted for matching register banks.
1247// Replace the register in the instruction operand
1248static void substituteSimpleCopyRegs(
1249  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1250  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1251  if (!SrcReg.empty()) {
1252    assert(SrcReg.size() == 1);
1253    OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1254  }
1255}
1256
1257/// Handle register layout difference for f16 images for some subtargets.
1258Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1259                                                MachineRegisterInfo &MRI,
1260                                                Register Reg) const {
1261  if (!Subtarget.hasUnpackedD16VMem())
1262    return Reg;
1263
1264  const LLT S16 = LLT::scalar(16);
1265  LLT StoreVT = MRI.getType(Reg);
1266  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1267    return Reg;
1268
1269  auto Unmerge = B.buildUnmerge(S16, Reg);
1270
1271
1272  SmallVector<Register, 4> WideRegs;
1273  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1274    WideRegs.push_back(Unmerge.getReg(I));
1275
1276  const LLT S32 = LLT::scalar(32);
1277  int NumElts = StoreVT.getNumElements();
1278
1279  return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1280}
1281
1282static std::pair<Register, unsigned>
1283getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1284  int64_t Const;
1285  if (mi_match(Reg, MRI, m_ICst(Const)))
1286    return std::make_pair(Register(), Const);
1287
1288  Register Base;
1289  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1290    return std::make_pair(Base, Const);
1291
1292  // TODO: Handle G_OR used for add case
1293  return std::make_pair(Reg, 0);
1294}
1295
1296std::pair<Register, unsigned>
1297AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1298                                           Register OrigOffset) const {
1299  const unsigned MaxImm = 4095;
1300  Register BaseReg;
1301  unsigned ImmOffset;
1302  const LLT S32 = LLT::scalar(32);
1303
1304  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1305                                                           OrigOffset);
1306
1307  unsigned C1 = 0;
1308  if (ImmOffset != 0) {
1309    // If the immediate value is too big for the immoffset field, put the value
1310    // and -4096 into the immoffset field so that the value that is copied/added
1311    // for the voffset field is a multiple of 4096, and it stands more chance
1312    // of being CSEd with the copy/add for another similar load/store.
1313    // However, do not do that rounding down to a multiple of 4096 if that is a
1314    // negative number, as it appears to be illegal to have a negative offset
1315    // in the vgpr, even if adding the immediate offset makes it positive.
1316    unsigned Overflow = ImmOffset & ~MaxImm;
1317    ImmOffset -= Overflow;
1318    if ((int32_t)Overflow < 0) {
1319      Overflow += ImmOffset;
1320      ImmOffset = 0;
1321    }
1322
1323    C1 = ImmOffset;
1324    if (Overflow != 0) {
1325      if (!BaseReg)
1326        BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1327      else {
1328        auto OverflowVal = B.buildConstant(S32, Overflow);
1329        BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1330      }
1331    }
1332  }
1333
1334  if (!BaseReg)
1335    BaseReg = B.buildConstant(S32, 0).getReg(0);
1336
1337  return {BaseReg, C1};
1338}
1339
1340static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1341  int64_t C;
1342  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1343}
1344
1345static unsigned extractGLC(unsigned CachePolicy) {
1346  return CachePolicy & 1;
1347}
1348
1349static unsigned extractSLC(unsigned CachePolicy) {
1350  return (CachePolicy >> 1) & 1;
1351}
1352
1353static unsigned extractDLC(unsigned CachePolicy) {
1354  return (CachePolicy >> 2) & 1;
1355}
1356
1357MachineInstr *
1358AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1359                                             MachineInstr &MI) const {
1360   MachineRegisterInfo &MRI = *B.getMRI();
1361  executeInWaterfallLoop(B, MI, MRI, {2, 4});
1362
1363  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1364
1365  Register VData = MI.getOperand(1).getReg();
1366  LLT Ty = MRI.getType(VData);
1367
1368  int EltSize = Ty.getScalarSizeInBits();
1369  int Size = Ty.getSizeInBits();
1370
1371  // FIXME: Broken integer truncstore.
1372  if (EltSize != 32)
1373    report_fatal_error("unhandled intrinsic store");
1374
1375  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1376  const int MemSize = (*MI.memoperands_begin())->getSize();
1377
1378
1379  Register RSrc = MI.getOperand(2).getReg();
1380  Register VOffset = MI.getOperand(3).getReg();
1381  Register SOffset = MI.getOperand(4).getReg();
1382  unsigned CachePolicy = MI.getOperand(5).getImm();
1383
1384  unsigned ImmOffset;
1385  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1386
1387  const bool Offen = !isZero(VOffset, MRI);
1388
1389  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1390  switch (8 * MemSize) {
1391  case 8:
1392    Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1393                  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1394    break;
1395  case 16:
1396    Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1397                  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1398    break;
1399  default:
1400    Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1401                  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1402    if (Size > 32)
1403      Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1404    break;
1405  }
1406
1407
1408  // Set the insertion point back to the instruction in case it was moved into a
1409  // loop.
1410  B.setInstr(MI);
1411
1412  MachineInstrBuilder MIB = B.buildInstr(Opc)
1413    .addUse(VData);
1414
1415  if (Offen)
1416    MIB.addUse(VOffset);
1417
1418  MIB.addUse(RSrc)
1419     .addUse(SOffset)
1420     .addImm(ImmOffset)
1421     .addImm(extractGLC(CachePolicy))
1422     .addImm(extractSLC(CachePolicy))
1423     .addImm(0) // tfe: FIXME: Remove from inst
1424     .addImm(extractDLC(CachePolicy))
1425     .cloneMemRefs(MI);
1426
1427  // FIXME: We need a way to report failure from applyMappingImpl.
1428  // Insert constrain copies before inserting the loop.
1429  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1430    report_fatal_error("failed to constrain selected store intrinsic");
1431
1432  return MIB;
1433}
1434
1435bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1436                                        Register SrcReg) const {
1437  MachineRegisterInfo &MRI = *B.getMRI();
1438  LLT SrcTy = MRI.getType(SrcReg);
1439  if (SrcTy.getSizeInBits() == 32) {
1440    // Use a v_mov_b32 here to make the exec dependency explicit.
1441    B.buildInstr(AMDGPU::V_MOV_B32_e32)
1442      .addDef(DstReg)
1443      .addUse(SrcReg);
1444    return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1445           constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1446  }
1447
1448  Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1449  Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1450
1451  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1452    .addDef(TmpReg0)
1453    .addUse(SrcReg, 0, AMDGPU::sub0);
1454  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1455    .addDef(TmpReg1)
1456    .addUse(SrcReg, 0, AMDGPU::sub1);
1457  B.buildInstr(AMDGPU::REG_SEQUENCE)
1458    .addDef(DstReg)
1459    .addUse(TmpReg0)
1460    .addImm(AMDGPU::sub0)
1461    .addUse(TmpReg1)
1462    .addImm(AMDGPU::sub1);
1463
1464  return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1465         constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1466}
1467
1468void AMDGPURegisterBankInfo::applyMappingImpl(
1469    const OperandsMapper &OpdMapper) const {
1470  MachineInstr &MI = OpdMapper.getMI();
1471  unsigned Opc = MI.getOpcode();
1472  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1473  switch (Opc) {
1474  case AMDGPU::G_PHI: {
1475    Register DstReg = MI.getOperand(0).getReg();
1476    LLT DstTy = MRI.getType(DstReg);
1477    if (DstTy != LLT::scalar(1))
1478      break;
1479
1480    const LLT S32 = LLT::scalar(32);
1481    const RegisterBank *DstBank =
1482      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1483    if (DstBank == &AMDGPU::VCCRegBank) {
1484      applyDefaultMapping(OpdMapper);
1485      // The standard handling only considers the result register bank for
1486      // phis. For VCC, blindly inserting a copy when the phi is lowered will
1487      // produce an invalid copy. We can only copy with some kind of compare to
1488      // get a vector boolean result. Insert a regitser bank copy that will be
1489      // correctly lowered to a compare.
1490      MachineIRBuilder B(*MI.getParent()->getParent());
1491
1492      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1493        Register SrcReg = MI.getOperand(I).getReg();
1494        const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1495
1496        if (SrcBank != &AMDGPU::VCCRegBank) {
1497          MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
1498          B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
1499
1500          auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
1501          MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
1502          MI.getOperand(I).setReg(Copy.getReg(0));
1503        }
1504      }
1505
1506      return;
1507    }
1508
1509    // Phi handling is strange and only considers the bank of the destination.
1510    substituteSimpleCopyRegs(OpdMapper, 0);
1511
1512    // Promote SGPR/VGPR booleans to s32
1513    MachineFunction *MF = MI.getParent()->getParent();
1514    ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1515    GISelObserverWrapper Observer(&ApplyBank);
1516    MachineIRBuilder B(MI);
1517    LegalizerHelper Helper(*MF, Observer, B);
1518
1519    if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1520      llvm_unreachable("widen scalar should have succeeded");
1521
1522    return;
1523  }
1524  case AMDGPU::G_ICMP:
1525  case AMDGPU::G_UADDO:
1526  case AMDGPU::G_USUBO:
1527  case AMDGPU::G_UADDE:
1528  case AMDGPU::G_SADDE:
1529  case AMDGPU::G_USUBE:
1530  case AMDGPU::G_SSUBE: {
1531    unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
1532    Register DstReg = MI.getOperand(BoolDstOp).getReg();
1533
1534    const RegisterBank *DstBank =
1535      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1536    if (DstBank != &AMDGPU::SGPRRegBank)
1537      break;
1538
1539    const bool HasCarryIn = MI.getNumOperands() == 5;
1540
1541    // If this is a scalar compare, promote the result to s32, as the selection
1542    // will end up using a copy to a 32-bit vreg.
1543    const LLT S32 = LLT::scalar(32);
1544    Register NewDstReg = MRI.createGenericVirtualRegister(S32);
1545    MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
1546    MI.getOperand(BoolDstOp).setReg(NewDstReg);
1547    MachineIRBuilder B(MI);
1548
1549    if (HasCarryIn) {
1550      Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
1551      MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
1552      B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
1553      MI.getOperand(4).setReg(NewSrcReg);
1554    }
1555
1556    MachineBasicBlock *MBB = MI.getParent();
1557    B.setInsertPt(*MBB, std::next(MI.getIterator()));
1558    B.buildTrunc(DstReg, NewDstReg);
1559    return;
1560  }
1561  case AMDGPU::G_SELECT: {
1562    Register DstReg = MI.getOperand(0).getReg();
1563    LLT DstTy = MRI.getType(DstReg);
1564
1565    SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
1566    if (CondRegs.empty())
1567      CondRegs.push_back(MI.getOperand(1).getReg());
1568    else {
1569      assert(CondRegs.size() == 1);
1570    }
1571
1572    const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
1573    if (CondBank == &AMDGPU::SGPRRegBank) {
1574      MachineIRBuilder B(MI);
1575      const LLT S32 = LLT::scalar(32);
1576      Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1577      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1578
1579      MI.getOperand(1).setReg(NewCondReg);
1580      B.buildZExt(NewCondReg, CondRegs[0]);
1581    }
1582
1583    if (DstTy.getSizeInBits() != 64)
1584      break;
1585
1586    MachineIRBuilder B(MI);
1587    LLT HalfTy = getHalfSizedType(DstTy);
1588
1589    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1590    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1591    SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1592
1593    // All inputs are SGPRs, nothing special to do.
1594    if (DefRegs.empty()) {
1595      assert(Src1Regs.empty() && Src2Regs.empty());
1596      break;
1597    }
1598
1599    if (Src1Regs.empty())
1600      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1601    else {
1602      setRegsToType(MRI, Src1Regs, HalfTy);
1603    }
1604
1605    if (Src2Regs.empty())
1606      split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1607    else
1608      setRegsToType(MRI, Src2Regs, HalfTy);
1609
1610    setRegsToType(MRI, DefRegs, HalfTy);
1611
1612    B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
1613    B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
1614
1615    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1616    MI.eraseFromParent();
1617    return;
1618  }
1619  case AMDGPU::G_BRCOND: {
1620    Register CondReg = MI.getOperand(0).getReg();
1621    // FIXME: Should use legalizer helper, but should change bool ext type.
1622    const RegisterBank *CondBank =
1623      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1624
1625    if (CondBank == &AMDGPU::SGPRRegBank) {
1626      MachineIRBuilder B(MI);
1627      const LLT S32 = LLT::scalar(32);
1628      Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1629      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1630
1631      MI.getOperand(0).setReg(NewCondReg);
1632      B.buildZExt(NewCondReg, CondReg);
1633      return;
1634    }
1635
1636    break;
1637  }
1638  case AMDGPU::G_AND:
1639  case AMDGPU::G_OR:
1640  case AMDGPU::G_XOR: {
1641    // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1642    // there is a VGPR input.
1643    Register DstReg = MI.getOperand(0).getReg();
1644    LLT DstTy = MRI.getType(DstReg);
1645
1646    if (DstTy.getSizeInBits() == 1) {
1647      const RegisterBank *DstBank =
1648        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1649      if (DstBank == &AMDGPU::VCCRegBank)
1650        break;
1651
1652      MachineFunction *MF = MI.getParent()->getParent();
1653      ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1654      GISelObserverWrapper Observer(&ApplyBank);
1655      MachineIRBuilder B(MI);
1656      LegalizerHelper Helper(*MF, Observer, B);
1657
1658      if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1659          LegalizerHelper::Legalized)
1660        llvm_unreachable("widen scalar should have succeeded");
1661      return;
1662    }
1663
1664    if (DstTy.getSizeInBits() != 64)
1665      break;
1666
1667    LLT HalfTy = getHalfSizedType(DstTy);
1668    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1669    SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1670    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1671
1672    // All inputs are SGPRs, nothing special to do.
1673    if (DefRegs.empty()) {
1674      assert(Src0Regs.empty() && Src1Regs.empty());
1675      break;
1676    }
1677
1678    assert(DefRegs.size() == 2);
1679    assert(Src0Regs.size() == Src1Regs.size() &&
1680           (Src0Regs.empty() || Src0Regs.size() == 2));
1681
1682    // Depending on where the source registers came from, the generic code may
1683    // have decided to split the inputs already or not. If not, we still need to
1684    // extract the values.
1685    MachineIRBuilder B(MI);
1686
1687    if (Src0Regs.empty())
1688      split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1689    else
1690      setRegsToType(MRI, Src0Regs, HalfTy);
1691
1692    if (Src1Regs.empty())
1693      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1694    else
1695      setRegsToType(MRI, Src1Regs, HalfTy);
1696
1697    setRegsToType(MRI, DefRegs, HalfTy);
1698
1699    B.buildInstr(Opc)
1700      .addDef(DefRegs[0])
1701      .addUse(Src0Regs[0])
1702      .addUse(Src1Regs[0]);
1703
1704    B.buildInstr(Opc)
1705      .addDef(DefRegs[1])
1706      .addUse(Src0Regs[1])
1707      .addUse(Src1Regs[1]);
1708
1709    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1710    MI.eraseFromParent();
1711    return;
1712  }
1713  case AMDGPU::G_ADD:
1714  case AMDGPU::G_SUB:
1715  case AMDGPU::G_MUL: {
1716    Register DstReg = MI.getOperand(0).getReg();
1717    LLT DstTy = MRI.getType(DstReg);
1718    if (DstTy != LLT::scalar(16))
1719      break;
1720
1721    const RegisterBank *DstBank =
1722      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1723    if (DstBank == &AMDGPU::VGPRRegBank)
1724      break;
1725
1726    // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1727    MachineFunction *MF = MI.getParent()->getParent();
1728    MachineIRBuilder B(MI);
1729    ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1730    GISelObserverWrapper Observer(&ApplySALU);
1731    LegalizerHelper Helper(*MF, Observer, B);
1732
1733    if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1734        LegalizerHelper::Legalized)
1735      llvm_unreachable("widen scalar should have succeeded");
1736    return;
1737  }
1738  case AMDGPU::G_SMIN:
1739  case AMDGPU::G_SMAX:
1740  case AMDGPU::G_UMIN:
1741  case AMDGPU::G_UMAX: {
1742    Register DstReg = MI.getOperand(0).getReg();
1743    const RegisterBank *DstBank =
1744      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1745    if (DstBank == &AMDGPU::VGPRRegBank)
1746      break;
1747
1748    MachineFunction *MF = MI.getParent()->getParent();
1749    MachineIRBuilder B(MI);
1750
1751    // Turn scalar min/max into a compare and select.
1752    LLT Ty = MRI.getType(DstReg);
1753    LLT S32 = LLT::scalar(32);
1754    LLT S16 = LLT::scalar(16);
1755
1756    if (Ty == S16) {
1757      ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1758      GISelObserverWrapper Observer(&ApplySALU);
1759      LegalizerHelper Helper(*MF, Observer, B);
1760
1761      // Need to widen to s32, and expand as cmp + select.
1762      if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1763        llvm_unreachable("widenScalar should have succeeded");
1764
1765      // FIXME: This is relying on widenScalar leaving MI in place.
1766      lowerScalarMinMax(B, MI);
1767    } else
1768      lowerScalarMinMax(B, MI);
1769
1770    return;
1771  }
1772  case AMDGPU::G_SEXT:
1773  case AMDGPU::G_ZEXT: {
1774    Register SrcReg = MI.getOperand(1).getReg();
1775    LLT SrcTy = MRI.getType(SrcReg);
1776    bool Signed = Opc == AMDGPU::G_SEXT;
1777
1778    MachineIRBuilder B(MI);
1779    const RegisterBank *SrcBank =
1780      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1781
1782    Register DstReg = MI.getOperand(0).getReg();
1783    LLT DstTy = MRI.getType(DstReg);
1784    if (DstTy.isScalar() &&
1785        SrcBank != &AMDGPU::SGPRRegBank &&
1786        SrcBank != &AMDGPU::VCCRegBank &&
1787        // FIXME: Should handle any type that round to s64 when irregular
1788        // breakdowns supported.
1789        DstTy.getSizeInBits() == 64 &&
1790        SrcTy.getSizeInBits() <= 32) {
1791      const LLT S32 = LLT::scalar(32);
1792      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1793
1794      // Extend to 32-bit, and then extend the low half.
1795      if (Signed) {
1796        // TODO: Should really be buildSExtOrCopy
1797        B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1798
1799        // Replicate sign bit from 32-bit extended part.
1800        auto ShiftAmt = B.buildConstant(S32, 31);
1801        MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1802        B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1803      } else {
1804        B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1805        B.buildConstant(DefRegs[1], 0);
1806      }
1807
1808      MRI.setRegBank(DstReg, *SrcBank);
1809      MI.eraseFromParent();
1810      return;
1811    }
1812
1813    if (SrcTy != LLT::scalar(1))
1814      return;
1815
1816    if (SrcBank == &AMDGPU::VCCRegBank) {
1817      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1818
1819      const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
1820
1821      unsigned DstSize = DstTy.getSizeInBits();
1822      // 64-bit select is SGPR only
1823      const bool UseSel64 = DstSize > 32 &&
1824        SrcBank->getID() == AMDGPU::SGPRRegBankID;
1825
1826      // TODO: Should s16 select be legal?
1827      LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1828      auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1829      auto False = B.buildConstant(SelType, 0);
1830
1831      MRI.setRegBank(True.getReg(0), *DstBank);
1832      MRI.setRegBank(False.getReg(0), *DstBank);
1833      MRI.setRegBank(DstReg, *DstBank);
1834
1835      if (DstSize > 32) {
1836        B.buildSelect(DefRegs[0], SrcReg, True, False);
1837        B.buildCopy(DefRegs[1], DefRegs[0]);
1838      } else if (DstSize < 32) {
1839        auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1840        MRI.setRegBank(Sel.getReg(0), *DstBank);
1841        B.buildTrunc(DstReg, Sel);
1842      } else {
1843        B.buildSelect(DstReg, SrcReg, True, False);
1844      }
1845
1846      MI.eraseFromParent();
1847      return;
1848    }
1849
1850    // Fixup the case with an s1 src that isn't a condition register. Use shifts
1851    // instead of introducing a compare to avoid an unnecessary condition
1852    // register (and since there's no scalar 16-bit compares).
1853    auto Ext = B.buildAnyExt(DstTy, SrcReg);
1854    auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1855    auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1856
1857    if (MI.getOpcode() == AMDGPU::G_SEXT)
1858      B.buildAShr(DstReg, Shl, ShiftAmt);
1859    else
1860      B.buildLShr(DstReg, Shl, ShiftAmt);
1861
1862    MRI.setRegBank(DstReg, *SrcBank);
1863    MRI.setRegBank(Ext.getReg(0), *SrcBank);
1864    MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1865    MRI.setRegBank(Shl.getReg(0), *SrcBank);
1866    MI.eraseFromParent();
1867    return;
1868  }
1869  case AMDGPU::G_BUILD_VECTOR:
1870  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1871    Register DstReg = MI.getOperand(0).getReg();
1872    LLT DstTy = MRI.getType(DstReg);
1873    if (DstTy != LLT::vector(2, 16))
1874      break;
1875
1876    assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
1877    substituteSimpleCopyRegs(OpdMapper, 1);
1878    substituteSimpleCopyRegs(OpdMapper, 2);
1879
1880    const RegisterBank *DstBank =
1881      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1882    if (DstBank == &AMDGPU::SGPRRegBank)
1883      break; // Can use S_PACK_* instructions.
1884
1885    MachineIRBuilder B(MI);
1886
1887    Register Lo = MI.getOperand(1).getReg();
1888    Register Hi = MI.getOperand(2).getReg();
1889    const LLT S32 = LLT::scalar(32);
1890
1891    const RegisterBank *BankLo =
1892      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1893    const RegisterBank *BankHi =
1894      OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1895
1896    Register ZextLo;
1897    Register ShiftHi;
1898
1899    if (Opc == AMDGPU::G_BUILD_VECTOR) {
1900      ZextLo = B.buildZExt(S32, Lo).getReg(0);
1901      MRI.setRegBank(ZextLo, *BankLo);
1902
1903      Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1904      MRI.setRegBank(ZextHi, *BankHi);
1905
1906      auto ShiftAmt = B.buildConstant(S32, 16);
1907      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1908
1909      ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1910      MRI.setRegBank(ShiftHi, *BankHi);
1911    } else {
1912      Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1913      MRI.setRegBank(MaskLo, *BankLo);
1914
1915      auto ShiftAmt = B.buildConstant(S32, 16);
1916      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1917
1918      ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1919      MRI.setRegBank(ShiftHi, *BankHi);
1920
1921      ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1922      MRI.setRegBank(ZextLo, *BankLo);
1923    }
1924
1925    auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1926    MRI.setRegBank(Or.getReg(0), *DstBank);
1927
1928    B.buildBitcast(DstReg, Or);
1929    MI.eraseFromParent();
1930    return;
1931  }
1932  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1933    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1934
1935    assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
1936
1937    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1938    MachineIRBuilder B(MI);
1939
1940    const ValueMapping &DstMapping
1941      = OpdMapper.getInstrMapping().getOperandMapping(0);
1942    const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
1943    const RegisterBank *SrcBank =
1944      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1945
1946    Register DstReg = MI.getOperand(0).getReg();
1947    Register SrcReg = MI.getOperand(1).getReg();
1948    Register IdxReg = MI.getOperand(2).getReg();
1949
1950    // If this is a VGPR result only because the index was a VGPR result, the
1951    // actual indexing will be done on the SGPR source vector, which will
1952    // produce a scalar result. We need to copy to the VGPR result inside the
1953    // waterfall loop.
1954    const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
1955                                SrcBank == &AMDGPU::SGPRRegBank;
1956    if (DstRegs.empty()) {
1957      applyDefaultMapping(OpdMapper);
1958
1959      executeInWaterfallLoop(MI, MRI, { 2 });
1960
1961      if (NeedCopyToVGPR) {
1962        // We don't want a phi for this temporary reg.
1963        Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
1964        MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
1965        MI.getOperand(0).setReg(TmpReg);
1966        B.setInsertPt(*MI.getParent(), ++MI.getIterator());
1967
1968        // Use a v_mov_b32 here to make the exec dependency explicit.
1969        buildVCopy(B, DstReg, TmpReg);
1970      }
1971
1972      return;
1973    }
1974
1975    assert(DstTy.getSizeInBits() == 64);
1976
1977    LLT SrcTy = MRI.getType(SrcReg);
1978    const LLT S32 = LLT::scalar(32);
1979    LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1980
1981    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1982    auto One = B.buildConstant(S32, 1);
1983
1984    // Split the vector index into 32-bit pieces. Prepare to move all of the
1985    // new instructions into a waterfall loop if necessary.
1986    //
1987    // Don't put the bitcast or constant in the loop.
1988    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1989
1990    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1991    auto IdxLo = B.buildShl(S32, IdxReg, One);
1992    auto IdxHi = B.buildAdd(S32, IdxLo, One);
1993
1994    auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
1995    auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
1996
1997    MRI.setRegBank(DstReg, *DstBank);
1998    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1999    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2000    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2001    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2002
2003    SmallSet<Register, 4> OpsToWaterfall;
2004    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2005      MI.eraseFromParent();
2006      return;
2007    }
2008
2009    // Remove the original instruction to avoid potentially confusing the
2010    // waterfall loop logic.
2011    B.setInstr(*Span.begin());
2012    MI.eraseFromParent();
2013    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2014                           OpsToWaterfall, MRI);
2015
2016    if (NeedCopyToVGPR) {
2017      MachineBasicBlock *LoopBB = Extract1->getParent();
2018      Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2019      Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2020      MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2021      MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2022
2023      Extract0->getOperand(0).setReg(TmpReg0);
2024      Extract1->getOperand(0).setReg(TmpReg1);
2025
2026      B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2027
2028      buildVCopy(B, DstRegs[0], TmpReg0);
2029      buildVCopy(B, DstRegs[1], TmpReg1);
2030    }
2031
2032    return;
2033  }
2034  case AMDGPU::G_INSERT_VECTOR_ELT: {
2035    SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2036
2037    assert(OpdMapper.getVRegs(0).empty());
2038    assert(OpdMapper.getVRegs(1).empty());
2039    assert(OpdMapper.getVRegs(3).empty());
2040
2041    if (InsRegs.empty()) {
2042      applyDefaultMapping(OpdMapper);
2043      executeInWaterfallLoop(MI, MRI, { 3 });
2044      return;
2045    }
2046
2047    Register DstReg = MI.getOperand(0).getReg();
2048    Register SrcReg = MI.getOperand(1).getReg();
2049    Register InsReg = MI.getOperand(2).getReg();
2050    Register IdxReg = MI.getOperand(3).getReg();
2051    LLT SrcTy = MRI.getType(SrcReg);
2052    LLT InsTy = MRI.getType(InsReg);
2053    (void)InsTy;
2054
2055    assert(InsTy.getSizeInBits() == 64);
2056
2057    const LLT S32 = LLT::scalar(32);
2058    LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2059
2060    MachineIRBuilder B(MI);
2061    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2062    auto One = B.buildConstant(S32, 1);
2063
2064    // Split the vector index into 32-bit pieces. Prepare to move all of the
2065    // new instructions into a waterfall loop if necessary.
2066    //
2067    // Don't put the bitcast or constant in the loop.
2068    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2069
2070    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2071    auto IdxLo = B.buildShl(S32, IdxReg, One);
2072    auto IdxHi = B.buildAdd(S32, IdxLo, One);
2073
2074    auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2075    auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2076    B.buildBitcast(DstReg, InsHi);
2077
2078    const RegisterBank *DstBank =
2079      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2080    const RegisterBank *SrcBank =
2081      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2082    const RegisterBank *InsSrcBank =
2083      OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2084
2085    MRI.setRegBank(InsReg, *InsSrcBank);
2086    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2087    MRI.setRegBank(InsLo.getReg(0), *DstBank);
2088    MRI.setRegBank(InsHi.getReg(0), *DstBank);
2089    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2090    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2091    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2092
2093
2094    SmallSet<Register, 4> OpsToWaterfall;
2095    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2096      MI.eraseFromParent();
2097      return;
2098    }
2099
2100    B.setInstr(*Span.begin());
2101    MI.eraseFromParent();
2102
2103    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2104                           OpsToWaterfall, MRI);
2105    return;
2106  }
2107  case AMDGPU::G_INTRINSIC: {
2108    switch (MI.getIntrinsicID()) {
2109    case Intrinsic::amdgcn_s_buffer_load: {
2110      // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
2111      executeInWaterfallLoop(MI, MRI, { 2, 3 });
2112      return;
2113    }
2114    case Intrinsic::amdgcn_readlane: {
2115      substituteSimpleCopyRegs(OpdMapper, 2);
2116
2117      assert(OpdMapper.getVRegs(0).empty());
2118      assert(OpdMapper.getVRegs(3).empty());
2119
2120      // Make sure the index is an SGPR. It doesn't make sense to run this in a
2121      // waterfall loop, so assume it's a uniform value.
2122      constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2123      return;
2124    }
2125    case Intrinsic::amdgcn_writelane: {
2126      assert(OpdMapper.getVRegs(0).empty());
2127      assert(OpdMapper.getVRegs(2).empty());
2128      assert(OpdMapper.getVRegs(3).empty());
2129
2130      substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2131      constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2132      constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2133      return;
2134    }
2135    default:
2136      break;
2137    }
2138    break;
2139  }
2140  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2141    auto IntrID = MI.getIntrinsicID();
2142    switch (IntrID) {
2143    case Intrinsic::amdgcn_buffer_load: {
2144      executeInWaterfallLoop(MI, MRI, { 2 });
2145      return;
2146    }
2147    case Intrinsic::amdgcn_ds_ordered_add:
2148    case Intrinsic::amdgcn_ds_ordered_swap: {
2149      // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2150      assert(OpdMapper.getVRegs(0).empty());
2151      substituteSimpleCopyRegs(OpdMapper, 3);
2152      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2153      return;
2154    }
2155    case Intrinsic::amdgcn_ds_gws_init:
2156    case Intrinsic::amdgcn_ds_gws_barrier:
2157    case Intrinsic::amdgcn_ds_gws_sema_br: {
2158      // Only the first lane is executes, so readfirstlane is safe.
2159      substituteSimpleCopyRegs(OpdMapper, 1);
2160      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2161      return;
2162    }
2163    case Intrinsic::amdgcn_ds_gws_sema_v:
2164    case Intrinsic::amdgcn_ds_gws_sema_p:
2165    case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2166      // Only the first lane is executes, so readfirstlane is safe.
2167      constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2168      return;
2169    }
2170    case Intrinsic::amdgcn_s_sendmsg:
2171    case Intrinsic::amdgcn_s_sendmsghalt: {
2172      // FIXME: Should this use a waterfall loop?
2173      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2174      return;
2175    }
2176    case Intrinsic::amdgcn_raw_buffer_load:
2177    case Intrinsic::amdgcn_raw_buffer_load_format:
2178    case Intrinsic::amdgcn_raw_tbuffer_load:
2179    case Intrinsic::amdgcn_raw_buffer_store:
2180    case Intrinsic::amdgcn_raw_buffer_store_format:
2181    case Intrinsic::amdgcn_raw_tbuffer_store: {
2182      applyDefaultMapping(OpdMapper);
2183      executeInWaterfallLoop(MI, MRI, {2, 4});
2184      return;
2185    }
2186    case Intrinsic::amdgcn_struct_buffer_load:
2187    case Intrinsic::amdgcn_struct_buffer_store:
2188    case Intrinsic::amdgcn_struct_tbuffer_load:
2189    case Intrinsic::amdgcn_struct_tbuffer_store: {
2190      applyDefaultMapping(OpdMapper);
2191      executeInWaterfallLoop(MI, MRI, {2, 5});
2192      return;
2193    }
2194    default: {
2195      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2196              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2197        // Non-images can have complications from operands that allow both SGPR
2198        // and VGPR. For now it's too complicated to figure out the final opcode
2199        // to derive the register bank from the MCInstrDesc.
2200        if (RSrcIntrin->IsImage) {
2201          applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2202          return;
2203        }
2204      }
2205
2206      break;
2207    }
2208    }
2209    break;
2210  }
2211  case AMDGPU::G_LOAD:
2212  case AMDGPU::G_ZEXTLOAD:
2213  case AMDGPU::G_SEXTLOAD: {
2214    if (applyMappingWideLoad(MI, OpdMapper, MRI))
2215      return;
2216    break;
2217  }
2218  default:
2219    break;
2220  }
2221
2222  return applyDefaultMapping(OpdMapper);
2223}
2224
2225bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
2226  const MachineFunction &MF = *MI.getParent()->getParent();
2227  const MachineRegisterInfo &MRI = MF.getRegInfo();
2228  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
2229    if (!MI.getOperand(i).isReg())
2230      continue;
2231    Register Reg = MI.getOperand(i).getReg();
2232    if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
2233      if (Bank->getID() != AMDGPU::SGPRRegBankID)
2234        return false;
2235    }
2236  }
2237  return true;
2238}
2239
2240const RegisterBankInfo::InstructionMapping &
2241AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
2242  const MachineFunction &MF = *MI.getParent()->getParent();
2243  const MachineRegisterInfo &MRI = MF.getRegInfo();
2244  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2245
2246  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2247    unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2248    OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2249  }
2250  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2251                               MI.getNumOperands());
2252}
2253
2254const RegisterBankInfo::InstructionMapping &
2255AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
2256  const MachineFunction &MF = *MI.getParent()->getParent();
2257  const MachineRegisterInfo &MRI = MF.getRegInfo();
2258  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2259  unsigned OpdIdx = 0;
2260
2261  unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2262  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2263
2264  if (MI.getOperand(OpdIdx).isIntrinsicID())
2265    OpdsMapping[OpdIdx++] = nullptr;
2266
2267  Register Reg1 = MI.getOperand(OpdIdx).getReg();
2268  unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
2269
2270  unsigned DefaultBankID = Size1 == 1 ?
2271    AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2272  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
2273
2274  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
2275
2276  for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
2277    const MachineOperand &MO = MI.getOperand(OpdIdx);
2278    if (!MO.isReg())
2279      continue;
2280
2281    unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
2282    unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2283    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
2284  }
2285
2286  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2287                               MI.getNumOperands());
2288}
2289
2290const RegisterBankInfo::InstructionMapping &
2291AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
2292  const MachineFunction &MF = *MI.getParent()->getParent();
2293  const MachineRegisterInfo &MRI = MF.getRegInfo();
2294  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2295
2296  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
2297    const MachineOperand &Op = MI.getOperand(I);
2298    if (!Op.isReg())
2299      continue;
2300
2301    unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
2302    OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2303  }
2304
2305  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2306                               MI.getNumOperands());
2307}
2308
2309const RegisterBankInfo::InstructionMapping &
2310AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
2311                                        const MachineInstr &MI,
2312                                        int RsrcIdx) const {
2313  // The reported argument index is relative to the IR intrinsic call arguments,
2314  // so we need to shift by the number of defs and the intrinsic ID.
2315  RsrcIdx += MI.getNumExplicitDefs() + 1;
2316
2317  const int NumOps = MI.getNumOperands();
2318  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2319
2320  // TODO: Should packed/unpacked D16 difference be reported here as part of
2321  // the value mapping?
2322  for (int I = 0; I != NumOps; ++I) {
2323    if (!MI.getOperand(I).isReg())
2324      continue;
2325
2326    Register OpReg = MI.getOperand(I).getReg();
2327    unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2328
2329    // FIXME: Probably need a new intrinsic register bank searchable table to
2330    // handle arbitrary intrinsics easily.
2331    //
2332    // If this has a sampler, it immediately follows rsrc.
2333    const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2334
2335    if (MustBeSGPR) {
2336      // If this must be an SGPR, so we must report whatever it is as legal.
2337      unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2338      OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2339    } else {
2340      // Some operands must be VGPR, and these are easy to copy to.
2341      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2342    }
2343  }
2344
2345  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2346}
2347
2348const RegisterBankInfo::InstructionMapping &
2349AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2350
2351  const MachineFunction &MF = *MI.getParent()->getParent();
2352  const MachineRegisterInfo &MRI = MF.getRegInfo();
2353  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2354  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2355  LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2356  Register PtrReg = MI.getOperand(1).getReg();
2357  LLT PtrTy = MRI.getType(PtrReg);
2358  unsigned AS = PtrTy.getAddressSpace();
2359  unsigned PtrSize = PtrTy.getSizeInBits();
2360
2361  const ValueMapping *ValMapping;
2362  const ValueMapping *PtrMapping;
2363
2364  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2365
2366  if (PtrBank == &AMDGPU::SGPRRegBank &&
2367      (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
2368       AS != AMDGPUAS::PRIVATE_ADDRESS) &&
2369      isScalarLoadLegal(MI)) {
2370    // We have a uniform instruction so we want to use an SMRD load
2371    ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2372    PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2373  } else {
2374    ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2375    PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2376  }
2377
2378  OpdsMapping[0] = ValMapping;
2379  OpdsMapping[1] = PtrMapping;
2380  const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
2381      1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2382  return Mapping;
2383
2384  // FIXME: Do we want to add a mapping for FLAT load, or should we just
2385  // handle that during instruction selection?
2386}
2387
2388unsigned
2389AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2390                                     const MachineRegisterInfo &MRI,
2391                                     const TargetRegisterInfo &TRI,
2392                                     unsigned Default) const {
2393  const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2394  return Bank ? Bank->getID() : Default;
2395}
2396
2397
2398static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
2399  return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
2400    AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2401}
2402
2403static int regBankBoolUnion(int RB0, int RB1) {
2404  if (RB0 == -1)
2405    return RB1;
2406  if (RB1 == -1)
2407    return RB0;
2408
2409  // vcc, vcc -> vcc
2410  // vcc, sgpr -> vcc
2411  // vcc, vgpr -> vcc
2412  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
2413    return AMDGPU::VCCRegBankID;
2414
2415  // vcc, vgpr -> vgpr
2416  return regBankUnion(RB0, RB1);
2417}
2418
2419const RegisterBankInfo::ValueMapping *
2420AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
2421                                         const MachineRegisterInfo &MRI,
2422                                         const TargetRegisterInfo &TRI) const {
2423  // Lie and claim anything is legal, even though this needs to be an SGPR
2424  // applyMapping will have to deal with it as a waterfall loop.
2425  unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
2426  unsigned Size = getSizeInBits(Reg, MRI, TRI);
2427  return AMDGPU::getValueMapping(Bank, Size);
2428}
2429
2430const RegisterBankInfo::ValueMapping *
2431AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
2432                                         const MachineRegisterInfo &MRI,
2433                                         const TargetRegisterInfo &TRI) const {
2434  unsigned Size = getSizeInBits(Reg, MRI, TRI);
2435  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2436}
2437
2438const RegisterBankInfo::ValueMapping *
2439AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
2440                                         const MachineRegisterInfo &MRI,
2441                                         const TargetRegisterInfo &TRI) const {
2442  unsigned Size = getSizeInBits(Reg, MRI, TRI);
2443  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
2444}
2445
2446///
2447/// This function must return a legal mapping, because
2448/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
2449/// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
2450/// VGPR to SGPR generated is illegal.
2451///
2452const RegisterBankInfo::InstructionMapping &
2453AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
2454  const MachineFunction &MF = *MI.getParent()->getParent();
2455  const MachineRegisterInfo &MRI = MF.getRegInfo();
2456
2457  if (MI.isRegSequence()) {
2458    // If any input is a VGPR, the result must be a VGPR. The default handling
2459    // assumes any copy between banks is legal.
2460    unsigned BankID = AMDGPU::SGPRRegBankID;
2461
2462    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2463      auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
2464      // It doesn't make sense to use vcc or scc banks here, so just ignore
2465      // them.
2466      if (OpBank != AMDGPU::SGPRRegBankID) {
2467        BankID = AMDGPU::VGPRRegBankID;
2468        break;
2469      }
2470    }
2471    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2472
2473    const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
2474    return getInstructionMapping(
2475        1, /*Cost*/ 1,
2476        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2477  }
2478
2479  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
2480  // properly.
2481  //
2482  // TODO: There are additional exec masking dependencies to analyze.
2483  if (MI.getOpcode() == TargetOpcode::G_PHI) {
2484    // TODO: Generate proper invalid bank enum.
2485    int ResultBank = -1;
2486    Register DstReg = MI.getOperand(0).getReg();
2487
2488    // Sometimes the result may have already been assigned a bank.
2489    if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
2490      ResultBank = DstBank->getID();
2491
2492    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2493      Register Reg = MI.getOperand(I).getReg();
2494      const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
2495
2496      // FIXME: Assuming VGPR for any undetermined inputs.
2497      if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
2498        ResultBank = AMDGPU::VGPRRegBankID;
2499        break;
2500      }
2501
2502      // FIXME: Need to promote SGPR case to s32
2503      unsigned OpBank = Bank->getID();
2504      ResultBank = regBankBoolUnion(ResultBank, OpBank);
2505    }
2506
2507    assert(ResultBank != -1);
2508
2509    unsigned Size = MRI.getType(DstReg).getSizeInBits();
2510
2511    const ValueMapping &ValMap =
2512        getValueMapping(0, Size, getRegBank(ResultBank));
2513    return getInstructionMapping(
2514        1, /*Cost*/ 1,
2515        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2516  }
2517
2518  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
2519  if (Mapping.isValid())
2520    return Mapping;
2521
2522  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2523
2524  switch (MI.getOpcode()) {
2525  default:
2526    return getInvalidInstructionMapping();
2527
2528  case AMDGPU::G_AND:
2529  case AMDGPU::G_OR:
2530  case AMDGPU::G_XOR: {
2531    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2532    if (Size == 1) {
2533      const RegisterBank *DstBank
2534        = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2535
2536      unsigned TargetBankID = -1;
2537      unsigned BankLHS = -1;
2538      unsigned BankRHS = -1;
2539      if (DstBank) {
2540        TargetBankID = DstBank->getID();
2541        if (DstBank == &AMDGPU::VCCRegBank) {
2542          TargetBankID = AMDGPU::VCCRegBankID;
2543          BankLHS = AMDGPU::VCCRegBankID;
2544          BankRHS = AMDGPU::VCCRegBankID;
2545        } else {
2546          BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2547                                 AMDGPU::SGPRRegBankID);
2548          BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2549                                 AMDGPU::SGPRRegBankID);
2550        }
2551      } else {
2552        BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2553                               AMDGPU::VCCRegBankID);
2554        BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2555                               AMDGPU::VCCRegBankID);
2556
2557        // Both inputs should be true booleans to produce a boolean result.
2558        if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2559          TargetBankID = AMDGPU::VGPRRegBankID;
2560        } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2561          TargetBankID = AMDGPU::VCCRegBankID;
2562          BankLHS = AMDGPU::VCCRegBankID;
2563          BankRHS = AMDGPU::VCCRegBankID;
2564        } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2565          TargetBankID = AMDGPU::SGPRRegBankID;
2566        }
2567      }
2568
2569      OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2570      OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2571      OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2572      break;
2573    }
2574
2575    if (Size == 64) {
2576
2577      if (isSALUMapping(MI)) {
2578        OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2579        OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2580      } else {
2581        OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2582        unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2583        OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2584
2585        unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2586        OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2587      }
2588
2589      break;
2590    }
2591
2592    LLVM_FALLTHROUGH;
2593  }
2594  case AMDGPU::G_PTR_ADD:
2595  case AMDGPU::G_ADD:
2596  case AMDGPU::G_SUB:
2597  case AMDGPU::G_MUL:
2598  case AMDGPU::G_SHL:
2599  case AMDGPU::G_LSHR:
2600  case AMDGPU::G_ASHR:
2601  case AMDGPU::G_UADDO:
2602  case AMDGPU::G_USUBO:
2603  case AMDGPU::G_UADDE:
2604  case AMDGPU::G_SADDE:
2605  case AMDGPU::G_USUBE:
2606  case AMDGPU::G_SSUBE:
2607  case AMDGPU::G_SMIN:
2608  case AMDGPU::G_SMAX:
2609  case AMDGPU::G_UMIN:
2610  case AMDGPU::G_UMAX:
2611    if (isSALUMapping(MI))
2612      return getDefaultMappingSOP(MI);
2613    LLVM_FALLTHROUGH;
2614
2615  case AMDGPU::G_FADD:
2616  case AMDGPU::G_FSUB:
2617  case AMDGPU::G_FPTOSI:
2618  case AMDGPU::G_FPTOUI:
2619  case AMDGPU::G_FMUL:
2620  case AMDGPU::G_FMA:
2621  case AMDGPU::G_FMAD:
2622  case AMDGPU::G_FSQRT:
2623  case AMDGPU::G_FFLOOR:
2624  case AMDGPU::G_FCEIL:
2625  case AMDGPU::G_FRINT:
2626  case AMDGPU::G_SITOFP:
2627  case AMDGPU::G_UITOFP:
2628  case AMDGPU::G_FPTRUNC:
2629  case AMDGPU::G_FPEXT:
2630  case AMDGPU::G_FEXP2:
2631  case AMDGPU::G_FLOG2:
2632  case AMDGPU::G_FMINNUM:
2633  case AMDGPU::G_FMAXNUM:
2634  case AMDGPU::G_FMINNUM_IEEE:
2635  case AMDGPU::G_FMAXNUM_IEEE:
2636  case AMDGPU::G_FCANONICALIZE:
2637  case AMDGPU::G_INTRINSIC_TRUNC:
2638  case AMDGPU::G_AMDGPU_FFBH_U32:
2639    return getDefaultMappingVOP(MI);
2640  case AMDGPU::G_UMULH:
2641  case AMDGPU::G_SMULH: {
2642    if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
2643      return getDefaultMappingSOP(MI);
2644    return getDefaultMappingVOP(MI);
2645  }
2646  case AMDGPU::G_IMPLICIT_DEF: {
2647    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2648    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2649    break;
2650  }
2651  case AMDGPU::G_FCONSTANT:
2652  case AMDGPU::G_CONSTANT:
2653  case AMDGPU::G_GLOBAL_VALUE:
2654  case AMDGPU::G_BLOCK_ADDR:
2655  case AMDGPU::G_READCYCLECOUNTER: {
2656    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2657    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2658    break;
2659  }
2660  case AMDGPU::G_FRAME_INDEX: {
2661    // TODO: This should be the same as other constants, but eliminateFrameIndex
2662    // currently assumes VALU uses.
2663    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2664    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2665    break;
2666  }
2667  case AMDGPU::G_INSERT: {
2668    unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2669                                          AMDGPU::VGPRRegBankID;
2670    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2671    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2672    unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2673    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2674    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2675    OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2676    OpdsMapping[3] = nullptr;
2677    break;
2678  }
2679  case AMDGPU::G_EXTRACT: {
2680    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2681    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2682    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2683    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2684    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2685    OpdsMapping[2] = nullptr;
2686    break;
2687  }
2688  case AMDGPU::G_BUILD_VECTOR:
2689  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2690    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2691    if (DstTy == LLT::vector(2, 16)) {
2692      unsigned DstSize = DstTy.getSizeInBits();
2693      unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2694      unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2695      unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2696      unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2697
2698      OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2699      OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2700      OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2701      break;
2702    }
2703
2704    LLVM_FALLTHROUGH;
2705  }
2706  case AMDGPU::G_MERGE_VALUES:
2707  case AMDGPU::G_CONCAT_VECTORS: {
2708    unsigned Bank = isSALUMapping(MI) ?
2709      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2710    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2711    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2712
2713    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2714    // Op1 and Dst should use the same register bank.
2715    for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2716      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2717    break;
2718  }
2719  case AMDGPU::G_BITCAST:
2720  case AMDGPU::G_INTTOPTR:
2721  case AMDGPU::G_PTRTOINT:
2722  case AMDGPU::G_CTLZ:
2723  case AMDGPU::G_CTLZ_ZERO_UNDEF:
2724  case AMDGPU::G_CTTZ:
2725  case AMDGPU::G_CTTZ_ZERO_UNDEF:
2726  case AMDGPU::G_CTPOP:
2727  case AMDGPU::G_BSWAP:
2728  case AMDGPU::G_BITREVERSE:
2729  case AMDGPU::G_FABS:
2730  case AMDGPU::G_FNEG: {
2731    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2732    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2733    OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2734    break;
2735  }
2736  case AMDGPU::G_TRUNC: {
2737    Register Dst = MI.getOperand(0).getReg();
2738    Register Src = MI.getOperand(1).getReg();
2739    unsigned Bank = getRegBankID(Src, MRI, *TRI);
2740    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2741    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2742    OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
2743      AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
2744      AMDGPU::getValueMapping(Bank, DstSize);
2745    OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2746    break;
2747  }
2748  case AMDGPU::G_ZEXT:
2749  case AMDGPU::G_SEXT:
2750  case AMDGPU::G_ANYEXT: {
2751    Register Dst = MI.getOperand(0).getReg();
2752    Register Src = MI.getOperand(1).getReg();
2753    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2754    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2755
2756    unsigned DstBank;
2757    const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2758    assert(SrcBank);
2759    switch (SrcBank->getID()) {
2760    case AMDGPU::SGPRRegBankID:
2761      DstBank = AMDGPU::SGPRRegBankID;
2762      break;
2763    default:
2764      DstBank = AMDGPU::VGPRRegBankID;
2765      break;
2766    }
2767
2768    // TODO: Should anyext be split into 32-bit part as well?
2769    if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2770      OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2771      OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2772    } else {
2773      // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2774      // 32-bits, and then to 64.
2775      OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2776      OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2777                                                         SrcSize);
2778    }
2779    break;
2780  }
2781  case AMDGPU::G_FCMP: {
2782    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2783    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2784    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2785    OpdsMapping[1] = nullptr; // Predicate Operand.
2786    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2787    OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2788    break;
2789  }
2790  case AMDGPU::G_STORE: {
2791    assert(MI.getOperand(0).isReg());
2792    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2793    // FIXME: We need to specify a different reg bank once scalar stores
2794    // are supported.
2795    const ValueMapping *ValMapping =
2796        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2797    // FIXME: Depending on the type of store, the pointer could be in
2798    // the SGPR Reg bank.
2799    // FIXME: Pointer size should be based on the address space.
2800    const ValueMapping *PtrMapping =
2801        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2802
2803    OpdsMapping[0] = ValMapping;
2804    OpdsMapping[1] = PtrMapping;
2805    break;
2806  }
2807
2808  case AMDGPU::G_ICMP: {
2809    auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2810    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2811    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2812    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2813
2814    bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2815                     Op3Bank == AMDGPU::SGPRRegBankID &&
2816      (Size == 32 || (Size == 64 &&
2817                      (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2818                      Subtarget.hasScalarCompareEq64()));
2819
2820    unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
2821
2822    // TODO: Use 32-bit for scalar output size.
2823    // SCC results will need to be copied to a 32-bit SGPR virtual register.
2824    const unsigned ResultSize = 1;
2825
2826    OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
2827    OpdsMapping[1] = nullptr; // Predicate Operand.
2828    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2829    OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2830    break;
2831  }
2832  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2833    // VGPR index can be used for waterfall when indexing a SGPR vector.
2834    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2835    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2836    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2837    unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2838    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2839    unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
2840
2841    OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
2842    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
2843
2844    // The index can be either if the source vector is VGPR.
2845    OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2846    break;
2847  }
2848  case AMDGPU::G_INSERT_VECTOR_ELT: {
2849    unsigned OutputBankID = isSALUMapping(MI) ?
2850      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2851
2852    unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2853    unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2854    unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2855    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2856    unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
2857                                            MRI, *TRI);
2858    unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2859
2860    OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2861    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
2862    OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
2863                                                       InsertSize);
2864
2865    // The index can be either if the source vector is VGPR.
2866    OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
2867    break;
2868  }
2869  case AMDGPU::G_UNMERGE_VALUES: {
2870    unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2871      AMDGPU::VGPRRegBankID;
2872
2873    // Op1 and Dst should use the same register bank.
2874    // FIXME: Shouldn't this be the default? Why do we need to handle this?
2875    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2876      unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2877      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2878    }
2879    break;
2880  }
2881  case AMDGPU::G_INTRINSIC: {
2882    switch (MI.getIntrinsicID()) {
2883    default:
2884      return getInvalidInstructionMapping();
2885    case Intrinsic::amdgcn_div_fmas:
2886    case Intrinsic::amdgcn_div_fixup:
2887    case Intrinsic::amdgcn_trig_preop:
2888    case Intrinsic::amdgcn_sin:
2889    case Intrinsic::amdgcn_cos:
2890    case Intrinsic::amdgcn_log_clamp:
2891    case Intrinsic::amdgcn_rcp:
2892    case Intrinsic::amdgcn_rcp_legacy:
2893    case Intrinsic::amdgcn_rsq:
2894    case Intrinsic::amdgcn_rsq_legacy:
2895    case Intrinsic::amdgcn_rsq_clamp:
2896    case Intrinsic::amdgcn_ldexp:
2897    case Intrinsic::amdgcn_frexp_mant:
2898    case Intrinsic::amdgcn_frexp_exp:
2899    case Intrinsic::amdgcn_fract:
2900    case Intrinsic::amdgcn_cvt_pkrtz:
2901    case Intrinsic::amdgcn_cvt_pknorm_i16:
2902    case Intrinsic::amdgcn_cvt_pknorm_u16:
2903    case Intrinsic::amdgcn_cvt_pk_i16:
2904    case Intrinsic::amdgcn_cvt_pk_u16:
2905    case Intrinsic::amdgcn_fmed3:
2906    case Intrinsic::amdgcn_cubeid:
2907    case Intrinsic::amdgcn_cubema:
2908    case Intrinsic::amdgcn_cubesc:
2909    case Intrinsic::amdgcn_cubetc:
2910    case Intrinsic::amdgcn_sffbh:
2911    case Intrinsic::amdgcn_fmad_ftz:
2912    case Intrinsic::amdgcn_mbcnt_lo:
2913    case Intrinsic::amdgcn_mbcnt_hi:
2914    case Intrinsic::amdgcn_ubfe:
2915    case Intrinsic::amdgcn_sbfe:
2916    case Intrinsic::amdgcn_mul_u24:
2917    case Intrinsic::amdgcn_mul_i24:
2918    case Intrinsic::amdgcn_lerp:
2919    case Intrinsic::amdgcn_sad_u8:
2920    case Intrinsic::amdgcn_msad_u8:
2921    case Intrinsic::amdgcn_sad_hi_u8:
2922    case Intrinsic::amdgcn_sad_u16:
2923    case Intrinsic::amdgcn_qsad_pk_u16_u8:
2924    case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2925    case Intrinsic::amdgcn_mqsad_u32_u8:
2926    case Intrinsic::amdgcn_cvt_pk_u8_f32:
2927    case Intrinsic::amdgcn_alignbit:
2928    case Intrinsic::amdgcn_alignbyte:
2929    case Intrinsic::amdgcn_fdot2:
2930    case Intrinsic::amdgcn_sdot2:
2931    case Intrinsic::amdgcn_udot2:
2932    case Intrinsic::amdgcn_sdot4:
2933    case Intrinsic::amdgcn_udot4:
2934    case Intrinsic::amdgcn_sdot8:
2935    case Intrinsic::amdgcn_udot8:
2936    case Intrinsic::amdgcn_wwm:
2937    case Intrinsic::amdgcn_wqm:
2938      return getDefaultMappingVOP(MI);
2939    case Intrinsic::amdgcn_ds_swizzle:
2940    case Intrinsic::amdgcn_ds_permute:
2941    case Intrinsic::amdgcn_ds_bpermute:
2942    case Intrinsic::amdgcn_update_dpp:
2943      return getDefaultMappingAllVGPR(MI);
2944    case Intrinsic::amdgcn_kernarg_segment_ptr:
2945    case Intrinsic::amdgcn_s_getpc:
2946    case Intrinsic::amdgcn_groupstaticsize: {
2947      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2948      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2949      break;
2950    }
2951    case Intrinsic::amdgcn_wqm_vote: {
2952      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2953      OpdsMapping[0] = OpdsMapping[2]
2954        = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2955      break;
2956    }
2957    case Intrinsic::amdgcn_s_buffer_load: {
2958      // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2959      Register RSrc = MI.getOperand(2).getReg();   // SGPR
2960      Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2961
2962      unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2963      unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2964      unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2965
2966      unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2967      unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2968
2969      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2970      OpdsMapping[1] = nullptr; // intrinsic id
2971
2972      // Lie and claim everything is legal, even though some need to be
2973      // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2974      OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2975      OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2976      OpdsMapping[4] = nullptr;
2977      break;
2978    }
2979    case Intrinsic::amdgcn_div_scale: {
2980      unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2981      unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2982      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2983      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2984
2985      unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2986      OpdsMapping[3] = AMDGPU::getValueMapping(
2987        getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2988      OpdsMapping[4] = AMDGPU::getValueMapping(
2989        getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2990
2991      break;
2992    }
2993    case Intrinsic::amdgcn_class: {
2994      Register Src0Reg = MI.getOperand(2).getReg();
2995      Register Src1Reg = MI.getOperand(3).getReg();
2996      unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2997      unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2998      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2999      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3000      OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
3001                                               Src0Size);
3002      OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
3003                                               Src1Size);
3004      break;
3005    }
3006    case Intrinsic::amdgcn_icmp:
3007    case Intrinsic::amdgcn_fcmp: {
3008      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3009      // This is not VCCRegBank because this is not used in boolean contexts.
3010      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3011      unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3012      unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3013      unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3014      OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
3015      OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
3016      break;
3017    }
3018    case Intrinsic::amdgcn_readlane: {
3019      // This must be an SGPR, but accept a VGPR.
3020      Register IdxReg = MI.getOperand(3).getReg();
3021      unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3022      unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3023      OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3024      LLVM_FALLTHROUGH;
3025    }
3026    case Intrinsic::amdgcn_readfirstlane: {
3027      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3028      unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3029      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3030      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3031      break;
3032    }
3033    case Intrinsic::amdgcn_writelane: {
3034      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3035      Register SrcReg = MI.getOperand(2).getReg();
3036      unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3037      unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3038      Register IdxReg = MI.getOperand(3).getReg();
3039      unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3040      unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3041      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3042
3043      // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
3044      // to legalize.
3045      OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
3046      OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3047      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3048      break;
3049    }
3050    case Intrinsic::amdgcn_if_break: {
3051      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3052      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3053      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3054      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3055      break;
3056    }
3057    case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
3058    case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
3059    case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
3060    case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
3061    case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
3062    case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
3063    case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
3064    case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
3065    case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
3066    case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
3067    case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
3068    case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
3069    case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
3070    case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
3071    case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
3072    case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
3073    case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
3074    case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
3075    case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
3076    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
3077      // Default for MAI intrinsics.
3078      // srcC can also be an immediate which can be folded later.
3079      // FIXME: Should we eventually add an alternative mapping with AGPR src
3080      // for srcA/srcB?
3081      //
3082      // vdst, srcA, srcB, srcC
3083      OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3084      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3085      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3086      OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3087      break;
3088    }
3089    }
3090    break;
3091  }
3092  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3093    auto IntrID = MI.getIntrinsicID();
3094    switch (IntrID) {
3095    case Intrinsic::amdgcn_s_getreg:
3096    case Intrinsic::amdgcn_s_memtime:
3097    case Intrinsic::amdgcn_s_memrealtime:
3098    case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
3099      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3100      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3101      break;
3102    }
3103    case Intrinsic::amdgcn_ds_append:
3104    case Intrinsic::amdgcn_ds_consume:
3105    case Intrinsic::amdgcn_ds_fadd:
3106    case Intrinsic::amdgcn_ds_fmin:
3107    case Intrinsic::amdgcn_ds_fmax:
3108    case Intrinsic::amdgcn_atomic_inc:
3109    case Intrinsic::amdgcn_atomic_dec:
3110      return getDefaultMappingAllVGPR(MI);
3111    case Intrinsic::amdgcn_ds_ordered_add:
3112    case Intrinsic::amdgcn_ds_ordered_swap: {
3113      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3114      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3115      unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3116                                 AMDGPU::SGPRRegBankID);
3117      OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
3118      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3119      break;
3120    }
3121    case Intrinsic::amdgcn_exp_compr:
3122      OpdsMapping[0] = nullptr; // IntrinsicID
3123      // FIXME: These are immediate values which can't be read from registers.
3124      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3125      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3126      // FIXME: Could we support packed types here?
3127      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3128      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3129      // FIXME: These are immediate values which can't be read from registers.
3130      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3131      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3132      break;
3133    case Intrinsic::amdgcn_exp:
3134      // FIXME: Could we support packed types here?
3135      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3136      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3137      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3138      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3139      break;
3140    case Intrinsic::amdgcn_buffer_load: {
3141      Register RSrc = MI.getOperand(2).getReg();   // SGPR
3142      Register VIndex = MI.getOperand(3).getReg(); // VGPR
3143      Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
3144
3145      unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3146      unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
3147      unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
3148      unsigned Size4 = MRI.getType(Offset).getSizeInBits();
3149
3150      unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
3151      unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
3152
3153      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
3154      OpdsMapping[1] = nullptr; // intrinsic id
3155
3156      // Lie and claim everything is legal, even though some need to be
3157      // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3158      OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
3159      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
3160      OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
3161      OpdsMapping[5] = nullptr;
3162      OpdsMapping[6] = nullptr;
3163      break;
3164    }
3165    case Intrinsic::amdgcn_s_sendmsg:
3166    case Intrinsic::amdgcn_s_sendmsghalt: {
3167      // This must be an SGPR, but accept a VGPR.
3168      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3169                                   AMDGPU::SGPRRegBankID);
3170      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3171      break;
3172    }
3173    case Intrinsic::amdgcn_end_cf:
3174    case Intrinsic::amdgcn_init_exec: {
3175      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3176      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3177      break;
3178    }
3179    case Intrinsic::amdgcn_else: {
3180      unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3181      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3182      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3183      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3184      break;
3185    }
3186    case Intrinsic::amdgcn_kill: {
3187      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3188      break;
3189    }
3190    case Intrinsic::amdgcn_raw_buffer_load:
3191    case Intrinsic::amdgcn_raw_tbuffer_load: {
3192      // FIXME: Should make intrinsic ID the last operand of the instruction,
3193      // then this would be the same as store
3194      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3195      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3196      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3197      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3198      break;
3199    }
3200    case Intrinsic::amdgcn_raw_buffer_store:
3201    case Intrinsic::amdgcn_raw_buffer_store_format:
3202    case Intrinsic::amdgcn_raw_tbuffer_store: {
3203      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3204      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3205      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3206      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3207      break;
3208    }
3209    case Intrinsic::amdgcn_struct_buffer_load:
3210    case Intrinsic::amdgcn_struct_tbuffer_load: {
3211      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3212      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3213      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3214      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3215      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3216      break;
3217    }
3218    case Intrinsic::amdgcn_struct_buffer_store:
3219    case Intrinsic::amdgcn_struct_tbuffer_store: {
3220      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3221      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3222      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3223      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3224      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3225      break;
3226    }
3227    case Intrinsic::amdgcn_init_exec_from_input: {
3228      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3229      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3230      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3231      break;
3232    }
3233    case Intrinsic::amdgcn_ds_gws_init:
3234    case Intrinsic::amdgcn_ds_gws_barrier:
3235    case Intrinsic::amdgcn_ds_gws_sema_br: {
3236      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3237
3238      // This must be an SGPR, but accept a VGPR.
3239      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3240                                   AMDGPU::SGPRRegBankID);
3241      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3242      break;
3243    }
3244    case Intrinsic::amdgcn_ds_gws_sema_v:
3245    case Intrinsic::amdgcn_ds_gws_sema_p:
3246    case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3247      // This must be an SGPR, but accept a VGPR.
3248      unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3249                                   AMDGPU::SGPRRegBankID);
3250      OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
3251      break;
3252    }
3253    default:
3254      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3255              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3256        // Non-images can have complications from operands that allow both SGPR
3257        // and VGPR. For now it's too complicated to figure out the final opcode
3258        // to derive the register bank from the MCInstrDesc.
3259        if (RSrcIntrin->IsImage)
3260          return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
3261      }
3262
3263      return getInvalidInstructionMapping();
3264    }
3265    break;
3266  }
3267  case AMDGPU::G_SELECT: {
3268    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3269    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3270                                    AMDGPU::SGPRRegBankID);
3271    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
3272                                    AMDGPU::SGPRRegBankID);
3273    bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
3274                    Op3Bank == AMDGPU::SGPRRegBankID;
3275
3276    unsigned CondBankDefault = SGPRSrcs ?
3277      AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3278    unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3279                                     CondBankDefault);
3280    if (CondBank == AMDGPU::SGPRRegBankID)
3281      CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3282    else if (CondBank == AMDGPU::VGPRRegBankID)
3283      CondBank = AMDGPU::VCCRegBankID;
3284
3285    unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
3286      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3287
3288    assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
3289
3290    // TODO: Should report 32-bit for scalar condition type.
3291    if (Size == 64) {
3292      OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3293      OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3294      OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3295      OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3296    } else {
3297      OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
3298      OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3299      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
3300      OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
3301    }
3302
3303    break;
3304  }
3305
3306  case AMDGPU::G_LOAD:
3307  case AMDGPU::G_ZEXTLOAD:
3308  case AMDGPU::G_SEXTLOAD:
3309    return getInstrMappingForLoad(MI);
3310
3311  case AMDGPU::G_ATOMICRMW_XCHG:
3312  case AMDGPU::G_ATOMICRMW_ADD:
3313  case AMDGPU::G_ATOMICRMW_SUB:
3314  case AMDGPU::G_ATOMICRMW_AND:
3315  case AMDGPU::G_ATOMICRMW_OR:
3316  case AMDGPU::G_ATOMICRMW_XOR:
3317  case AMDGPU::G_ATOMICRMW_MAX:
3318  case AMDGPU::G_ATOMICRMW_MIN:
3319  case AMDGPU::G_ATOMICRMW_UMAX:
3320  case AMDGPU::G_ATOMICRMW_UMIN:
3321  case AMDGPU::G_ATOMICRMW_FADD:
3322  case AMDGPU::G_ATOMIC_CMPXCHG:
3323  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
3324    return getDefaultMappingAllVGPR(MI);
3325  }
3326  case AMDGPU::G_BRCOND: {
3327    unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
3328                                 AMDGPU::SGPRRegBankID);
3329    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
3330    if (Bank != AMDGPU::SGPRRegBankID)
3331      Bank = AMDGPU::VCCRegBankID;
3332
3333    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
3334    break;
3335  }
3336  }
3337
3338  return getInstructionMapping(/*ID*/1, /*Cost*/1,
3339                               getOperandsMapping(OpdsMapping),
3340                               MI.getNumOperands());
3341}
3342