1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
13#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14#include "SIMachineFunctionInfo.h"
15#include "llvm/ADT/DepthFirstIterator.h"
16#include "llvm/CodeGen/MachineFunctionPass.h"
17#include "llvm/CodeGen/MachineOperand.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
25  MachineInstr *UseMI;
26  union {
27    MachineOperand *OpToFold;
28    uint64_t ImmToFold;
29    int FrameIndexToFold;
30  };
31  int ShrinkOpcode;
32  unsigned UseOpNo;
33  MachineOperand::MachineOperandType Kind;
34  bool Commuted;
35
36  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37                bool Commuted_ = false,
38                int ShrinkOp = -1) :
39    UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40    Kind(FoldOp->getType()),
41    Commuted(Commuted_) {
42    if (FoldOp->isImm()) {
43      ImmToFold = FoldOp->getImm();
44    } else if (FoldOp->isFI()) {
45      FrameIndexToFold = FoldOp->getIndex();
46    } else {
47      assert(FoldOp->isReg() || FoldOp->isGlobal());
48      OpToFold = FoldOp;
49    }
50  }
51
52  bool isFI() const {
53    return Kind == MachineOperand::MO_FrameIndex;
54  }
55
56  bool isImm() const {
57    return Kind == MachineOperand::MO_Immediate;
58  }
59
60  bool isReg() const {
61    return Kind == MachineOperand::MO_Register;
62  }
63
64  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66  bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71  static char ID;
72  MachineRegisterInfo *MRI;
73  const SIInstrInfo *TII;
74  const SIRegisterInfo *TRI;
75  const GCNSubtarget *ST;
76  const SIMachineFunctionInfo *MFI;
77
78  bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79                         const MachineOperand &OpToFold) const;
80
81  bool updateOperand(FoldCandidate &Fold) const;
82
83  bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85  bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87  bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88                        MachineInstr *MI, unsigned OpNo,
89                        MachineOperand *OpToFold) const;
90  bool isUseSafeToFold(const MachineInstr &MI,
91                       const MachineOperand &UseMO) const;
92  bool
93  getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94                Register UseReg, uint8_t OpTy) const;
95  bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96                      unsigned UseOpIdx,
97                      SmallVectorImpl<FoldCandidate> &FoldList) const;
98  void foldOperand(MachineOperand &OpToFold,
99                   MachineInstr *UseMI,
100                   int UseOpIdx,
101                   SmallVectorImpl<FoldCandidate> &FoldList,
102                   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104  MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105  bool tryConstantFoldOp(MachineInstr *MI) const;
106  bool tryFoldCndMask(MachineInstr &MI) const;
107  bool tryFoldZeroHighBits(MachineInstr &MI) const;
108  bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109  bool tryFoldFoldableCopy(MachineInstr &MI,
110                           MachineOperand *&CurrentKnownM0Val) const;
111
112  const MachineOperand *isClamp(const MachineInstr &MI) const;
113  bool tryFoldClamp(MachineInstr &MI);
114
115  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116  bool tryFoldOMod(MachineInstr &MI);
117  bool tryFoldRegSequence(MachineInstr &MI);
118  bool tryFoldPhiAGPR(MachineInstr &MI);
119  bool tryFoldLoad(MachineInstr &MI);
120
121  bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124  SIFoldOperands() : MachineFunctionPass(ID) {
125    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
126  }
127
128  bool runOnMachineFunction(MachineFunction &MF) override;
129
130  StringRef getPassName() const override { return "SI Fold Operands"; }
131
132  void getAnalysisUsage(AnalysisUsage &AU) const override {
133    AU.setPreservesCFG();
134    MachineFunctionPass::getAnalysisUsage(AU);
135  }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141                "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
147static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI,
148                                             const TargetRegisterInfo &TRI,
149                                             const MachineOperand &MO) {
150  const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151  if (const TargetRegisterClass *SubRC =
152          TRI.getSubRegisterClass(RC, MO.getSubReg()))
153    RC = SubRC;
154  return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159  switch (Opc) {
160  case AMDGPU::V_MAC_F32_e64:
161    return AMDGPU::V_MAD_F32_e64;
162  case AMDGPU::V_MAC_F16_e64:
163    return AMDGPU::V_MAD_F16_e64;
164  case AMDGPU::V_FMAC_F32_e64:
165    return AMDGPU::V_FMA_F32_e64;
166  case AMDGPU::V_FMAC_F16_e64:
167    return AMDGPU::V_FMA_F16_gfx9_e64;
168  case AMDGPU::V_FMAC_F16_t16_e64:
169    return AMDGPU::V_FMA_F16_gfx9_e64;
170  case AMDGPU::V_FMAC_LEGACY_F32_e64:
171    return AMDGPU::V_FMA_LEGACY_F32_e64;
172  case AMDGPU::V_FMAC_F64_e64:
173    return AMDGPU::V_FMA_F64_e64;
174  }
175  return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181                                       const MachineOperand &OpToFold) const {
182  if (!OpToFold.isFI())
183    return false;
184
185  const unsigned Opc = UseMI.getOpcode();
186  if (TII->isMUBUF(UseMI))
187    return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188  if (!TII->isFLATScratch(UseMI))
189    return false;
190
191  int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192  if (OpNo == SIdx)
193    return true;
194
195  int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196  return OpNo == VIdx && SIdx == -1;
197}
198
199FunctionPass *llvm::createSIFoldOperandsPass() {
200  return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204  MachineInstr *MI = Fold.UseMI;
205  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206  const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208  assert(Old.isReg() && Fold.isImm());
209
210  if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211      (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213    return false;
214
215  unsigned Opcode = MI->getOpcode();
216  int OpNo = MI->getOperandNo(&Old);
217  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218  switch (OpType) {
219  default:
220    return false;
221  case AMDGPU::OPERAND_REG_IMM_V2FP16:
222  case AMDGPU::OPERAND_REG_IMM_V2INT16:
223  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
224  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
225    break;
226  }
227
228  return true;
229}
230
231bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
232  MachineInstr *MI = Fold.UseMI;
233  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
234  unsigned Opcode = MI->getOpcode();
235  int OpNo = MI->getOperandNo(&Old);
236  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
237
238  // If the literal can be inlined as-is, apply it and short-circuit the
239  // tests below. The main motivation for this is to avoid unintuitive
240  // uses of opsel.
241  if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
242    Old.ChangeToImmediate(Fold.ImmToFold);
243    return true;
244  }
245
246  // Refer to op_sel/op_sel_hi and check if we can change the immediate and
247  // op_sel in a way that allows an inline constant.
248  int ModIdx = -1;
249  unsigned SrcIdx = ~0;
250  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
251    ModIdx = AMDGPU::OpName::src0_modifiers;
252    SrcIdx = 0;
253  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
254    ModIdx = AMDGPU::OpName::src1_modifiers;
255    SrcIdx = 1;
256  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
257    ModIdx = AMDGPU::OpName::src2_modifiers;
258    SrcIdx = 2;
259  }
260  assert(ModIdx != -1);
261  ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
262  MachineOperand &Mod = MI->getOperand(ModIdx);
263  unsigned ModVal = Mod.getImm();
264
265  uint16_t ImmLo = static_cast<uint16_t>(
266      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
267  uint16_t ImmHi = static_cast<uint16_t>(
268      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
269  uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
270  unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
271
272  // Helper function that attempts to inline the given value with a newly
273  // chosen opsel pattern.
274  auto tryFoldToInline = [&](uint32_t Imm) -> bool {
275    if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
276      Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
277      Old.ChangeToImmediate(Imm);
278      return true;
279    }
280
281    // Try to shuffle the halves around and leverage opsel to get an inline
282    // constant.
283    uint16_t Lo = static_cast<uint16_t>(Imm);
284    uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
285    if (Lo == Hi) {
286      if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
287        Mod.setImm(NewModVal);
288        Old.ChangeToImmediate(Lo);
289        return true;
290      }
291
292      if (static_cast<int16_t>(Lo) < 0) {
293        int32_t SExt = static_cast<int16_t>(Lo);
294        if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
295          Mod.setImm(NewModVal);
296          Old.ChangeToImmediate(SExt);
297          return true;
298        }
299      }
300
301      // This check is only useful for integer instructions
302      if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
303          OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
304        if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
305          Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
306          Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
307          return true;
308        }
309      }
310    } else {
311      uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
312      if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
313        Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
314        Old.ChangeToImmediate(Swapped);
315        return true;
316      }
317    }
318
319    return false;
320  };
321
322  if (tryFoldToInline(Imm))
323    return true;
324
325  // Replace integer addition by subtraction and vice versa if it allows
326  // folding the immediate to an inline constant.
327  //
328  // We should only ever get here for SrcIdx == 1 due to canonicalization
329  // earlier in the pipeline, but we double-check here to be safe / fully
330  // general.
331  bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
332  bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
333  if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
334    unsigned ClampIdx =
335        AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
336    bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
337
338    if (!Clamp) {
339      uint16_t NegLo = -static_cast<uint16_t>(Imm);
340      uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
341      uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
342
343      if (tryFoldToInline(NegImm)) {
344        unsigned NegOpcode =
345            IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
346        MI->setDesc(TII->get(NegOpcode));
347        return true;
348      }
349    }
350  }
351
352  return false;
353}
354
355bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
356  MachineInstr *MI = Fold.UseMI;
357  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
358  assert(Old.isReg());
359
360  if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
361    if (tryFoldImmWithOpSel(Fold))
362      return true;
363
364    // We can't represent the candidate as an inline constant. Try as a literal
365    // with the original opsel, checking constant bus limitations.
366    MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
367    int OpNo = MI->getOperandNo(&Old);
368    if (!TII->isOperandLegal(*MI, OpNo, &New))
369      return false;
370    Old.ChangeToImmediate(Fold.ImmToFold);
371    return true;
372  }
373
374  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
375    MachineBasicBlock *MBB = MI->getParent();
376    auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
377    if (Liveness != MachineBasicBlock::LQR_Dead) {
378      LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
379      return false;
380    }
381
382    int Op32 = Fold.ShrinkOpcode;
383    MachineOperand &Dst0 = MI->getOperand(0);
384    MachineOperand &Dst1 = MI->getOperand(1);
385    assert(Dst0.isDef() && Dst1.isDef());
386
387    bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
388
389    const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
390    Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
391
392    MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
393
394    if (HaveNonDbgCarryUse) {
395      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
396              Dst1.getReg())
397        .addReg(AMDGPU::VCC, RegState::Kill);
398    }
399
400    // Keep the old instruction around to avoid breaking iterators, but
401    // replace it with a dummy instruction to remove uses.
402    //
403    // FIXME: We should not invert how this pass looks at operands to avoid
404    // this. Should track set of foldable movs instead of looking for uses
405    // when looking at a use.
406    Dst0.setReg(NewReg0);
407    for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
408      MI->removeOperand(I);
409    MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
410
411    if (Fold.Commuted)
412      TII->commuteInstruction(*Inst32, false);
413    return true;
414  }
415
416  assert(!Fold.needsShrink() && "not handled");
417
418  if (Fold.isImm()) {
419    if (Old.isTied()) {
420      int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
421      if (NewMFMAOpc == -1)
422        return false;
423      MI->setDesc(TII->get(NewMFMAOpc));
424      MI->untieRegOperand(0);
425    }
426    Old.ChangeToImmediate(Fold.ImmToFold);
427    return true;
428  }
429
430  if (Fold.isGlobal()) {
431    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
432                   Fold.OpToFold->getTargetFlags());
433    return true;
434  }
435
436  if (Fold.isFI()) {
437    Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
438    return true;
439  }
440
441  MachineOperand *New = Fold.OpToFold;
442  Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
443  Old.setIsUndef(New->isUndef());
444  return true;
445}
446
447static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
448                              const MachineInstr *MI) {
449  return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
450}
451
452static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
453                                MachineInstr *MI, unsigned OpNo,
454                                MachineOperand *FoldOp, bool Commuted = false,
455                                int ShrinkOp = -1) {
456  // Skip additional folding on the same operand.
457  for (FoldCandidate &Fold : FoldList)
458    if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
459      return;
460  LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
461                    << " operand " << OpNo << "\n  " << *MI);
462  FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
463}
464
465bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
466                                      MachineInstr *MI, unsigned OpNo,
467                                      MachineOperand *OpToFold) const {
468  const unsigned Opc = MI->getOpcode();
469
470  auto tryToFoldAsFMAAKorMK = [&]() {
471    if (!OpToFold->isImm())
472      return false;
473
474    const bool TryAK = OpNo == 3;
475    const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
476    MI->setDesc(TII->get(NewOpc));
477
478    // We have to fold into operand which would be Imm not into OpNo.
479    bool FoldAsFMAAKorMK =
480        tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
481    if (FoldAsFMAAKorMK) {
482      // Untie Src2 of fmac.
483      MI->untieRegOperand(3);
484      // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
485      if (OpNo == 1) {
486        MachineOperand &Op1 = MI->getOperand(1);
487        MachineOperand &Op2 = MI->getOperand(2);
488        Register OldReg = Op1.getReg();
489        // Operand 2 might be an inlinable constant
490        if (Op2.isImm()) {
491          Op1.ChangeToImmediate(Op2.getImm());
492          Op2.ChangeToRegister(OldReg, false);
493        } else {
494          Op1.setReg(Op2.getReg());
495          Op2.setReg(OldReg);
496        }
497      }
498      return true;
499    }
500    MI->setDesc(TII->get(Opc));
501    return false;
502  };
503
504  bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
505  if (!IsLegal && OpToFold->isImm()) {
506    FoldCandidate Fold(MI, OpNo, OpToFold);
507    IsLegal = canUseImmWithOpSel(Fold);
508  }
509
510  if (!IsLegal) {
511    // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
512    unsigned NewOpc = macToMad(Opc);
513    if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
514      // Check if changing this to a v_mad_{f16, f32} instruction will allow us
515      // to fold the operand.
516      MI->setDesc(TII->get(NewOpc));
517      bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
518                      AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
519      if (AddOpSel)
520        MI->addOperand(MachineOperand::CreateImm(0));
521      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
522      if (FoldAsMAD) {
523        MI->untieRegOperand(OpNo);
524        return true;
525      }
526      if (AddOpSel)
527        MI->removeOperand(MI->getNumExplicitOperands() - 1);
528      MI->setDesc(TII->get(Opc));
529    }
530
531    // Special case for s_fmac_f32 if we are trying to fold into Src2.
532    // By transforming into fmaak we can untie Src2 and make folding legal.
533    if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
534      if (tryToFoldAsFMAAKorMK())
535        return true;
536    }
537
538    // Special case for s_setreg_b32
539    if (OpToFold->isImm()) {
540      unsigned ImmOpc = 0;
541      if (Opc == AMDGPU::S_SETREG_B32)
542        ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
543      else if (Opc == AMDGPU::S_SETREG_B32_mode)
544        ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
545      if (ImmOpc) {
546        MI->setDesc(TII->get(ImmOpc));
547        appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
548        return true;
549      }
550    }
551
552    // If we are already folding into another operand of MI, then
553    // we can't commute the instruction, otherwise we risk making the
554    // other fold illegal.
555    if (isUseMIInFoldList(FoldList, MI))
556      return false;
557
558    // Operand is not legal, so try to commute the instruction to
559    // see if this makes it possible to fold.
560    unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
561    bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
562    if (!CanCommute)
563      return false;
564
565    // One of operands might be an Imm operand, and OpNo may refer to it after
566    // the call of commuteInstruction() below. Such situations are avoided
567    // here explicitly as OpNo must be a register operand to be a candidate
568    // for memory folding.
569    if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
570      return false;
571
572    if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
573      return false;
574
575    int Op32 = -1;
576    if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
577      if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
578           Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
579          (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
580        TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
581        return false;
582      }
583
584      // Verify the other operand is a VGPR, otherwise we would violate the
585      // constant bus restriction.
586      MachineOperand &OtherOp = MI->getOperand(OpNo);
587      if (!OtherOp.isReg() ||
588          !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
589        return false;
590
591      assert(MI->getOperand(1).isDef());
592
593      // Make sure to get the 32-bit version of the commuted opcode.
594      unsigned MaybeCommutedOpc = MI->getOpcode();
595      Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
596    }
597
598    appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
599    return true;
600  }
601
602  // Inlineable constant might have been folded into Imm operand of fmaak or
603  // fmamk and we are trying to fold a non-inlinable constant.
604  if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
605      !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
606    unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
607    MachineOperand &OpImm = MI->getOperand(ImmIdx);
608    if (!OpImm.isReg() &&
609        TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
610      return tryToFoldAsFMAAKorMK();
611  }
612
613  // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
614  // By changing into fmamk we can untie Src2.
615  // If folding for Src0 happens first and it is identical operand to Src1 we
616  // should avoid transforming into fmamk which requires commuting as it would
617  // cause folding into Src1 to fail later on due to wrong OpNo used.
618  if (Opc == AMDGPU::S_FMAC_F32 &&
619      (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
620    if (tryToFoldAsFMAAKorMK())
621      return true;
622  }
623
624  // Check the case where we might introduce a second constant operand to a
625  // scalar instruction
626  if (TII->isSALU(MI->getOpcode())) {
627    const MCInstrDesc &InstDesc = MI->getDesc();
628    const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
629
630    // Fine if the operand can be encoded as an inline constant
631    if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
632      // Otherwise check for another constant
633      for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
634        auto &Op = MI->getOperand(i);
635        if (OpNo != i && !Op.isReg() &&
636            !TII->isInlineConstant(Op, InstDesc.operands()[i]))
637          return false;
638      }
639    }
640  }
641
642  appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
643  return true;
644}
645
646bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
647                                     const MachineOperand &UseMO) const {
648  // Operands of SDWA instructions must be registers.
649  return !TII->isSDWA(MI);
650}
651
652// Find a def of the UseReg, check if it is a reg_sequence and find initializers
653// for each subreg, tracking it to foldable inline immediate if possible.
654// Returns true on success.
655bool SIFoldOperands::getRegSeqInit(
656    SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
657    Register UseReg, uint8_t OpTy) const {
658  MachineInstr *Def = MRI->getVRegDef(UseReg);
659  if (!Def || !Def->isRegSequence())
660    return false;
661
662  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
663    MachineOperand *Sub = &Def->getOperand(I);
664    assert(Sub->isReg());
665
666    for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
667         SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
668         !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
669         SubDef = MRI->getVRegDef(Sub->getReg())) {
670      MachineOperand *Op = &SubDef->getOperand(1);
671      if (Op->isImm()) {
672        if (TII->isInlineConstant(*Op, OpTy))
673          Sub = Op;
674        break;
675      }
676      if (!Op->isReg() || Op->getReg().isPhysical())
677        break;
678      Sub = Op;
679    }
680
681    Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
682  }
683
684  return true;
685}
686
687bool SIFoldOperands::tryToFoldACImm(
688    const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
689    SmallVectorImpl<FoldCandidate> &FoldList) const {
690  const MCInstrDesc &Desc = UseMI->getDesc();
691  if (UseOpIdx >= Desc.getNumOperands())
692    return false;
693
694  if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
695    return false;
696
697  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
698  if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
699      TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
700    UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
701    return true;
702  }
703
704  if (!OpToFold.isReg())
705    return false;
706
707  Register UseReg = OpToFold.getReg();
708  if (!UseReg.isVirtual())
709    return false;
710
711  if (isUseMIInFoldList(FoldList, UseMI))
712    return false;
713
714  // Maybe it is just a COPY of an immediate itself.
715  MachineInstr *Def = MRI->getVRegDef(UseReg);
716  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
717  if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
718    MachineOperand &DefOp = Def->getOperand(1);
719    if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
720        TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
721      UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
722      return true;
723    }
724  }
725
726  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
727  if (!getRegSeqInit(Defs, UseReg, OpTy))
728    return false;
729
730  int32_t Imm;
731  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
732    const MachineOperand *Op = Defs[I].first;
733    if (!Op->isImm())
734      return false;
735
736    auto SubImm = Op->getImm();
737    if (!I) {
738      Imm = SubImm;
739      if (!TII->isInlineConstant(*Op, OpTy) ||
740          !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
741        return false;
742
743      continue;
744    }
745    if (Imm != SubImm)
746      return false; // Can only fold splat constants
747  }
748
749  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
750  return true;
751}
752
753void SIFoldOperands::foldOperand(
754  MachineOperand &OpToFold,
755  MachineInstr *UseMI,
756  int UseOpIdx,
757  SmallVectorImpl<FoldCandidate> &FoldList,
758  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
759  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
760
761  if (!isUseSafeToFold(*UseMI, UseOp))
762    return;
763
764  // FIXME: Fold operands with subregs.
765  if (UseOp.isReg() && OpToFold.isReg() &&
766      (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
767    return;
768
769  // Special case for REG_SEQUENCE: We can't fold literals into
770  // REG_SEQUENCE instructions, so we have to fold them into the
771  // uses of REG_SEQUENCE.
772  if (UseMI->isRegSequence()) {
773    Register RegSeqDstReg = UseMI->getOperand(0).getReg();
774    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
775
776    for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
777      MachineInstr *RSUseMI = RSUse.getParent();
778
779      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
780                         RSUseMI->getOperandNo(&RSUse), FoldList))
781        continue;
782
783      if (RSUse.getSubReg() != RegSeqDstSubReg)
784        continue;
785
786      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
787                  CopiesToReplace);
788    }
789
790    return;
791  }
792
793  if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
794    return;
795
796  if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
797    // Verify that this is a stack access.
798    // FIXME: Should probably use stack pseudos before frame lowering.
799
800    if (TII->isMUBUF(*UseMI)) {
801      if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
802          MFI->getScratchRSrcReg())
803        return;
804
805      // Ensure this is either relative to the current frame or the current
806      // wave.
807      MachineOperand &SOff =
808          *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
809      if (!SOff.isImm() || SOff.getImm() != 0)
810        return;
811    }
812
813    // A frame index will resolve to a positive constant, so it should always be
814    // safe to fold the addressing mode, even pre-GFX9.
815    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
816
817    const unsigned Opc = UseMI->getOpcode();
818    if (TII->isFLATScratch(*UseMI) &&
819        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
820        !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
821      unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
822      UseMI->setDesc(TII->get(NewOpc));
823    }
824
825    return;
826  }
827
828  bool FoldingImmLike =
829      OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
830
831  if (FoldingImmLike && UseMI->isCopy()) {
832    Register DestReg = UseMI->getOperand(0).getReg();
833    Register SrcReg = UseMI->getOperand(1).getReg();
834    assert(SrcReg.isVirtual());
835
836    const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
837
838    // Don't fold into a copy to a physical register with the same class. Doing
839    // so would interfere with the register coalescer's logic which would avoid
840    // redundant initializations.
841    if (DestReg.isPhysical() && SrcRC->contains(DestReg))
842      return;
843
844    const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
845    if (!DestReg.isPhysical()) {
846      if (DestRC == &AMDGPU::AGPR_32RegClass &&
847          TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
848        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
849        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
850        CopiesToReplace.push_back(UseMI);
851        return;
852      }
853    }
854
855    // In order to fold immediates into copies, we need to change the
856    // copy to a MOV.
857
858    unsigned MovOp = TII->getMovOpcode(DestRC);
859    if (MovOp == AMDGPU::COPY)
860      return;
861
862    UseMI->setDesc(TII->get(MovOp));
863    MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
864    MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
865    while (ImpOpI != ImpOpE) {
866      MachineInstr::mop_iterator Tmp = ImpOpI;
867      ImpOpI++;
868      UseMI->removeOperand(UseMI->getOperandNo(Tmp));
869    }
870    CopiesToReplace.push_back(UseMI);
871  } else {
872    if (UseMI->isCopy() && OpToFold.isReg() &&
873        UseMI->getOperand(0).getReg().isVirtual() &&
874        !UseMI->getOperand(1).getSubReg()) {
875      LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
876      unsigned Size = TII->getOpSize(*UseMI, 1);
877      Register UseReg = OpToFold.getReg();
878      UseMI->getOperand(1).setReg(UseReg);
879      UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
880      UseMI->getOperand(1).setIsKill(false);
881      CopiesToReplace.push_back(UseMI);
882      OpToFold.setIsKill(false);
883
884      // Remove kill flags as kills may now be out of order with uses.
885      MRI->clearKillFlags(OpToFold.getReg());
886
887      // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
888      // can only accept VGPR or inline immediate. Recreate a reg_sequence with
889      // its initializers right here, so we will rematerialize immediates and
890      // avoid copies via different reg classes.
891      SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
892      if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
893          getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
894        const DebugLoc &DL = UseMI->getDebugLoc();
895        MachineBasicBlock &MBB = *UseMI->getParent();
896
897        UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
898        for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
899          UseMI->removeOperand(I);
900
901        MachineInstrBuilder B(*MBB.getParent(), UseMI);
902        DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
903        SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
904        for (unsigned I = 0; I < Size / 4; ++I) {
905          MachineOperand *Def = Defs[I].first;
906          TargetInstrInfo::RegSubRegPair CopyToVGPR;
907          if (Def->isImm() &&
908              TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
909            int64_t Imm = Def->getImm();
910
911            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
912            BuildMI(MBB, UseMI, DL,
913                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
914            B.addReg(Tmp);
915          } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
916            auto Src = getRegSubRegPair(*Def);
917            Def->setIsKill(false);
918            if (!SeenAGPRs.insert(Src)) {
919              // We cannot build a reg_sequence out of the same registers, they
920              // must be copied. Better do it here before copyPhysReg() created
921              // several reads to do the AGPR->VGPR->AGPR copy.
922              CopyToVGPR = Src;
923            } else {
924              B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
925                       Src.SubReg);
926            }
927          } else {
928            assert(Def->isReg());
929            Def->setIsKill(false);
930            auto Src = getRegSubRegPair(*Def);
931
932            // Direct copy from SGPR to AGPR is not possible. To avoid creation
933            // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
934            // create a copy here and track if we already have such a copy.
935            if (TRI->isSGPRReg(*MRI, Src.Reg)) {
936              CopyToVGPR = Src;
937            } else {
938              auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
939              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
940              B.addReg(Tmp);
941            }
942          }
943
944          if (CopyToVGPR.Reg) {
945            Register Vgpr;
946            if (VGPRCopies.count(CopyToVGPR)) {
947              Vgpr = VGPRCopies[CopyToVGPR];
948            } else {
949              Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
950              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
951              VGPRCopies[CopyToVGPR] = Vgpr;
952            }
953            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
954            BuildMI(MBB, UseMI, DL,
955                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
956            B.addReg(Tmp);
957          }
958
959          B.addImm(Defs[I].second);
960        }
961        LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
962        return;
963      }
964
965      if (Size != 4)
966        return;
967
968      Register Reg0 = UseMI->getOperand(0).getReg();
969      Register Reg1 = UseMI->getOperand(1).getReg();
970      if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
971        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
972      else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
973        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
974      else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
975               TRI->isAGPR(*MRI, Reg1))
976        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
977      return;
978    }
979
980    unsigned UseOpc = UseMI->getOpcode();
981    if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
982        (UseOpc == AMDGPU::V_READLANE_B32 &&
983         (int)UseOpIdx ==
984         AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
985      // %vgpr = V_MOV_B32 imm
986      // %sgpr = V_READFIRSTLANE_B32 %vgpr
987      // =>
988      // %sgpr = S_MOV_B32 imm
989      if (FoldingImmLike) {
990        if (execMayBeModifiedBeforeUse(*MRI,
991                                       UseMI->getOperand(UseOpIdx).getReg(),
992                                       *OpToFold.getParent(),
993                                       *UseMI))
994          return;
995
996        UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
997
998        if (OpToFold.isImm())
999          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1000        else
1001          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
1002        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1003        return;
1004      }
1005
1006      if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1007        if (execMayBeModifiedBeforeUse(*MRI,
1008                                       UseMI->getOperand(UseOpIdx).getReg(),
1009                                       *OpToFold.getParent(),
1010                                       *UseMI))
1011          return;
1012
1013        // %vgpr = COPY %sgpr0
1014        // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1015        // =>
1016        // %sgpr1 = COPY %sgpr0
1017        UseMI->setDesc(TII->get(AMDGPU::COPY));
1018        UseMI->getOperand(1).setReg(OpToFold.getReg());
1019        UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1020        UseMI->getOperand(1).setIsKill(false);
1021        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1022        return;
1023      }
1024    }
1025
1026    const MCInstrDesc &UseDesc = UseMI->getDesc();
1027
1028    // Don't fold into target independent nodes.  Target independent opcodes
1029    // don't have defined register classes.
1030    if (UseDesc.isVariadic() || UseOp.isImplicit() ||
1031        UseDesc.operands()[UseOpIdx].RegClass == -1)
1032      return;
1033  }
1034
1035  if (!FoldingImmLike) {
1036    if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1037      // Don't fold if OpToFold doesn't hold an aligned register.
1038      const TargetRegisterClass *RC =
1039          TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1040      if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1041        unsigned SubReg = OpToFold.getSubReg();
1042        if (const TargetRegisterClass *SubRC =
1043                TRI->getSubRegisterClass(RC, SubReg))
1044          RC = SubRC;
1045      }
1046
1047      if (!RC || !TRI->isProperlyAlignedRC(*RC))
1048        return;
1049    }
1050
1051    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1052
1053    // FIXME: We could try to change the instruction from 64-bit to 32-bit
1054    // to enable more folding opportunities.  The shrink operands pass
1055    // already does this.
1056    return;
1057  }
1058
1059
1060  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1061  const TargetRegisterClass *FoldRC =
1062      TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1063
1064  // Split 64-bit constants into 32-bits for folding.
1065  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1066    Register UseReg = UseOp.getReg();
1067    const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1068    if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1069      return;
1070
1071    APInt Imm(64, OpToFold.getImm());
1072    if (UseOp.getSubReg() == AMDGPU::sub0) {
1073      Imm = Imm.getLoBits(32);
1074    } else {
1075      assert(UseOp.getSubReg() == AMDGPU::sub1);
1076      Imm = Imm.getHiBits(32);
1077    }
1078
1079    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1080    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1081    return;
1082  }
1083
1084  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1085}
1086
1087static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1088                                  uint32_t LHS, uint32_t RHS) {
1089  switch (Opcode) {
1090  case AMDGPU::V_AND_B32_e64:
1091  case AMDGPU::V_AND_B32_e32:
1092  case AMDGPU::S_AND_B32:
1093    Result = LHS & RHS;
1094    return true;
1095  case AMDGPU::V_OR_B32_e64:
1096  case AMDGPU::V_OR_B32_e32:
1097  case AMDGPU::S_OR_B32:
1098    Result = LHS | RHS;
1099    return true;
1100  case AMDGPU::V_XOR_B32_e64:
1101  case AMDGPU::V_XOR_B32_e32:
1102  case AMDGPU::S_XOR_B32:
1103    Result = LHS ^ RHS;
1104    return true;
1105  case AMDGPU::S_XNOR_B32:
1106    Result = ~(LHS ^ RHS);
1107    return true;
1108  case AMDGPU::S_NAND_B32:
1109    Result = ~(LHS & RHS);
1110    return true;
1111  case AMDGPU::S_NOR_B32:
1112    Result = ~(LHS | RHS);
1113    return true;
1114  case AMDGPU::S_ANDN2_B32:
1115    Result = LHS & ~RHS;
1116    return true;
1117  case AMDGPU::S_ORN2_B32:
1118    Result = LHS | ~RHS;
1119    return true;
1120  case AMDGPU::V_LSHL_B32_e64:
1121  case AMDGPU::V_LSHL_B32_e32:
1122  case AMDGPU::S_LSHL_B32:
1123    // The instruction ignores the high bits for out of bounds shifts.
1124    Result = LHS << (RHS & 31);
1125    return true;
1126  case AMDGPU::V_LSHLREV_B32_e64:
1127  case AMDGPU::V_LSHLREV_B32_e32:
1128    Result = RHS << (LHS & 31);
1129    return true;
1130  case AMDGPU::V_LSHR_B32_e64:
1131  case AMDGPU::V_LSHR_B32_e32:
1132  case AMDGPU::S_LSHR_B32:
1133    Result = LHS >> (RHS & 31);
1134    return true;
1135  case AMDGPU::V_LSHRREV_B32_e64:
1136  case AMDGPU::V_LSHRREV_B32_e32:
1137    Result = RHS >> (LHS & 31);
1138    return true;
1139  case AMDGPU::V_ASHR_I32_e64:
1140  case AMDGPU::V_ASHR_I32_e32:
1141  case AMDGPU::S_ASHR_I32:
1142    Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1143    return true;
1144  case AMDGPU::V_ASHRREV_I32_e64:
1145  case AMDGPU::V_ASHRREV_I32_e32:
1146    Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1147    return true;
1148  default:
1149    return false;
1150  }
1151}
1152
1153static unsigned getMovOpc(bool IsScalar) {
1154  return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1155}
1156
1157static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1158  MI.setDesc(NewDesc);
1159
1160  // Remove any leftover implicit operands from mutating the instruction. e.g.
1161  // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1162  // anymore.
1163  const MCInstrDesc &Desc = MI.getDesc();
1164  unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1165                    Desc.implicit_defs().size();
1166
1167  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1168    MI.removeOperand(I);
1169}
1170
1171MachineOperand *
1172SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1173  // If this has a subregister, it obviously is a register source.
1174  if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1175      !Op.getReg().isVirtual())
1176    return &Op;
1177
1178  MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1179  if (Def && Def->isMoveImmediate()) {
1180    MachineOperand &ImmSrc = Def->getOperand(1);
1181    if (ImmSrc.isImm())
1182      return &ImmSrc;
1183  }
1184
1185  return &Op;
1186}
1187
1188// Try to simplify operations with a constant that may appear after instruction
1189// selection.
1190// TODO: See if a frame index with a fixed offset can fold.
1191bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1192  if (!MI->allImplicitDefsAreDead())
1193    return false;
1194
1195  unsigned Opc = MI->getOpcode();
1196
1197  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1198  if (Src0Idx == -1)
1199    return false;
1200  MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1201
1202  if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1203       Opc == AMDGPU::S_NOT_B32) &&
1204      Src0->isImm()) {
1205    MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1206    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1207    return true;
1208  }
1209
1210  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1211  if (Src1Idx == -1)
1212    return false;
1213  MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1214
1215  if (!Src0->isImm() && !Src1->isImm())
1216    return false;
1217
1218  // and k0, k1 -> v_mov_b32 (k0 & k1)
1219  // or k0, k1 -> v_mov_b32 (k0 | k1)
1220  // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1221  if (Src0->isImm() && Src1->isImm()) {
1222    int32_t NewImm;
1223    if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1224      return false;
1225
1226    bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1227
1228    // Be careful to change the right operand, src0 may belong to a different
1229    // instruction.
1230    MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1231    MI->removeOperand(Src1Idx);
1232    mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1233    return true;
1234  }
1235
1236  if (!MI->isCommutable())
1237    return false;
1238
1239  if (Src0->isImm() && !Src1->isImm()) {
1240    std::swap(Src0, Src1);
1241    std::swap(Src0Idx, Src1Idx);
1242  }
1243
1244  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1245  if (Opc == AMDGPU::V_OR_B32_e64 ||
1246      Opc == AMDGPU::V_OR_B32_e32 ||
1247      Opc == AMDGPU::S_OR_B32) {
1248    if (Src1Val == 0) {
1249      // y = or x, 0 => y = copy x
1250      MI->removeOperand(Src1Idx);
1251      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1252    } else if (Src1Val == -1) {
1253      // y = or x, -1 => y = v_mov_b32 -1
1254      MI->removeOperand(Src1Idx);
1255      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1256    } else
1257      return false;
1258
1259    return true;
1260  }
1261
1262  if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1263      Opc == AMDGPU::S_AND_B32) {
1264    if (Src1Val == 0) {
1265      // y = and x, 0 => y = v_mov_b32 0
1266      MI->removeOperand(Src0Idx);
1267      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1268    } else if (Src1Val == -1) {
1269      // y = and x, -1 => y = copy x
1270      MI->removeOperand(Src1Idx);
1271      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1272    } else
1273      return false;
1274
1275    return true;
1276  }
1277
1278  if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1279      Opc == AMDGPU::S_XOR_B32) {
1280    if (Src1Val == 0) {
1281      // y = xor x, 0 => y = copy x
1282      MI->removeOperand(Src1Idx);
1283      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1284      return true;
1285    }
1286  }
1287
1288  return false;
1289}
1290
1291// Try to fold an instruction into a simpler one
1292bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1293  unsigned Opc = MI.getOpcode();
1294  if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1295      Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1296    return false;
1297
1298  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1299  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1300  if (!Src1->isIdenticalTo(*Src0)) {
1301    auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1302    auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1303    if (!Src1Imm->isIdenticalTo(*Src0Imm))
1304      return false;
1305  }
1306
1307  int Src1ModIdx =
1308      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1309  int Src0ModIdx =
1310      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1311  if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1312      (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1313    return false;
1314
1315  LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1316  auto &NewDesc =
1317      TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1318  int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1319  if (Src2Idx != -1)
1320    MI.removeOperand(Src2Idx);
1321  MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1322  if (Src1ModIdx != -1)
1323    MI.removeOperand(Src1ModIdx);
1324  if (Src0ModIdx != -1)
1325    MI.removeOperand(Src0ModIdx);
1326  mutateCopyOp(MI, NewDesc);
1327  LLVM_DEBUG(dbgs() << MI);
1328  return true;
1329}
1330
1331bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1332  if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1333      MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1334    return false;
1335
1336  MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1337  if (!Src0->isImm() || Src0->getImm() != 0xffff)
1338    return false;
1339
1340  Register Src1 = MI.getOperand(2).getReg();
1341  MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1342  if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1343    return false;
1344
1345  Register Dst = MI.getOperand(0).getReg();
1346  MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1347  MI.eraseFromParent();
1348  return true;
1349}
1350
1351bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1352                                     MachineOperand &OpToFold) const {
1353  // We need mutate the operands of new mov instructions to add implicit
1354  // uses of EXEC, but adding them invalidates the use_iterator, so defer
1355  // this.
1356  SmallVector<MachineInstr *, 4> CopiesToReplace;
1357  SmallVector<FoldCandidate, 4> FoldList;
1358  MachineOperand &Dst = MI.getOperand(0);
1359  bool Changed = false;
1360
1361  if (OpToFold.isImm()) {
1362    for (auto &UseMI :
1363         make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1364      // Folding the immediate may reveal operations that can be constant
1365      // folded or replaced with a copy. This can happen for example after
1366      // frame indices are lowered to constants or from splitting 64-bit
1367      // constants.
1368      //
1369      // We may also encounter cases where one or both operands are
1370      // immediates materialized into a register, which would ordinarily not
1371      // be folded due to multiple uses or operand constraints.
1372      if (tryConstantFoldOp(&UseMI)) {
1373        LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1374        Changed = true;
1375      }
1376    }
1377  }
1378
1379  SmallVector<MachineOperand *, 4> UsesToProcess;
1380  for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1381    UsesToProcess.push_back(&Use);
1382  for (auto *U : UsesToProcess) {
1383    MachineInstr *UseMI = U->getParent();
1384    foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1385                CopiesToReplace);
1386  }
1387
1388  if (CopiesToReplace.empty() && FoldList.empty())
1389    return Changed;
1390
1391  MachineFunction *MF = MI.getParent()->getParent();
1392  // Make sure we add EXEC uses to any new v_mov instructions created.
1393  for (MachineInstr *Copy : CopiesToReplace)
1394    Copy->addImplicitDefUseOperands(*MF);
1395
1396  for (FoldCandidate &Fold : FoldList) {
1397    assert(!Fold.isReg() || Fold.OpToFold);
1398    if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1399      Register Reg = Fold.OpToFold->getReg();
1400      MachineInstr *DefMI = Fold.OpToFold->getParent();
1401      if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1402          execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1403        continue;
1404    }
1405    if (updateOperand(Fold)) {
1406      // Clear kill flags.
1407      if (Fold.isReg()) {
1408        assert(Fold.OpToFold && Fold.OpToFold->isReg());
1409        // FIXME: Probably shouldn't bother trying to fold if not an
1410        // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1411        // copies.
1412        MRI->clearKillFlags(Fold.OpToFold->getReg());
1413      }
1414      LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1415                        << static_cast<int>(Fold.UseOpNo) << " of "
1416                        << *Fold.UseMI);
1417    } else if (Fold.Commuted) {
1418      // Restoring instruction's original operand order if fold has failed.
1419      TII->commuteInstruction(*Fold.UseMI, false);
1420    }
1421  }
1422  return true;
1423}
1424
1425bool SIFoldOperands::tryFoldFoldableCopy(
1426    MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1427  // Specially track simple redefs of m0 to the same value in a block, so we
1428  // can erase the later ones.
1429  if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1430    MachineOperand &NewM0Val = MI.getOperand(1);
1431    if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1432      MI.eraseFromParent();
1433      return true;
1434    }
1435
1436    // We aren't tracking other physical registers
1437    CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1438                            ? nullptr
1439                            : &NewM0Val;
1440    return false;
1441  }
1442
1443  MachineOperand &OpToFold = MI.getOperand(1);
1444  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1445
1446  // FIXME: We could also be folding things like TargetIndexes.
1447  if (!FoldingImm && !OpToFold.isReg())
1448    return false;
1449
1450  if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1451    return false;
1452
1453  // Prevent folding operands backwards in the function. For example,
1454  // the COPY opcode must not be replaced by 1 in this example:
1455  //
1456  //    %3 = COPY %vgpr0; VGPR_32:%3
1457  //    ...
1458  //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1459  if (!MI.getOperand(0).getReg().isVirtual())
1460    return false;
1461
1462  bool Changed = foldInstOperand(MI, OpToFold);
1463
1464  // If we managed to fold all uses of this copy then we might as well
1465  // delete it now.
1466  // The only reason we need to follow chains of copies here is that
1467  // tryFoldRegSequence looks forward through copies before folding a
1468  // REG_SEQUENCE into its eventual users.
1469  auto *InstToErase = &MI;
1470  while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1471    auto &SrcOp = InstToErase->getOperand(1);
1472    auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1473    InstToErase->eraseFromParent();
1474    Changed = true;
1475    InstToErase = nullptr;
1476    if (!SrcReg || SrcReg.isPhysical())
1477      break;
1478    InstToErase = MRI->getVRegDef(SrcReg);
1479    if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1480      break;
1481  }
1482
1483  if (InstToErase && InstToErase->isRegSequence() &&
1484      MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1485    InstToErase->eraseFromParent();
1486    Changed = true;
1487  }
1488
1489  return Changed;
1490}
1491
1492// Clamp patterns are canonically selected to v_max_* instructions, so only
1493// handle them.
1494const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1495  unsigned Op = MI.getOpcode();
1496  switch (Op) {
1497  case AMDGPU::V_MAX_F32_e64:
1498  case AMDGPU::V_MAX_F16_e64:
1499  case AMDGPU::V_MAX_F16_t16_e64:
1500  case AMDGPU::V_MAX_F16_fake16_e64:
1501  case AMDGPU::V_MAX_F64_e64:
1502  case AMDGPU::V_MAX_NUM_F64_e64:
1503  case AMDGPU::V_PK_MAX_F16: {
1504    if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1505      return nullptr;
1506
1507    // Make sure sources are identical.
1508    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1509    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1510    if (!Src0->isReg() || !Src1->isReg() ||
1511        Src0->getReg() != Src1->getReg() ||
1512        Src0->getSubReg() != Src1->getSubReg() ||
1513        Src0->getSubReg() != AMDGPU::NoSubRegister)
1514      return nullptr;
1515
1516    // Can't fold up if we have modifiers.
1517    if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1518      return nullptr;
1519
1520    unsigned Src0Mods
1521      = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1522    unsigned Src1Mods
1523      = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1524
1525    // Having a 0 op_sel_hi would require swizzling the output in the source
1526    // instruction, which we can't do.
1527    unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1528                                                      : 0u;
1529    if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1530      return nullptr;
1531    return Src0;
1532  }
1533  default:
1534    return nullptr;
1535  }
1536}
1537
1538// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1539bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1540  const MachineOperand *ClampSrc = isClamp(MI);
1541  if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1542    return false;
1543
1544  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1545
1546  // The type of clamp must be compatible.
1547  if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1548    return false;
1549
1550  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1551  if (!DefClamp)
1552    return false;
1553
1554  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1555
1556  // Clamp is applied after omod, so it is OK if omod is set.
1557  DefClamp->setImm(1);
1558  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1559  MI.eraseFromParent();
1560
1561  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1562  // instruction, so we might as well convert it to the more flexible VOP3-only
1563  // mad/fma form.
1564  if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1565    Def->eraseFromParent();
1566
1567  return true;
1568}
1569
1570static int getOModValue(unsigned Opc, int64_t Val) {
1571  switch (Opc) {
1572  case AMDGPU::V_MUL_F64_e64:
1573  case AMDGPU::V_MUL_F64_pseudo_e64: {
1574    switch (Val) {
1575    case 0x3fe0000000000000: // 0.5
1576      return SIOutMods::DIV2;
1577    case 0x4000000000000000: // 2.0
1578      return SIOutMods::MUL2;
1579    case 0x4010000000000000: // 4.0
1580      return SIOutMods::MUL4;
1581    default:
1582      return SIOutMods::NONE;
1583    }
1584  }
1585  case AMDGPU::V_MUL_F32_e64: {
1586    switch (static_cast<uint32_t>(Val)) {
1587    case 0x3f000000: // 0.5
1588      return SIOutMods::DIV2;
1589    case 0x40000000: // 2.0
1590      return SIOutMods::MUL2;
1591    case 0x40800000: // 4.0
1592      return SIOutMods::MUL4;
1593    default:
1594      return SIOutMods::NONE;
1595    }
1596  }
1597  case AMDGPU::V_MUL_F16_e64:
1598  case AMDGPU::V_MUL_F16_t16_e64:
1599  case AMDGPU::V_MUL_F16_fake16_e64: {
1600    switch (static_cast<uint16_t>(Val)) {
1601    case 0x3800: // 0.5
1602      return SIOutMods::DIV2;
1603    case 0x4000: // 2.0
1604      return SIOutMods::MUL2;
1605    case 0x4400: // 4.0
1606      return SIOutMods::MUL4;
1607    default:
1608      return SIOutMods::NONE;
1609    }
1610  }
1611  default:
1612    llvm_unreachable("invalid mul opcode");
1613  }
1614}
1615
1616// FIXME: Does this really not support denormals with f16?
1617// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1618// handled, so will anything other than that break?
1619std::pair<const MachineOperand *, int>
1620SIFoldOperands::isOMod(const MachineInstr &MI) const {
1621  unsigned Op = MI.getOpcode();
1622  switch (Op) {
1623  case AMDGPU::V_MUL_F64_e64:
1624  case AMDGPU::V_MUL_F64_pseudo_e64:
1625  case AMDGPU::V_MUL_F32_e64:
1626  case AMDGPU::V_MUL_F16_t16_e64:
1627  case AMDGPU::V_MUL_F16_fake16_e64:
1628  case AMDGPU::V_MUL_F16_e64: {
1629    // If output denormals are enabled, omod is ignored.
1630    if ((Op == AMDGPU::V_MUL_F32_e64 &&
1631         MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1632        ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1633          Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1634          Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1635         MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1636      return std::pair(nullptr, SIOutMods::NONE);
1637
1638    const MachineOperand *RegOp = nullptr;
1639    const MachineOperand *ImmOp = nullptr;
1640    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1641    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1642    if (Src0->isImm()) {
1643      ImmOp = Src0;
1644      RegOp = Src1;
1645    } else if (Src1->isImm()) {
1646      ImmOp = Src1;
1647      RegOp = Src0;
1648    } else
1649      return std::pair(nullptr, SIOutMods::NONE);
1650
1651    int OMod = getOModValue(Op, ImmOp->getImm());
1652    if (OMod == SIOutMods::NONE ||
1653        TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1654        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1655        TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1656        TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1657      return std::pair(nullptr, SIOutMods::NONE);
1658
1659    return std::pair(RegOp, OMod);
1660  }
1661  case AMDGPU::V_ADD_F64_e64:
1662  case AMDGPU::V_ADD_F64_pseudo_e64:
1663  case AMDGPU::V_ADD_F32_e64:
1664  case AMDGPU::V_ADD_F16_e64:
1665  case AMDGPU::V_ADD_F16_t16_e64:
1666  case AMDGPU::V_ADD_F16_fake16_e64: {
1667    // If output denormals are enabled, omod is ignored.
1668    if ((Op == AMDGPU::V_ADD_F32_e64 &&
1669         MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1670        ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1671          Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1672          Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1673         MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1674      return std::pair(nullptr, SIOutMods::NONE);
1675
1676    // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1677    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1678    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1679
1680    if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1681        Src0->getSubReg() == Src1->getSubReg() &&
1682        !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1683        !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1684        !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1685        !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1686      return std::pair(Src0, SIOutMods::MUL2);
1687
1688    return std::pair(nullptr, SIOutMods::NONE);
1689  }
1690  default:
1691    return std::pair(nullptr, SIOutMods::NONE);
1692  }
1693}
1694
1695// FIXME: Does this need to check IEEE bit on function?
1696bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1697  const MachineOperand *RegOp;
1698  int OMod;
1699  std::tie(RegOp, OMod) = isOMod(MI);
1700  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1701      RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1702      !MRI->hasOneNonDBGUser(RegOp->getReg()))
1703    return false;
1704
1705  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1706  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1707  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1708    return false;
1709
1710  // Clamp is applied after omod. If the source already has clamp set, don't
1711  // fold it.
1712  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1713    return false;
1714
1715  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1716
1717  DefOMod->setImm(OMod);
1718  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1719  MI.eraseFromParent();
1720
1721  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1722  // instruction, so we might as well convert it to the more flexible VOP3-only
1723  // mad/fma form.
1724  if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1725    Def->eraseFromParent();
1726
1727  return true;
1728}
1729
1730// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1731// instruction which can take an agpr. So far that means a store.
1732bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1733  assert(MI.isRegSequence());
1734  auto Reg = MI.getOperand(0).getReg();
1735
1736  if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1737      !MRI->hasOneNonDBGUse(Reg))
1738    return false;
1739
1740  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1741  if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1742    return false;
1743
1744  for (auto &Def : Defs) {
1745    const auto *Op = Def.first;
1746    if (!Op->isReg())
1747      return false;
1748    if (TRI->isAGPR(*MRI, Op->getReg()))
1749      continue;
1750    // Maybe this is a COPY from AREG
1751    const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1752    if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1753      return false;
1754    if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1755      return false;
1756  }
1757
1758  MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1759  MachineInstr *UseMI = Op->getParent();
1760  while (UseMI->isCopy() && !Op->getSubReg()) {
1761    Reg = UseMI->getOperand(0).getReg();
1762    if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1763      return false;
1764    Op = &*MRI->use_nodbg_begin(Reg);
1765    UseMI = Op->getParent();
1766  }
1767
1768  if (Op->getSubReg())
1769    return false;
1770
1771  unsigned OpIdx = Op - &UseMI->getOperand(0);
1772  const MCInstrDesc &InstDesc = UseMI->getDesc();
1773  const TargetRegisterClass *OpRC =
1774      TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1775  if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1776    return false;
1777
1778  const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1779  auto Dst = MRI->createVirtualRegister(NewDstRC);
1780  auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1781                    TII->get(AMDGPU::REG_SEQUENCE), Dst);
1782
1783  for (unsigned I = 0; I < Defs.size(); ++I) {
1784    MachineOperand *Def = Defs[I].first;
1785    Def->setIsKill(false);
1786    if (TRI->isAGPR(*MRI, Def->getReg())) {
1787      RS.add(*Def);
1788    } else { // This is a copy
1789      MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1790      SubDef->getOperand(1).setIsKill(false);
1791      RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1792    }
1793    RS.addImm(Defs[I].second);
1794  }
1795
1796  Op->setReg(Dst);
1797  if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1798    Op->setReg(Reg);
1799    RS->eraseFromParent();
1800    return false;
1801  }
1802
1803  LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1804
1805  // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1806  // in which case we can erase them all later in runOnMachineFunction.
1807  if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1808    MI.eraseFromParent();
1809  return true;
1810}
1811
1812/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1813/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1814static bool isAGPRCopy(const SIRegisterInfo &TRI,
1815                       const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1816                       Register &OutReg, unsigned &OutSubReg) {
1817  assert(Copy.isCopy());
1818
1819  const MachineOperand &CopySrc = Copy.getOperand(1);
1820  Register CopySrcReg = CopySrc.getReg();
1821  if (!CopySrcReg.isVirtual())
1822    return false;
1823
1824  // Common case: copy from AGPR directly, e.g.
1825  //  %1:vgpr_32 = COPY %0:agpr_32
1826  if (TRI.isAGPR(MRI, CopySrcReg)) {
1827    OutReg = CopySrcReg;
1828    OutSubReg = CopySrc.getSubReg();
1829    return true;
1830  }
1831
1832  // Sometimes it can also involve two copies, e.g.
1833  //  %1:vgpr_256 = COPY %0:agpr_256
1834  //  %2:vgpr_32 = COPY %1:vgpr_256.sub0
1835  const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1836  if (!CopySrcDef || !CopySrcDef->isCopy())
1837    return false;
1838
1839  const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1840  Register OtherCopySrcReg = OtherCopySrc.getReg();
1841  if (!OtherCopySrcReg.isVirtual() ||
1842      CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1843      OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1844      !TRI.isAGPR(MRI, OtherCopySrcReg))
1845    return false;
1846
1847  OutReg = OtherCopySrcReg;
1848  OutSubReg = CopySrc.getSubReg();
1849  return true;
1850}
1851
1852// Try to hoist an AGPR to VGPR copy across a PHI.
1853// This should allow folding of an AGPR into a consumer which may support it.
1854//
1855// Example 1: LCSSA PHI
1856//      loop:
1857//        %1:vreg = COPY %0:areg
1858//      exit:
1859//        %2:vreg = PHI %1:vreg, %loop
1860//  =>
1861//      loop:
1862//      exit:
1863//        %1:areg = PHI %0:areg, %loop
1864//        %2:vreg = COPY %1:areg
1865//
1866// Example 2: PHI with multiple incoming values:
1867//      entry:
1868//        %1:vreg = GLOBAL_LOAD(..)
1869//      loop:
1870//        %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1871//        %3:areg = COPY %2:vreg
1872//        %4:areg = (instr using %3:areg)
1873//        %5:vreg = COPY %4:areg
1874//  =>
1875//      entry:
1876//        %1:vreg = GLOBAL_LOAD(..)
1877//        %2:areg = COPY %1:vreg
1878//      loop:
1879//        %3:areg = PHI %2:areg, %entry, %X:areg,
1880//        %4:areg = (instr using %3:areg)
1881bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1882  assert(PHI.isPHI());
1883
1884  Register PhiOut = PHI.getOperand(0).getReg();
1885  if (!TRI->isVGPR(*MRI, PhiOut))
1886    return false;
1887
1888  // Iterate once over all incoming values of the PHI to check if this PHI is
1889  // eligible, and determine the exact AGPR RC we'll target.
1890  const TargetRegisterClass *ARC = nullptr;
1891  for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1892    MachineOperand &MO = PHI.getOperand(K);
1893    MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1894    if (!Copy || !Copy->isCopy())
1895      continue;
1896
1897    Register AGPRSrc;
1898    unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1899    if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1900      continue;
1901
1902    const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1903    if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1904      CopyInRC = SubRC;
1905
1906    if (ARC && !ARC->hasSubClassEq(CopyInRC))
1907      return false;
1908    ARC = CopyInRC;
1909  }
1910
1911  if (!ARC)
1912    return false;
1913
1914  bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1915
1916  // Rewrite the PHI's incoming values to ARC.
1917  LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1918  for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1919    MachineOperand &MO = PHI.getOperand(K);
1920    Register Reg = MO.getReg();
1921
1922    MachineBasicBlock::iterator InsertPt;
1923    MachineBasicBlock *InsertMBB = nullptr;
1924
1925    // Look at the def of Reg, ignoring all copies.
1926    unsigned CopyOpc = AMDGPU::COPY;
1927    if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1928
1929      // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1930      // the copy was single-use, it will be removed by DCE later.
1931      if (Def->isCopy()) {
1932        Register AGPRSrc;
1933        unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1934        if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1935          MO.setReg(AGPRSrc);
1936          MO.setSubReg(AGPRSubReg);
1937          continue;
1938        }
1939
1940        // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1941        // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1942        // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1943        // is unlikely to be profitable.
1944        //
1945        // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1946        MachineOperand &CopyIn = Def->getOperand(1);
1947        if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1948            TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1949          CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1950      }
1951
1952      InsertMBB = Def->getParent();
1953      InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1954    } else {
1955      InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1956      InsertPt = InsertMBB->getFirstTerminator();
1957    }
1958
1959    Register NewReg = MRI->createVirtualRegister(ARC);
1960    MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1961                               TII->get(CopyOpc), NewReg)
1962                           .addReg(Reg);
1963    MO.setReg(NewReg);
1964
1965    (void)MI;
1966    LLVM_DEBUG(dbgs() << "  Created COPY: " << *MI);
1967  }
1968
1969  // Replace the PHI's result with a new register.
1970  Register NewReg = MRI->createVirtualRegister(ARC);
1971  PHI.getOperand(0).setReg(NewReg);
1972
1973  // COPY that new register back to the original PhiOut register. This COPY will
1974  // usually be folded out later.
1975  MachineBasicBlock *MBB = PHI.getParent();
1976  BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1977          TII->get(AMDGPU::COPY), PhiOut)
1978      .addReg(NewReg);
1979
1980  LLVM_DEBUG(dbgs() << "  Done: Folded " << PHI);
1981  return true;
1982}
1983
1984// Attempt to convert VGPR load to an AGPR load.
1985bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1986  assert(MI.mayLoad());
1987  if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1988    return false;
1989
1990  MachineOperand &Def = MI.getOperand(0);
1991  if (!Def.isDef())
1992    return false;
1993
1994  Register DefReg = Def.getReg();
1995
1996  if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1997    return false;
1998
1999  SmallVector<const MachineInstr*, 8> Users;
2000  SmallVector<Register, 8> MoveRegs;
2001  for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2002    Users.push_back(&I);
2003
2004  if (Users.empty())
2005    return false;
2006
2007  // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2008  while (!Users.empty()) {
2009    const MachineInstr *I = Users.pop_back_val();
2010    if (!I->isCopy() && !I->isRegSequence())
2011      return false;
2012    Register DstReg = I->getOperand(0).getReg();
2013    // Physical registers may have more than one instruction definitions
2014    if (DstReg.isPhysical())
2015      return false;
2016    if (TRI->isAGPR(*MRI, DstReg))
2017      continue;
2018    MoveRegs.push_back(DstReg);
2019    for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2020      Users.push_back(&U);
2021  }
2022
2023  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2024  MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2025  if (!TII->isOperandLegal(MI, 0, &Def)) {
2026    MRI->setRegClass(DefReg, RC);
2027    return false;
2028  }
2029
2030  while (!MoveRegs.empty()) {
2031    Register Reg = MoveRegs.pop_back_val();
2032    MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2033  }
2034
2035  LLVM_DEBUG(dbgs() << "Folded " << MI);
2036
2037  return true;
2038}
2039
2040// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2041// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2042// there's cases where it can create a lot more AGPR-AGPR copies, which are
2043// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2044//
2045// This function looks at all AGPR PHIs in a basic block and collects their
2046// operands. Then, it checks for register that are used more than once across
2047// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2048// having to create one VGPR temporary per use, which can get very messy if
2049// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2050// element).
2051//
2052// Example
2053//      a:
2054//        %in:agpr_256 = COPY %foo:vgpr_256
2055//      c:
2056//        %x:agpr_32 = ..
2057//      b:
2058//        %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2059//        %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2060//        %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2061//  =>
2062//      a:
2063//        %in:agpr_256 = COPY %foo:vgpr_256
2064//        %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2065//        %tmp_agpr:agpr_32 = COPY %tmp
2066//      c:
2067//        %x:agpr_32 = ..
2068//      b:
2069//        %0:areg = PHI %tmp_agpr, %a, %x, %c
2070//        %1:areg = PHI %tmp_agpr, %a, %y, %c
2071//        %2:areg = PHI %tmp_agpr, %a, %z, %c
2072bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2073  // This is only really needed on GFX908 where AGPR-AGPR copies are
2074  // unreasonably difficult.
2075  if (ST->hasGFX90AInsts())
2076    return false;
2077
2078  // Look at all AGPR Phis and collect the register + subregister used.
2079  DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2080      RegToMO;
2081
2082  for (auto &MI : MBB) {
2083    if (!MI.isPHI())
2084      break;
2085
2086    if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2087      continue;
2088
2089    for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2090      MachineOperand &PhiMO = MI.getOperand(K);
2091      RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2092    }
2093  }
2094
2095  // For all (Reg, SubReg) pair that are used more than once, cache the value in
2096  // a VGPR.
2097  bool Changed = false;
2098  for (const auto &[Entry, MOs] : RegToMO) {
2099    if (MOs.size() == 1)
2100      continue;
2101
2102    const auto [Reg, SubReg] = Entry;
2103    MachineInstr *Def = MRI->getVRegDef(Reg);
2104    MachineBasicBlock *DefMBB = Def->getParent();
2105
2106    // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2107    // out.
2108    const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2109    Register TempVGPR =
2110        MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2111    MachineInstr *VGPRCopy =
2112        BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2113                TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2114            .addReg(Reg, /* flags */ 0, SubReg);
2115
2116    // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2117    Register TempAGPR = MRI->createVirtualRegister(ARC);
2118    BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2119            TII->get(AMDGPU::COPY), TempAGPR)
2120        .addReg(TempVGPR);
2121
2122    LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2123    for (MachineOperand *MO : MOs) {
2124      MO->setReg(TempAGPR);
2125      MO->setSubReg(AMDGPU::NoSubRegister);
2126      LLVM_DEBUG(dbgs() << "  Changed PHI Operand: " << *MO << "\n");
2127    }
2128
2129    Changed = true;
2130  }
2131
2132  return Changed;
2133}
2134
2135bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2136  if (skipFunction(MF.getFunction()))
2137    return false;
2138
2139  MRI = &MF.getRegInfo();
2140  ST = &MF.getSubtarget<GCNSubtarget>();
2141  TII = ST->getInstrInfo();
2142  TRI = &TII->getRegisterInfo();
2143  MFI = MF.getInfo<SIMachineFunctionInfo>();
2144
2145  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2146  // correctly handle signed zeros.
2147  //
2148  // FIXME: Also need to check strictfp
2149  bool IsIEEEMode = MFI->getMode().IEEE;
2150  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2151
2152  bool Changed = false;
2153  for (MachineBasicBlock *MBB : depth_first(&MF)) {
2154    MachineOperand *CurrentKnownM0Val = nullptr;
2155    for (auto &MI : make_early_inc_range(*MBB)) {
2156      Changed |= tryFoldCndMask(MI);
2157
2158      if (tryFoldZeroHighBits(MI)) {
2159        Changed = true;
2160        continue;
2161      }
2162
2163      if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2164        Changed = true;
2165        continue;
2166      }
2167
2168      if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2169        Changed = true;
2170        continue;
2171      }
2172
2173      if (MI.mayLoad() && tryFoldLoad(MI)) {
2174        Changed = true;
2175        continue;
2176      }
2177
2178      if (TII->isFoldableCopy(MI)) {
2179        Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2180        continue;
2181      }
2182
2183      // Saw an unknown clobber of m0, so we no longer know what it is.
2184      if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2185        CurrentKnownM0Val = nullptr;
2186
2187      // TODO: Omod might be OK if there is NSZ only on the source
2188      // instruction, and not the omod multiply.
2189      if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2190          !tryFoldOMod(MI))
2191        Changed |= tryFoldClamp(MI);
2192    }
2193
2194    Changed |= tryOptimizeAGPRPhis(*MBB);
2195  }
2196
2197  return Changed;
2198}
2199