SIPeepholeSDWA.cpp revision 344779
1327952Sdim//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2317017Sdim//
3317017Sdim//                     The LLVM Compiler Infrastructure
4317017Sdim//
5317017Sdim// This file is distributed under the University of Illinois Open Source
6317017Sdim// License. See LICENSE.TXT for details.
7317017Sdim//
8317017Sdim//===----------------------------------------------------------------------===//
9317017Sdim//
10317017Sdim/// \file This pass tries to apply several peephole SDWA patterns.
11317017Sdim///
12317017Sdim/// E.g. original:
13327952Sdim///   V_LSHRREV_B32_e32 %0, 16, %1
14327952Sdim///   V_ADD_I32_e32 %2, %0, %3
15327952Sdim///   V_LSHLREV_B32_e32 %4, 16, %2
16317017Sdim///
17317017Sdim/// Replace:
18327952Sdim///   V_ADD_I32_sdwa %4, %1, %3
19317017Sdim///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20317017Sdim///
21317017Sdim//===----------------------------------------------------------------------===//
22317017Sdim
23317017Sdim#include "AMDGPU.h"
24317017Sdim#include "AMDGPUSubtarget.h"
25317017Sdim#include "SIDefines.h"
26317017Sdim#include "SIInstrInfo.h"
27327952Sdim#include "SIRegisterInfo.h"
28341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29327952Sdim#include "Utils/AMDGPUBaseInfo.h"
30327952Sdim#include "llvm/ADT/None.h"
31327952Sdim#include "llvm/ADT/Optional.h"
32319799Sdim#include "llvm/ADT/STLExtras.h"
33327952Sdim#include "llvm/ADT/SmallVector.h"
34317017Sdim#include "llvm/ADT/Statistic.h"
35327952Sdim#include "llvm/CodeGen/MachineBasicBlock.h"
36327952Sdim#include "llvm/CodeGen/MachineFunction.h"
37317017Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
38327952Sdim#include "llvm/CodeGen/MachineInstr.h"
39317017Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
40327952Sdim#include "llvm/CodeGen/MachineOperand.h"
41327952Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
42327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h"
43341825Sdim#include "llvm/Config/llvm-config.h"
44327952Sdim#include "llvm/MC/LaneBitmask.h"
45327952Sdim#include "llvm/MC/MCInstrDesc.h"
46327952Sdim#include "llvm/Pass.h"
47327952Sdim#include "llvm/Support/Debug.h"
48327952Sdim#include "llvm/Support/raw_ostream.h"
49327952Sdim#include <algorithm>
50327952Sdim#include <cassert>
51327952Sdim#include <cstdint>
52327952Sdim#include <memory>
53317017Sdim#include <unordered_map>
54317017Sdim
55317017Sdimusing namespace llvm;
56317017Sdim
57317017Sdim#define DEBUG_TYPE "si-peephole-sdwa"
58317017Sdim
59317017SdimSTATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
60317017SdimSTATISTIC(NumSDWAInstructionsPeepholed,
61317017Sdim          "Number of instruction converted to SDWA.");
62317017Sdim
63317017Sdimnamespace {
64317017Sdim
65317017Sdimclass SDWAOperand;
66327952Sdimclass SDWADstOperand;
67317017Sdim
68317017Sdimclass SIPeepholeSDWA : public MachineFunctionPass {
69318681Sdimpublic:
70327952Sdim  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
71318681Sdim
72317017Sdimprivate:
73317017Sdim  MachineRegisterInfo *MRI;
74317017Sdim  const SIRegisterInfo *TRI;
75317017Sdim  const SIInstrInfo *TII;
76317017Sdim
77317017Sdim  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
78318681Sdim  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
79319250Sdim  SmallVector<MachineInstr *, 8> ConvertedInstructions;
80317017Sdim
81317017Sdim  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
82317017Sdim
83317017Sdimpublic:
84317017Sdim  static char ID;
85317017Sdim
86317017Sdim  SIPeepholeSDWA() : MachineFunctionPass(ID) {
87317017Sdim    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
88317017Sdim  }
89317017Sdim
90317017Sdim  bool runOnMachineFunction(MachineFunction &MF) override;
91341825Sdim  void matchSDWAOperands(MachineBasicBlock &MBB);
92327952Sdim  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
93344779Sdim  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
94344779Sdim  void pseudoOpConvertToVOP2(MachineInstr &MI,
95344779Sdim                             const GCNSubtarget &ST) const;
96317017Sdim  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
97341825Sdim  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
98317017Sdim
99317017Sdim  StringRef getPassName() const override { return "SI Peephole SDWA"; }
100317017Sdim
101317017Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
102317017Sdim    AU.setPreservesCFG();
103317017Sdim    MachineFunctionPass::getAnalysisUsage(AU);
104317017Sdim  }
105317017Sdim};
106317017Sdim
107317017Sdimclass SDWAOperand {
108317017Sdimprivate:
109317017Sdim  MachineOperand *Target; // Operand that would be used in converted instruction
110317017Sdim  MachineOperand *Replaced; // Operand that would be replace by Target
111317017Sdim
112317017Sdimpublic:
113317017Sdim  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
114317017Sdim      : Target(TargetOp), Replaced(ReplacedOp) {
115317017Sdim    assert(Target->isReg());
116317017Sdim    assert(Replaced->isReg());
117317017Sdim  }
118317017Sdim
119327952Sdim  virtual ~SDWAOperand() = default;
120317017Sdim
121317017Sdim  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
122317017Sdim  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
123317017Sdim
124317017Sdim  MachineOperand *getTargetOperand() const { return Target; }
125317017Sdim  MachineOperand *getReplacedOperand() const { return Replaced; }
126317017Sdim  MachineInstr *getParentInst() const { return Target->getParent(); }
127327952Sdim
128317017Sdim  MachineRegisterInfo *getMRI() const {
129317017Sdim    return &getParentInst()->getParent()->getParent()->getRegInfo();
130317017Sdim  }
131327952Sdim
132327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
133327952Sdim  virtual void print(raw_ostream& OS) const = 0;
134327952Sdim  void dump() const { print(dbgs()); }
135327952Sdim#endif
136317017Sdim};
137317017Sdim
138317017Sdimusing namespace AMDGPU::SDWA;
139317017Sdim
140317017Sdimclass SDWASrcOperand : public SDWAOperand {
141317017Sdimprivate:
142317017Sdim  SdwaSel SrcSel;
143317017Sdim  bool Abs;
144317017Sdim  bool Neg;
145317017Sdim  bool Sext;
146317017Sdim
147317017Sdimpublic:
148317017Sdim  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
149317017Sdim                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
150317017Sdim                 bool Sext_ = false)
151327952Sdim      : SDWAOperand(TargetOp, ReplacedOp),
152327952Sdim        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
153317017Sdim
154327952Sdim  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
155327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
156317017Sdim
157317017Sdim  SdwaSel getSrcSel() const { return SrcSel; }
158317017Sdim  bool getAbs() const { return Abs; }
159317017Sdim  bool getNeg() const { return Neg; }
160317017Sdim  bool getSext() const { return Sext; }
161317017Sdim
162319799Sdim  uint64_t getSrcMods(const SIInstrInfo *TII,
163319799Sdim                      const MachineOperand *SrcOp) const;
164327952Sdim
165327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
166327952Sdim  void print(raw_ostream& OS) const override;
167327952Sdim#endif
168317017Sdim};
169317017Sdim
170317017Sdimclass SDWADstOperand : public SDWAOperand {
171317017Sdimprivate:
172317017Sdim  SdwaSel DstSel;
173317017Sdim  DstUnused DstUn;
174317017Sdim
175317017Sdimpublic:
176327952Sdim
177317017Sdim  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
178317017Sdim                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
179327952Sdim    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
180317017Sdim
181327952Sdim  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
182327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
183317017Sdim
184317017Sdim  SdwaSel getDstSel() const { return DstSel; }
185317017Sdim  DstUnused getDstUnused() const { return DstUn; }
186327952Sdim
187327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
188327952Sdim  void print(raw_ostream& OS) const override;
189327952Sdim#endif
190317017Sdim};
191317017Sdim
192327952Sdimclass SDWADstPreserveOperand : public SDWADstOperand {
193327952Sdimprivate:
194327952Sdim  MachineOperand *Preserve;
195317017Sdim
196327952Sdimpublic:
197327952Sdim  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
198327952Sdim                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
199327952Sdim      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
200327952Sdim        Preserve(PreserveOp) {}
201327952Sdim
202327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
203327952Sdim
204327952Sdim  MachineOperand *getPreservedOperand() const { return Preserve; }
205327952Sdim
206327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
207327952Sdim  void print(raw_ostream& OS) const override;
208327952Sdim#endif
209327952Sdim};
210327952Sdim
211327952Sdim} // end anonymous namespace
212327952Sdim
213317017SdimINITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
214317017Sdim
215317017Sdimchar SIPeepholeSDWA::ID = 0;
216317017Sdim
217317017Sdimchar &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
218317017Sdim
219317017SdimFunctionPass *llvm::createSIPeepholeSDWAPass() {
220317017Sdim  return new SIPeepholeSDWA();
221317017Sdim}
222317017Sdim
223317017Sdim
224327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225341825Sdimstatic raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
226317017Sdim  switch(Sel) {
227317017Sdim  case BYTE_0: OS << "BYTE_0"; break;
228317017Sdim  case BYTE_1: OS << "BYTE_1"; break;
229317017Sdim  case BYTE_2: OS << "BYTE_2"; break;
230317017Sdim  case BYTE_3: OS << "BYTE_3"; break;
231317017Sdim  case WORD_0: OS << "WORD_0"; break;
232317017Sdim  case WORD_1: OS << "WORD_1"; break;
233317017Sdim  case DWORD:  OS << "DWORD"; break;
234317017Sdim  }
235317017Sdim  return OS;
236317017Sdim}
237317017Sdim
238317017Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
239317017Sdim  switch(Un) {
240317017Sdim  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
241317017Sdim  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
242317017Sdim  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
243317017Sdim  }
244317017Sdim  return OS;
245317017Sdim}
246317017Sdim
247327952Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
248327952Sdim  Operand.print(OS);
249317017Sdim  return OS;
250317017Sdim}
251317017Sdim
252327952SdimLLVM_DUMP_METHOD
253327952Sdimvoid SDWASrcOperand::print(raw_ostream& OS) const {
254327952Sdim  OS << "SDWA src: " << *getTargetOperand()
255327952Sdim    << " src_sel:" << getSrcSel()
256327952Sdim    << " abs:" << getAbs() << " neg:" << getNeg()
257327952Sdim    << " sext:" << getSext() << '\n';
258317017Sdim}
259317017Sdim
260327952SdimLLVM_DUMP_METHOD
261327952Sdimvoid SDWADstOperand::print(raw_ostream& OS) const {
262327952Sdim  OS << "SDWA dst: " << *getTargetOperand()
263327952Sdim    << " dst_sel:" << getDstSel()
264327952Sdim    << " dst_unused:" << getDstUnused() << '\n';
265327952Sdim}
266327952Sdim
267327952SdimLLVM_DUMP_METHOD
268327952Sdimvoid SDWADstPreserveOperand::print(raw_ostream& OS) const {
269327952Sdim  OS << "SDWA preserve dst: " << *getTargetOperand()
270327952Sdim    << " dst_sel:" << getDstSel()
271327952Sdim    << " preserve:" << *getPreservedOperand() << '\n';
272327952Sdim}
273327952Sdim
274317017Sdim#endif
275317017Sdim
276317017Sdimstatic void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
277317017Sdim  assert(To.isReg() && From.isReg());
278317017Sdim  To.setReg(From.getReg());
279317017Sdim  To.setSubReg(From.getSubReg());
280317017Sdim  To.setIsUndef(From.isUndef());
281317017Sdim  if (To.isUse()) {
282317017Sdim    To.setIsKill(From.isKill());
283317017Sdim  } else {
284317017Sdim    To.setIsDead(From.isDead());
285317017Sdim  }
286317017Sdim}
287317017Sdim
288317017Sdimstatic bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
289317017Sdim  return LHS.isReg() &&
290317017Sdim         RHS.isReg() &&
291317017Sdim         LHS.getReg() == RHS.getReg() &&
292317017Sdim         LHS.getSubReg() == RHS.getSubReg();
293317017Sdim}
294317017Sdim
295327952Sdimstatic MachineOperand *findSingleRegUse(const MachineOperand *Reg,
296327952Sdim                                        const MachineRegisterInfo *MRI) {
297327952Sdim  if (!Reg->isReg() || !Reg->isDef())
298327952Sdim    return nullptr;
299320397Sdim
300327952Sdim  MachineOperand *ResMO = nullptr;
301327952Sdim  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
302327952Sdim    // If there exist use of subreg of Reg then return nullptr
303327952Sdim    if (!isSameReg(UseMO, *Reg))
304327952Sdim      return nullptr;
305317017Sdim
306327952Sdim    // Check that there is only one instruction that uses Reg
307327952Sdim    if (!ResMO) {
308327952Sdim      ResMO = &UseMO;
309327952Sdim    } else if (ResMO->getParent() != UseMO.getParent()) {
310327952Sdim      return nullptr;
311327952Sdim    }
312327952Sdim  }
313317017Sdim
314327952Sdim  return ResMO;
315327952Sdim}
316317017Sdim
317327952Sdimstatic MachineOperand *findSingleRegDef(const MachineOperand *Reg,
318327952Sdim                                        const MachineRegisterInfo *MRI) {
319327952Sdim  if (!Reg->isReg())
320327952Sdim    return nullptr;
321327952Sdim
322327952Sdim  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
323327952Sdim  if (!DefInstr)
324327952Sdim    return nullptr;
325327952Sdim
326327952Sdim  for (auto &DefMO : DefInstr->defs()) {
327327952Sdim    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
328327952Sdim      return &DefMO;
329327952Sdim  }
330327952Sdim
331327952Sdim  // Ignore implicit defs.
332327952Sdim  return nullptr;
333317017Sdim}
334317017Sdim
335319799Sdimuint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
336319799Sdim                                    const MachineOperand *SrcOp) const {
337317017Sdim  uint64_t Mods = 0;
338319799Sdim  const auto *MI = SrcOp->getParent();
339319799Sdim  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
340319799Sdim    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
341319799Sdim      Mods = Mod->getImm();
342319799Sdim    }
343319799Sdim  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
344319799Sdim    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
345319799Sdim      Mods = Mod->getImm();
346319799Sdim    }
347319799Sdim  }
348317017Sdim  if (Abs || Neg) {
349317017Sdim    assert(!Sext &&
350317017Sdim           "Float and integer src modifiers can't be set simulteniously");
351317017Sdim    Mods |= Abs ? SISrcMods::ABS : 0;
352319799Sdim    Mods ^= Neg ? SISrcMods::NEG : 0;
353317017Sdim  } else if (Sext) {
354317017Sdim    Mods |= SISrcMods::SEXT;
355317017Sdim  }
356317017Sdim
357317017Sdim  return Mods;
358317017Sdim}
359317017Sdim
360317017SdimMachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
361317017Sdim  // For SDWA src operand potential instruction is one that use register
362317017Sdim  // defined by parent instruction
363327952Sdim  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
364327952Sdim  if (!PotentialMO)
365327952Sdim    return nullptr;
366317017Sdim
367327952Sdim  return PotentialMO->getParent();
368317017Sdim}
369317017Sdim
370317017Sdimbool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
371317017Sdim  // Find operand in instruction that matches source operand and replace it with
372317017Sdim  // target operand. Set corresponding src_sel
373341825Sdim  bool IsPreserveSrc = false;
374317017Sdim  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
375317017Sdim  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
376317017Sdim  MachineOperand *SrcMods =
377317017Sdim      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
378319250Sdim  assert(Src && (Src->isReg() || Src->isImm()));
379317017Sdim  if (!isSameReg(*Src, *getReplacedOperand())) {
380341825Sdim    // If this is not src0 then it could be src1
381317017Sdim    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
382317017Sdim    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
383317017Sdim    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
384317017Sdim
385341825Sdim    if (!Src ||
386341825Sdim        !isSameReg(*Src, *getReplacedOperand())) {
387341825Sdim      // It's possible this Src is a tied operand for
388341825Sdim      // UNUSED_PRESERVE, in which case we can either
389341825Sdim      // abandon the peephole attempt, or if legal we can
390341825Sdim      // copy the target operand into the tied slot
391341825Sdim      // if the preserve operation will effectively cause the same
392341825Sdim      // result by overwriting the rest of the dst.
393341825Sdim      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
394341825Sdim      MachineOperand *DstUnused =
395341825Sdim        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
396341825Sdim
397341825Sdim      if (Dst &&
398341825Sdim          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
399341825Sdim        // This will work if the tied src is acessing WORD_0, and the dst is
400341825Sdim        // writing WORD_1. Modifiers don't matter because all the bits that
401341825Sdim        // would be impacted are being overwritten by the dst.
402341825Sdim        // Any other case will not work.
403341825Sdim        SdwaSel DstSel = static_cast<SdwaSel>(
404341825Sdim            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
405341825Sdim        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
406341825Sdim            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
407341825Sdim          IsPreserveSrc = true;
408341825Sdim          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
409341825Sdim                                                   AMDGPU::OpName::vdst);
410341825Sdim          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
411341825Sdim          Src = &MI.getOperand(TiedIdx);
412341825Sdim          SrcSel = nullptr;
413341825Sdim          SrcMods = nullptr;
414341825Sdim        } else {
415341825Sdim          // Not legal to convert this src
416341825Sdim          return false;
417341825Sdim        }
418341825Sdim      }
419341825Sdim    }
420317017Sdim    assert(Src && Src->isReg());
421317017Sdim
422317017Sdim    if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
423317017Sdim         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
424327952Sdim         !isSameReg(*Src, *getReplacedOperand())) {
425317017Sdim      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
426317017Sdim      // src2. This is not allowed.
427317017Sdim      return false;
428317017Sdim    }
429317017Sdim
430341825Sdim    assert(isSameReg(*Src, *getReplacedOperand()) &&
431341825Sdim           (IsPreserveSrc || (SrcSel && SrcMods)));
432317017Sdim  }
433317017Sdim  copyRegOperand(*Src, *getTargetOperand());
434341825Sdim  if (!IsPreserveSrc) {
435341825Sdim    SrcSel->setImm(getSrcSel());
436341825Sdim    SrcMods->setImm(getSrcMods(TII, Src));
437341825Sdim  }
438317017Sdim  getTargetOperand()->setIsKill(false);
439317017Sdim  return true;
440317017Sdim}
441317017Sdim
442317017SdimMachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
443317017Sdim  // For SDWA dst operand potential instruction is one that defines register
444317017Sdim  // that this operand uses
445317017Sdim  MachineRegisterInfo *MRI = getMRI();
446317017Sdim  MachineInstr *ParentMI = getParentInst();
447317017Sdim
448327952Sdim  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
449327952Sdim  if (!PotentialMO)
450327952Sdim    return nullptr;
451317017Sdim
452327952Sdim  // Check that ParentMI is the only instruction that uses replaced register
453327952Sdim  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
454327952Sdim    if (&UseInst != ParentMI)
455317017Sdim      return nullptr;
456317017Sdim  }
457317017Sdim
458327952Sdim  return PotentialMO->getParent();
459317017Sdim}
460317017Sdim
461317017Sdimbool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
462317017Sdim  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
463317017Sdim
464317017Sdim  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
465317017Sdim       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
466317017Sdim      getDstSel() != AMDGPU::SDWA::DWORD) {
467317017Sdim    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
468317017Sdim    return false;
469317017Sdim  }
470317017Sdim
471317017Sdim  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
472317017Sdim  assert(Operand &&
473317017Sdim         Operand->isReg() &&
474317017Sdim         isSameReg(*Operand, *getReplacedOperand()));
475317017Sdim  copyRegOperand(*Operand, *getTargetOperand());
476317017Sdim  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
477317017Sdim  assert(DstSel);
478317017Sdim  DstSel->setImm(getDstSel());
479317017Sdim  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
480317017Sdim  assert(DstUnused);
481317017Sdim  DstUnused->setImm(getDstUnused());
482317017Sdim
483317017Sdim  // Remove original instruction  because it would conflict with our new
484317017Sdim  // instruction by register definition
485317017Sdim  getParentInst()->eraseFromParent();
486317017Sdim  return true;
487317017Sdim}
488317017Sdim
489327952Sdimbool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
490327952Sdim                                           const SIInstrInfo *TII) {
491327952Sdim  // MI should be moved right before v_or_b32.
492327952Sdim  // For this we should clear all kill flags on uses of MI src-operands or else
493327952Sdim  // we can encounter problem with use of killed operand.
494327952Sdim  for (MachineOperand &MO : MI.uses()) {
495327952Sdim    if (!MO.isReg())
496327952Sdim      continue;
497327952Sdim    getMRI()->clearKillFlags(MO.getReg());
498327952Sdim  }
499327952Sdim
500327952Sdim  // Move MI before v_or_b32
501327952Sdim  auto MBB = MI.getParent();
502327952Sdim  MBB->remove(&MI);
503327952Sdim  MBB->insert(getParentInst(), &MI);
504327952Sdim
505327952Sdim  // Add Implicit use of preserved register
506327952Sdim  MachineInstrBuilder MIB(*MBB->getParent(), MI);
507327952Sdim  MIB.addReg(getPreservedOperand()->getReg(),
508327952Sdim             RegState::ImplicitKill,
509327952Sdim             getPreservedOperand()->getSubReg());
510327952Sdim
511327952Sdim  // Tie dst to implicit use
512327952Sdim  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
513327952Sdim                 MI.getNumOperands() - 1);
514327952Sdim
515327952Sdim  // Convert MI as any other SDWADstOperand and remove v_or_b32
516327952Sdim  return SDWADstOperand::convertToSDWA(MI, TII);
517327952Sdim}
518327952Sdim
519317017SdimOptional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
520317017Sdim  if (Op.isImm()) {
521317017Sdim    return Op.getImm();
522317017Sdim  }
523317017Sdim
524317017Sdim  // If this is not immediate then it can be copy of immediate value, e.g.:
525327952Sdim  // %1 = S_MOV_B32 255;
526317017Sdim  if (Op.isReg()) {
527317017Sdim    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
528317017Sdim      if (!isSameReg(Op, Def))
529317017Sdim        continue;
530317017Sdim
531317017Sdim      const MachineInstr *DefInst = Def.getParent();
532317017Sdim      if (!TII->isFoldableCopy(*DefInst))
533317017Sdim        return None;
534317017Sdim
535317017Sdim      const MachineOperand &Copied = DefInst->getOperand(1);
536317017Sdim      if (!Copied.isImm())
537317017Sdim        return None;
538317017Sdim
539317017Sdim      return Copied.getImm();
540317017Sdim    }
541317017Sdim  }
542317017Sdim
543317017Sdim  return None;
544317017Sdim}
545317017Sdim
546327952Sdimstd::unique_ptr<SDWAOperand>
547327952SdimSIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
548327952Sdim  unsigned Opcode = MI.getOpcode();
549327952Sdim  switch (Opcode) {
550327952Sdim  case AMDGPU::V_LSHRREV_B32_e32:
551327952Sdim  case AMDGPU::V_ASHRREV_I32_e32:
552327952Sdim  case AMDGPU::V_LSHLREV_B32_e32:
553327952Sdim  case AMDGPU::V_LSHRREV_B32_e64:
554327952Sdim  case AMDGPU::V_ASHRREV_I32_e64:
555327952Sdim  case AMDGPU::V_LSHLREV_B32_e64: {
556327952Sdim    // from: v_lshrrev_b32_e32 v1, 16/24, v0
557327952Sdim    // to SDWA src:v0 src_sel:WORD_1/BYTE_3
558317017Sdim
559327952Sdim    // from: v_ashrrev_i32_e32 v1, 16/24, v0
560327952Sdim    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
561317017Sdim
562327952Sdim    // from: v_lshlrev_b32_e32 v1, 16/24, v0
563327952Sdim    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
564327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
565327952Sdim    auto Imm = foldToImm(*Src0);
566327952Sdim    if (!Imm)
567327952Sdim      break;
568317017Sdim
569327952Sdim    if (*Imm != 16 && *Imm != 24)
570327952Sdim      break;
571317017Sdim
572327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
573327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
574327952Sdim    if (TRI->isPhysicalRegister(Src1->getReg()) ||
575327952Sdim        TRI->isPhysicalRegister(Dst->getReg()))
576327952Sdim      break;
577317017Sdim
578327952Sdim    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
579327952Sdim        Opcode == AMDGPU::V_LSHLREV_B32_e64) {
580327952Sdim      return make_unique<SDWADstOperand>(
581327952Sdim          Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
582327952Sdim    } else {
583327952Sdim      return make_unique<SDWASrcOperand>(
584327952Sdim          Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
585327952Sdim          Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
586327952Sdim          Opcode != AMDGPU::V_LSHRREV_B32_e64);
587327952Sdim    }
588327952Sdim    break;
589327952Sdim  }
590317017Sdim
591327952Sdim  case AMDGPU::V_LSHRREV_B16_e32:
592327952Sdim  case AMDGPU::V_ASHRREV_I16_e32:
593327952Sdim  case AMDGPU::V_LSHLREV_B16_e32:
594327952Sdim  case AMDGPU::V_LSHRREV_B16_e64:
595327952Sdim  case AMDGPU::V_ASHRREV_I16_e64:
596327952Sdim  case AMDGPU::V_LSHLREV_B16_e64: {
597327952Sdim    // from: v_lshrrev_b16_e32 v1, 8, v0
598327952Sdim    // to SDWA src:v0 src_sel:BYTE_1
599317017Sdim
600327952Sdim    // from: v_ashrrev_i16_e32 v1, 8, v0
601327952Sdim    // to SDWA src:v0 src_sel:BYTE_1 sext:1
602317017Sdim
603327952Sdim    // from: v_lshlrev_b16_e32 v1, 8, v0
604327952Sdim    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
605327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
606327952Sdim    auto Imm = foldToImm(*Src0);
607327952Sdim    if (!Imm || *Imm != 8)
608327952Sdim      break;
609317017Sdim
610327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
611327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
612317017Sdim
613327952Sdim    if (TRI->isPhysicalRegister(Src1->getReg()) ||
614327952Sdim        TRI->isPhysicalRegister(Dst->getReg()))
615327952Sdim      break;
616317017Sdim
617327952Sdim    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
618327952Sdim        Opcode == AMDGPU::V_LSHLREV_B16_e64) {
619327952Sdim      return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
620327952Sdim    } else {
621327952Sdim      return make_unique<SDWASrcOperand>(
622327952Sdim            Src1, Dst, BYTE_1, false, false,
623327952Sdim            Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
624327952Sdim            Opcode != AMDGPU::V_LSHRREV_B16_e64);
625327952Sdim    }
626327952Sdim    break;
627327952Sdim  }
628317017Sdim
629327952Sdim  case AMDGPU::V_BFE_I32:
630327952Sdim  case AMDGPU::V_BFE_U32: {
631327952Sdim    // e.g.:
632327952Sdim    // from: v_bfe_u32 v1, v0, 8, 8
633327952Sdim    // to SDWA src:v0 src_sel:BYTE_1
634317017Sdim
635327952Sdim    // offset | width | src_sel
636327952Sdim    // ------------------------
637327952Sdim    // 0      | 8     | BYTE_0
638327952Sdim    // 0      | 16    | WORD_0
639327952Sdim    // 0      | 32    | DWORD ?
640327952Sdim    // 8      | 8     | BYTE_1
641327952Sdim    // 16     | 8     | BYTE_2
642327952Sdim    // 16     | 16    | WORD_1
643327952Sdim    // 24     | 8     | BYTE_3
644317017Sdim
645327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
646327952Sdim    auto Offset = foldToImm(*Src1);
647327952Sdim    if (!Offset)
648327952Sdim      break;
649317017Sdim
650327952Sdim    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
651327952Sdim    auto Width = foldToImm(*Src2);
652327952Sdim    if (!Width)
653327952Sdim      break;
654317017Sdim
655327952Sdim    SdwaSel SrcSel = DWORD;
656317017Sdim
657327952Sdim    if (*Offset == 0 && *Width == 8)
658327952Sdim      SrcSel = BYTE_0;
659327952Sdim    else if (*Offset == 0 && *Width == 16)
660327952Sdim      SrcSel = WORD_0;
661327952Sdim    else if (*Offset == 0 && *Width == 32)
662327952Sdim      SrcSel = DWORD;
663327952Sdim    else if (*Offset == 8 && *Width == 8)
664327952Sdim      SrcSel = BYTE_1;
665327952Sdim    else if (*Offset == 16 && *Width == 8)
666327952Sdim      SrcSel = BYTE_2;
667327952Sdim    else if (*Offset == 16 && *Width == 16)
668327952Sdim      SrcSel = WORD_1;
669327952Sdim    else if (*Offset == 24 && *Width == 8)
670327952Sdim      SrcSel = BYTE_3;
671327952Sdim    else
672327952Sdim      break;
673317017Sdim
674327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
675327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
676320397Sdim
677327952Sdim    if (TRI->isPhysicalRegister(Src0->getReg()) ||
678327952Sdim        TRI->isPhysicalRegister(Dst->getReg()))
679327952Sdim      break;
680317017Sdim
681327952Sdim    return make_unique<SDWASrcOperand>(
682327952Sdim          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
683327952Sdim  }
684327952Sdim
685327952Sdim  case AMDGPU::V_AND_B32_e32:
686327952Sdim  case AMDGPU::V_AND_B32_e64: {
687327952Sdim    // e.g.:
688327952Sdim    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
689327952Sdim    // to SDWA src:v0 src_sel:WORD_0/BYTE_0
690327952Sdim
691327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
692327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
693327952Sdim    auto ValSrc = Src1;
694327952Sdim    auto Imm = foldToImm(*Src0);
695327952Sdim
696327952Sdim    if (!Imm) {
697327952Sdim      Imm = foldToImm(*Src1);
698327952Sdim      ValSrc = Src0;
699327952Sdim    }
700327952Sdim
701327952Sdim    if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
702327952Sdim      break;
703327952Sdim
704327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
705327952Sdim
706341825Sdim    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
707327952Sdim        TRI->isPhysicalRegister(Dst->getReg()))
708327952Sdim      break;
709327952Sdim
710327952Sdim    return make_unique<SDWASrcOperand>(
711327952Sdim        ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
712327952Sdim  }
713327952Sdim
714327952Sdim  case AMDGPU::V_OR_B32_e32:
715327952Sdim  case AMDGPU::V_OR_B32_e64: {
716327952Sdim    // Patterns for dst_unused:UNUSED_PRESERVE.
717327952Sdim    // e.g., from:
718327952Sdim    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
719327952Sdim    //                           src1_sel:WORD_1 src2_sel:WORD1
720327952Sdim    // v_add_f16_e32 v3, v1, v2
721327952Sdim    // v_or_b32_e32 v4, v0, v3
722327952Sdim    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
723327952Sdim
724327952Sdim    // Check if one of operands of v_or_b32 is SDWA instruction
725327952Sdim    using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
726327952Sdim    auto CheckOROperandsForSDWA =
727327952Sdim      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
728327952Sdim        if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
729327952Sdim          return CheckRetType(None);
730327952Sdim
731327952Sdim        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
732327952Sdim        if (!Op1Def)
733327952Sdim          return CheckRetType(None);
734327952Sdim
735327952Sdim        MachineInstr *Op1Inst = Op1Def->getParent();
736327952Sdim        if (!TII->isSDWA(*Op1Inst))
737327952Sdim          return CheckRetType(None);
738327952Sdim
739327952Sdim        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
740327952Sdim        if (!Op2Def)
741327952Sdim          return CheckRetType(None);
742327952Sdim
743327952Sdim        return CheckRetType(std::make_pair(Op1Def, Op2Def));
744327952Sdim      };
745327952Sdim
746327952Sdim    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
747327952Sdim    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
748327952Sdim    assert(OrSDWA && OrOther);
749327952Sdim    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
750327952Sdim    if (!Res) {
751327952Sdim      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
752327952Sdim      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
753327952Sdim      assert(OrSDWA && OrOther);
754327952Sdim      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
755327952Sdim      if (!Res)
756317017Sdim        break;
757327952Sdim    }
758317017Sdim
759327952Sdim    MachineOperand *OrSDWADef = Res->first;
760327952Sdim    MachineOperand *OrOtherDef = Res->second;
761327952Sdim    assert(OrSDWADef && OrOtherDef);
762317017Sdim
763327952Sdim    MachineInstr *SDWAInst = OrSDWADef->getParent();
764327952Sdim    MachineInstr *OtherInst = OrOtherDef->getParent();
765319799Sdim
766327952Sdim    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
767327952Sdim    // destination patterns don't overlap. Compatible instruction can be either
768327952Sdim    // regular instruction with compatible bitness or SDWA instruction with
769327952Sdim    // correct dst_sel
770327952Sdim    // SDWAInst | OtherInst bitness / OtherInst dst_sel
771327952Sdim    // -----------------------------------------------------
772327952Sdim    // DWORD    | no                    / no
773327952Sdim    // WORD_0   | no                    / BYTE_2/3, WORD_1
774327952Sdim    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
775327952Sdim    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
776327952Sdim    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
777327952Sdim    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
778327952Sdim    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
779327952Sdim    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
780327952Sdim    // but v_add_f32 is not.
781317017Sdim
782327952Sdim    // TODO: add support for non-SDWA instructions as OtherInst.
783327952Sdim    // For now this only works with SDWA instructions. For regular instructions
784341825Sdim    // there is no way to determine if the instruction writes only 8/16/24-bit
785341825Sdim    // out of full register size and all registers are at min 32-bit wide.
786327952Sdim    if (!TII->isSDWA(*OtherInst))
787327952Sdim      break;
788320397Sdim
789327952Sdim    SdwaSel DstSel = static_cast<SdwaSel>(
790327952Sdim      TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
791327952Sdim    SdwaSel OtherDstSel = static_cast<SdwaSel>(
792327952Sdim      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
793317017Sdim
794327952Sdim    bool DstSelAgree = false;
795327952Sdim    switch (DstSel) {
796327952Sdim    case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
797327952Sdim                                (OtherDstSel == BYTE_3) ||
798327952Sdim                                (OtherDstSel == WORD_1));
799327952Sdim      break;
800327952Sdim    case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
801327952Sdim                                (OtherDstSel == BYTE_1) ||
802327952Sdim                                (OtherDstSel == WORD_0));
803327952Sdim      break;
804327952Sdim    case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
805327952Sdim                                (OtherDstSel == BYTE_2) ||
806327952Sdim                                (OtherDstSel == BYTE_3) ||
807327952Sdim                                (OtherDstSel == WORD_1));
808327952Sdim      break;
809327952Sdim    case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
810327952Sdim                                (OtherDstSel == BYTE_2) ||
811327952Sdim                                (OtherDstSel == BYTE_3) ||
812327952Sdim                                (OtherDstSel == WORD_1));
813327952Sdim      break;
814327952Sdim    case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
815327952Sdim                                (OtherDstSel == BYTE_1) ||
816327952Sdim                                (OtherDstSel == BYTE_3) ||
817327952Sdim                                (OtherDstSel == WORD_0));
818327952Sdim      break;
819327952Sdim    case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
820327952Sdim                                (OtherDstSel == BYTE_1) ||
821327952Sdim                                (OtherDstSel == BYTE_2) ||
822327952Sdim                                (OtherDstSel == WORD_0));
823327952Sdim      break;
824327952Sdim    default: DstSelAgree = false;
825327952Sdim    }
826327952Sdim
827327952Sdim    if (!DstSelAgree)
828327952Sdim      break;
829327952Sdim
830327952Sdim    // Also OtherInst dst_unused should be UNUSED_PAD
831327952Sdim    DstUnused OtherDstUnused = static_cast<DstUnused>(
832327952Sdim      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
833327952Sdim    if (OtherDstUnused != DstUnused::UNUSED_PAD)
834327952Sdim      break;
835327952Sdim
836327952Sdim    // Create DstPreserveOperand
837327952Sdim    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
838327952Sdim    assert(OrDst && OrDst->isReg());
839327952Sdim
840327952Sdim    return make_unique<SDWADstPreserveOperand>(
841327952Sdim      OrDst, OrSDWADef, OrOtherDef, DstSel);
842327952Sdim
843327952Sdim  }
844327952Sdim  }
845327952Sdim
846327952Sdim  return std::unique_ptr<SDWAOperand>(nullptr);
847327952Sdim}
848327952Sdim
849341825Sdimvoid SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
850341825Sdim  for (MachineInstr &MI : MBB) {
851341825Sdim    if (auto Operand = matchSDWAOperand(MI)) {
852341825Sdim      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
853341825Sdim      SDWAOperands[&MI] = std::move(Operand);
854341825Sdim      ++NumSDWAPatternsFound;
855317017Sdim    }
856317017Sdim  }
857317017Sdim}
858317017Sdim
859344779Sdim// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
860344779Sdim// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
861344779Sdim// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
862344779Sdim//
863344779Sdim// We are transforming from a VOP3 into a VOP2 form of the instruction.
864344779Sdim//   %19:vgpr_32 = V_AND_B32_e32 255,
865344779Sdim//       killed %16:vgpr_32, implicit $exec
866344779Sdim//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
867344779Sdim//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
868344779Sdim//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
869344779Sdim//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
870344779Sdim//
871344779Sdim// becomes
872344779Sdim//   %47:vgpr_32 = V_ADD_I32_sdwa
873344779Sdim//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
874344779Sdim//       implicit-def $vcc, implicit $exec
875344779Sdim//  %48:vgpr_32 = V_ADDC_U32_e32
876344779Sdim//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
877344779Sdimvoid SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
878344779Sdim                                           const GCNSubtarget &ST) const {
879344779Sdim  int Opc = MI.getOpcode();
880344779Sdim  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
881344779Sdim         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
882344779Sdim
883344779Sdim  // Can the candidate MI be shrunk?
884344779Sdim  if (!TII->canShrink(MI, *MRI))
885344779Sdim    return;
886344779Sdim  Opc = AMDGPU::getVOPe32(Opc);
887344779Sdim  // Find the related ADD instruction.
888344779Sdim  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
889344779Sdim  if (!Sdst)
890344779Sdim    return;
891344779Sdim  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
892344779Sdim  if (!NextOp)
893344779Sdim    return;
894344779Sdim  MachineInstr &MISucc = *NextOp->getParent();
895344779Sdim  // Can the successor be shrunk?
896344779Sdim  if (!TII->canShrink(MISucc, *MRI))
897344779Sdim    return;
898344779Sdim  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
899344779Sdim  // Make sure the carry in/out are subsequently unused.
900344779Sdim  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
901344779Sdim  if (!CarryIn)
902344779Sdim    return;
903344779Sdim  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
904344779Sdim  if (!CarryOut)
905344779Sdim    return;
906344779Sdim  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
907344779Sdim    return;
908344779Sdim  // Make sure VCC or its subregs are dead before MI.
909344779Sdim  MachineBasicBlock &MBB = *MI.getParent();
910344779Sdim  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
911344779Sdim  if (Liveness != MachineBasicBlock::LQR_Dead)
912344779Sdim    return;
913344779Sdim  // Check if VCC is referenced in range of (MI,MISucc].
914344779Sdim  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
915344779Sdim       I != E; ++I) {
916344779Sdim    if (I->modifiesRegister(AMDGPU::VCC, TRI))
917344779Sdim      return;
918344779Sdim  }
919344779Sdim  // Make the two new e32 instruction variants.
920344779Sdim  // Replace MI with V_{SUB|ADD}_I32_e32
921344779Sdim  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
922344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
923344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
924344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
925344779Sdim  MI.eraseFromParent();
926344779Sdim  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
927344779Sdim  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
928344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
929344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
930344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
931344779Sdim  MISucc.eraseFromParent();
932344779Sdim}
933344779Sdim
934344779Sdimbool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
935341825Sdim                                         const GCNSubtarget &ST) const {
936327952Sdim  // Check if this is already an SDWA instruction
937327952Sdim  unsigned Opc = MI.getOpcode();
938327952Sdim  if (TII->isSDWA(Opc))
939327952Sdim    return true;
940327952Sdim
941319250Sdim  // Check if this instruction has opcode that supports SDWA
942320397Sdim  if (AMDGPU::getSDWAOp(Opc) == -1)
943320397Sdim    Opc = AMDGPU::getVOPe32(Opc);
944320397Sdim
945327952Sdim  if (AMDGPU::getSDWAOp(Opc) == -1)
946320397Sdim    return false;
947320397Sdim
948320397Sdim  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
949320397Sdim    return false;
950320397Sdim
951320397Sdim  if (TII->isVOPC(Opc)) {
952320397Sdim    if (!ST.hasSDWASdst()) {
953320397Sdim      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
954320397Sdim      if (SDst && SDst->getReg() != AMDGPU::VCC)
955320397Sdim        return false;
956320397Sdim    }
957320397Sdim
958320572Sdim    if (!ST.hasSDWAOutModsVOPC() &&
959320572Sdim        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
960320572Sdim         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
961320397Sdim      return false;
962320397Sdim
963320572Sdim  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
964320572Sdim             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
965320397Sdim    return false;
966320397Sdim  }
967320397Sdim
968320397Sdim  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
969320397Sdim                           Opc == AMDGPU::V_MAC_F32_e32))
970320397Sdim    return false;
971320397Sdim
972341825Sdim  // FIXME: has SDWA but require handling of implicit VCC use
973341825Sdim  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
974341825Sdim    return false;
975341825Sdim
976320397Sdim  return true;
977318681Sdim}
978318681Sdim
979318681Sdimbool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
980318681Sdim                                   const SDWAOperandsVector &SDWAOperands) {
981341825Sdim
982341825Sdim  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
983341825Sdim
984317017Sdim  // Convert to sdwa
985327952Sdim  int SDWAOpcode;
986327952Sdim  unsigned Opcode = MI.getOpcode();
987327952Sdim  if (TII->isSDWA(Opcode)) {
988327952Sdim    SDWAOpcode = Opcode;
989327952Sdim  } else {
990327952Sdim    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
991327952Sdim    if (SDWAOpcode == -1)
992327952Sdim      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
993327952Sdim  }
994317017Sdim  assert(SDWAOpcode != -1);
995317017Sdim
996317017Sdim  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
997317017Sdim
998317017Sdim  // Create SDWA version of instruction MI and initialize its operands
999317017Sdim  MachineInstrBuilder SDWAInst =
1000317017Sdim    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
1001317017Sdim
1002320572Sdim  // Copy dst, if it is present in original then should also be present in SDWA
1003320572Sdim  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1004317017Sdim  if (Dst) {
1005317017Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1006317017Sdim    SDWAInst.add(*Dst);
1007320572Sdim  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1008320397Sdim    assert(Dst &&
1009320397Sdim           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1010320397Sdim    SDWAInst.add(*Dst);
1011320572Sdim  } else {
1012320572Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1013320572Sdim    SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
1014317017Sdim  }
1015317017Sdim
1016317017Sdim  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1017317017Sdim  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1018317017Sdim  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1019317017Sdim  assert(
1020317017Sdim    Src0 &&
1021317017Sdim    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1022317017Sdim    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1023319799Sdim  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1024319799Sdim    SDWAInst.addImm(Mod->getImm());
1025319799Sdim  else
1026319799Sdim    SDWAInst.addImm(0);
1027317017Sdim  SDWAInst.add(*Src0);
1028317017Sdim
1029317017Sdim  // Copy src1 if present, initialize src1_modifiers.
1030317017Sdim  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1031317017Sdim  if (Src1) {
1032317017Sdim    assert(
1033317017Sdim      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1034317017Sdim      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1035319799Sdim    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1036319799Sdim      SDWAInst.addImm(Mod->getImm());
1037319799Sdim    else
1038319799Sdim      SDWAInst.addImm(0);
1039317017Sdim    SDWAInst.add(*Src1);
1040317017Sdim  }
1041317017Sdim
1042317017Sdim  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1043317017Sdim      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1044317017Sdim    // v_mac_f16/32 has additional src2 operand tied to vdst
1045317017Sdim    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1046317017Sdim    assert(Src2);
1047317017Sdim    SDWAInst.add(*Src2);
1048317017Sdim  }
1049317017Sdim
1050320397Sdim  // Copy clamp if present, initialize otherwise
1051317017Sdim  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1052320397Sdim  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1053320397Sdim  if (Clamp) {
1054320397Sdim    SDWAInst.add(*Clamp);
1055320397Sdim  } else {
1056320397Sdim    SDWAInst.addImm(0);
1057320397Sdim  }
1058317017Sdim
1059320397Sdim  // Copy omod if present, initialize otherwise if needed
1060320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1061320572Sdim    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1062320572Sdim    if (OMod) {
1063320572Sdim      SDWAInst.add(*OMod);
1064320572Sdim    } else {
1065320572Sdim      SDWAInst.addImm(0);
1066320572Sdim    }
1067320397Sdim  }
1068320397Sdim
1069327952Sdim  // Copy dst_sel if present, initialize otherwise if needed
1070320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1071327952Sdim    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1072327952Sdim    if (DstSel) {
1073327952Sdim      SDWAInst.add(*DstSel);
1074327952Sdim    } else {
1075327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1076327952Sdim    }
1077320572Sdim  }
1078320572Sdim
1079327952Sdim  // Copy dst_unused if present, initialize otherwise if needed
1080320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1081327952Sdim    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1082327952Sdim    if (DstUnused) {
1083327952Sdim      SDWAInst.add(*DstUnused);
1084327952Sdim    } else {
1085327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1086327952Sdim    }
1087317017Sdim  }
1088317017Sdim
1089327952Sdim  // Copy src0_sel if present, initialize otherwise
1090317017Sdim  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1091327952Sdim  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1092327952Sdim  if (Src0Sel) {
1093327952Sdim    SDWAInst.add(*Src0Sel);
1094327952Sdim  } else {
1095327952Sdim    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1096327952Sdim  }
1097317017Sdim
1098327952Sdim  // Copy src1_sel if present, initialize otherwise if needed
1099317017Sdim  if (Src1) {
1100317017Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1101327952Sdim    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1102327952Sdim    if (Src1Sel) {
1103327952Sdim      SDWAInst.add(*Src1Sel);
1104327952Sdim    } else {
1105327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1106327952Sdim    }
1107317017Sdim  }
1108317017Sdim
1109341825Sdim  // Check for a preserved register that needs to be copied.
1110341825Sdim  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1111341825Sdim  if (DstUnused &&
1112341825Sdim      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1113341825Sdim    // We expect, if we are here, that the instruction was already in it's SDWA form,
1114341825Sdim    // with a tied operand.
1115341825Sdim    assert(Dst && Dst->isTied());
1116341825Sdim    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1117341825Sdim    // We also expect a vdst, since sdst can't preserve.
1118341825Sdim    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1119341825Sdim    assert(PreserveDstIdx != -1);
1120341825Sdim
1121341825Sdim    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1122341825Sdim    auto Tied = MI.getOperand(TiedIdx);
1123341825Sdim
1124341825Sdim    SDWAInst.add(Tied);
1125341825Sdim    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1126341825Sdim  }
1127341825Sdim
1128341825Sdim  // Apply all sdwa operand patterns.
1129317017Sdim  bool Converted = false;
1130317017Sdim  for (auto &Operand : SDWAOperands) {
1131341825Sdim    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1132318681Sdim    // There should be no intesection between SDWA operands and potential MIs
1133318681Sdim    // e.g.:
1134318681Sdim    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1135318681Sdim    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1136318681Sdim    // v_add_u32 v3, v4, v2
1137318681Sdim    //
1138318681Sdim    // In that example it is possible that we would fold 2nd instruction into 3rd
1139318681Sdim    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1140318681Sdim    // already destroyed). So if SDWAOperand is also a potential MI then do not
1141318681Sdim    // apply it.
1142318681Sdim    if (PotentialMatches.count(Operand->getParentInst()) == 0)
1143318681Sdim      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1144317017Sdim  }
1145319250Sdim  if (Converted) {
1146319250Sdim    ConvertedInstructions.push_back(SDWAInst);
1147319250Sdim  } else {
1148317017Sdim    SDWAInst->eraseFromParent();
1149317017Sdim    return false;
1150317017Sdim  }
1151317017Sdim
1152341825Sdim  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1153317017Sdim  ++NumSDWAInstructionsPeepholed;
1154317017Sdim
1155317017Sdim  MI.eraseFromParent();
1156317017Sdim  return true;
1157317017Sdim}
1158317017Sdim
1159319250Sdim// If an instruction was converted to SDWA it should not have immediates or SGPR
1160320397Sdim// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1161341825Sdimvoid SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1162341825Sdim                                            const GCNSubtarget &ST) const {
1163319250Sdim  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1164320397Sdim  unsigned ConstantBusCount = 0;
1165327952Sdim  for (MachineOperand &Op : MI.explicit_uses()) {
1166319250Sdim    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1167319250Sdim      continue;
1168320397Sdim
1169320397Sdim    unsigned I = MI.getOperandNo(&Op);
1170319250Sdim    if (Desc.OpInfo[I].RegClass == -1 ||
1171319250Sdim       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1172319250Sdim      continue;
1173320397Sdim
1174320397Sdim    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1175320397Sdim        TRI->isSGPRReg(*MRI, Op.getReg())) {
1176320397Sdim      ++ConstantBusCount;
1177320397Sdim      continue;
1178320397Sdim    }
1179320397Sdim
1180319250Sdim    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1181319250Sdim    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1182319250Sdim                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1183319250Sdim    if (Op.isImm())
1184319250Sdim      Copy.addImm(Op.getImm());
1185319250Sdim    else if (Op.isReg())
1186319250Sdim      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1187319250Sdim                  Op.getSubReg());
1188319250Sdim    Op.ChangeToRegister(VGPR, false);
1189319250Sdim  }
1190319250Sdim}
1191319250Sdim
1192317017Sdimbool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1193341825Sdim  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1194317017Sdim
1195327952Sdim  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1196317017Sdim    return false;
1197317017Sdim
1198317017Sdim  MRI = &MF.getRegInfo();
1199317017Sdim  TRI = ST.getRegisterInfo();
1200317017Sdim  TII = ST.getInstrInfo();
1201320397Sdim
1202318681Sdim  // Find all SDWA operands in MF.
1203327952Sdim  bool Ret = false;
1204341825Sdim  for (MachineBasicBlock &MBB : MF) {
1205341825Sdim    bool Changed = false;
1206341825Sdim    do {
1207344779Sdim      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1208344779Sdim      // Look for a possible ADD or SUB that resulted from a previously lowered
1209344779Sdim      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1210344779Sdim      // lowers the pair of instructions into e32 form.
1211341825Sdim      matchSDWAOperands(MBB);
1212344779Sdim      for (const auto &OperandPair : SDWAOperands) {
1213344779Sdim        const auto &Operand = OperandPair.second;
1214344779Sdim        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1215344779Sdim        if (PotentialMI &&
1216344779Sdim           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
1217344779Sdim            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
1218344779Sdim          pseudoOpConvertToVOP2(*PotentialMI, ST);
1219344779Sdim      }
1220344779Sdim      SDWAOperands.clear();
1221317017Sdim
1222344779Sdim      // Generate potential match list.
1223344779Sdim      matchSDWAOperands(MBB);
1224344779Sdim
1225341825Sdim      for (const auto &OperandPair : SDWAOperands) {
1226341825Sdim        const auto &Operand = OperandPair.second;
1227341825Sdim        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1228341825Sdim        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1229341825Sdim          PotentialMatches[PotentialMI].push_back(Operand.get());
1230341825Sdim        }
1231327952Sdim      }
1232317017Sdim
1233341825Sdim      for (auto &PotentialPair : PotentialMatches) {
1234341825Sdim        MachineInstr &PotentialMI = *PotentialPair.first;
1235341825Sdim        convertToSDWA(PotentialMI, PotentialPair.second);
1236341825Sdim      }
1237317017Sdim
1238341825Sdim      PotentialMatches.clear();
1239341825Sdim      SDWAOperands.clear();
1240319250Sdim
1241341825Sdim      Changed = !ConvertedInstructions.empty();
1242319250Sdim
1243341825Sdim      if (Changed)
1244341825Sdim        Ret = true;
1245341825Sdim      while (!ConvertedInstructions.empty())
1246341825Sdim        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1247341825Sdim    } while (Changed);
1248341825Sdim  }
1249327952Sdim
1250319799Sdim  return Ret;
1251317017Sdim}
1252