1327952Sdim//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2317017Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6317017Sdim//
7317017Sdim//===----------------------------------------------------------------------===//
8317017Sdim//
9317017Sdim/// \file This pass tries to apply several peephole SDWA patterns.
10317017Sdim///
11317017Sdim/// E.g. original:
12327952Sdim///   V_LSHRREV_B32_e32 %0, 16, %1
13327952Sdim///   V_ADD_I32_e32 %2, %0, %3
14327952Sdim///   V_LSHLREV_B32_e32 %4, 16, %2
15317017Sdim///
16317017Sdim/// Replace:
17327952Sdim///   V_ADD_I32_sdwa %4, %1, %3
18317017Sdim///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19317017Sdim///
20317017Sdim//===----------------------------------------------------------------------===//
21317017Sdim
22317017Sdim#include "AMDGPU.h"
23317017Sdim#include "AMDGPUSubtarget.h"
24317017Sdim#include "SIDefines.h"
25317017Sdim#include "SIInstrInfo.h"
26327952Sdim#include "SIRegisterInfo.h"
27341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28327952Sdim#include "Utils/AMDGPUBaseInfo.h"
29360784Sdim#include "llvm/ADT/MapVector.h"
30327952Sdim#include "llvm/ADT/None.h"
31327952Sdim#include "llvm/ADT/Optional.h"
32319799Sdim#include "llvm/ADT/STLExtras.h"
33327952Sdim#include "llvm/ADT/SmallVector.h"
34317017Sdim#include "llvm/ADT/Statistic.h"
35327952Sdim#include "llvm/CodeGen/MachineBasicBlock.h"
36327952Sdim#include "llvm/CodeGen/MachineFunction.h"
37317017Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
38327952Sdim#include "llvm/CodeGen/MachineInstr.h"
39317017Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
40327952Sdim#include "llvm/CodeGen/MachineOperand.h"
41327952Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
42327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h"
43341825Sdim#include "llvm/Config/llvm-config.h"
44327952Sdim#include "llvm/MC/LaneBitmask.h"
45327952Sdim#include "llvm/MC/MCInstrDesc.h"
46327952Sdim#include "llvm/Pass.h"
47327952Sdim#include "llvm/Support/Debug.h"
48327952Sdim#include "llvm/Support/raw_ostream.h"
49327952Sdim#include <algorithm>
50327952Sdim#include <cassert>
51327952Sdim#include <cstdint>
52327952Sdim#include <memory>
53317017Sdim#include <unordered_map>
54317017Sdim
55317017Sdimusing namespace llvm;
56317017Sdim
57317017Sdim#define DEBUG_TYPE "si-peephole-sdwa"
58317017Sdim
59317017SdimSTATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
60317017SdimSTATISTIC(NumSDWAInstructionsPeepholed,
61317017Sdim          "Number of instruction converted to SDWA.");
62317017Sdim
63317017Sdimnamespace {
64317017Sdim
65317017Sdimclass SDWAOperand;
66327952Sdimclass SDWADstOperand;
67317017Sdim
68317017Sdimclass SIPeepholeSDWA : public MachineFunctionPass {
69318681Sdimpublic:
70327952Sdim  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
71318681Sdim
72317017Sdimprivate:
73317017Sdim  MachineRegisterInfo *MRI;
74317017Sdim  const SIRegisterInfo *TRI;
75317017Sdim  const SIInstrInfo *TII;
76317017Sdim
77360784Sdim  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
78360784Sdim  MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
79319250Sdim  SmallVector<MachineInstr *, 8> ConvertedInstructions;
80317017Sdim
81317017Sdim  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
82317017Sdim
83317017Sdimpublic:
84317017Sdim  static char ID;
85317017Sdim
86317017Sdim  SIPeepholeSDWA() : MachineFunctionPass(ID) {
87317017Sdim    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
88317017Sdim  }
89317017Sdim
90317017Sdim  bool runOnMachineFunction(MachineFunction &MF) override;
91341825Sdim  void matchSDWAOperands(MachineBasicBlock &MBB);
92327952Sdim  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
93344779Sdim  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
94344779Sdim  void pseudoOpConvertToVOP2(MachineInstr &MI,
95344779Sdim                             const GCNSubtarget &ST) const;
96317017Sdim  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
97341825Sdim  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
98317017Sdim
99317017Sdim  StringRef getPassName() const override { return "SI Peephole SDWA"; }
100317017Sdim
101317017Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
102317017Sdim    AU.setPreservesCFG();
103317017Sdim    MachineFunctionPass::getAnalysisUsage(AU);
104317017Sdim  }
105317017Sdim};
106317017Sdim
107317017Sdimclass SDWAOperand {
108317017Sdimprivate:
109317017Sdim  MachineOperand *Target; // Operand that would be used in converted instruction
110317017Sdim  MachineOperand *Replaced; // Operand that would be replace by Target
111317017Sdim
112317017Sdimpublic:
113317017Sdim  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
114317017Sdim      : Target(TargetOp), Replaced(ReplacedOp) {
115317017Sdim    assert(Target->isReg());
116317017Sdim    assert(Replaced->isReg());
117317017Sdim  }
118317017Sdim
119327952Sdim  virtual ~SDWAOperand() = default;
120317017Sdim
121317017Sdim  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
122317017Sdim  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
123317017Sdim
124317017Sdim  MachineOperand *getTargetOperand() const { return Target; }
125317017Sdim  MachineOperand *getReplacedOperand() const { return Replaced; }
126317017Sdim  MachineInstr *getParentInst() const { return Target->getParent(); }
127327952Sdim
128317017Sdim  MachineRegisterInfo *getMRI() const {
129317017Sdim    return &getParentInst()->getParent()->getParent()->getRegInfo();
130317017Sdim  }
131327952Sdim
132327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
133327952Sdim  virtual void print(raw_ostream& OS) const = 0;
134327952Sdim  void dump() const { print(dbgs()); }
135327952Sdim#endif
136317017Sdim};
137317017Sdim
138317017Sdimusing namespace AMDGPU::SDWA;
139317017Sdim
140317017Sdimclass SDWASrcOperand : public SDWAOperand {
141317017Sdimprivate:
142317017Sdim  SdwaSel SrcSel;
143317017Sdim  bool Abs;
144317017Sdim  bool Neg;
145317017Sdim  bool Sext;
146317017Sdim
147317017Sdimpublic:
148317017Sdim  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
149317017Sdim                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
150317017Sdim                 bool Sext_ = false)
151327952Sdim      : SDWAOperand(TargetOp, ReplacedOp),
152327952Sdim        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
153317017Sdim
154327952Sdim  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
155327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
156317017Sdim
157317017Sdim  SdwaSel getSrcSel() const { return SrcSel; }
158317017Sdim  bool getAbs() const { return Abs; }
159317017Sdim  bool getNeg() const { return Neg; }
160317017Sdim  bool getSext() const { return Sext; }
161317017Sdim
162319799Sdim  uint64_t getSrcMods(const SIInstrInfo *TII,
163319799Sdim                      const MachineOperand *SrcOp) const;
164327952Sdim
165327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
166327952Sdim  void print(raw_ostream& OS) const override;
167327952Sdim#endif
168317017Sdim};
169317017Sdim
170317017Sdimclass SDWADstOperand : public SDWAOperand {
171317017Sdimprivate:
172317017Sdim  SdwaSel DstSel;
173317017Sdim  DstUnused DstUn;
174317017Sdim
175317017Sdimpublic:
176327952Sdim
177317017Sdim  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
178317017Sdim                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
179327952Sdim    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
180317017Sdim
181327952Sdim  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
182327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
183317017Sdim
184317017Sdim  SdwaSel getDstSel() const { return DstSel; }
185317017Sdim  DstUnused getDstUnused() const { return DstUn; }
186327952Sdim
187327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
188327952Sdim  void print(raw_ostream& OS) const override;
189327952Sdim#endif
190317017Sdim};
191317017Sdim
192327952Sdimclass SDWADstPreserveOperand : public SDWADstOperand {
193327952Sdimprivate:
194327952Sdim  MachineOperand *Preserve;
195317017Sdim
196327952Sdimpublic:
197327952Sdim  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
198327952Sdim                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
199327952Sdim      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
200327952Sdim        Preserve(PreserveOp) {}
201327952Sdim
202327952Sdim  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
203327952Sdim
204327952Sdim  MachineOperand *getPreservedOperand() const { return Preserve; }
205327952Sdim
206327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
207327952Sdim  void print(raw_ostream& OS) const override;
208327952Sdim#endif
209327952Sdim};
210327952Sdim
211327952Sdim} // end anonymous namespace
212327952Sdim
213317017SdimINITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
214317017Sdim
215317017Sdimchar SIPeepholeSDWA::ID = 0;
216317017Sdim
217317017Sdimchar &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
218317017Sdim
219317017SdimFunctionPass *llvm::createSIPeepholeSDWAPass() {
220317017Sdim  return new SIPeepholeSDWA();
221317017Sdim}
222317017Sdim
223317017Sdim
224327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225341825Sdimstatic raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
226317017Sdim  switch(Sel) {
227317017Sdim  case BYTE_0: OS << "BYTE_0"; break;
228317017Sdim  case BYTE_1: OS << "BYTE_1"; break;
229317017Sdim  case BYTE_2: OS << "BYTE_2"; break;
230317017Sdim  case BYTE_3: OS << "BYTE_3"; break;
231317017Sdim  case WORD_0: OS << "WORD_0"; break;
232317017Sdim  case WORD_1: OS << "WORD_1"; break;
233317017Sdim  case DWORD:  OS << "DWORD"; break;
234317017Sdim  }
235317017Sdim  return OS;
236317017Sdim}
237317017Sdim
238317017Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
239317017Sdim  switch(Un) {
240317017Sdim  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
241317017Sdim  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
242317017Sdim  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
243317017Sdim  }
244317017Sdim  return OS;
245317017Sdim}
246317017Sdim
247327952SdimLLVM_DUMP_METHOD
248327952Sdimvoid SDWASrcOperand::print(raw_ostream& OS) const {
249327952Sdim  OS << "SDWA src: " << *getTargetOperand()
250327952Sdim    << " src_sel:" << getSrcSel()
251327952Sdim    << " abs:" << getAbs() << " neg:" << getNeg()
252327952Sdim    << " sext:" << getSext() << '\n';
253317017Sdim}
254317017Sdim
255327952SdimLLVM_DUMP_METHOD
256327952Sdimvoid SDWADstOperand::print(raw_ostream& OS) const {
257327952Sdim  OS << "SDWA dst: " << *getTargetOperand()
258327952Sdim    << " dst_sel:" << getDstSel()
259327952Sdim    << " dst_unused:" << getDstUnused() << '\n';
260327952Sdim}
261327952Sdim
262327952SdimLLVM_DUMP_METHOD
263327952Sdimvoid SDWADstPreserveOperand::print(raw_ostream& OS) const {
264327952Sdim  OS << "SDWA preserve dst: " << *getTargetOperand()
265327952Sdim    << " dst_sel:" << getDstSel()
266327952Sdim    << " preserve:" << *getPreservedOperand() << '\n';
267327952Sdim}
268327952Sdim
269317017Sdim#endif
270317017Sdim
271317017Sdimstatic void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
272317017Sdim  assert(To.isReg() && From.isReg());
273317017Sdim  To.setReg(From.getReg());
274317017Sdim  To.setSubReg(From.getSubReg());
275317017Sdim  To.setIsUndef(From.isUndef());
276317017Sdim  if (To.isUse()) {
277317017Sdim    To.setIsKill(From.isKill());
278317017Sdim  } else {
279317017Sdim    To.setIsDead(From.isDead());
280317017Sdim  }
281317017Sdim}
282317017Sdim
283317017Sdimstatic bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
284317017Sdim  return LHS.isReg() &&
285317017Sdim         RHS.isReg() &&
286317017Sdim         LHS.getReg() == RHS.getReg() &&
287317017Sdim         LHS.getSubReg() == RHS.getSubReg();
288317017Sdim}
289317017Sdim
290327952Sdimstatic MachineOperand *findSingleRegUse(const MachineOperand *Reg,
291327952Sdim                                        const MachineRegisterInfo *MRI) {
292327952Sdim  if (!Reg->isReg() || !Reg->isDef())
293327952Sdim    return nullptr;
294320397Sdim
295327952Sdim  MachineOperand *ResMO = nullptr;
296327952Sdim  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
297327952Sdim    // If there exist use of subreg of Reg then return nullptr
298327952Sdim    if (!isSameReg(UseMO, *Reg))
299327952Sdim      return nullptr;
300317017Sdim
301327952Sdim    // Check that there is only one instruction that uses Reg
302327952Sdim    if (!ResMO) {
303327952Sdim      ResMO = &UseMO;
304327952Sdim    } else if (ResMO->getParent() != UseMO.getParent()) {
305327952Sdim      return nullptr;
306327952Sdim    }
307327952Sdim  }
308317017Sdim
309327952Sdim  return ResMO;
310327952Sdim}
311317017Sdim
312327952Sdimstatic MachineOperand *findSingleRegDef(const MachineOperand *Reg,
313327952Sdim                                        const MachineRegisterInfo *MRI) {
314327952Sdim  if (!Reg->isReg())
315327952Sdim    return nullptr;
316327952Sdim
317327952Sdim  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
318327952Sdim  if (!DefInstr)
319327952Sdim    return nullptr;
320327952Sdim
321327952Sdim  for (auto &DefMO : DefInstr->defs()) {
322327952Sdim    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
323327952Sdim      return &DefMO;
324327952Sdim  }
325327952Sdim
326327952Sdim  // Ignore implicit defs.
327327952Sdim  return nullptr;
328317017Sdim}
329317017Sdim
330319799Sdimuint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
331319799Sdim                                    const MachineOperand *SrcOp) const {
332317017Sdim  uint64_t Mods = 0;
333319799Sdim  const auto *MI = SrcOp->getParent();
334319799Sdim  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
335319799Sdim    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
336319799Sdim      Mods = Mod->getImm();
337319799Sdim    }
338319799Sdim  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
339319799Sdim    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
340319799Sdim      Mods = Mod->getImm();
341319799Sdim    }
342319799Sdim  }
343317017Sdim  if (Abs || Neg) {
344317017Sdim    assert(!Sext &&
345317017Sdim           "Float and integer src modifiers can't be set simulteniously");
346353358Sdim    Mods |= Abs ? SISrcMods::ABS : 0u;
347353358Sdim    Mods ^= Neg ? SISrcMods::NEG : 0u;
348317017Sdim  } else if (Sext) {
349317017Sdim    Mods |= SISrcMods::SEXT;
350317017Sdim  }
351317017Sdim
352317017Sdim  return Mods;
353317017Sdim}
354317017Sdim
355317017SdimMachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
356317017Sdim  // For SDWA src operand potential instruction is one that use register
357317017Sdim  // defined by parent instruction
358327952Sdim  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
359327952Sdim  if (!PotentialMO)
360327952Sdim    return nullptr;
361317017Sdim
362327952Sdim  return PotentialMO->getParent();
363317017Sdim}
364317017Sdim
365317017Sdimbool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
366317017Sdim  // Find operand in instruction that matches source operand and replace it with
367317017Sdim  // target operand. Set corresponding src_sel
368341825Sdim  bool IsPreserveSrc = false;
369317017Sdim  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
370317017Sdim  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
371317017Sdim  MachineOperand *SrcMods =
372317017Sdim      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
373319250Sdim  assert(Src && (Src->isReg() || Src->isImm()));
374317017Sdim  if (!isSameReg(*Src, *getReplacedOperand())) {
375341825Sdim    // If this is not src0 then it could be src1
376317017Sdim    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
377317017Sdim    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
378317017Sdim    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
379317017Sdim
380341825Sdim    if (!Src ||
381341825Sdim        !isSameReg(*Src, *getReplacedOperand())) {
382341825Sdim      // It's possible this Src is a tied operand for
383341825Sdim      // UNUSED_PRESERVE, in which case we can either
384341825Sdim      // abandon the peephole attempt, or if legal we can
385341825Sdim      // copy the target operand into the tied slot
386341825Sdim      // if the preserve operation will effectively cause the same
387341825Sdim      // result by overwriting the rest of the dst.
388341825Sdim      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
389341825Sdim      MachineOperand *DstUnused =
390341825Sdim        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
391341825Sdim
392341825Sdim      if (Dst &&
393341825Sdim          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
394341825Sdim        // This will work if the tied src is acessing WORD_0, and the dst is
395341825Sdim        // writing WORD_1. Modifiers don't matter because all the bits that
396341825Sdim        // would be impacted are being overwritten by the dst.
397341825Sdim        // Any other case will not work.
398341825Sdim        SdwaSel DstSel = static_cast<SdwaSel>(
399341825Sdim            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
400341825Sdim        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
401341825Sdim            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
402341825Sdim          IsPreserveSrc = true;
403341825Sdim          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
404341825Sdim                                                   AMDGPU::OpName::vdst);
405341825Sdim          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
406341825Sdim          Src = &MI.getOperand(TiedIdx);
407341825Sdim          SrcSel = nullptr;
408341825Sdim          SrcMods = nullptr;
409341825Sdim        } else {
410341825Sdim          // Not legal to convert this src
411341825Sdim          return false;
412341825Sdim        }
413341825Sdim      }
414341825Sdim    }
415317017Sdim    assert(Src && Src->isReg());
416317017Sdim
417353358Sdim    if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
418353358Sdim         MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
419353358Sdim         MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
420317017Sdim         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
421327952Sdim         !isSameReg(*Src, *getReplacedOperand())) {
422317017Sdim      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
423317017Sdim      // src2. This is not allowed.
424317017Sdim      return false;
425317017Sdim    }
426317017Sdim
427341825Sdim    assert(isSameReg(*Src, *getReplacedOperand()) &&
428341825Sdim           (IsPreserveSrc || (SrcSel && SrcMods)));
429317017Sdim  }
430317017Sdim  copyRegOperand(*Src, *getTargetOperand());
431341825Sdim  if (!IsPreserveSrc) {
432341825Sdim    SrcSel->setImm(getSrcSel());
433341825Sdim    SrcMods->setImm(getSrcMods(TII, Src));
434341825Sdim  }
435317017Sdim  getTargetOperand()->setIsKill(false);
436317017Sdim  return true;
437317017Sdim}
438317017Sdim
439317017SdimMachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
440317017Sdim  // For SDWA dst operand potential instruction is one that defines register
441317017Sdim  // that this operand uses
442317017Sdim  MachineRegisterInfo *MRI = getMRI();
443317017Sdim  MachineInstr *ParentMI = getParentInst();
444317017Sdim
445327952Sdim  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
446327952Sdim  if (!PotentialMO)
447327952Sdim    return nullptr;
448317017Sdim
449327952Sdim  // Check that ParentMI is the only instruction that uses replaced register
450327952Sdim  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
451327952Sdim    if (&UseInst != ParentMI)
452317017Sdim      return nullptr;
453317017Sdim  }
454317017Sdim
455327952Sdim  return PotentialMO->getParent();
456317017Sdim}
457317017Sdim
458317017Sdimbool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
459317017Sdim  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
460317017Sdim
461353358Sdim  if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
462353358Sdim       MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
463353358Sdim       MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
464317017Sdim       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
465317017Sdim      getDstSel() != AMDGPU::SDWA::DWORD) {
466317017Sdim    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
467317017Sdim    return false;
468317017Sdim  }
469317017Sdim
470317017Sdim  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
471317017Sdim  assert(Operand &&
472317017Sdim         Operand->isReg() &&
473317017Sdim         isSameReg(*Operand, *getReplacedOperand()));
474317017Sdim  copyRegOperand(*Operand, *getTargetOperand());
475317017Sdim  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
476317017Sdim  assert(DstSel);
477317017Sdim  DstSel->setImm(getDstSel());
478317017Sdim  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
479317017Sdim  assert(DstUnused);
480317017Sdim  DstUnused->setImm(getDstUnused());
481317017Sdim
482317017Sdim  // Remove original instruction  because it would conflict with our new
483317017Sdim  // instruction by register definition
484317017Sdim  getParentInst()->eraseFromParent();
485317017Sdim  return true;
486317017Sdim}
487317017Sdim
488327952Sdimbool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
489327952Sdim                                           const SIInstrInfo *TII) {
490327952Sdim  // MI should be moved right before v_or_b32.
491327952Sdim  // For this we should clear all kill flags on uses of MI src-operands or else
492327952Sdim  // we can encounter problem with use of killed operand.
493327952Sdim  for (MachineOperand &MO : MI.uses()) {
494327952Sdim    if (!MO.isReg())
495327952Sdim      continue;
496327952Sdim    getMRI()->clearKillFlags(MO.getReg());
497327952Sdim  }
498327952Sdim
499327952Sdim  // Move MI before v_or_b32
500327952Sdim  auto MBB = MI.getParent();
501327952Sdim  MBB->remove(&MI);
502327952Sdim  MBB->insert(getParentInst(), &MI);
503327952Sdim
504327952Sdim  // Add Implicit use of preserved register
505327952Sdim  MachineInstrBuilder MIB(*MBB->getParent(), MI);
506327952Sdim  MIB.addReg(getPreservedOperand()->getReg(),
507327952Sdim             RegState::ImplicitKill,
508327952Sdim             getPreservedOperand()->getSubReg());
509327952Sdim
510327952Sdim  // Tie dst to implicit use
511327952Sdim  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
512327952Sdim                 MI.getNumOperands() - 1);
513327952Sdim
514327952Sdim  // Convert MI as any other SDWADstOperand and remove v_or_b32
515327952Sdim  return SDWADstOperand::convertToSDWA(MI, TII);
516327952Sdim}
517327952Sdim
518317017SdimOptional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
519317017Sdim  if (Op.isImm()) {
520317017Sdim    return Op.getImm();
521317017Sdim  }
522317017Sdim
523317017Sdim  // If this is not immediate then it can be copy of immediate value, e.g.:
524327952Sdim  // %1 = S_MOV_B32 255;
525317017Sdim  if (Op.isReg()) {
526317017Sdim    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
527317017Sdim      if (!isSameReg(Op, Def))
528317017Sdim        continue;
529317017Sdim
530317017Sdim      const MachineInstr *DefInst = Def.getParent();
531317017Sdim      if (!TII->isFoldableCopy(*DefInst))
532317017Sdim        return None;
533317017Sdim
534317017Sdim      const MachineOperand &Copied = DefInst->getOperand(1);
535317017Sdim      if (!Copied.isImm())
536317017Sdim        return None;
537317017Sdim
538317017Sdim      return Copied.getImm();
539317017Sdim    }
540317017Sdim  }
541317017Sdim
542317017Sdim  return None;
543317017Sdim}
544317017Sdim
545327952Sdimstd::unique_ptr<SDWAOperand>
546327952SdimSIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
547327952Sdim  unsigned Opcode = MI.getOpcode();
548327952Sdim  switch (Opcode) {
549327952Sdim  case AMDGPU::V_LSHRREV_B32_e32:
550327952Sdim  case AMDGPU::V_ASHRREV_I32_e32:
551327952Sdim  case AMDGPU::V_LSHLREV_B32_e32:
552327952Sdim  case AMDGPU::V_LSHRREV_B32_e64:
553327952Sdim  case AMDGPU::V_ASHRREV_I32_e64:
554327952Sdim  case AMDGPU::V_LSHLREV_B32_e64: {
555327952Sdim    // from: v_lshrrev_b32_e32 v1, 16/24, v0
556327952Sdim    // to SDWA src:v0 src_sel:WORD_1/BYTE_3
557317017Sdim
558327952Sdim    // from: v_ashrrev_i32_e32 v1, 16/24, v0
559327952Sdim    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
560317017Sdim
561327952Sdim    // from: v_lshlrev_b32_e32 v1, 16/24, v0
562327952Sdim    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
563327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
564327952Sdim    auto Imm = foldToImm(*Src0);
565327952Sdim    if (!Imm)
566327952Sdim      break;
567317017Sdim
568327952Sdim    if (*Imm != 16 && *Imm != 24)
569327952Sdim      break;
570317017Sdim
571327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
572327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
573360784Sdim    if (Register::isPhysicalRegister(Src1->getReg()) ||
574360784Sdim        Register::isPhysicalRegister(Dst->getReg()))
575327952Sdim      break;
576317017Sdim
577327952Sdim    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
578327952Sdim        Opcode == AMDGPU::V_LSHLREV_B32_e64) {
579360784Sdim      return std::make_unique<SDWADstOperand>(
580327952Sdim          Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
581327952Sdim    } else {
582360784Sdim      return std::make_unique<SDWASrcOperand>(
583327952Sdim          Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
584327952Sdim          Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
585327952Sdim          Opcode != AMDGPU::V_LSHRREV_B32_e64);
586327952Sdim    }
587327952Sdim    break;
588327952Sdim  }
589317017Sdim
590327952Sdim  case AMDGPU::V_LSHRREV_B16_e32:
591327952Sdim  case AMDGPU::V_ASHRREV_I16_e32:
592327952Sdim  case AMDGPU::V_LSHLREV_B16_e32:
593327952Sdim  case AMDGPU::V_LSHRREV_B16_e64:
594327952Sdim  case AMDGPU::V_ASHRREV_I16_e64:
595327952Sdim  case AMDGPU::V_LSHLREV_B16_e64: {
596327952Sdim    // from: v_lshrrev_b16_e32 v1, 8, v0
597327952Sdim    // to SDWA src:v0 src_sel:BYTE_1
598317017Sdim
599327952Sdim    // from: v_ashrrev_i16_e32 v1, 8, v0
600327952Sdim    // to SDWA src:v0 src_sel:BYTE_1 sext:1
601317017Sdim
602327952Sdim    // from: v_lshlrev_b16_e32 v1, 8, v0
603327952Sdim    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
604327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
605327952Sdim    auto Imm = foldToImm(*Src0);
606327952Sdim    if (!Imm || *Imm != 8)
607327952Sdim      break;
608317017Sdim
609327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
610327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
611317017Sdim
612360784Sdim    if (Register::isPhysicalRegister(Src1->getReg()) ||
613360784Sdim        Register::isPhysicalRegister(Dst->getReg()))
614327952Sdim      break;
615317017Sdim
616327952Sdim    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
617327952Sdim        Opcode == AMDGPU::V_LSHLREV_B16_e64) {
618360784Sdim      return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
619327952Sdim    } else {
620360784Sdim      return std::make_unique<SDWASrcOperand>(
621327952Sdim            Src1, Dst, BYTE_1, false, false,
622327952Sdim            Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
623327952Sdim            Opcode != AMDGPU::V_LSHRREV_B16_e64);
624327952Sdim    }
625327952Sdim    break;
626327952Sdim  }
627317017Sdim
628327952Sdim  case AMDGPU::V_BFE_I32:
629327952Sdim  case AMDGPU::V_BFE_U32: {
630327952Sdim    // e.g.:
631327952Sdim    // from: v_bfe_u32 v1, v0, 8, 8
632327952Sdim    // to SDWA src:v0 src_sel:BYTE_1
633317017Sdim
634327952Sdim    // offset | width | src_sel
635327952Sdim    // ------------------------
636327952Sdim    // 0      | 8     | BYTE_0
637327952Sdim    // 0      | 16    | WORD_0
638327952Sdim    // 0      | 32    | DWORD ?
639327952Sdim    // 8      | 8     | BYTE_1
640327952Sdim    // 16     | 8     | BYTE_2
641327952Sdim    // 16     | 16    | WORD_1
642327952Sdim    // 24     | 8     | BYTE_3
643317017Sdim
644327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
645327952Sdim    auto Offset = foldToImm(*Src1);
646327952Sdim    if (!Offset)
647327952Sdim      break;
648317017Sdim
649327952Sdim    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
650327952Sdim    auto Width = foldToImm(*Src2);
651327952Sdim    if (!Width)
652327952Sdim      break;
653317017Sdim
654327952Sdim    SdwaSel SrcSel = DWORD;
655317017Sdim
656327952Sdim    if (*Offset == 0 && *Width == 8)
657327952Sdim      SrcSel = BYTE_0;
658327952Sdim    else if (*Offset == 0 && *Width == 16)
659327952Sdim      SrcSel = WORD_0;
660327952Sdim    else if (*Offset == 0 && *Width == 32)
661327952Sdim      SrcSel = DWORD;
662327952Sdim    else if (*Offset == 8 && *Width == 8)
663327952Sdim      SrcSel = BYTE_1;
664327952Sdim    else if (*Offset == 16 && *Width == 8)
665327952Sdim      SrcSel = BYTE_2;
666327952Sdim    else if (*Offset == 16 && *Width == 16)
667327952Sdim      SrcSel = WORD_1;
668327952Sdim    else if (*Offset == 24 && *Width == 8)
669327952Sdim      SrcSel = BYTE_3;
670327952Sdim    else
671327952Sdim      break;
672317017Sdim
673327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
674327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
675320397Sdim
676360784Sdim    if (Register::isPhysicalRegister(Src0->getReg()) ||
677360784Sdim        Register::isPhysicalRegister(Dst->getReg()))
678327952Sdim      break;
679317017Sdim
680360784Sdim    return std::make_unique<SDWASrcOperand>(
681327952Sdim          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
682327952Sdim  }
683327952Sdim
684327952Sdim  case AMDGPU::V_AND_B32_e32:
685327952Sdim  case AMDGPU::V_AND_B32_e64: {
686327952Sdim    // e.g.:
687327952Sdim    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
688327952Sdim    // to SDWA src:v0 src_sel:WORD_0/BYTE_0
689327952Sdim
690327952Sdim    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
691327952Sdim    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
692327952Sdim    auto ValSrc = Src1;
693327952Sdim    auto Imm = foldToImm(*Src0);
694327952Sdim
695327952Sdim    if (!Imm) {
696327952Sdim      Imm = foldToImm(*Src1);
697327952Sdim      ValSrc = Src0;
698327952Sdim    }
699327952Sdim
700327952Sdim    if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
701327952Sdim      break;
702327952Sdim
703327952Sdim    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
704327952Sdim
705360784Sdim    if (Register::isPhysicalRegister(ValSrc->getReg()) ||
706360784Sdim        Register::isPhysicalRegister(Dst->getReg()))
707327952Sdim      break;
708327952Sdim
709360784Sdim    return std::make_unique<SDWASrcOperand>(
710327952Sdim        ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
711327952Sdim  }
712327952Sdim
713327952Sdim  case AMDGPU::V_OR_B32_e32:
714327952Sdim  case AMDGPU::V_OR_B32_e64: {
715327952Sdim    // Patterns for dst_unused:UNUSED_PRESERVE.
716327952Sdim    // e.g., from:
717327952Sdim    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
718327952Sdim    //                           src1_sel:WORD_1 src2_sel:WORD1
719327952Sdim    // v_add_f16_e32 v3, v1, v2
720327952Sdim    // v_or_b32_e32 v4, v0, v3
721327952Sdim    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
722327952Sdim
723327952Sdim    // Check if one of operands of v_or_b32 is SDWA instruction
724327952Sdim    using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
725327952Sdim    auto CheckOROperandsForSDWA =
726327952Sdim      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
727327952Sdim        if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
728327952Sdim          return CheckRetType(None);
729327952Sdim
730327952Sdim        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
731327952Sdim        if (!Op1Def)
732327952Sdim          return CheckRetType(None);
733327952Sdim
734327952Sdim        MachineInstr *Op1Inst = Op1Def->getParent();
735327952Sdim        if (!TII->isSDWA(*Op1Inst))
736327952Sdim          return CheckRetType(None);
737327952Sdim
738327952Sdim        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
739327952Sdim        if (!Op2Def)
740327952Sdim          return CheckRetType(None);
741327952Sdim
742327952Sdim        return CheckRetType(std::make_pair(Op1Def, Op2Def));
743327952Sdim      };
744327952Sdim
745327952Sdim    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
746327952Sdim    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
747327952Sdim    assert(OrSDWA && OrOther);
748327952Sdim    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
749327952Sdim    if (!Res) {
750327952Sdim      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
751327952Sdim      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
752327952Sdim      assert(OrSDWA && OrOther);
753327952Sdim      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
754327952Sdim      if (!Res)
755317017Sdim        break;
756327952Sdim    }
757317017Sdim
758327952Sdim    MachineOperand *OrSDWADef = Res->first;
759327952Sdim    MachineOperand *OrOtherDef = Res->second;
760327952Sdim    assert(OrSDWADef && OrOtherDef);
761317017Sdim
762327952Sdim    MachineInstr *SDWAInst = OrSDWADef->getParent();
763327952Sdim    MachineInstr *OtherInst = OrOtherDef->getParent();
764319799Sdim
765327952Sdim    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
766327952Sdim    // destination patterns don't overlap. Compatible instruction can be either
767327952Sdim    // regular instruction with compatible bitness or SDWA instruction with
768327952Sdim    // correct dst_sel
769327952Sdim    // SDWAInst | OtherInst bitness / OtherInst dst_sel
770327952Sdim    // -----------------------------------------------------
771327952Sdim    // DWORD    | no                    / no
772327952Sdim    // WORD_0   | no                    / BYTE_2/3, WORD_1
773327952Sdim    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
774327952Sdim    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
775327952Sdim    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
776327952Sdim    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
777327952Sdim    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
778327952Sdim    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
779327952Sdim    // but v_add_f32 is not.
780317017Sdim
781327952Sdim    // TODO: add support for non-SDWA instructions as OtherInst.
782327952Sdim    // For now this only works with SDWA instructions. For regular instructions
783341825Sdim    // there is no way to determine if the instruction writes only 8/16/24-bit
784341825Sdim    // out of full register size and all registers are at min 32-bit wide.
785327952Sdim    if (!TII->isSDWA(*OtherInst))
786327952Sdim      break;
787320397Sdim
788327952Sdim    SdwaSel DstSel = static_cast<SdwaSel>(
789327952Sdim      TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
790327952Sdim    SdwaSel OtherDstSel = static_cast<SdwaSel>(
791327952Sdim      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
792317017Sdim
793327952Sdim    bool DstSelAgree = false;
794327952Sdim    switch (DstSel) {
795327952Sdim    case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
796327952Sdim                                (OtherDstSel == BYTE_3) ||
797327952Sdim                                (OtherDstSel == WORD_1));
798327952Sdim      break;
799327952Sdim    case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
800327952Sdim                                (OtherDstSel == BYTE_1) ||
801327952Sdim                                (OtherDstSel == WORD_0));
802327952Sdim      break;
803327952Sdim    case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
804327952Sdim                                (OtherDstSel == BYTE_2) ||
805327952Sdim                                (OtherDstSel == BYTE_3) ||
806327952Sdim                                (OtherDstSel == WORD_1));
807327952Sdim      break;
808327952Sdim    case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
809327952Sdim                                (OtherDstSel == BYTE_2) ||
810327952Sdim                                (OtherDstSel == BYTE_3) ||
811327952Sdim                                (OtherDstSel == WORD_1));
812327952Sdim      break;
813327952Sdim    case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
814327952Sdim                                (OtherDstSel == BYTE_1) ||
815327952Sdim                                (OtherDstSel == BYTE_3) ||
816327952Sdim                                (OtherDstSel == WORD_0));
817327952Sdim      break;
818327952Sdim    case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
819327952Sdim                                (OtherDstSel == BYTE_1) ||
820327952Sdim                                (OtherDstSel == BYTE_2) ||
821327952Sdim                                (OtherDstSel == WORD_0));
822327952Sdim      break;
823327952Sdim    default: DstSelAgree = false;
824327952Sdim    }
825327952Sdim
826327952Sdim    if (!DstSelAgree)
827327952Sdim      break;
828327952Sdim
829327952Sdim    // Also OtherInst dst_unused should be UNUSED_PAD
830327952Sdim    DstUnused OtherDstUnused = static_cast<DstUnused>(
831327952Sdim      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
832327952Sdim    if (OtherDstUnused != DstUnused::UNUSED_PAD)
833327952Sdim      break;
834327952Sdim
835327952Sdim    // Create DstPreserveOperand
836327952Sdim    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
837327952Sdim    assert(OrDst && OrDst->isReg());
838327952Sdim
839360784Sdim    return std::make_unique<SDWADstPreserveOperand>(
840327952Sdim      OrDst, OrSDWADef, OrOtherDef, DstSel);
841327952Sdim
842327952Sdim  }
843327952Sdim  }
844327952Sdim
845327952Sdim  return std::unique_ptr<SDWAOperand>(nullptr);
846327952Sdim}
847327952Sdim
848360784Sdim#if !defined(NDEBUG)
849360784Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
850360784Sdim  Operand.print(OS);
851360784Sdim  return OS;
852360784Sdim}
853360784Sdim#endif
854360784Sdim
855341825Sdimvoid SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
856341825Sdim  for (MachineInstr &MI : MBB) {
857341825Sdim    if (auto Operand = matchSDWAOperand(MI)) {
858341825Sdim      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
859341825Sdim      SDWAOperands[&MI] = std::move(Operand);
860341825Sdim      ++NumSDWAPatternsFound;
861317017Sdim    }
862317017Sdim  }
863317017Sdim}
864317017Sdim
865344779Sdim// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
866344779Sdim// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
867344779Sdim// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
868344779Sdim//
869344779Sdim// We are transforming from a VOP3 into a VOP2 form of the instruction.
870344779Sdim//   %19:vgpr_32 = V_AND_B32_e32 255,
871344779Sdim//       killed %16:vgpr_32, implicit $exec
872344779Sdim//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
873344779Sdim//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
874344779Sdim//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
875344779Sdim//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
876344779Sdim//
877344779Sdim// becomes
878344779Sdim//   %47:vgpr_32 = V_ADD_I32_sdwa
879344779Sdim//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
880344779Sdim//       implicit-def $vcc, implicit $exec
881344779Sdim//  %48:vgpr_32 = V_ADDC_U32_e32
882344779Sdim//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
883344779Sdimvoid SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
884344779Sdim                                           const GCNSubtarget &ST) const {
885344779Sdim  int Opc = MI.getOpcode();
886344779Sdim  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
887344779Sdim         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
888344779Sdim
889344779Sdim  // Can the candidate MI be shrunk?
890344779Sdim  if (!TII->canShrink(MI, *MRI))
891344779Sdim    return;
892344779Sdim  Opc = AMDGPU::getVOPe32(Opc);
893344779Sdim  // Find the related ADD instruction.
894344779Sdim  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
895344779Sdim  if (!Sdst)
896344779Sdim    return;
897344779Sdim  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
898344779Sdim  if (!NextOp)
899344779Sdim    return;
900344779Sdim  MachineInstr &MISucc = *NextOp->getParent();
901344779Sdim  // Can the successor be shrunk?
902344779Sdim  if (!TII->canShrink(MISucc, *MRI))
903344779Sdim    return;
904344779Sdim  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
905344779Sdim  // Make sure the carry in/out are subsequently unused.
906344779Sdim  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
907344779Sdim  if (!CarryIn)
908344779Sdim    return;
909344779Sdim  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
910344779Sdim  if (!CarryOut)
911344779Sdim    return;
912344779Sdim  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
913344779Sdim    return;
914344779Sdim  // Make sure VCC or its subregs are dead before MI.
915344779Sdim  MachineBasicBlock &MBB = *MI.getParent();
916344779Sdim  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
917344779Sdim  if (Liveness != MachineBasicBlock::LQR_Dead)
918344779Sdim    return;
919344779Sdim  // Check if VCC is referenced in range of (MI,MISucc].
920344779Sdim  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
921344779Sdim       I != E; ++I) {
922344779Sdim    if (I->modifiesRegister(AMDGPU::VCC, TRI))
923344779Sdim      return;
924344779Sdim  }
925344779Sdim  // Make the two new e32 instruction variants.
926344779Sdim  // Replace MI with V_{SUB|ADD}_I32_e32
927344779Sdim  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
928344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
929344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
930344779Sdim  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
931344779Sdim  MI.eraseFromParent();
932344779Sdim  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
933344779Sdim  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
934344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
935344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
936344779Sdim  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
937344779Sdim  MISucc.eraseFromParent();
938344779Sdim}
939344779Sdim
940344779Sdimbool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
941341825Sdim                                         const GCNSubtarget &ST) const {
942327952Sdim  // Check if this is already an SDWA instruction
943327952Sdim  unsigned Opc = MI.getOpcode();
944327952Sdim  if (TII->isSDWA(Opc))
945327952Sdim    return true;
946327952Sdim
947319250Sdim  // Check if this instruction has opcode that supports SDWA
948320397Sdim  if (AMDGPU::getSDWAOp(Opc) == -1)
949320397Sdim    Opc = AMDGPU::getVOPe32(Opc);
950320397Sdim
951327952Sdim  if (AMDGPU::getSDWAOp(Opc) == -1)
952320397Sdim    return false;
953320397Sdim
954320397Sdim  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
955320397Sdim    return false;
956320397Sdim
957320397Sdim  if (TII->isVOPC(Opc)) {
958320397Sdim    if (!ST.hasSDWASdst()) {
959320397Sdim      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
960353358Sdim      if (SDst && (SDst->getReg() != AMDGPU::VCC &&
961353358Sdim                   SDst->getReg() != AMDGPU::VCC_LO))
962320397Sdim        return false;
963320397Sdim    }
964320397Sdim
965320572Sdim    if (!ST.hasSDWAOutModsVOPC() &&
966320572Sdim        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
967320572Sdim         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
968320397Sdim      return false;
969320397Sdim
970320572Sdim  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
971320572Sdim             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
972320397Sdim    return false;
973320397Sdim  }
974320397Sdim
975353358Sdim  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
976353358Sdim                           Opc == AMDGPU::V_FMAC_F32_e32 ||
977353358Sdim                           Opc == AMDGPU::V_MAC_F16_e32 ||
978320397Sdim                           Opc == AMDGPU::V_MAC_F32_e32))
979320397Sdim    return false;
980320397Sdim
981353358Sdim  // Check if target supports this SDWA opcode
982353358Sdim  if (TII->pseudoToMCOpcode(Opc) == -1)
983353358Sdim    return false;
984353358Sdim
985341825Sdim  // FIXME: has SDWA but require handling of implicit VCC use
986341825Sdim  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
987341825Sdim    return false;
988341825Sdim
989320397Sdim  return true;
990318681Sdim}
991318681Sdim
992318681Sdimbool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
993318681Sdim                                   const SDWAOperandsVector &SDWAOperands) {
994341825Sdim
995341825Sdim  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
996341825Sdim
997317017Sdim  // Convert to sdwa
998327952Sdim  int SDWAOpcode;
999327952Sdim  unsigned Opcode = MI.getOpcode();
1000327952Sdim  if (TII->isSDWA(Opcode)) {
1001327952Sdim    SDWAOpcode = Opcode;
1002327952Sdim  } else {
1003327952Sdim    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1004327952Sdim    if (SDWAOpcode == -1)
1005327952Sdim      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1006327952Sdim  }
1007317017Sdim  assert(SDWAOpcode != -1);
1008317017Sdim
1009317017Sdim  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1010317017Sdim
1011317017Sdim  // Create SDWA version of instruction MI and initialize its operands
1012317017Sdim  MachineInstrBuilder SDWAInst =
1013317017Sdim    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
1014317017Sdim
1015320572Sdim  // Copy dst, if it is present in original then should also be present in SDWA
1016320572Sdim  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1017317017Sdim  if (Dst) {
1018317017Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1019317017Sdim    SDWAInst.add(*Dst);
1020320572Sdim  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1021320397Sdim    assert(Dst &&
1022320397Sdim           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1023320397Sdim    SDWAInst.add(*Dst);
1024320572Sdim  } else {
1025320572Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1026353358Sdim    SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1027317017Sdim  }
1028317017Sdim
1029317017Sdim  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1030317017Sdim  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1031317017Sdim  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1032317017Sdim  assert(
1033317017Sdim    Src0 &&
1034317017Sdim    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1035317017Sdim    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1036319799Sdim  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1037319799Sdim    SDWAInst.addImm(Mod->getImm());
1038319799Sdim  else
1039319799Sdim    SDWAInst.addImm(0);
1040317017Sdim  SDWAInst.add(*Src0);
1041317017Sdim
1042317017Sdim  // Copy src1 if present, initialize src1_modifiers.
1043317017Sdim  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1044317017Sdim  if (Src1) {
1045317017Sdim    assert(
1046317017Sdim      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1047317017Sdim      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1048319799Sdim    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1049319799Sdim      SDWAInst.addImm(Mod->getImm());
1050319799Sdim    else
1051319799Sdim      SDWAInst.addImm(0);
1052317017Sdim    SDWAInst.add(*Src1);
1053317017Sdim  }
1054317017Sdim
1055353358Sdim  if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1056353358Sdim      SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1057353358Sdim      SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1058317017Sdim      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1059317017Sdim    // v_mac_f16/32 has additional src2 operand tied to vdst
1060317017Sdim    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1061317017Sdim    assert(Src2);
1062317017Sdim    SDWAInst.add(*Src2);
1063317017Sdim  }
1064317017Sdim
1065320397Sdim  // Copy clamp if present, initialize otherwise
1066317017Sdim  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1067320397Sdim  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1068320397Sdim  if (Clamp) {
1069320397Sdim    SDWAInst.add(*Clamp);
1070320397Sdim  } else {
1071320397Sdim    SDWAInst.addImm(0);
1072320397Sdim  }
1073317017Sdim
1074320397Sdim  // Copy omod if present, initialize otherwise if needed
1075320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1076320572Sdim    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1077320572Sdim    if (OMod) {
1078320572Sdim      SDWAInst.add(*OMod);
1079320572Sdim    } else {
1080320572Sdim      SDWAInst.addImm(0);
1081320572Sdim    }
1082320397Sdim  }
1083320397Sdim
1084327952Sdim  // Copy dst_sel if present, initialize otherwise if needed
1085320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1086327952Sdim    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1087327952Sdim    if (DstSel) {
1088327952Sdim      SDWAInst.add(*DstSel);
1089327952Sdim    } else {
1090327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1091327952Sdim    }
1092320572Sdim  }
1093320572Sdim
1094327952Sdim  // Copy dst_unused if present, initialize otherwise if needed
1095320572Sdim  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1096327952Sdim    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1097327952Sdim    if (DstUnused) {
1098327952Sdim      SDWAInst.add(*DstUnused);
1099327952Sdim    } else {
1100327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1101327952Sdim    }
1102317017Sdim  }
1103317017Sdim
1104327952Sdim  // Copy src0_sel if present, initialize otherwise
1105317017Sdim  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1106327952Sdim  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1107327952Sdim  if (Src0Sel) {
1108327952Sdim    SDWAInst.add(*Src0Sel);
1109327952Sdim  } else {
1110327952Sdim    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1111327952Sdim  }
1112317017Sdim
1113327952Sdim  // Copy src1_sel if present, initialize otherwise if needed
1114317017Sdim  if (Src1) {
1115317017Sdim    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1116327952Sdim    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1117327952Sdim    if (Src1Sel) {
1118327952Sdim      SDWAInst.add(*Src1Sel);
1119327952Sdim    } else {
1120327952Sdim      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1121327952Sdim    }
1122317017Sdim  }
1123317017Sdim
1124341825Sdim  // Check for a preserved register that needs to be copied.
1125341825Sdim  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1126341825Sdim  if (DstUnused &&
1127341825Sdim      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1128341825Sdim    // We expect, if we are here, that the instruction was already in it's SDWA form,
1129341825Sdim    // with a tied operand.
1130341825Sdim    assert(Dst && Dst->isTied());
1131341825Sdim    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1132341825Sdim    // We also expect a vdst, since sdst can't preserve.
1133341825Sdim    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1134341825Sdim    assert(PreserveDstIdx != -1);
1135341825Sdim
1136341825Sdim    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1137341825Sdim    auto Tied = MI.getOperand(TiedIdx);
1138341825Sdim
1139341825Sdim    SDWAInst.add(Tied);
1140341825Sdim    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1141341825Sdim  }
1142341825Sdim
1143341825Sdim  // Apply all sdwa operand patterns.
1144317017Sdim  bool Converted = false;
1145317017Sdim  for (auto &Operand : SDWAOperands) {
1146341825Sdim    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1147318681Sdim    // There should be no intesection between SDWA operands and potential MIs
1148318681Sdim    // e.g.:
1149318681Sdim    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1150318681Sdim    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1151318681Sdim    // v_add_u32 v3, v4, v2
1152318681Sdim    //
1153318681Sdim    // In that example it is possible that we would fold 2nd instruction into 3rd
1154318681Sdim    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1155318681Sdim    // already destroyed). So if SDWAOperand is also a potential MI then do not
1156318681Sdim    // apply it.
1157318681Sdim    if (PotentialMatches.count(Operand->getParentInst()) == 0)
1158318681Sdim      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1159317017Sdim  }
1160319250Sdim  if (Converted) {
1161319250Sdim    ConvertedInstructions.push_back(SDWAInst);
1162319250Sdim  } else {
1163317017Sdim    SDWAInst->eraseFromParent();
1164317017Sdim    return false;
1165317017Sdim  }
1166317017Sdim
1167341825Sdim  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1168317017Sdim  ++NumSDWAInstructionsPeepholed;
1169317017Sdim
1170317017Sdim  MI.eraseFromParent();
1171317017Sdim  return true;
1172317017Sdim}
1173317017Sdim
1174319250Sdim// If an instruction was converted to SDWA it should not have immediates or SGPR
1175320397Sdim// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1176341825Sdimvoid SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1177341825Sdim                                            const GCNSubtarget &ST) const {
1178319250Sdim  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1179320397Sdim  unsigned ConstantBusCount = 0;
1180327952Sdim  for (MachineOperand &Op : MI.explicit_uses()) {
1181319250Sdim    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1182319250Sdim      continue;
1183320397Sdim
1184320397Sdim    unsigned I = MI.getOperandNo(&Op);
1185319250Sdim    if (Desc.OpInfo[I].RegClass == -1 ||
1186319250Sdim       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1187319250Sdim      continue;
1188320397Sdim
1189320397Sdim    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1190320397Sdim        TRI->isSGPRReg(*MRI, Op.getReg())) {
1191320397Sdim      ++ConstantBusCount;
1192320397Sdim      continue;
1193320397Sdim    }
1194320397Sdim
1195360784Sdim    Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1196319250Sdim    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1197319250Sdim                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1198319250Sdim    if (Op.isImm())
1199319250Sdim      Copy.addImm(Op.getImm());
1200319250Sdim    else if (Op.isReg())
1201319250Sdim      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1202319250Sdim                  Op.getSubReg());
1203319250Sdim    Op.ChangeToRegister(VGPR, false);
1204319250Sdim  }
1205319250Sdim}
1206319250Sdim
1207317017Sdimbool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1208341825Sdim  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1209317017Sdim
1210327952Sdim  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1211317017Sdim    return false;
1212317017Sdim
1213317017Sdim  MRI = &MF.getRegInfo();
1214317017Sdim  TRI = ST.getRegisterInfo();
1215317017Sdim  TII = ST.getInstrInfo();
1216320397Sdim
1217318681Sdim  // Find all SDWA operands in MF.
1218327952Sdim  bool Ret = false;
1219341825Sdim  for (MachineBasicBlock &MBB : MF) {
1220341825Sdim    bool Changed = false;
1221341825Sdim    do {
1222344779Sdim      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1223344779Sdim      // Look for a possible ADD or SUB that resulted from a previously lowered
1224344779Sdim      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1225344779Sdim      // lowers the pair of instructions into e32 form.
1226341825Sdim      matchSDWAOperands(MBB);
1227344779Sdim      for (const auto &OperandPair : SDWAOperands) {
1228344779Sdim        const auto &Operand = OperandPair.second;
1229344779Sdim        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1230344779Sdim        if (PotentialMI &&
1231344779Sdim           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
1232344779Sdim            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
1233344779Sdim          pseudoOpConvertToVOP2(*PotentialMI, ST);
1234344779Sdim      }
1235344779Sdim      SDWAOperands.clear();
1236317017Sdim
1237344779Sdim      // Generate potential match list.
1238344779Sdim      matchSDWAOperands(MBB);
1239344779Sdim
1240341825Sdim      for (const auto &OperandPair : SDWAOperands) {
1241341825Sdim        const auto &Operand = OperandPair.second;
1242341825Sdim        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1243341825Sdim        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1244341825Sdim          PotentialMatches[PotentialMI].push_back(Operand.get());
1245341825Sdim        }
1246327952Sdim      }
1247317017Sdim
1248341825Sdim      for (auto &PotentialPair : PotentialMatches) {
1249341825Sdim        MachineInstr &PotentialMI = *PotentialPair.first;
1250341825Sdim        convertToSDWA(PotentialMI, PotentialPair.second);
1251341825Sdim      }
1252317017Sdim
1253341825Sdim      PotentialMatches.clear();
1254341825Sdim      SDWAOperands.clear();
1255319250Sdim
1256341825Sdim      Changed = !ConvertedInstructions.empty();
1257319250Sdim
1258341825Sdim      if (Changed)
1259341825Sdim        Ret = true;
1260341825Sdim      while (!ConvertedInstructions.empty())
1261341825Sdim        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1262341825Sdim    } while (Changed);
1263341825Sdim  }
1264327952Sdim
1265319799Sdim  return Ret;
1266317017Sdim}
1267