SIPeepholeSDWA.cpp revision 353358
1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12///   V_LSHRREV_B32_e32 %0, 16, %1
13///   V_ADD_I32_e32 %2, %0, %3
14///   V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17///   V_ADD_I32_sdwa %4, %1, %3
18///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "AMDGPU.h"
23#include "AMDGPUSubtarget.h"
24#include "SIDefines.h"
25#include "SIInstrInfo.h"
26#include "SIRegisterInfo.h"
27#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/None.h"
30#include "llvm/ADT/Optional.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SmallVector.h"
33#include "llvm/ADT/Statistic.h"
34#include "llvm/CodeGen/MachineBasicBlock.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineFunctionPass.h"
37#include "llvm/CodeGen/MachineInstr.h"
38#include "llvm/CodeGen/MachineInstrBuilder.h"
39#include "llvm/CodeGen/MachineOperand.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/Config/llvm-config.h"
43#include "llvm/MC/LaneBitmask.h"
44#include "llvm/MC/MCInstrDesc.h"
45#include "llvm/Pass.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/raw_ostream.h"
48#include <algorithm>
49#include <cassert>
50#include <cstdint>
51#include <memory>
52#include <unordered_map>
53
54using namespace llvm;
55
56#define DEBUG_TYPE "si-peephole-sdwa"
57
58STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
59STATISTIC(NumSDWAInstructionsPeepholed,
60          "Number of instruction converted to SDWA.");
61
62namespace {
63
64class SDWAOperand;
65class SDWADstOperand;
66
67class SIPeepholeSDWA : public MachineFunctionPass {
68public:
69  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
70
71private:
72  MachineRegisterInfo *MRI;
73  const SIRegisterInfo *TRI;
74  const SIInstrInfo *TII;
75
76  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
77  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
78  SmallVector<MachineInstr *, 8> ConvertedInstructions;
79
80  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
81
82public:
83  static char ID;
84
85  SIPeepholeSDWA() : MachineFunctionPass(ID) {
86    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
87  }
88
89  bool runOnMachineFunction(MachineFunction &MF) override;
90  void matchSDWAOperands(MachineBasicBlock &MBB);
91  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
92  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
93  void pseudoOpConvertToVOP2(MachineInstr &MI,
94                             const GCNSubtarget &ST) const;
95  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
96  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
97
98  StringRef getPassName() const override { return "SI Peephole SDWA"; }
99
100  void getAnalysisUsage(AnalysisUsage &AU) const override {
101    AU.setPreservesCFG();
102    MachineFunctionPass::getAnalysisUsage(AU);
103  }
104};
105
106class SDWAOperand {
107private:
108  MachineOperand *Target; // Operand that would be used in converted instruction
109  MachineOperand *Replaced; // Operand that would be replace by Target
110
111public:
112  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
113      : Target(TargetOp), Replaced(ReplacedOp) {
114    assert(Target->isReg());
115    assert(Replaced->isReg());
116  }
117
118  virtual ~SDWAOperand() = default;
119
120  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
121  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
122
123  MachineOperand *getTargetOperand() const { return Target; }
124  MachineOperand *getReplacedOperand() const { return Replaced; }
125  MachineInstr *getParentInst() const { return Target->getParent(); }
126
127  MachineRegisterInfo *getMRI() const {
128    return &getParentInst()->getParent()->getParent()->getRegInfo();
129  }
130
131#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
132  virtual void print(raw_ostream& OS) const = 0;
133  void dump() const { print(dbgs()); }
134#endif
135};
136
137using namespace AMDGPU::SDWA;
138
139class SDWASrcOperand : public SDWAOperand {
140private:
141  SdwaSel SrcSel;
142  bool Abs;
143  bool Neg;
144  bool Sext;
145
146public:
147  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
148                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
149                 bool Sext_ = false)
150      : SDWAOperand(TargetOp, ReplacedOp),
151        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
152
153  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
154  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
155
156  SdwaSel getSrcSel() const { return SrcSel; }
157  bool getAbs() const { return Abs; }
158  bool getNeg() const { return Neg; }
159  bool getSext() const { return Sext; }
160
161  uint64_t getSrcMods(const SIInstrInfo *TII,
162                      const MachineOperand *SrcOp) const;
163
164#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
165  void print(raw_ostream& OS) const override;
166#endif
167};
168
169class SDWADstOperand : public SDWAOperand {
170private:
171  SdwaSel DstSel;
172  DstUnused DstUn;
173
174public:
175
176  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
177                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
178    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
179
180  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
181  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
182
183  SdwaSel getDstSel() const { return DstSel; }
184  DstUnused getDstUnused() const { return DstUn; }
185
186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187  void print(raw_ostream& OS) const override;
188#endif
189};
190
191class SDWADstPreserveOperand : public SDWADstOperand {
192private:
193  MachineOperand *Preserve;
194
195public:
196  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
197                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
198      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
199        Preserve(PreserveOp) {}
200
201  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
202
203  MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206  void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
213
214char SIPeepholeSDWA::ID = 0;
215
216char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
217
218FunctionPass *llvm::createSIPeepholeSDWAPass() {
219  return new SIPeepholeSDWA();
220}
221
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225  switch(Sel) {
226  case BYTE_0: OS << "BYTE_0"; break;
227  case BYTE_1: OS << "BYTE_1"; break;
228  case BYTE_2: OS << "BYTE_2"; break;
229  case BYTE_3: OS << "BYTE_3"; break;
230  case WORD_0: OS << "WORD_0"; break;
231  case WORD_1: OS << "WORD_1"; break;
232  case DWORD:  OS << "DWORD"; break;
233  }
234  return OS;
235}
236
237static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238  switch(Un) {
239  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242  }
243  return OS;
244}
245
246static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
247  Operand.print(OS);
248  return OS;
249}
250
251LLVM_DUMP_METHOD
252void SDWASrcOperand::print(raw_ostream& OS) const {
253  OS << "SDWA src: " << *getTargetOperand()
254    << " src_sel:" << getSrcSel()
255    << " abs:" << getAbs() << " neg:" << getNeg()
256    << " sext:" << getSext() << '\n';
257}
258
259LLVM_DUMP_METHOD
260void SDWADstOperand::print(raw_ostream& OS) const {
261  OS << "SDWA dst: " << *getTargetOperand()
262    << " dst_sel:" << getDstSel()
263    << " dst_unused:" << getDstUnused() << '\n';
264}
265
266LLVM_DUMP_METHOD
267void SDWADstPreserveOperand::print(raw_ostream& OS) const {
268  OS << "SDWA preserve dst: " << *getTargetOperand()
269    << " dst_sel:" << getDstSel()
270    << " preserve:" << *getPreservedOperand() << '\n';
271}
272
273#endif
274
275static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
276  assert(To.isReg() && From.isReg());
277  To.setReg(From.getReg());
278  To.setSubReg(From.getSubReg());
279  To.setIsUndef(From.isUndef());
280  if (To.isUse()) {
281    To.setIsKill(From.isKill());
282  } else {
283    To.setIsDead(From.isDead());
284  }
285}
286
287static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
288  return LHS.isReg() &&
289         RHS.isReg() &&
290         LHS.getReg() == RHS.getReg() &&
291         LHS.getSubReg() == RHS.getSubReg();
292}
293
294static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
295                                        const MachineRegisterInfo *MRI) {
296  if (!Reg->isReg() || !Reg->isDef())
297    return nullptr;
298
299  MachineOperand *ResMO = nullptr;
300  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
301    // If there exist use of subreg of Reg then return nullptr
302    if (!isSameReg(UseMO, *Reg))
303      return nullptr;
304
305    // Check that there is only one instruction that uses Reg
306    if (!ResMO) {
307      ResMO = &UseMO;
308    } else if (ResMO->getParent() != UseMO.getParent()) {
309      return nullptr;
310    }
311  }
312
313  return ResMO;
314}
315
316static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
317                                        const MachineRegisterInfo *MRI) {
318  if (!Reg->isReg())
319    return nullptr;
320
321  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
322  if (!DefInstr)
323    return nullptr;
324
325  for (auto &DefMO : DefInstr->defs()) {
326    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
327      return &DefMO;
328  }
329
330  // Ignore implicit defs.
331  return nullptr;
332}
333
334uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
335                                    const MachineOperand *SrcOp) const {
336  uint64_t Mods = 0;
337  const auto *MI = SrcOp->getParent();
338  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
339    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
340      Mods = Mod->getImm();
341    }
342  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
343    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
344      Mods = Mod->getImm();
345    }
346  }
347  if (Abs || Neg) {
348    assert(!Sext &&
349           "Float and integer src modifiers can't be set simulteniously");
350    Mods |= Abs ? SISrcMods::ABS : 0u;
351    Mods ^= Neg ? SISrcMods::NEG : 0u;
352  } else if (Sext) {
353    Mods |= SISrcMods::SEXT;
354  }
355
356  return Mods;
357}
358
359MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
360  // For SDWA src operand potential instruction is one that use register
361  // defined by parent instruction
362  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
363  if (!PotentialMO)
364    return nullptr;
365
366  return PotentialMO->getParent();
367}
368
369bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
370  // Find operand in instruction that matches source operand and replace it with
371  // target operand. Set corresponding src_sel
372  bool IsPreserveSrc = false;
373  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
374  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
375  MachineOperand *SrcMods =
376      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
377  assert(Src && (Src->isReg() || Src->isImm()));
378  if (!isSameReg(*Src, *getReplacedOperand())) {
379    // If this is not src0 then it could be src1
380    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
381    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
382    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
383
384    if (!Src ||
385        !isSameReg(*Src, *getReplacedOperand())) {
386      // It's possible this Src is a tied operand for
387      // UNUSED_PRESERVE, in which case we can either
388      // abandon the peephole attempt, or if legal we can
389      // copy the target operand into the tied slot
390      // if the preserve operation will effectively cause the same
391      // result by overwriting the rest of the dst.
392      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
393      MachineOperand *DstUnused =
394        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
395
396      if (Dst &&
397          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
398        // This will work if the tied src is acessing WORD_0, and the dst is
399        // writing WORD_1. Modifiers don't matter because all the bits that
400        // would be impacted are being overwritten by the dst.
401        // Any other case will not work.
402        SdwaSel DstSel = static_cast<SdwaSel>(
403            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
404        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
405            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
406          IsPreserveSrc = true;
407          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
408                                                   AMDGPU::OpName::vdst);
409          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
410          Src = &MI.getOperand(TiedIdx);
411          SrcSel = nullptr;
412          SrcMods = nullptr;
413        } else {
414          // Not legal to convert this src
415          return false;
416        }
417      }
418    }
419    assert(Src && Src->isReg());
420
421    if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
422         MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
423         MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
424         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
425         !isSameReg(*Src, *getReplacedOperand())) {
426      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
427      // src2. This is not allowed.
428      return false;
429    }
430
431    assert(isSameReg(*Src, *getReplacedOperand()) &&
432           (IsPreserveSrc || (SrcSel && SrcMods)));
433  }
434  copyRegOperand(*Src, *getTargetOperand());
435  if (!IsPreserveSrc) {
436    SrcSel->setImm(getSrcSel());
437    SrcMods->setImm(getSrcMods(TII, Src));
438  }
439  getTargetOperand()->setIsKill(false);
440  return true;
441}
442
443MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
444  // For SDWA dst operand potential instruction is one that defines register
445  // that this operand uses
446  MachineRegisterInfo *MRI = getMRI();
447  MachineInstr *ParentMI = getParentInst();
448
449  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
450  if (!PotentialMO)
451    return nullptr;
452
453  // Check that ParentMI is the only instruction that uses replaced register
454  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
455    if (&UseInst != ParentMI)
456      return nullptr;
457  }
458
459  return PotentialMO->getParent();
460}
461
462bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
463  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
464
465  if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
466       MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
467       MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
468       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
469      getDstSel() != AMDGPU::SDWA::DWORD) {
470    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
471    return false;
472  }
473
474  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
475  assert(Operand &&
476         Operand->isReg() &&
477         isSameReg(*Operand, *getReplacedOperand()));
478  copyRegOperand(*Operand, *getTargetOperand());
479  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
480  assert(DstSel);
481  DstSel->setImm(getDstSel());
482  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
483  assert(DstUnused);
484  DstUnused->setImm(getDstUnused());
485
486  // Remove original instruction  because it would conflict with our new
487  // instruction by register definition
488  getParentInst()->eraseFromParent();
489  return true;
490}
491
492bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
493                                           const SIInstrInfo *TII) {
494  // MI should be moved right before v_or_b32.
495  // For this we should clear all kill flags on uses of MI src-operands or else
496  // we can encounter problem with use of killed operand.
497  for (MachineOperand &MO : MI.uses()) {
498    if (!MO.isReg())
499      continue;
500    getMRI()->clearKillFlags(MO.getReg());
501  }
502
503  // Move MI before v_or_b32
504  auto MBB = MI.getParent();
505  MBB->remove(&MI);
506  MBB->insert(getParentInst(), &MI);
507
508  // Add Implicit use of preserved register
509  MachineInstrBuilder MIB(*MBB->getParent(), MI);
510  MIB.addReg(getPreservedOperand()->getReg(),
511             RegState::ImplicitKill,
512             getPreservedOperand()->getSubReg());
513
514  // Tie dst to implicit use
515  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
516                 MI.getNumOperands() - 1);
517
518  // Convert MI as any other SDWADstOperand and remove v_or_b32
519  return SDWADstOperand::convertToSDWA(MI, TII);
520}
521
522Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
523  if (Op.isImm()) {
524    return Op.getImm();
525  }
526
527  // If this is not immediate then it can be copy of immediate value, e.g.:
528  // %1 = S_MOV_B32 255;
529  if (Op.isReg()) {
530    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
531      if (!isSameReg(Op, Def))
532        continue;
533
534      const MachineInstr *DefInst = Def.getParent();
535      if (!TII->isFoldableCopy(*DefInst))
536        return None;
537
538      const MachineOperand &Copied = DefInst->getOperand(1);
539      if (!Copied.isImm())
540        return None;
541
542      return Copied.getImm();
543    }
544  }
545
546  return None;
547}
548
549std::unique_ptr<SDWAOperand>
550SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
551  unsigned Opcode = MI.getOpcode();
552  switch (Opcode) {
553  case AMDGPU::V_LSHRREV_B32_e32:
554  case AMDGPU::V_ASHRREV_I32_e32:
555  case AMDGPU::V_LSHLREV_B32_e32:
556  case AMDGPU::V_LSHRREV_B32_e64:
557  case AMDGPU::V_ASHRREV_I32_e64:
558  case AMDGPU::V_LSHLREV_B32_e64: {
559    // from: v_lshrrev_b32_e32 v1, 16/24, v0
560    // to SDWA src:v0 src_sel:WORD_1/BYTE_3
561
562    // from: v_ashrrev_i32_e32 v1, 16/24, v0
563    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
564
565    // from: v_lshlrev_b32_e32 v1, 16/24, v0
566    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
567    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
568    auto Imm = foldToImm(*Src0);
569    if (!Imm)
570      break;
571
572    if (*Imm != 16 && *Imm != 24)
573      break;
574
575    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
576    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
577    if (TRI->isPhysicalRegister(Src1->getReg()) ||
578        TRI->isPhysicalRegister(Dst->getReg()))
579      break;
580
581    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
582        Opcode == AMDGPU::V_LSHLREV_B32_e64) {
583      return make_unique<SDWADstOperand>(
584          Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
585    } else {
586      return make_unique<SDWASrcOperand>(
587          Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
588          Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
589          Opcode != AMDGPU::V_LSHRREV_B32_e64);
590    }
591    break;
592  }
593
594  case AMDGPU::V_LSHRREV_B16_e32:
595  case AMDGPU::V_ASHRREV_I16_e32:
596  case AMDGPU::V_LSHLREV_B16_e32:
597  case AMDGPU::V_LSHRREV_B16_e64:
598  case AMDGPU::V_ASHRREV_I16_e64:
599  case AMDGPU::V_LSHLREV_B16_e64: {
600    // from: v_lshrrev_b16_e32 v1, 8, v0
601    // to SDWA src:v0 src_sel:BYTE_1
602
603    // from: v_ashrrev_i16_e32 v1, 8, v0
604    // to SDWA src:v0 src_sel:BYTE_1 sext:1
605
606    // from: v_lshlrev_b16_e32 v1, 8, v0
607    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
608    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
609    auto Imm = foldToImm(*Src0);
610    if (!Imm || *Imm != 8)
611      break;
612
613    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
614    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
615
616    if (TRI->isPhysicalRegister(Src1->getReg()) ||
617        TRI->isPhysicalRegister(Dst->getReg()))
618      break;
619
620    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
621        Opcode == AMDGPU::V_LSHLREV_B16_e64) {
622      return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
623    } else {
624      return make_unique<SDWASrcOperand>(
625            Src1, Dst, BYTE_1, false, false,
626            Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
627            Opcode != AMDGPU::V_LSHRREV_B16_e64);
628    }
629    break;
630  }
631
632  case AMDGPU::V_BFE_I32:
633  case AMDGPU::V_BFE_U32: {
634    // e.g.:
635    // from: v_bfe_u32 v1, v0, 8, 8
636    // to SDWA src:v0 src_sel:BYTE_1
637
638    // offset | width | src_sel
639    // ------------------------
640    // 0      | 8     | BYTE_0
641    // 0      | 16    | WORD_0
642    // 0      | 32    | DWORD ?
643    // 8      | 8     | BYTE_1
644    // 16     | 8     | BYTE_2
645    // 16     | 16    | WORD_1
646    // 24     | 8     | BYTE_3
647
648    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
649    auto Offset = foldToImm(*Src1);
650    if (!Offset)
651      break;
652
653    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
654    auto Width = foldToImm(*Src2);
655    if (!Width)
656      break;
657
658    SdwaSel SrcSel = DWORD;
659
660    if (*Offset == 0 && *Width == 8)
661      SrcSel = BYTE_0;
662    else if (*Offset == 0 && *Width == 16)
663      SrcSel = WORD_0;
664    else if (*Offset == 0 && *Width == 32)
665      SrcSel = DWORD;
666    else if (*Offset == 8 && *Width == 8)
667      SrcSel = BYTE_1;
668    else if (*Offset == 16 && *Width == 8)
669      SrcSel = BYTE_2;
670    else if (*Offset == 16 && *Width == 16)
671      SrcSel = WORD_1;
672    else if (*Offset == 24 && *Width == 8)
673      SrcSel = BYTE_3;
674    else
675      break;
676
677    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
678    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
679
680    if (TRI->isPhysicalRegister(Src0->getReg()) ||
681        TRI->isPhysicalRegister(Dst->getReg()))
682      break;
683
684    return make_unique<SDWASrcOperand>(
685          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
686  }
687
688  case AMDGPU::V_AND_B32_e32:
689  case AMDGPU::V_AND_B32_e64: {
690    // e.g.:
691    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
692    // to SDWA src:v0 src_sel:WORD_0/BYTE_0
693
694    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
695    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
696    auto ValSrc = Src1;
697    auto Imm = foldToImm(*Src0);
698
699    if (!Imm) {
700      Imm = foldToImm(*Src1);
701      ValSrc = Src0;
702    }
703
704    if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
705      break;
706
707    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
708
709    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
710        TRI->isPhysicalRegister(Dst->getReg()))
711      break;
712
713    return make_unique<SDWASrcOperand>(
714        ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
715  }
716
717  case AMDGPU::V_OR_B32_e32:
718  case AMDGPU::V_OR_B32_e64: {
719    // Patterns for dst_unused:UNUSED_PRESERVE.
720    // e.g., from:
721    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
722    //                           src1_sel:WORD_1 src2_sel:WORD1
723    // v_add_f16_e32 v3, v1, v2
724    // v_or_b32_e32 v4, v0, v3
725    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
726
727    // Check if one of operands of v_or_b32 is SDWA instruction
728    using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
729    auto CheckOROperandsForSDWA =
730      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
731        if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
732          return CheckRetType(None);
733
734        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
735        if (!Op1Def)
736          return CheckRetType(None);
737
738        MachineInstr *Op1Inst = Op1Def->getParent();
739        if (!TII->isSDWA(*Op1Inst))
740          return CheckRetType(None);
741
742        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
743        if (!Op2Def)
744          return CheckRetType(None);
745
746        return CheckRetType(std::make_pair(Op1Def, Op2Def));
747      };
748
749    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
750    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
751    assert(OrSDWA && OrOther);
752    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
753    if (!Res) {
754      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
755      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
756      assert(OrSDWA && OrOther);
757      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
758      if (!Res)
759        break;
760    }
761
762    MachineOperand *OrSDWADef = Res->first;
763    MachineOperand *OrOtherDef = Res->second;
764    assert(OrSDWADef && OrOtherDef);
765
766    MachineInstr *SDWAInst = OrSDWADef->getParent();
767    MachineInstr *OtherInst = OrOtherDef->getParent();
768
769    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
770    // destination patterns don't overlap. Compatible instruction can be either
771    // regular instruction with compatible bitness or SDWA instruction with
772    // correct dst_sel
773    // SDWAInst | OtherInst bitness / OtherInst dst_sel
774    // -----------------------------------------------------
775    // DWORD    | no                    / no
776    // WORD_0   | no                    / BYTE_2/3, WORD_1
777    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
778    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
779    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
780    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
781    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
782    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
783    // but v_add_f32 is not.
784
785    // TODO: add support for non-SDWA instructions as OtherInst.
786    // For now this only works with SDWA instructions. For regular instructions
787    // there is no way to determine if the instruction writes only 8/16/24-bit
788    // out of full register size and all registers are at min 32-bit wide.
789    if (!TII->isSDWA(*OtherInst))
790      break;
791
792    SdwaSel DstSel = static_cast<SdwaSel>(
793      TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
794    SdwaSel OtherDstSel = static_cast<SdwaSel>(
795      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
796
797    bool DstSelAgree = false;
798    switch (DstSel) {
799    case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
800                                (OtherDstSel == BYTE_3) ||
801                                (OtherDstSel == WORD_1));
802      break;
803    case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
804                                (OtherDstSel == BYTE_1) ||
805                                (OtherDstSel == WORD_0));
806      break;
807    case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
808                                (OtherDstSel == BYTE_2) ||
809                                (OtherDstSel == BYTE_3) ||
810                                (OtherDstSel == WORD_1));
811      break;
812    case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
813                                (OtherDstSel == BYTE_2) ||
814                                (OtherDstSel == BYTE_3) ||
815                                (OtherDstSel == WORD_1));
816      break;
817    case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
818                                (OtherDstSel == BYTE_1) ||
819                                (OtherDstSel == BYTE_3) ||
820                                (OtherDstSel == WORD_0));
821      break;
822    case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
823                                (OtherDstSel == BYTE_1) ||
824                                (OtherDstSel == BYTE_2) ||
825                                (OtherDstSel == WORD_0));
826      break;
827    default: DstSelAgree = false;
828    }
829
830    if (!DstSelAgree)
831      break;
832
833    // Also OtherInst dst_unused should be UNUSED_PAD
834    DstUnused OtherDstUnused = static_cast<DstUnused>(
835      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
836    if (OtherDstUnused != DstUnused::UNUSED_PAD)
837      break;
838
839    // Create DstPreserveOperand
840    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
841    assert(OrDst && OrDst->isReg());
842
843    return make_unique<SDWADstPreserveOperand>(
844      OrDst, OrSDWADef, OrOtherDef, DstSel);
845
846  }
847  }
848
849  return std::unique_ptr<SDWAOperand>(nullptr);
850}
851
852void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
853  for (MachineInstr &MI : MBB) {
854    if (auto Operand = matchSDWAOperand(MI)) {
855      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
856      SDWAOperands[&MI] = std::move(Operand);
857      ++NumSDWAPatternsFound;
858    }
859  }
860}
861
862// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
863// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
864// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
865//
866// We are transforming from a VOP3 into a VOP2 form of the instruction.
867//   %19:vgpr_32 = V_AND_B32_e32 255,
868//       killed %16:vgpr_32, implicit $exec
869//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
870//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
871//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
872//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
873//
874// becomes
875//   %47:vgpr_32 = V_ADD_I32_sdwa
876//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
877//       implicit-def $vcc, implicit $exec
878//  %48:vgpr_32 = V_ADDC_U32_e32
879//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
880void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
881                                           const GCNSubtarget &ST) const {
882  int Opc = MI.getOpcode();
883  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
884         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
885
886  // Can the candidate MI be shrunk?
887  if (!TII->canShrink(MI, *MRI))
888    return;
889  Opc = AMDGPU::getVOPe32(Opc);
890  // Find the related ADD instruction.
891  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
892  if (!Sdst)
893    return;
894  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
895  if (!NextOp)
896    return;
897  MachineInstr &MISucc = *NextOp->getParent();
898  // Can the successor be shrunk?
899  if (!TII->canShrink(MISucc, *MRI))
900    return;
901  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
902  // Make sure the carry in/out are subsequently unused.
903  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
904  if (!CarryIn)
905    return;
906  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
907  if (!CarryOut)
908    return;
909  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
910    return;
911  // Make sure VCC or its subregs are dead before MI.
912  MachineBasicBlock &MBB = *MI.getParent();
913  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
914  if (Liveness != MachineBasicBlock::LQR_Dead)
915    return;
916  // Check if VCC is referenced in range of (MI,MISucc].
917  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
918       I != E; ++I) {
919    if (I->modifiesRegister(AMDGPU::VCC, TRI))
920      return;
921  }
922  // Make the two new e32 instruction variants.
923  // Replace MI with V_{SUB|ADD}_I32_e32
924  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
925  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
926  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
927  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
928  MI.eraseFromParent();
929  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
930  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
931  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
932  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
933  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
934  MISucc.eraseFromParent();
935}
936
937bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
938                                         const GCNSubtarget &ST) const {
939  // Check if this is already an SDWA instruction
940  unsigned Opc = MI.getOpcode();
941  if (TII->isSDWA(Opc))
942    return true;
943
944  // Check if this instruction has opcode that supports SDWA
945  if (AMDGPU::getSDWAOp(Opc) == -1)
946    Opc = AMDGPU::getVOPe32(Opc);
947
948  if (AMDGPU::getSDWAOp(Opc) == -1)
949    return false;
950
951  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
952    return false;
953
954  if (TII->isVOPC(Opc)) {
955    if (!ST.hasSDWASdst()) {
956      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
957      if (SDst && (SDst->getReg() != AMDGPU::VCC &&
958                   SDst->getReg() != AMDGPU::VCC_LO))
959        return false;
960    }
961
962    if (!ST.hasSDWAOutModsVOPC() &&
963        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
964         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
965      return false;
966
967  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
968             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
969    return false;
970  }
971
972  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
973                           Opc == AMDGPU::V_FMAC_F32_e32 ||
974                           Opc == AMDGPU::V_MAC_F16_e32 ||
975                           Opc == AMDGPU::V_MAC_F32_e32))
976    return false;
977
978  // Check if target supports this SDWA opcode
979  if (TII->pseudoToMCOpcode(Opc) == -1)
980    return false;
981
982  // FIXME: has SDWA but require handling of implicit VCC use
983  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
984    return false;
985
986  return true;
987}
988
989bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
990                                   const SDWAOperandsVector &SDWAOperands) {
991
992  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
993
994  // Convert to sdwa
995  int SDWAOpcode;
996  unsigned Opcode = MI.getOpcode();
997  if (TII->isSDWA(Opcode)) {
998    SDWAOpcode = Opcode;
999  } else {
1000    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1001    if (SDWAOpcode == -1)
1002      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1003  }
1004  assert(SDWAOpcode != -1);
1005
1006  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1007
1008  // Create SDWA version of instruction MI and initialize its operands
1009  MachineInstrBuilder SDWAInst =
1010    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
1011
1012  // Copy dst, if it is present in original then should also be present in SDWA
1013  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1014  if (Dst) {
1015    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1016    SDWAInst.add(*Dst);
1017  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1018    assert(Dst &&
1019           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1020    SDWAInst.add(*Dst);
1021  } else {
1022    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1023    SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1024  }
1025
1026  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1027  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1028  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1029  assert(
1030    Src0 &&
1031    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1032    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1033  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1034    SDWAInst.addImm(Mod->getImm());
1035  else
1036    SDWAInst.addImm(0);
1037  SDWAInst.add(*Src0);
1038
1039  // Copy src1 if present, initialize src1_modifiers.
1040  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1041  if (Src1) {
1042    assert(
1043      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1044      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1045    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1046      SDWAInst.addImm(Mod->getImm());
1047    else
1048      SDWAInst.addImm(0);
1049    SDWAInst.add(*Src1);
1050  }
1051
1052  if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1053      SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1054      SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1055      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1056    // v_mac_f16/32 has additional src2 operand tied to vdst
1057    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1058    assert(Src2);
1059    SDWAInst.add(*Src2);
1060  }
1061
1062  // Copy clamp if present, initialize otherwise
1063  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1064  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1065  if (Clamp) {
1066    SDWAInst.add(*Clamp);
1067  } else {
1068    SDWAInst.addImm(0);
1069  }
1070
1071  // Copy omod if present, initialize otherwise if needed
1072  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1073    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1074    if (OMod) {
1075      SDWAInst.add(*OMod);
1076    } else {
1077      SDWAInst.addImm(0);
1078    }
1079  }
1080
1081  // Copy dst_sel if present, initialize otherwise if needed
1082  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1083    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1084    if (DstSel) {
1085      SDWAInst.add(*DstSel);
1086    } else {
1087      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1088    }
1089  }
1090
1091  // Copy dst_unused if present, initialize otherwise if needed
1092  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1093    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1094    if (DstUnused) {
1095      SDWAInst.add(*DstUnused);
1096    } else {
1097      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1098    }
1099  }
1100
1101  // Copy src0_sel if present, initialize otherwise
1102  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1103  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1104  if (Src0Sel) {
1105    SDWAInst.add(*Src0Sel);
1106  } else {
1107    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1108  }
1109
1110  // Copy src1_sel if present, initialize otherwise if needed
1111  if (Src1) {
1112    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1113    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1114    if (Src1Sel) {
1115      SDWAInst.add(*Src1Sel);
1116    } else {
1117      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1118    }
1119  }
1120
1121  // Check for a preserved register that needs to be copied.
1122  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1123  if (DstUnused &&
1124      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1125    // We expect, if we are here, that the instruction was already in it's SDWA form,
1126    // with a tied operand.
1127    assert(Dst && Dst->isTied());
1128    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1129    // We also expect a vdst, since sdst can't preserve.
1130    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1131    assert(PreserveDstIdx != -1);
1132
1133    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1134    auto Tied = MI.getOperand(TiedIdx);
1135
1136    SDWAInst.add(Tied);
1137    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1138  }
1139
1140  // Apply all sdwa operand patterns.
1141  bool Converted = false;
1142  for (auto &Operand : SDWAOperands) {
1143    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1144    // There should be no intesection between SDWA operands and potential MIs
1145    // e.g.:
1146    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1147    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1148    // v_add_u32 v3, v4, v2
1149    //
1150    // In that example it is possible that we would fold 2nd instruction into 3rd
1151    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1152    // already destroyed). So if SDWAOperand is also a potential MI then do not
1153    // apply it.
1154    if (PotentialMatches.count(Operand->getParentInst()) == 0)
1155      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1156  }
1157  if (Converted) {
1158    ConvertedInstructions.push_back(SDWAInst);
1159  } else {
1160    SDWAInst->eraseFromParent();
1161    return false;
1162  }
1163
1164  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1165  ++NumSDWAInstructionsPeepholed;
1166
1167  MI.eraseFromParent();
1168  return true;
1169}
1170
1171// If an instruction was converted to SDWA it should not have immediates or SGPR
1172// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1173void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1174                                            const GCNSubtarget &ST) const {
1175  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1176  unsigned ConstantBusCount = 0;
1177  for (MachineOperand &Op : MI.explicit_uses()) {
1178    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1179      continue;
1180
1181    unsigned I = MI.getOperandNo(&Op);
1182    if (Desc.OpInfo[I].RegClass == -1 ||
1183       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1184      continue;
1185
1186    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1187        TRI->isSGPRReg(*MRI, Op.getReg())) {
1188      ++ConstantBusCount;
1189      continue;
1190    }
1191
1192    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1193    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1194                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1195    if (Op.isImm())
1196      Copy.addImm(Op.getImm());
1197    else if (Op.isReg())
1198      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1199                  Op.getSubReg());
1200    Op.ChangeToRegister(VGPR, false);
1201  }
1202}
1203
1204bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1205  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1206
1207  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1208    return false;
1209
1210  MRI = &MF.getRegInfo();
1211  TRI = ST.getRegisterInfo();
1212  TII = ST.getInstrInfo();
1213
1214  // Find all SDWA operands in MF.
1215  bool Ret = false;
1216  for (MachineBasicBlock &MBB : MF) {
1217    bool Changed = false;
1218    do {
1219      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1220      // Look for a possible ADD or SUB that resulted from a previously lowered
1221      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1222      // lowers the pair of instructions into e32 form.
1223      matchSDWAOperands(MBB);
1224      for (const auto &OperandPair : SDWAOperands) {
1225        const auto &Operand = OperandPair.second;
1226        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1227        if (PotentialMI &&
1228           (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
1229            PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
1230          pseudoOpConvertToVOP2(*PotentialMI, ST);
1231      }
1232      SDWAOperands.clear();
1233
1234      // Generate potential match list.
1235      matchSDWAOperands(MBB);
1236
1237      for (const auto &OperandPair : SDWAOperands) {
1238        const auto &Operand = OperandPair.second;
1239        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1240        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1241          PotentialMatches[PotentialMI].push_back(Operand.get());
1242        }
1243      }
1244
1245      for (auto &PotentialPair : PotentialMatches) {
1246        MachineInstr &PotentialMI = *PotentialPair.first;
1247        convertToSDWA(PotentialMI, PotentialPair.second);
1248      }
1249
1250      PotentialMatches.clear();
1251      SDWAOperands.clear();
1252
1253      Changed = !ConvertedInstructions.empty();
1254
1255      if (Changed)
1256        Ret = true;
1257      while (!ConvertedInstructions.empty())
1258        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1259    } while (Changed);
1260  }
1261
1262  return Ret;
1263}
1264