1327952Sdim//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2317017Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6317017Sdim// 7317017Sdim//===----------------------------------------------------------------------===// 8317017Sdim// 9317017Sdim/// \file This pass tries to apply several peephole SDWA patterns. 10317017Sdim/// 11317017Sdim/// E.g. original: 12327952Sdim/// V_LSHRREV_B32_e32 %0, 16, %1 13327952Sdim/// V_ADD_I32_e32 %2, %0, %3 14327952Sdim/// V_LSHLREV_B32_e32 %4, 16, %2 15317017Sdim/// 16317017Sdim/// Replace: 17327952Sdim/// V_ADD_I32_sdwa %4, %1, %3 18317017Sdim/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19317017Sdim/// 20317017Sdim//===----------------------------------------------------------------------===// 21317017Sdim 22317017Sdim#include "AMDGPU.h" 23317017Sdim#include "AMDGPUSubtarget.h" 24317017Sdim#include "SIDefines.h" 25317017Sdim#include "SIInstrInfo.h" 26327952Sdim#include "SIRegisterInfo.h" 27341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 28327952Sdim#include "Utils/AMDGPUBaseInfo.h" 29360784Sdim#include "llvm/ADT/MapVector.h" 30327952Sdim#include "llvm/ADT/None.h" 31327952Sdim#include "llvm/ADT/Optional.h" 32319799Sdim#include "llvm/ADT/STLExtras.h" 33327952Sdim#include "llvm/ADT/SmallVector.h" 34317017Sdim#include "llvm/ADT/Statistic.h" 35327952Sdim#include "llvm/CodeGen/MachineBasicBlock.h" 36327952Sdim#include "llvm/CodeGen/MachineFunction.h" 37317017Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 38327952Sdim#include "llvm/CodeGen/MachineInstr.h" 39317017Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 40327952Sdim#include "llvm/CodeGen/MachineOperand.h" 41327952Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 42327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h" 43341825Sdim#include "llvm/Config/llvm-config.h" 44327952Sdim#include "llvm/MC/LaneBitmask.h" 45327952Sdim#include "llvm/MC/MCInstrDesc.h" 46327952Sdim#include "llvm/Pass.h" 47327952Sdim#include "llvm/Support/Debug.h" 48327952Sdim#include "llvm/Support/raw_ostream.h" 49327952Sdim#include <algorithm> 50327952Sdim#include <cassert> 51327952Sdim#include <cstdint> 52327952Sdim#include <memory> 53317017Sdim#include <unordered_map> 54317017Sdim 55317017Sdimusing namespace llvm; 56317017Sdim 57317017Sdim#define DEBUG_TYPE "si-peephole-sdwa" 58317017Sdim 59317017SdimSTATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 60317017SdimSTATISTIC(NumSDWAInstructionsPeepholed, 61317017Sdim "Number of instruction converted to SDWA."); 62317017Sdim 63317017Sdimnamespace { 64317017Sdim 65317017Sdimclass SDWAOperand; 66327952Sdimclass SDWADstOperand; 67317017Sdim 68317017Sdimclass SIPeepholeSDWA : public MachineFunctionPass { 69318681Sdimpublic: 70327952Sdim using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 71318681Sdim 72317017Sdimprivate: 73317017Sdim MachineRegisterInfo *MRI; 74317017Sdim const SIRegisterInfo *TRI; 75317017Sdim const SIInstrInfo *TII; 76317017Sdim 77360784Sdim MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 78360784Sdim MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; 79319250Sdim SmallVector<MachineInstr *, 8> ConvertedInstructions; 80317017Sdim 81317017Sdim Optional<int64_t> foldToImm(const MachineOperand &Op) const; 82317017Sdim 83317017Sdimpublic: 84317017Sdim static char ID; 85317017Sdim 86317017Sdim SIPeepholeSDWA() : MachineFunctionPass(ID) { 87317017Sdim initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 88317017Sdim } 89317017Sdim 90317017Sdim bool runOnMachineFunction(MachineFunction &MF) override; 91341825Sdim void matchSDWAOperands(MachineBasicBlock &MBB); 92327952Sdim std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 93344779Sdim bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 94344779Sdim void pseudoOpConvertToVOP2(MachineInstr &MI, 95344779Sdim const GCNSubtarget &ST) const; 96317017Sdim bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 97341825Sdim void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 98317017Sdim 99317017Sdim StringRef getPassName() const override { return "SI Peephole SDWA"; } 100317017Sdim 101317017Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 102317017Sdim AU.setPreservesCFG(); 103317017Sdim MachineFunctionPass::getAnalysisUsage(AU); 104317017Sdim } 105317017Sdim}; 106317017Sdim 107317017Sdimclass SDWAOperand { 108317017Sdimprivate: 109317017Sdim MachineOperand *Target; // Operand that would be used in converted instruction 110317017Sdim MachineOperand *Replaced; // Operand that would be replace by Target 111317017Sdim 112317017Sdimpublic: 113317017Sdim SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 114317017Sdim : Target(TargetOp), Replaced(ReplacedOp) { 115317017Sdim assert(Target->isReg()); 116317017Sdim assert(Replaced->isReg()); 117317017Sdim } 118317017Sdim 119327952Sdim virtual ~SDWAOperand() = default; 120317017Sdim 121317017Sdim virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 122317017Sdim virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 123317017Sdim 124317017Sdim MachineOperand *getTargetOperand() const { return Target; } 125317017Sdim MachineOperand *getReplacedOperand() const { return Replaced; } 126317017Sdim MachineInstr *getParentInst() const { return Target->getParent(); } 127327952Sdim 128317017Sdim MachineRegisterInfo *getMRI() const { 129317017Sdim return &getParentInst()->getParent()->getParent()->getRegInfo(); 130317017Sdim } 131327952Sdim 132327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 133327952Sdim virtual void print(raw_ostream& OS) const = 0; 134327952Sdim void dump() const { print(dbgs()); } 135327952Sdim#endif 136317017Sdim}; 137317017Sdim 138317017Sdimusing namespace AMDGPU::SDWA; 139317017Sdim 140317017Sdimclass SDWASrcOperand : public SDWAOperand { 141317017Sdimprivate: 142317017Sdim SdwaSel SrcSel; 143317017Sdim bool Abs; 144317017Sdim bool Neg; 145317017Sdim bool Sext; 146317017Sdim 147317017Sdimpublic: 148317017Sdim SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 149317017Sdim SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 150317017Sdim bool Sext_ = false) 151327952Sdim : SDWAOperand(TargetOp, ReplacedOp), 152327952Sdim SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 153317017Sdim 154327952Sdim MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 155327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 156317017Sdim 157317017Sdim SdwaSel getSrcSel() const { return SrcSel; } 158317017Sdim bool getAbs() const { return Abs; } 159317017Sdim bool getNeg() const { return Neg; } 160317017Sdim bool getSext() const { return Sext; } 161317017Sdim 162319799Sdim uint64_t getSrcMods(const SIInstrInfo *TII, 163319799Sdim const MachineOperand *SrcOp) const; 164327952Sdim 165327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 166327952Sdim void print(raw_ostream& OS) const override; 167327952Sdim#endif 168317017Sdim}; 169317017Sdim 170317017Sdimclass SDWADstOperand : public SDWAOperand { 171317017Sdimprivate: 172317017Sdim SdwaSel DstSel; 173317017Sdim DstUnused DstUn; 174317017Sdim 175317017Sdimpublic: 176327952Sdim 177317017Sdim SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 178317017Sdim SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 179327952Sdim : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 180317017Sdim 181327952Sdim MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 182327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 183317017Sdim 184317017Sdim SdwaSel getDstSel() const { return DstSel; } 185317017Sdim DstUnused getDstUnused() const { return DstUn; } 186327952Sdim 187327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 188327952Sdim void print(raw_ostream& OS) const override; 189327952Sdim#endif 190317017Sdim}; 191317017Sdim 192327952Sdimclass SDWADstPreserveOperand : public SDWADstOperand { 193327952Sdimprivate: 194327952Sdim MachineOperand *Preserve; 195317017Sdim 196327952Sdimpublic: 197327952Sdim SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 198327952Sdim MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 199327952Sdim : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 200327952Sdim Preserve(PreserveOp) {} 201327952Sdim 202327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 203327952Sdim 204327952Sdim MachineOperand *getPreservedOperand() const { return Preserve; } 205327952Sdim 206327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207327952Sdim void print(raw_ostream& OS) const override; 208327952Sdim#endif 209327952Sdim}; 210327952Sdim 211327952Sdim} // end anonymous namespace 212327952Sdim 213317017SdimINITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 214317017Sdim 215317017Sdimchar SIPeepholeSDWA::ID = 0; 216317017Sdim 217317017Sdimchar &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 218317017Sdim 219317017SdimFunctionPass *llvm::createSIPeepholeSDWAPass() { 220317017Sdim return new SIPeepholeSDWA(); 221317017Sdim} 222317017Sdim 223317017Sdim 224327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 225341825Sdimstatic raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 226317017Sdim switch(Sel) { 227317017Sdim case BYTE_0: OS << "BYTE_0"; break; 228317017Sdim case BYTE_1: OS << "BYTE_1"; break; 229317017Sdim case BYTE_2: OS << "BYTE_2"; break; 230317017Sdim case BYTE_3: OS << "BYTE_3"; break; 231317017Sdim case WORD_0: OS << "WORD_0"; break; 232317017Sdim case WORD_1: OS << "WORD_1"; break; 233317017Sdim case DWORD: OS << "DWORD"; break; 234317017Sdim } 235317017Sdim return OS; 236317017Sdim} 237317017Sdim 238317017Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 239317017Sdim switch(Un) { 240317017Sdim case UNUSED_PAD: OS << "UNUSED_PAD"; break; 241317017Sdim case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 242317017Sdim case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 243317017Sdim } 244317017Sdim return OS; 245317017Sdim} 246317017Sdim 247327952SdimLLVM_DUMP_METHOD 248327952Sdimvoid SDWASrcOperand::print(raw_ostream& OS) const { 249327952Sdim OS << "SDWA src: " << *getTargetOperand() 250327952Sdim << " src_sel:" << getSrcSel() 251327952Sdim << " abs:" << getAbs() << " neg:" << getNeg() 252327952Sdim << " sext:" << getSext() << '\n'; 253317017Sdim} 254317017Sdim 255327952SdimLLVM_DUMP_METHOD 256327952Sdimvoid SDWADstOperand::print(raw_ostream& OS) const { 257327952Sdim OS << "SDWA dst: " << *getTargetOperand() 258327952Sdim << " dst_sel:" << getDstSel() 259327952Sdim << " dst_unused:" << getDstUnused() << '\n'; 260327952Sdim} 261327952Sdim 262327952SdimLLVM_DUMP_METHOD 263327952Sdimvoid SDWADstPreserveOperand::print(raw_ostream& OS) const { 264327952Sdim OS << "SDWA preserve dst: " << *getTargetOperand() 265327952Sdim << " dst_sel:" << getDstSel() 266327952Sdim << " preserve:" << *getPreservedOperand() << '\n'; 267327952Sdim} 268327952Sdim 269317017Sdim#endif 270317017Sdim 271317017Sdimstatic void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 272317017Sdim assert(To.isReg() && From.isReg()); 273317017Sdim To.setReg(From.getReg()); 274317017Sdim To.setSubReg(From.getSubReg()); 275317017Sdim To.setIsUndef(From.isUndef()); 276317017Sdim if (To.isUse()) { 277317017Sdim To.setIsKill(From.isKill()); 278317017Sdim } else { 279317017Sdim To.setIsDead(From.isDead()); 280317017Sdim } 281317017Sdim} 282317017Sdim 283317017Sdimstatic bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 284317017Sdim return LHS.isReg() && 285317017Sdim RHS.isReg() && 286317017Sdim LHS.getReg() == RHS.getReg() && 287317017Sdim LHS.getSubReg() == RHS.getSubReg(); 288317017Sdim} 289317017Sdim 290327952Sdimstatic MachineOperand *findSingleRegUse(const MachineOperand *Reg, 291327952Sdim const MachineRegisterInfo *MRI) { 292327952Sdim if (!Reg->isReg() || !Reg->isDef()) 293327952Sdim return nullptr; 294320397Sdim 295327952Sdim MachineOperand *ResMO = nullptr; 296327952Sdim for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 297327952Sdim // If there exist use of subreg of Reg then return nullptr 298327952Sdim if (!isSameReg(UseMO, *Reg)) 299327952Sdim return nullptr; 300317017Sdim 301327952Sdim // Check that there is only one instruction that uses Reg 302327952Sdim if (!ResMO) { 303327952Sdim ResMO = &UseMO; 304327952Sdim } else if (ResMO->getParent() != UseMO.getParent()) { 305327952Sdim return nullptr; 306327952Sdim } 307327952Sdim } 308317017Sdim 309327952Sdim return ResMO; 310327952Sdim} 311317017Sdim 312327952Sdimstatic MachineOperand *findSingleRegDef(const MachineOperand *Reg, 313327952Sdim const MachineRegisterInfo *MRI) { 314327952Sdim if (!Reg->isReg()) 315327952Sdim return nullptr; 316327952Sdim 317327952Sdim MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 318327952Sdim if (!DefInstr) 319327952Sdim return nullptr; 320327952Sdim 321327952Sdim for (auto &DefMO : DefInstr->defs()) { 322327952Sdim if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 323327952Sdim return &DefMO; 324327952Sdim } 325327952Sdim 326327952Sdim // Ignore implicit defs. 327327952Sdim return nullptr; 328317017Sdim} 329317017Sdim 330319799Sdimuint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 331319799Sdim const MachineOperand *SrcOp) const { 332317017Sdim uint64_t Mods = 0; 333319799Sdim const auto *MI = SrcOp->getParent(); 334319799Sdim if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 335319799Sdim if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 336319799Sdim Mods = Mod->getImm(); 337319799Sdim } 338319799Sdim } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 339319799Sdim if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 340319799Sdim Mods = Mod->getImm(); 341319799Sdim } 342319799Sdim } 343317017Sdim if (Abs || Neg) { 344317017Sdim assert(!Sext && 345317017Sdim "Float and integer src modifiers can't be set simulteniously"); 346353358Sdim Mods |= Abs ? SISrcMods::ABS : 0u; 347353358Sdim Mods ^= Neg ? SISrcMods::NEG : 0u; 348317017Sdim } else if (Sext) { 349317017Sdim Mods |= SISrcMods::SEXT; 350317017Sdim } 351317017Sdim 352317017Sdim return Mods; 353317017Sdim} 354317017Sdim 355317017SdimMachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 356317017Sdim // For SDWA src operand potential instruction is one that use register 357317017Sdim // defined by parent instruction 358327952Sdim MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 359327952Sdim if (!PotentialMO) 360327952Sdim return nullptr; 361317017Sdim 362327952Sdim return PotentialMO->getParent(); 363317017Sdim} 364317017Sdim 365317017Sdimbool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 366317017Sdim // Find operand in instruction that matches source operand and replace it with 367317017Sdim // target operand. Set corresponding src_sel 368341825Sdim bool IsPreserveSrc = false; 369317017Sdim MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 370317017Sdim MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 371317017Sdim MachineOperand *SrcMods = 372317017Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 373319250Sdim assert(Src && (Src->isReg() || Src->isImm())); 374317017Sdim if (!isSameReg(*Src, *getReplacedOperand())) { 375341825Sdim // If this is not src0 then it could be src1 376317017Sdim Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 377317017Sdim SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 378317017Sdim SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 379317017Sdim 380341825Sdim if (!Src || 381341825Sdim !isSameReg(*Src, *getReplacedOperand())) { 382341825Sdim // It's possible this Src is a tied operand for 383341825Sdim // UNUSED_PRESERVE, in which case we can either 384341825Sdim // abandon the peephole attempt, or if legal we can 385341825Sdim // copy the target operand into the tied slot 386341825Sdim // if the preserve operation will effectively cause the same 387341825Sdim // result by overwriting the rest of the dst. 388341825Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 389341825Sdim MachineOperand *DstUnused = 390341825Sdim TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 391341825Sdim 392341825Sdim if (Dst && 393341825Sdim DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 394341825Sdim // This will work if the tied src is acessing WORD_0, and the dst is 395341825Sdim // writing WORD_1. Modifiers don't matter because all the bits that 396341825Sdim // would be impacted are being overwritten by the dst. 397341825Sdim // Any other case will not work. 398341825Sdim SdwaSel DstSel = static_cast<SdwaSel>( 399341825Sdim TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 400341825Sdim if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 401341825Sdim getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 402341825Sdim IsPreserveSrc = true; 403341825Sdim auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 404341825Sdim AMDGPU::OpName::vdst); 405341825Sdim auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 406341825Sdim Src = &MI.getOperand(TiedIdx); 407341825Sdim SrcSel = nullptr; 408341825Sdim SrcMods = nullptr; 409341825Sdim } else { 410341825Sdim // Not legal to convert this src 411341825Sdim return false; 412341825Sdim } 413341825Sdim } 414341825Sdim } 415317017Sdim assert(Src && Src->isReg()); 416317017Sdim 417353358Sdim if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 418353358Sdim MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 419353358Sdim MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 420317017Sdim MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 421327952Sdim !isSameReg(*Src, *getReplacedOperand())) { 422317017Sdim // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 423317017Sdim // src2. This is not allowed. 424317017Sdim return false; 425317017Sdim } 426317017Sdim 427341825Sdim assert(isSameReg(*Src, *getReplacedOperand()) && 428341825Sdim (IsPreserveSrc || (SrcSel && SrcMods))); 429317017Sdim } 430317017Sdim copyRegOperand(*Src, *getTargetOperand()); 431341825Sdim if (!IsPreserveSrc) { 432341825Sdim SrcSel->setImm(getSrcSel()); 433341825Sdim SrcMods->setImm(getSrcMods(TII, Src)); 434341825Sdim } 435317017Sdim getTargetOperand()->setIsKill(false); 436317017Sdim return true; 437317017Sdim} 438317017Sdim 439317017SdimMachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 440317017Sdim // For SDWA dst operand potential instruction is one that defines register 441317017Sdim // that this operand uses 442317017Sdim MachineRegisterInfo *MRI = getMRI(); 443317017Sdim MachineInstr *ParentMI = getParentInst(); 444317017Sdim 445327952Sdim MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 446327952Sdim if (!PotentialMO) 447327952Sdim return nullptr; 448317017Sdim 449327952Sdim // Check that ParentMI is the only instruction that uses replaced register 450327952Sdim for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 451327952Sdim if (&UseInst != ParentMI) 452317017Sdim return nullptr; 453317017Sdim } 454317017Sdim 455327952Sdim return PotentialMO->getParent(); 456317017Sdim} 457317017Sdim 458317017Sdimbool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 459317017Sdim // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 460317017Sdim 461353358Sdim if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 462353358Sdim MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 463353358Sdim MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 464317017Sdim MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 465317017Sdim getDstSel() != AMDGPU::SDWA::DWORD) { 466317017Sdim // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 467317017Sdim return false; 468317017Sdim } 469317017Sdim 470317017Sdim MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 471317017Sdim assert(Operand && 472317017Sdim Operand->isReg() && 473317017Sdim isSameReg(*Operand, *getReplacedOperand())); 474317017Sdim copyRegOperand(*Operand, *getTargetOperand()); 475317017Sdim MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 476317017Sdim assert(DstSel); 477317017Sdim DstSel->setImm(getDstSel()); 478317017Sdim MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 479317017Sdim assert(DstUnused); 480317017Sdim DstUnused->setImm(getDstUnused()); 481317017Sdim 482317017Sdim // Remove original instruction because it would conflict with our new 483317017Sdim // instruction by register definition 484317017Sdim getParentInst()->eraseFromParent(); 485317017Sdim return true; 486317017Sdim} 487317017Sdim 488327952Sdimbool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 489327952Sdim const SIInstrInfo *TII) { 490327952Sdim // MI should be moved right before v_or_b32. 491327952Sdim // For this we should clear all kill flags on uses of MI src-operands or else 492327952Sdim // we can encounter problem with use of killed operand. 493327952Sdim for (MachineOperand &MO : MI.uses()) { 494327952Sdim if (!MO.isReg()) 495327952Sdim continue; 496327952Sdim getMRI()->clearKillFlags(MO.getReg()); 497327952Sdim } 498327952Sdim 499327952Sdim // Move MI before v_or_b32 500327952Sdim auto MBB = MI.getParent(); 501327952Sdim MBB->remove(&MI); 502327952Sdim MBB->insert(getParentInst(), &MI); 503327952Sdim 504327952Sdim // Add Implicit use of preserved register 505327952Sdim MachineInstrBuilder MIB(*MBB->getParent(), MI); 506327952Sdim MIB.addReg(getPreservedOperand()->getReg(), 507327952Sdim RegState::ImplicitKill, 508327952Sdim getPreservedOperand()->getSubReg()); 509327952Sdim 510327952Sdim // Tie dst to implicit use 511327952Sdim MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 512327952Sdim MI.getNumOperands() - 1); 513327952Sdim 514327952Sdim // Convert MI as any other SDWADstOperand and remove v_or_b32 515327952Sdim return SDWADstOperand::convertToSDWA(MI, TII); 516327952Sdim} 517327952Sdim 518317017SdimOptional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 519317017Sdim if (Op.isImm()) { 520317017Sdim return Op.getImm(); 521317017Sdim } 522317017Sdim 523317017Sdim // If this is not immediate then it can be copy of immediate value, e.g.: 524327952Sdim // %1 = S_MOV_B32 255; 525317017Sdim if (Op.isReg()) { 526317017Sdim for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 527317017Sdim if (!isSameReg(Op, Def)) 528317017Sdim continue; 529317017Sdim 530317017Sdim const MachineInstr *DefInst = Def.getParent(); 531317017Sdim if (!TII->isFoldableCopy(*DefInst)) 532317017Sdim return None; 533317017Sdim 534317017Sdim const MachineOperand &Copied = DefInst->getOperand(1); 535317017Sdim if (!Copied.isImm()) 536317017Sdim return None; 537317017Sdim 538317017Sdim return Copied.getImm(); 539317017Sdim } 540317017Sdim } 541317017Sdim 542317017Sdim return None; 543317017Sdim} 544317017Sdim 545327952Sdimstd::unique_ptr<SDWAOperand> 546327952SdimSIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 547327952Sdim unsigned Opcode = MI.getOpcode(); 548327952Sdim switch (Opcode) { 549327952Sdim case AMDGPU::V_LSHRREV_B32_e32: 550327952Sdim case AMDGPU::V_ASHRREV_I32_e32: 551327952Sdim case AMDGPU::V_LSHLREV_B32_e32: 552327952Sdim case AMDGPU::V_LSHRREV_B32_e64: 553327952Sdim case AMDGPU::V_ASHRREV_I32_e64: 554327952Sdim case AMDGPU::V_LSHLREV_B32_e64: { 555327952Sdim // from: v_lshrrev_b32_e32 v1, 16/24, v0 556327952Sdim // to SDWA src:v0 src_sel:WORD_1/BYTE_3 557317017Sdim 558327952Sdim // from: v_ashrrev_i32_e32 v1, 16/24, v0 559327952Sdim // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 560317017Sdim 561327952Sdim // from: v_lshlrev_b32_e32 v1, 16/24, v0 562327952Sdim // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 563327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 564327952Sdim auto Imm = foldToImm(*Src0); 565327952Sdim if (!Imm) 566327952Sdim break; 567317017Sdim 568327952Sdim if (*Imm != 16 && *Imm != 24) 569327952Sdim break; 570317017Sdim 571327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 572327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 573360784Sdim if (Register::isPhysicalRegister(Src1->getReg()) || 574360784Sdim Register::isPhysicalRegister(Dst->getReg())) 575327952Sdim break; 576317017Sdim 577327952Sdim if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 578327952Sdim Opcode == AMDGPU::V_LSHLREV_B32_e64) { 579360784Sdim return std::make_unique<SDWADstOperand>( 580327952Sdim Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 581327952Sdim } else { 582360784Sdim return std::make_unique<SDWASrcOperand>( 583327952Sdim Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 584327952Sdim Opcode != AMDGPU::V_LSHRREV_B32_e32 && 585327952Sdim Opcode != AMDGPU::V_LSHRREV_B32_e64); 586327952Sdim } 587327952Sdim break; 588327952Sdim } 589317017Sdim 590327952Sdim case AMDGPU::V_LSHRREV_B16_e32: 591327952Sdim case AMDGPU::V_ASHRREV_I16_e32: 592327952Sdim case AMDGPU::V_LSHLREV_B16_e32: 593327952Sdim case AMDGPU::V_LSHRREV_B16_e64: 594327952Sdim case AMDGPU::V_ASHRREV_I16_e64: 595327952Sdim case AMDGPU::V_LSHLREV_B16_e64: { 596327952Sdim // from: v_lshrrev_b16_e32 v1, 8, v0 597327952Sdim // to SDWA src:v0 src_sel:BYTE_1 598317017Sdim 599327952Sdim // from: v_ashrrev_i16_e32 v1, 8, v0 600327952Sdim // to SDWA src:v0 src_sel:BYTE_1 sext:1 601317017Sdim 602327952Sdim // from: v_lshlrev_b16_e32 v1, 8, v0 603327952Sdim // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 604327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 605327952Sdim auto Imm = foldToImm(*Src0); 606327952Sdim if (!Imm || *Imm != 8) 607327952Sdim break; 608317017Sdim 609327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 610327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 611317017Sdim 612360784Sdim if (Register::isPhysicalRegister(Src1->getReg()) || 613360784Sdim Register::isPhysicalRegister(Dst->getReg())) 614327952Sdim break; 615317017Sdim 616327952Sdim if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 617327952Sdim Opcode == AMDGPU::V_LSHLREV_B16_e64) { 618360784Sdim return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 619327952Sdim } else { 620360784Sdim return std::make_unique<SDWASrcOperand>( 621327952Sdim Src1, Dst, BYTE_1, false, false, 622327952Sdim Opcode != AMDGPU::V_LSHRREV_B16_e32 && 623327952Sdim Opcode != AMDGPU::V_LSHRREV_B16_e64); 624327952Sdim } 625327952Sdim break; 626327952Sdim } 627317017Sdim 628327952Sdim case AMDGPU::V_BFE_I32: 629327952Sdim case AMDGPU::V_BFE_U32: { 630327952Sdim // e.g.: 631327952Sdim // from: v_bfe_u32 v1, v0, 8, 8 632327952Sdim // to SDWA src:v0 src_sel:BYTE_1 633317017Sdim 634327952Sdim // offset | width | src_sel 635327952Sdim // ------------------------ 636327952Sdim // 0 | 8 | BYTE_0 637327952Sdim // 0 | 16 | WORD_0 638327952Sdim // 0 | 32 | DWORD ? 639327952Sdim // 8 | 8 | BYTE_1 640327952Sdim // 16 | 8 | BYTE_2 641327952Sdim // 16 | 16 | WORD_1 642327952Sdim // 24 | 8 | BYTE_3 643317017Sdim 644327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 645327952Sdim auto Offset = foldToImm(*Src1); 646327952Sdim if (!Offset) 647327952Sdim break; 648317017Sdim 649327952Sdim MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 650327952Sdim auto Width = foldToImm(*Src2); 651327952Sdim if (!Width) 652327952Sdim break; 653317017Sdim 654327952Sdim SdwaSel SrcSel = DWORD; 655317017Sdim 656327952Sdim if (*Offset == 0 && *Width == 8) 657327952Sdim SrcSel = BYTE_0; 658327952Sdim else if (*Offset == 0 && *Width == 16) 659327952Sdim SrcSel = WORD_0; 660327952Sdim else if (*Offset == 0 && *Width == 32) 661327952Sdim SrcSel = DWORD; 662327952Sdim else if (*Offset == 8 && *Width == 8) 663327952Sdim SrcSel = BYTE_1; 664327952Sdim else if (*Offset == 16 && *Width == 8) 665327952Sdim SrcSel = BYTE_2; 666327952Sdim else if (*Offset == 16 && *Width == 16) 667327952Sdim SrcSel = WORD_1; 668327952Sdim else if (*Offset == 24 && *Width == 8) 669327952Sdim SrcSel = BYTE_3; 670327952Sdim else 671327952Sdim break; 672317017Sdim 673327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 674327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 675320397Sdim 676360784Sdim if (Register::isPhysicalRegister(Src0->getReg()) || 677360784Sdim Register::isPhysicalRegister(Dst->getReg())) 678327952Sdim break; 679317017Sdim 680360784Sdim return std::make_unique<SDWASrcOperand>( 681327952Sdim Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 682327952Sdim } 683327952Sdim 684327952Sdim case AMDGPU::V_AND_B32_e32: 685327952Sdim case AMDGPU::V_AND_B32_e64: { 686327952Sdim // e.g.: 687327952Sdim // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 688327952Sdim // to SDWA src:v0 src_sel:WORD_0/BYTE_0 689327952Sdim 690327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 691327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 692327952Sdim auto ValSrc = Src1; 693327952Sdim auto Imm = foldToImm(*Src0); 694327952Sdim 695327952Sdim if (!Imm) { 696327952Sdim Imm = foldToImm(*Src1); 697327952Sdim ValSrc = Src0; 698327952Sdim } 699327952Sdim 700327952Sdim if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 701327952Sdim break; 702327952Sdim 703327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 704327952Sdim 705360784Sdim if (Register::isPhysicalRegister(ValSrc->getReg()) || 706360784Sdim Register::isPhysicalRegister(Dst->getReg())) 707327952Sdim break; 708327952Sdim 709360784Sdim return std::make_unique<SDWASrcOperand>( 710327952Sdim ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 711327952Sdim } 712327952Sdim 713327952Sdim case AMDGPU::V_OR_B32_e32: 714327952Sdim case AMDGPU::V_OR_B32_e64: { 715327952Sdim // Patterns for dst_unused:UNUSED_PRESERVE. 716327952Sdim // e.g., from: 717327952Sdim // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 718327952Sdim // src1_sel:WORD_1 src2_sel:WORD1 719327952Sdim // v_add_f16_e32 v3, v1, v2 720327952Sdim // v_or_b32_e32 v4, v0, v3 721327952Sdim // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 722327952Sdim 723327952Sdim // Check if one of operands of v_or_b32 is SDWA instruction 724327952Sdim using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 725327952Sdim auto CheckOROperandsForSDWA = 726327952Sdim [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 727327952Sdim if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 728327952Sdim return CheckRetType(None); 729327952Sdim 730327952Sdim MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 731327952Sdim if (!Op1Def) 732327952Sdim return CheckRetType(None); 733327952Sdim 734327952Sdim MachineInstr *Op1Inst = Op1Def->getParent(); 735327952Sdim if (!TII->isSDWA(*Op1Inst)) 736327952Sdim return CheckRetType(None); 737327952Sdim 738327952Sdim MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 739327952Sdim if (!Op2Def) 740327952Sdim return CheckRetType(None); 741327952Sdim 742327952Sdim return CheckRetType(std::make_pair(Op1Def, Op2Def)); 743327952Sdim }; 744327952Sdim 745327952Sdim MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 746327952Sdim MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 747327952Sdim assert(OrSDWA && OrOther); 748327952Sdim auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 749327952Sdim if (!Res) { 750327952Sdim OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 751327952Sdim OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 752327952Sdim assert(OrSDWA && OrOther); 753327952Sdim Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 754327952Sdim if (!Res) 755317017Sdim break; 756327952Sdim } 757317017Sdim 758327952Sdim MachineOperand *OrSDWADef = Res->first; 759327952Sdim MachineOperand *OrOtherDef = Res->second; 760327952Sdim assert(OrSDWADef && OrOtherDef); 761317017Sdim 762327952Sdim MachineInstr *SDWAInst = OrSDWADef->getParent(); 763327952Sdim MachineInstr *OtherInst = OrOtherDef->getParent(); 764319799Sdim 765327952Sdim // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 766327952Sdim // destination patterns don't overlap. Compatible instruction can be either 767327952Sdim // regular instruction with compatible bitness or SDWA instruction with 768327952Sdim // correct dst_sel 769327952Sdim // SDWAInst | OtherInst bitness / OtherInst dst_sel 770327952Sdim // ----------------------------------------------------- 771327952Sdim // DWORD | no / no 772327952Sdim // WORD_0 | no / BYTE_2/3, WORD_1 773327952Sdim // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 774327952Sdim // BYTE_0 | no / BYTE_1/2/3, WORD_1 775327952Sdim // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 776327952Sdim // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 777327952Sdim // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 778327952Sdim // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 779327952Sdim // but v_add_f32 is not. 780317017Sdim 781327952Sdim // TODO: add support for non-SDWA instructions as OtherInst. 782327952Sdim // For now this only works with SDWA instructions. For regular instructions 783341825Sdim // there is no way to determine if the instruction writes only 8/16/24-bit 784341825Sdim // out of full register size and all registers are at min 32-bit wide. 785327952Sdim if (!TII->isSDWA(*OtherInst)) 786327952Sdim break; 787320397Sdim 788327952Sdim SdwaSel DstSel = static_cast<SdwaSel>( 789327952Sdim TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 790327952Sdim SdwaSel OtherDstSel = static_cast<SdwaSel>( 791327952Sdim TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 792317017Sdim 793327952Sdim bool DstSelAgree = false; 794327952Sdim switch (DstSel) { 795327952Sdim case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 796327952Sdim (OtherDstSel == BYTE_3) || 797327952Sdim (OtherDstSel == WORD_1)); 798327952Sdim break; 799327952Sdim case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 800327952Sdim (OtherDstSel == BYTE_1) || 801327952Sdim (OtherDstSel == WORD_0)); 802327952Sdim break; 803327952Sdim case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 804327952Sdim (OtherDstSel == BYTE_2) || 805327952Sdim (OtherDstSel == BYTE_3) || 806327952Sdim (OtherDstSel == WORD_1)); 807327952Sdim break; 808327952Sdim case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 809327952Sdim (OtherDstSel == BYTE_2) || 810327952Sdim (OtherDstSel == BYTE_3) || 811327952Sdim (OtherDstSel == WORD_1)); 812327952Sdim break; 813327952Sdim case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 814327952Sdim (OtherDstSel == BYTE_1) || 815327952Sdim (OtherDstSel == BYTE_3) || 816327952Sdim (OtherDstSel == WORD_0)); 817327952Sdim break; 818327952Sdim case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 819327952Sdim (OtherDstSel == BYTE_1) || 820327952Sdim (OtherDstSel == BYTE_2) || 821327952Sdim (OtherDstSel == WORD_0)); 822327952Sdim break; 823327952Sdim default: DstSelAgree = false; 824327952Sdim } 825327952Sdim 826327952Sdim if (!DstSelAgree) 827327952Sdim break; 828327952Sdim 829327952Sdim // Also OtherInst dst_unused should be UNUSED_PAD 830327952Sdim DstUnused OtherDstUnused = static_cast<DstUnused>( 831327952Sdim TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 832327952Sdim if (OtherDstUnused != DstUnused::UNUSED_PAD) 833327952Sdim break; 834327952Sdim 835327952Sdim // Create DstPreserveOperand 836327952Sdim MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 837327952Sdim assert(OrDst && OrDst->isReg()); 838327952Sdim 839360784Sdim return std::make_unique<SDWADstPreserveOperand>( 840327952Sdim OrDst, OrSDWADef, OrOtherDef, DstSel); 841327952Sdim 842327952Sdim } 843327952Sdim } 844327952Sdim 845327952Sdim return std::unique_ptr<SDWAOperand>(nullptr); 846327952Sdim} 847327952Sdim 848360784Sdim#if !defined(NDEBUG) 849360784Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 850360784Sdim Operand.print(OS); 851360784Sdim return OS; 852360784Sdim} 853360784Sdim#endif 854360784Sdim 855341825Sdimvoid SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 856341825Sdim for (MachineInstr &MI : MBB) { 857341825Sdim if (auto Operand = matchSDWAOperand(MI)) { 858341825Sdim LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 859341825Sdim SDWAOperands[&MI] = std::move(Operand); 860341825Sdim ++NumSDWAPatternsFound; 861317017Sdim } 862317017Sdim } 863317017Sdim} 864317017Sdim 865344779Sdim// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 866344779Sdim// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA 867344779Sdim// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. 868344779Sdim// 869344779Sdim// We are transforming from a VOP3 into a VOP2 form of the instruction. 870344779Sdim// %19:vgpr_32 = V_AND_B32_e32 255, 871344779Sdim// killed %16:vgpr_32, implicit $exec 872344779Sdim// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 873344779Sdim// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 874344779Sdim// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 875344779Sdim// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 876344779Sdim// 877344779Sdim// becomes 878344779Sdim// %47:vgpr_32 = V_ADD_I32_sdwa 879344779Sdim// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 880344779Sdim// implicit-def $vcc, implicit $exec 881344779Sdim// %48:vgpr_32 = V_ADDC_U32_e32 882344779Sdim// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 883344779Sdimvoid SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 884344779Sdim const GCNSubtarget &ST) const { 885344779Sdim int Opc = MI.getOpcode(); 886344779Sdim assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && 887344779Sdim "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); 888344779Sdim 889344779Sdim // Can the candidate MI be shrunk? 890344779Sdim if (!TII->canShrink(MI, *MRI)) 891344779Sdim return; 892344779Sdim Opc = AMDGPU::getVOPe32(Opc); 893344779Sdim // Find the related ADD instruction. 894344779Sdim const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 895344779Sdim if (!Sdst) 896344779Sdim return; 897344779Sdim MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 898344779Sdim if (!NextOp) 899344779Sdim return; 900344779Sdim MachineInstr &MISucc = *NextOp->getParent(); 901344779Sdim // Can the successor be shrunk? 902344779Sdim if (!TII->canShrink(MISucc, *MRI)) 903344779Sdim return; 904344779Sdim int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 905344779Sdim // Make sure the carry in/out are subsequently unused. 906344779Sdim MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 907344779Sdim if (!CarryIn) 908344779Sdim return; 909344779Sdim MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 910344779Sdim if (!CarryOut) 911344779Sdim return; 912344779Sdim if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 913344779Sdim return; 914344779Sdim // Make sure VCC or its subregs are dead before MI. 915344779Sdim MachineBasicBlock &MBB = *MI.getParent(); 916344779Sdim auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 917344779Sdim if (Liveness != MachineBasicBlock::LQR_Dead) 918344779Sdim return; 919344779Sdim // Check if VCC is referenced in range of (MI,MISucc]. 920344779Sdim for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 921344779Sdim I != E; ++I) { 922344779Sdim if (I->modifiesRegister(AMDGPU::VCC, TRI)) 923344779Sdim return; 924344779Sdim } 925344779Sdim // Make the two new e32 instruction variants. 926344779Sdim // Replace MI with V_{SUB|ADD}_I32_e32 927344779Sdim auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); 928344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); 929344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 930344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); 931344779Sdim MI.eraseFromParent(); 932344779Sdim // Replace MISucc with V_{SUBB|ADDC}_U32_e32 933344779Sdim auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); 934344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); 935344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); 936344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); 937344779Sdim MISucc.eraseFromParent(); 938344779Sdim} 939344779Sdim 940344779Sdimbool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 941341825Sdim const GCNSubtarget &ST) const { 942327952Sdim // Check if this is already an SDWA instruction 943327952Sdim unsigned Opc = MI.getOpcode(); 944327952Sdim if (TII->isSDWA(Opc)) 945327952Sdim return true; 946327952Sdim 947319250Sdim // Check if this instruction has opcode that supports SDWA 948320397Sdim if (AMDGPU::getSDWAOp(Opc) == -1) 949320397Sdim Opc = AMDGPU::getVOPe32(Opc); 950320397Sdim 951327952Sdim if (AMDGPU::getSDWAOp(Opc) == -1) 952320397Sdim return false; 953320397Sdim 954320397Sdim if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 955320397Sdim return false; 956320397Sdim 957320397Sdim if (TII->isVOPC(Opc)) { 958320397Sdim if (!ST.hasSDWASdst()) { 959320397Sdim const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 960353358Sdim if (SDst && (SDst->getReg() != AMDGPU::VCC && 961353358Sdim SDst->getReg() != AMDGPU::VCC_LO)) 962320397Sdim return false; 963320397Sdim } 964320397Sdim 965320572Sdim if (!ST.hasSDWAOutModsVOPC() && 966320572Sdim (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 967320572Sdim TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 968320397Sdim return false; 969320397Sdim 970320572Sdim } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 971320572Sdim !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 972320397Sdim return false; 973320397Sdim } 974320397Sdim 975353358Sdim if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 976353358Sdim Opc == AMDGPU::V_FMAC_F32_e32 || 977353358Sdim Opc == AMDGPU::V_MAC_F16_e32 || 978320397Sdim Opc == AMDGPU::V_MAC_F32_e32)) 979320397Sdim return false; 980320397Sdim 981353358Sdim // Check if target supports this SDWA opcode 982353358Sdim if (TII->pseudoToMCOpcode(Opc) == -1) 983353358Sdim return false; 984353358Sdim 985341825Sdim // FIXME: has SDWA but require handling of implicit VCC use 986341825Sdim if (Opc == AMDGPU::V_CNDMASK_B32_e32) 987341825Sdim return false; 988341825Sdim 989320397Sdim return true; 990318681Sdim} 991318681Sdim 992318681Sdimbool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 993318681Sdim const SDWAOperandsVector &SDWAOperands) { 994341825Sdim 995341825Sdim LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 996341825Sdim 997317017Sdim // Convert to sdwa 998327952Sdim int SDWAOpcode; 999327952Sdim unsigned Opcode = MI.getOpcode(); 1000327952Sdim if (TII->isSDWA(Opcode)) { 1001327952Sdim SDWAOpcode = Opcode; 1002327952Sdim } else { 1003327952Sdim SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1004327952Sdim if (SDWAOpcode == -1) 1005327952Sdim SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1006327952Sdim } 1007317017Sdim assert(SDWAOpcode != -1); 1008317017Sdim 1009317017Sdim const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1010317017Sdim 1011317017Sdim // Create SDWA version of instruction MI and initialize its operands 1012317017Sdim MachineInstrBuilder SDWAInst = 1013317017Sdim BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 1014317017Sdim 1015320572Sdim // Copy dst, if it is present in original then should also be present in SDWA 1016320572Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1017317017Sdim if (Dst) { 1018317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 1019317017Sdim SDWAInst.add(*Dst); 1020320572Sdim } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1021320397Sdim assert(Dst && 1022320397Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1023320397Sdim SDWAInst.add(*Dst); 1024320572Sdim } else { 1025320572Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1026353358Sdim SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1027317017Sdim } 1028317017Sdim 1029317017Sdim // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1030317017Sdim // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1031317017Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1032317017Sdim assert( 1033317017Sdim Src0 && 1034317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 1035317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 1036319799Sdim if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1037319799Sdim SDWAInst.addImm(Mod->getImm()); 1038319799Sdim else 1039319799Sdim SDWAInst.addImm(0); 1040317017Sdim SDWAInst.add(*Src0); 1041317017Sdim 1042317017Sdim // Copy src1 if present, initialize src1_modifiers. 1043317017Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1044317017Sdim if (Src1) { 1045317017Sdim assert( 1046317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 1047317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 1048319799Sdim if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1049319799Sdim SDWAInst.addImm(Mod->getImm()); 1050319799Sdim else 1051319799Sdim SDWAInst.addImm(0); 1052317017Sdim SDWAInst.add(*Src1); 1053317017Sdim } 1054317017Sdim 1055353358Sdim if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1056353358Sdim SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1057353358Sdim SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1058317017Sdim SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1059317017Sdim // v_mac_f16/32 has additional src2 operand tied to vdst 1060317017Sdim MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1061317017Sdim assert(Src2); 1062317017Sdim SDWAInst.add(*Src2); 1063317017Sdim } 1064317017Sdim 1065320397Sdim // Copy clamp if present, initialize otherwise 1066317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 1067320397Sdim MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1068320397Sdim if (Clamp) { 1069320397Sdim SDWAInst.add(*Clamp); 1070320397Sdim } else { 1071320397Sdim SDWAInst.addImm(0); 1072320397Sdim } 1073317017Sdim 1074320397Sdim // Copy omod if present, initialize otherwise if needed 1075320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 1076320572Sdim MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1077320572Sdim if (OMod) { 1078320572Sdim SDWAInst.add(*OMod); 1079320572Sdim } else { 1080320572Sdim SDWAInst.addImm(0); 1081320572Sdim } 1082320397Sdim } 1083320397Sdim 1084327952Sdim // Copy dst_sel if present, initialize otherwise if needed 1085320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 1086327952Sdim MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1087327952Sdim if (DstSel) { 1088327952Sdim SDWAInst.add(*DstSel); 1089327952Sdim } else { 1090327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1091327952Sdim } 1092320572Sdim } 1093320572Sdim 1094327952Sdim // Copy dst_unused if present, initialize otherwise if needed 1095320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1096327952Sdim MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1097327952Sdim if (DstUnused) { 1098327952Sdim SDWAInst.add(*DstUnused); 1099327952Sdim } else { 1100327952Sdim SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1101327952Sdim } 1102317017Sdim } 1103317017Sdim 1104327952Sdim // Copy src0_sel if present, initialize otherwise 1105317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1106327952Sdim MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1107327952Sdim if (Src0Sel) { 1108327952Sdim SDWAInst.add(*Src0Sel); 1109327952Sdim } else { 1110327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1111327952Sdim } 1112317017Sdim 1113327952Sdim // Copy src1_sel if present, initialize otherwise if needed 1114317017Sdim if (Src1) { 1115317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1116327952Sdim MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1117327952Sdim if (Src1Sel) { 1118327952Sdim SDWAInst.add(*Src1Sel); 1119327952Sdim } else { 1120327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1121327952Sdim } 1122317017Sdim } 1123317017Sdim 1124341825Sdim // Check for a preserved register that needs to be copied. 1125341825Sdim auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1126341825Sdim if (DstUnused && 1127341825Sdim DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1128341825Sdim // We expect, if we are here, that the instruction was already in it's SDWA form, 1129341825Sdim // with a tied operand. 1130341825Sdim assert(Dst && Dst->isTied()); 1131341825Sdim assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1132341825Sdim // We also expect a vdst, since sdst can't preserve. 1133341825Sdim auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1134341825Sdim assert(PreserveDstIdx != -1); 1135341825Sdim 1136341825Sdim auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1137341825Sdim auto Tied = MI.getOperand(TiedIdx); 1138341825Sdim 1139341825Sdim SDWAInst.add(Tied); 1140341825Sdim SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1141341825Sdim } 1142341825Sdim 1143341825Sdim // Apply all sdwa operand patterns. 1144317017Sdim bool Converted = false; 1145317017Sdim for (auto &Operand : SDWAOperands) { 1146341825Sdim LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1147318681Sdim // There should be no intesection between SDWA operands and potential MIs 1148318681Sdim // e.g.: 1149318681Sdim // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1150318681Sdim // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1151318681Sdim // v_add_u32 v3, v4, v2 1152318681Sdim // 1153318681Sdim // In that example it is possible that we would fold 2nd instruction into 3rd 1154318681Sdim // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1155318681Sdim // already destroyed). So if SDWAOperand is also a potential MI then do not 1156318681Sdim // apply it. 1157318681Sdim if (PotentialMatches.count(Operand->getParentInst()) == 0) 1158318681Sdim Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1159317017Sdim } 1160319250Sdim if (Converted) { 1161319250Sdim ConvertedInstructions.push_back(SDWAInst); 1162319250Sdim } else { 1163317017Sdim SDWAInst->eraseFromParent(); 1164317017Sdim return false; 1165317017Sdim } 1166317017Sdim 1167341825Sdim LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1168317017Sdim ++NumSDWAInstructionsPeepholed; 1169317017Sdim 1170317017Sdim MI.eraseFromParent(); 1171317017Sdim return true; 1172317017Sdim} 1173317017Sdim 1174319250Sdim// If an instruction was converted to SDWA it should not have immediates or SGPR 1175320397Sdim// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1176341825Sdimvoid SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1177341825Sdim const GCNSubtarget &ST) const { 1178319250Sdim const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1179320397Sdim unsigned ConstantBusCount = 0; 1180327952Sdim for (MachineOperand &Op : MI.explicit_uses()) { 1181319250Sdim if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1182319250Sdim continue; 1183320397Sdim 1184320397Sdim unsigned I = MI.getOperandNo(&Op); 1185319250Sdim if (Desc.OpInfo[I].RegClass == -1 || 1186319250Sdim !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1187319250Sdim continue; 1188320397Sdim 1189320397Sdim if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1190320397Sdim TRI->isSGPRReg(*MRI, Op.getReg())) { 1191320397Sdim ++ConstantBusCount; 1192320397Sdim continue; 1193320397Sdim } 1194320397Sdim 1195360784Sdim Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1196319250Sdim auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1197319250Sdim TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1198319250Sdim if (Op.isImm()) 1199319250Sdim Copy.addImm(Op.getImm()); 1200319250Sdim else if (Op.isReg()) 1201319250Sdim Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1202319250Sdim Op.getSubReg()); 1203319250Sdim Op.ChangeToRegister(VGPR, false); 1204319250Sdim } 1205319250Sdim} 1206319250Sdim 1207317017Sdimbool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1208341825Sdim const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1209317017Sdim 1210327952Sdim if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1211317017Sdim return false; 1212317017Sdim 1213317017Sdim MRI = &MF.getRegInfo(); 1214317017Sdim TRI = ST.getRegisterInfo(); 1215317017Sdim TII = ST.getInstrInfo(); 1216320397Sdim 1217318681Sdim // Find all SDWA operands in MF. 1218327952Sdim bool Ret = false; 1219341825Sdim for (MachineBasicBlock &MBB : MF) { 1220341825Sdim bool Changed = false; 1221341825Sdim do { 1222344779Sdim // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1223344779Sdim // Look for a possible ADD or SUB that resulted from a previously lowered 1224344779Sdim // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1225344779Sdim // lowers the pair of instructions into e32 form. 1226341825Sdim matchSDWAOperands(MBB); 1227344779Sdim for (const auto &OperandPair : SDWAOperands) { 1228344779Sdim const auto &Operand = OperandPair.second; 1229344779Sdim MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1230344779Sdim if (PotentialMI && 1231344779Sdim (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || 1232344779Sdim PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) 1233344779Sdim pseudoOpConvertToVOP2(*PotentialMI, ST); 1234344779Sdim } 1235344779Sdim SDWAOperands.clear(); 1236317017Sdim 1237344779Sdim // Generate potential match list. 1238344779Sdim matchSDWAOperands(MBB); 1239344779Sdim 1240341825Sdim for (const auto &OperandPair : SDWAOperands) { 1241341825Sdim const auto &Operand = OperandPair.second; 1242341825Sdim MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1243341825Sdim if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1244341825Sdim PotentialMatches[PotentialMI].push_back(Operand.get()); 1245341825Sdim } 1246327952Sdim } 1247317017Sdim 1248341825Sdim for (auto &PotentialPair : PotentialMatches) { 1249341825Sdim MachineInstr &PotentialMI = *PotentialPair.first; 1250341825Sdim convertToSDWA(PotentialMI, PotentialPair.second); 1251341825Sdim } 1252317017Sdim 1253341825Sdim PotentialMatches.clear(); 1254341825Sdim SDWAOperands.clear(); 1255319250Sdim 1256341825Sdim Changed = !ConvertedInstructions.empty(); 1257319250Sdim 1258341825Sdim if (Changed) 1259341825Sdim Ret = true; 1260341825Sdim while (!ConvertedInstructions.empty()) 1261341825Sdim legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1262341825Sdim } while (Changed); 1263341825Sdim } 1264327952Sdim 1265319799Sdim return Ret; 1266317017Sdim} 1267