SIPeepholeSDWA.cpp revision 344779
1327952Sdim//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2317017Sdim// 3317017Sdim// The LLVM Compiler Infrastructure 4317017Sdim// 5317017Sdim// This file is distributed under the University of Illinois Open Source 6317017Sdim// License. See LICENSE.TXT for details. 7317017Sdim// 8317017Sdim//===----------------------------------------------------------------------===// 9317017Sdim// 10317017Sdim/// \file This pass tries to apply several peephole SDWA patterns. 11317017Sdim/// 12317017Sdim/// E.g. original: 13327952Sdim/// V_LSHRREV_B32_e32 %0, 16, %1 14327952Sdim/// V_ADD_I32_e32 %2, %0, %3 15327952Sdim/// V_LSHLREV_B32_e32 %4, 16, %2 16317017Sdim/// 17317017Sdim/// Replace: 18327952Sdim/// V_ADD_I32_sdwa %4, %1, %3 19317017Sdim/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20317017Sdim/// 21317017Sdim//===----------------------------------------------------------------------===// 22317017Sdim 23317017Sdim#include "AMDGPU.h" 24317017Sdim#include "AMDGPUSubtarget.h" 25317017Sdim#include "SIDefines.h" 26317017Sdim#include "SIInstrInfo.h" 27327952Sdim#include "SIRegisterInfo.h" 28341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29327952Sdim#include "Utils/AMDGPUBaseInfo.h" 30327952Sdim#include "llvm/ADT/None.h" 31327952Sdim#include "llvm/ADT/Optional.h" 32319799Sdim#include "llvm/ADT/STLExtras.h" 33327952Sdim#include "llvm/ADT/SmallVector.h" 34317017Sdim#include "llvm/ADT/Statistic.h" 35327952Sdim#include "llvm/CodeGen/MachineBasicBlock.h" 36327952Sdim#include "llvm/CodeGen/MachineFunction.h" 37317017Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 38327952Sdim#include "llvm/CodeGen/MachineInstr.h" 39317017Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 40327952Sdim#include "llvm/CodeGen/MachineOperand.h" 41327952Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 42327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h" 43341825Sdim#include "llvm/Config/llvm-config.h" 44327952Sdim#include "llvm/MC/LaneBitmask.h" 45327952Sdim#include "llvm/MC/MCInstrDesc.h" 46327952Sdim#include "llvm/Pass.h" 47327952Sdim#include "llvm/Support/Debug.h" 48327952Sdim#include "llvm/Support/raw_ostream.h" 49327952Sdim#include <algorithm> 50327952Sdim#include <cassert> 51327952Sdim#include <cstdint> 52327952Sdim#include <memory> 53317017Sdim#include <unordered_map> 54317017Sdim 55317017Sdimusing namespace llvm; 56317017Sdim 57317017Sdim#define DEBUG_TYPE "si-peephole-sdwa" 58317017Sdim 59317017SdimSTATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 60317017SdimSTATISTIC(NumSDWAInstructionsPeepholed, 61317017Sdim "Number of instruction converted to SDWA."); 62317017Sdim 63317017Sdimnamespace { 64317017Sdim 65317017Sdimclass SDWAOperand; 66327952Sdimclass SDWADstOperand; 67317017Sdim 68317017Sdimclass SIPeepholeSDWA : public MachineFunctionPass { 69318681Sdimpublic: 70327952Sdim using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 71318681Sdim 72317017Sdimprivate: 73317017Sdim MachineRegisterInfo *MRI; 74317017Sdim const SIRegisterInfo *TRI; 75317017Sdim const SIInstrInfo *TII; 76317017Sdim 77317017Sdim std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 78318681Sdim std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 79319250Sdim SmallVector<MachineInstr *, 8> ConvertedInstructions; 80317017Sdim 81317017Sdim Optional<int64_t> foldToImm(const MachineOperand &Op) const; 82317017Sdim 83317017Sdimpublic: 84317017Sdim static char ID; 85317017Sdim 86317017Sdim SIPeepholeSDWA() : MachineFunctionPass(ID) { 87317017Sdim initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 88317017Sdim } 89317017Sdim 90317017Sdim bool runOnMachineFunction(MachineFunction &MF) override; 91341825Sdim void matchSDWAOperands(MachineBasicBlock &MBB); 92327952Sdim std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 93344779Sdim bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 94344779Sdim void pseudoOpConvertToVOP2(MachineInstr &MI, 95344779Sdim const GCNSubtarget &ST) const; 96317017Sdim bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 97341825Sdim void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 98317017Sdim 99317017Sdim StringRef getPassName() const override { return "SI Peephole SDWA"; } 100317017Sdim 101317017Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 102317017Sdim AU.setPreservesCFG(); 103317017Sdim MachineFunctionPass::getAnalysisUsage(AU); 104317017Sdim } 105317017Sdim}; 106317017Sdim 107317017Sdimclass SDWAOperand { 108317017Sdimprivate: 109317017Sdim MachineOperand *Target; // Operand that would be used in converted instruction 110317017Sdim MachineOperand *Replaced; // Operand that would be replace by Target 111317017Sdim 112317017Sdimpublic: 113317017Sdim SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 114317017Sdim : Target(TargetOp), Replaced(ReplacedOp) { 115317017Sdim assert(Target->isReg()); 116317017Sdim assert(Replaced->isReg()); 117317017Sdim } 118317017Sdim 119327952Sdim virtual ~SDWAOperand() = default; 120317017Sdim 121317017Sdim virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 122317017Sdim virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 123317017Sdim 124317017Sdim MachineOperand *getTargetOperand() const { return Target; } 125317017Sdim MachineOperand *getReplacedOperand() const { return Replaced; } 126317017Sdim MachineInstr *getParentInst() const { return Target->getParent(); } 127327952Sdim 128317017Sdim MachineRegisterInfo *getMRI() const { 129317017Sdim return &getParentInst()->getParent()->getParent()->getRegInfo(); 130317017Sdim } 131327952Sdim 132327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 133327952Sdim virtual void print(raw_ostream& OS) const = 0; 134327952Sdim void dump() const { print(dbgs()); } 135327952Sdim#endif 136317017Sdim}; 137317017Sdim 138317017Sdimusing namespace AMDGPU::SDWA; 139317017Sdim 140317017Sdimclass SDWASrcOperand : public SDWAOperand { 141317017Sdimprivate: 142317017Sdim SdwaSel SrcSel; 143317017Sdim bool Abs; 144317017Sdim bool Neg; 145317017Sdim bool Sext; 146317017Sdim 147317017Sdimpublic: 148317017Sdim SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 149317017Sdim SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 150317017Sdim bool Sext_ = false) 151327952Sdim : SDWAOperand(TargetOp, ReplacedOp), 152327952Sdim SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 153317017Sdim 154327952Sdim MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 155327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 156317017Sdim 157317017Sdim SdwaSel getSrcSel() const { return SrcSel; } 158317017Sdim bool getAbs() const { return Abs; } 159317017Sdim bool getNeg() const { return Neg; } 160317017Sdim bool getSext() const { return Sext; } 161317017Sdim 162319799Sdim uint64_t getSrcMods(const SIInstrInfo *TII, 163319799Sdim const MachineOperand *SrcOp) const; 164327952Sdim 165327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 166327952Sdim void print(raw_ostream& OS) const override; 167327952Sdim#endif 168317017Sdim}; 169317017Sdim 170317017Sdimclass SDWADstOperand : public SDWAOperand { 171317017Sdimprivate: 172317017Sdim SdwaSel DstSel; 173317017Sdim DstUnused DstUn; 174317017Sdim 175317017Sdimpublic: 176327952Sdim 177317017Sdim SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 178317017Sdim SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 179327952Sdim : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 180317017Sdim 181327952Sdim MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 182327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 183317017Sdim 184317017Sdim SdwaSel getDstSel() const { return DstSel; } 185317017Sdim DstUnused getDstUnused() const { return DstUn; } 186327952Sdim 187327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 188327952Sdim void print(raw_ostream& OS) const override; 189327952Sdim#endif 190317017Sdim}; 191317017Sdim 192327952Sdimclass SDWADstPreserveOperand : public SDWADstOperand { 193327952Sdimprivate: 194327952Sdim MachineOperand *Preserve; 195317017Sdim 196327952Sdimpublic: 197327952Sdim SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 198327952Sdim MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 199327952Sdim : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 200327952Sdim Preserve(PreserveOp) {} 201327952Sdim 202327952Sdim bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 203327952Sdim 204327952Sdim MachineOperand *getPreservedOperand() const { return Preserve; } 205327952Sdim 206327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207327952Sdim void print(raw_ostream& OS) const override; 208327952Sdim#endif 209327952Sdim}; 210327952Sdim 211327952Sdim} // end anonymous namespace 212327952Sdim 213317017SdimINITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 214317017Sdim 215317017Sdimchar SIPeepholeSDWA::ID = 0; 216317017Sdim 217317017Sdimchar &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 218317017Sdim 219317017SdimFunctionPass *llvm::createSIPeepholeSDWAPass() { 220317017Sdim return new SIPeepholeSDWA(); 221317017Sdim} 222317017Sdim 223317017Sdim 224327952Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 225341825Sdimstatic raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 226317017Sdim switch(Sel) { 227317017Sdim case BYTE_0: OS << "BYTE_0"; break; 228317017Sdim case BYTE_1: OS << "BYTE_1"; break; 229317017Sdim case BYTE_2: OS << "BYTE_2"; break; 230317017Sdim case BYTE_3: OS << "BYTE_3"; break; 231317017Sdim case WORD_0: OS << "WORD_0"; break; 232317017Sdim case WORD_1: OS << "WORD_1"; break; 233317017Sdim case DWORD: OS << "DWORD"; break; 234317017Sdim } 235317017Sdim return OS; 236317017Sdim} 237317017Sdim 238317017Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 239317017Sdim switch(Un) { 240317017Sdim case UNUSED_PAD: OS << "UNUSED_PAD"; break; 241317017Sdim case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 242317017Sdim case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 243317017Sdim } 244317017Sdim return OS; 245317017Sdim} 246317017Sdim 247327952Sdimstatic raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 248327952Sdim Operand.print(OS); 249317017Sdim return OS; 250317017Sdim} 251317017Sdim 252327952SdimLLVM_DUMP_METHOD 253327952Sdimvoid SDWASrcOperand::print(raw_ostream& OS) const { 254327952Sdim OS << "SDWA src: " << *getTargetOperand() 255327952Sdim << " src_sel:" << getSrcSel() 256327952Sdim << " abs:" << getAbs() << " neg:" << getNeg() 257327952Sdim << " sext:" << getSext() << '\n'; 258317017Sdim} 259317017Sdim 260327952SdimLLVM_DUMP_METHOD 261327952Sdimvoid SDWADstOperand::print(raw_ostream& OS) const { 262327952Sdim OS << "SDWA dst: " << *getTargetOperand() 263327952Sdim << " dst_sel:" << getDstSel() 264327952Sdim << " dst_unused:" << getDstUnused() << '\n'; 265327952Sdim} 266327952Sdim 267327952SdimLLVM_DUMP_METHOD 268327952Sdimvoid SDWADstPreserveOperand::print(raw_ostream& OS) const { 269327952Sdim OS << "SDWA preserve dst: " << *getTargetOperand() 270327952Sdim << " dst_sel:" << getDstSel() 271327952Sdim << " preserve:" << *getPreservedOperand() << '\n'; 272327952Sdim} 273327952Sdim 274317017Sdim#endif 275317017Sdim 276317017Sdimstatic void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 277317017Sdim assert(To.isReg() && From.isReg()); 278317017Sdim To.setReg(From.getReg()); 279317017Sdim To.setSubReg(From.getSubReg()); 280317017Sdim To.setIsUndef(From.isUndef()); 281317017Sdim if (To.isUse()) { 282317017Sdim To.setIsKill(From.isKill()); 283317017Sdim } else { 284317017Sdim To.setIsDead(From.isDead()); 285317017Sdim } 286317017Sdim} 287317017Sdim 288317017Sdimstatic bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 289317017Sdim return LHS.isReg() && 290317017Sdim RHS.isReg() && 291317017Sdim LHS.getReg() == RHS.getReg() && 292317017Sdim LHS.getSubReg() == RHS.getSubReg(); 293317017Sdim} 294317017Sdim 295327952Sdimstatic MachineOperand *findSingleRegUse(const MachineOperand *Reg, 296327952Sdim const MachineRegisterInfo *MRI) { 297327952Sdim if (!Reg->isReg() || !Reg->isDef()) 298327952Sdim return nullptr; 299320397Sdim 300327952Sdim MachineOperand *ResMO = nullptr; 301327952Sdim for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 302327952Sdim // If there exist use of subreg of Reg then return nullptr 303327952Sdim if (!isSameReg(UseMO, *Reg)) 304327952Sdim return nullptr; 305317017Sdim 306327952Sdim // Check that there is only one instruction that uses Reg 307327952Sdim if (!ResMO) { 308327952Sdim ResMO = &UseMO; 309327952Sdim } else if (ResMO->getParent() != UseMO.getParent()) { 310327952Sdim return nullptr; 311327952Sdim } 312327952Sdim } 313317017Sdim 314327952Sdim return ResMO; 315327952Sdim} 316317017Sdim 317327952Sdimstatic MachineOperand *findSingleRegDef(const MachineOperand *Reg, 318327952Sdim const MachineRegisterInfo *MRI) { 319327952Sdim if (!Reg->isReg()) 320327952Sdim return nullptr; 321327952Sdim 322327952Sdim MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 323327952Sdim if (!DefInstr) 324327952Sdim return nullptr; 325327952Sdim 326327952Sdim for (auto &DefMO : DefInstr->defs()) { 327327952Sdim if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 328327952Sdim return &DefMO; 329327952Sdim } 330327952Sdim 331327952Sdim // Ignore implicit defs. 332327952Sdim return nullptr; 333317017Sdim} 334317017Sdim 335319799Sdimuint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 336319799Sdim const MachineOperand *SrcOp) const { 337317017Sdim uint64_t Mods = 0; 338319799Sdim const auto *MI = SrcOp->getParent(); 339319799Sdim if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 340319799Sdim if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 341319799Sdim Mods = Mod->getImm(); 342319799Sdim } 343319799Sdim } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 344319799Sdim if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 345319799Sdim Mods = Mod->getImm(); 346319799Sdim } 347319799Sdim } 348317017Sdim if (Abs || Neg) { 349317017Sdim assert(!Sext && 350317017Sdim "Float and integer src modifiers can't be set simulteniously"); 351317017Sdim Mods |= Abs ? SISrcMods::ABS : 0; 352319799Sdim Mods ^= Neg ? SISrcMods::NEG : 0; 353317017Sdim } else if (Sext) { 354317017Sdim Mods |= SISrcMods::SEXT; 355317017Sdim } 356317017Sdim 357317017Sdim return Mods; 358317017Sdim} 359317017Sdim 360317017SdimMachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 361317017Sdim // For SDWA src operand potential instruction is one that use register 362317017Sdim // defined by parent instruction 363327952Sdim MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 364327952Sdim if (!PotentialMO) 365327952Sdim return nullptr; 366317017Sdim 367327952Sdim return PotentialMO->getParent(); 368317017Sdim} 369317017Sdim 370317017Sdimbool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 371317017Sdim // Find operand in instruction that matches source operand and replace it with 372317017Sdim // target operand. Set corresponding src_sel 373341825Sdim bool IsPreserveSrc = false; 374317017Sdim MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 375317017Sdim MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 376317017Sdim MachineOperand *SrcMods = 377317017Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 378319250Sdim assert(Src && (Src->isReg() || Src->isImm())); 379317017Sdim if (!isSameReg(*Src, *getReplacedOperand())) { 380341825Sdim // If this is not src0 then it could be src1 381317017Sdim Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 382317017Sdim SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 383317017Sdim SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 384317017Sdim 385341825Sdim if (!Src || 386341825Sdim !isSameReg(*Src, *getReplacedOperand())) { 387341825Sdim // It's possible this Src is a tied operand for 388341825Sdim // UNUSED_PRESERVE, in which case we can either 389341825Sdim // abandon the peephole attempt, or if legal we can 390341825Sdim // copy the target operand into the tied slot 391341825Sdim // if the preserve operation will effectively cause the same 392341825Sdim // result by overwriting the rest of the dst. 393341825Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 394341825Sdim MachineOperand *DstUnused = 395341825Sdim TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 396341825Sdim 397341825Sdim if (Dst && 398341825Sdim DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 399341825Sdim // This will work if the tied src is acessing WORD_0, and the dst is 400341825Sdim // writing WORD_1. Modifiers don't matter because all the bits that 401341825Sdim // would be impacted are being overwritten by the dst. 402341825Sdim // Any other case will not work. 403341825Sdim SdwaSel DstSel = static_cast<SdwaSel>( 404341825Sdim TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 405341825Sdim if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 406341825Sdim getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 407341825Sdim IsPreserveSrc = true; 408341825Sdim auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 409341825Sdim AMDGPU::OpName::vdst); 410341825Sdim auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 411341825Sdim Src = &MI.getOperand(TiedIdx); 412341825Sdim SrcSel = nullptr; 413341825Sdim SrcMods = nullptr; 414341825Sdim } else { 415341825Sdim // Not legal to convert this src 416341825Sdim return false; 417341825Sdim } 418341825Sdim } 419341825Sdim } 420317017Sdim assert(Src && Src->isReg()); 421317017Sdim 422317017Sdim if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 423317017Sdim MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 424327952Sdim !isSameReg(*Src, *getReplacedOperand())) { 425317017Sdim // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 426317017Sdim // src2. This is not allowed. 427317017Sdim return false; 428317017Sdim } 429317017Sdim 430341825Sdim assert(isSameReg(*Src, *getReplacedOperand()) && 431341825Sdim (IsPreserveSrc || (SrcSel && SrcMods))); 432317017Sdim } 433317017Sdim copyRegOperand(*Src, *getTargetOperand()); 434341825Sdim if (!IsPreserveSrc) { 435341825Sdim SrcSel->setImm(getSrcSel()); 436341825Sdim SrcMods->setImm(getSrcMods(TII, Src)); 437341825Sdim } 438317017Sdim getTargetOperand()->setIsKill(false); 439317017Sdim return true; 440317017Sdim} 441317017Sdim 442317017SdimMachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 443317017Sdim // For SDWA dst operand potential instruction is one that defines register 444317017Sdim // that this operand uses 445317017Sdim MachineRegisterInfo *MRI = getMRI(); 446317017Sdim MachineInstr *ParentMI = getParentInst(); 447317017Sdim 448327952Sdim MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 449327952Sdim if (!PotentialMO) 450327952Sdim return nullptr; 451317017Sdim 452327952Sdim // Check that ParentMI is the only instruction that uses replaced register 453327952Sdim for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 454327952Sdim if (&UseInst != ParentMI) 455317017Sdim return nullptr; 456317017Sdim } 457317017Sdim 458327952Sdim return PotentialMO->getParent(); 459317017Sdim} 460317017Sdim 461317017Sdimbool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 462317017Sdim // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 463317017Sdim 464317017Sdim if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 465317017Sdim MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 466317017Sdim getDstSel() != AMDGPU::SDWA::DWORD) { 467317017Sdim // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 468317017Sdim return false; 469317017Sdim } 470317017Sdim 471317017Sdim MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 472317017Sdim assert(Operand && 473317017Sdim Operand->isReg() && 474317017Sdim isSameReg(*Operand, *getReplacedOperand())); 475317017Sdim copyRegOperand(*Operand, *getTargetOperand()); 476317017Sdim MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 477317017Sdim assert(DstSel); 478317017Sdim DstSel->setImm(getDstSel()); 479317017Sdim MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 480317017Sdim assert(DstUnused); 481317017Sdim DstUnused->setImm(getDstUnused()); 482317017Sdim 483317017Sdim // Remove original instruction because it would conflict with our new 484317017Sdim // instruction by register definition 485317017Sdim getParentInst()->eraseFromParent(); 486317017Sdim return true; 487317017Sdim} 488317017Sdim 489327952Sdimbool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 490327952Sdim const SIInstrInfo *TII) { 491327952Sdim // MI should be moved right before v_or_b32. 492327952Sdim // For this we should clear all kill flags on uses of MI src-operands or else 493327952Sdim // we can encounter problem with use of killed operand. 494327952Sdim for (MachineOperand &MO : MI.uses()) { 495327952Sdim if (!MO.isReg()) 496327952Sdim continue; 497327952Sdim getMRI()->clearKillFlags(MO.getReg()); 498327952Sdim } 499327952Sdim 500327952Sdim // Move MI before v_or_b32 501327952Sdim auto MBB = MI.getParent(); 502327952Sdim MBB->remove(&MI); 503327952Sdim MBB->insert(getParentInst(), &MI); 504327952Sdim 505327952Sdim // Add Implicit use of preserved register 506327952Sdim MachineInstrBuilder MIB(*MBB->getParent(), MI); 507327952Sdim MIB.addReg(getPreservedOperand()->getReg(), 508327952Sdim RegState::ImplicitKill, 509327952Sdim getPreservedOperand()->getSubReg()); 510327952Sdim 511327952Sdim // Tie dst to implicit use 512327952Sdim MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 513327952Sdim MI.getNumOperands() - 1); 514327952Sdim 515327952Sdim // Convert MI as any other SDWADstOperand and remove v_or_b32 516327952Sdim return SDWADstOperand::convertToSDWA(MI, TII); 517327952Sdim} 518327952Sdim 519317017SdimOptional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 520317017Sdim if (Op.isImm()) { 521317017Sdim return Op.getImm(); 522317017Sdim } 523317017Sdim 524317017Sdim // If this is not immediate then it can be copy of immediate value, e.g.: 525327952Sdim // %1 = S_MOV_B32 255; 526317017Sdim if (Op.isReg()) { 527317017Sdim for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 528317017Sdim if (!isSameReg(Op, Def)) 529317017Sdim continue; 530317017Sdim 531317017Sdim const MachineInstr *DefInst = Def.getParent(); 532317017Sdim if (!TII->isFoldableCopy(*DefInst)) 533317017Sdim return None; 534317017Sdim 535317017Sdim const MachineOperand &Copied = DefInst->getOperand(1); 536317017Sdim if (!Copied.isImm()) 537317017Sdim return None; 538317017Sdim 539317017Sdim return Copied.getImm(); 540317017Sdim } 541317017Sdim } 542317017Sdim 543317017Sdim return None; 544317017Sdim} 545317017Sdim 546327952Sdimstd::unique_ptr<SDWAOperand> 547327952SdimSIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 548327952Sdim unsigned Opcode = MI.getOpcode(); 549327952Sdim switch (Opcode) { 550327952Sdim case AMDGPU::V_LSHRREV_B32_e32: 551327952Sdim case AMDGPU::V_ASHRREV_I32_e32: 552327952Sdim case AMDGPU::V_LSHLREV_B32_e32: 553327952Sdim case AMDGPU::V_LSHRREV_B32_e64: 554327952Sdim case AMDGPU::V_ASHRREV_I32_e64: 555327952Sdim case AMDGPU::V_LSHLREV_B32_e64: { 556327952Sdim // from: v_lshrrev_b32_e32 v1, 16/24, v0 557327952Sdim // to SDWA src:v0 src_sel:WORD_1/BYTE_3 558317017Sdim 559327952Sdim // from: v_ashrrev_i32_e32 v1, 16/24, v0 560327952Sdim // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 561317017Sdim 562327952Sdim // from: v_lshlrev_b32_e32 v1, 16/24, v0 563327952Sdim // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 564327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 565327952Sdim auto Imm = foldToImm(*Src0); 566327952Sdim if (!Imm) 567327952Sdim break; 568317017Sdim 569327952Sdim if (*Imm != 16 && *Imm != 24) 570327952Sdim break; 571317017Sdim 572327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 573327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 574327952Sdim if (TRI->isPhysicalRegister(Src1->getReg()) || 575327952Sdim TRI->isPhysicalRegister(Dst->getReg())) 576327952Sdim break; 577317017Sdim 578327952Sdim if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 579327952Sdim Opcode == AMDGPU::V_LSHLREV_B32_e64) { 580327952Sdim return make_unique<SDWADstOperand>( 581327952Sdim Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 582327952Sdim } else { 583327952Sdim return make_unique<SDWASrcOperand>( 584327952Sdim Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 585327952Sdim Opcode != AMDGPU::V_LSHRREV_B32_e32 && 586327952Sdim Opcode != AMDGPU::V_LSHRREV_B32_e64); 587327952Sdim } 588327952Sdim break; 589327952Sdim } 590317017Sdim 591327952Sdim case AMDGPU::V_LSHRREV_B16_e32: 592327952Sdim case AMDGPU::V_ASHRREV_I16_e32: 593327952Sdim case AMDGPU::V_LSHLREV_B16_e32: 594327952Sdim case AMDGPU::V_LSHRREV_B16_e64: 595327952Sdim case AMDGPU::V_ASHRREV_I16_e64: 596327952Sdim case AMDGPU::V_LSHLREV_B16_e64: { 597327952Sdim // from: v_lshrrev_b16_e32 v1, 8, v0 598327952Sdim // to SDWA src:v0 src_sel:BYTE_1 599317017Sdim 600327952Sdim // from: v_ashrrev_i16_e32 v1, 8, v0 601327952Sdim // to SDWA src:v0 src_sel:BYTE_1 sext:1 602317017Sdim 603327952Sdim // from: v_lshlrev_b16_e32 v1, 8, v0 604327952Sdim // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 605327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 606327952Sdim auto Imm = foldToImm(*Src0); 607327952Sdim if (!Imm || *Imm != 8) 608327952Sdim break; 609317017Sdim 610327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 611327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 612317017Sdim 613327952Sdim if (TRI->isPhysicalRegister(Src1->getReg()) || 614327952Sdim TRI->isPhysicalRegister(Dst->getReg())) 615327952Sdim break; 616317017Sdim 617327952Sdim if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 618327952Sdim Opcode == AMDGPU::V_LSHLREV_B16_e64) { 619327952Sdim return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 620327952Sdim } else { 621327952Sdim return make_unique<SDWASrcOperand>( 622327952Sdim Src1, Dst, BYTE_1, false, false, 623327952Sdim Opcode != AMDGPU::V_LSHRREV_B16_e32 && 624327952Sdim Opcode != AMDGPU::V_LSHRREV_B16_e64); 625327952Sdim } 626327952Sdim break; 627327952Sdim } 628317017Sdim 629327952Sdim case AMDGPU::V_BFE_I32: 630327952Sdim case AMDGPU::V_BFE_U32: { 631327952Sdim // e.g.: 632327952Sdim // from: v_bfe_u32 v1, v0, 8, 8 633327952Sdim // to SDWA src:v0 src_sel:BYTE_1 634317017Sdim 635327952Sdim // offset | width | src_sel 636327952Sdim // ------------------------ 637327952Sdim // 0 | 8 | BYTE_0 638327952Sdim // 0 | 16 | WORD_0 639327952Sdim // 0 | 32 | DWORD ? 640327952Sdim // 8 | 8 | BYTE_1 641327952Sdim // 16 | 8 | BYTE_2 642327952Sdim // 16 | 16 | WORD_1 643327952Sdim // 24 | 8 | BYTE_3 644317017Sdim 645327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 646327952Sdim auto Offset = foldToImm(*Src1); 647327952Sdim if (!Offset) 648327952Sdim break; 649317017Sdim 650327952Sdim MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 651327952Sdim auto Width = foldToImm(*Src2); 652327952Sdim if (!Width) 653327952Sdim break; 654317017Sdim 655327952Sdim SdwaSel SrcSel = DWORD; 656317017Sdim 657327952Sdim if (*Offset == 0 && *Width == 8) 658327952Sdim SrcSel = BYTE_0; 659327952Sdim else if (*Offset == 0 && *Width == 16) 660327952Sdim SrcSel = WORD_0; 661327952Sdim else if (*Offset == 0 && *Width == 32) 662327952Sdim SrcSel = DWORD; 663327952Sdim else if (*Offset == 8 && *Width == 8) 664327952Sdim SrcSel = BYTE_1; 665327952Sdim else if (*Offset == 16 && *Width == 8) 666327952Sdim SrcSel = BYTE_2; 667327952Sdim else if (*Offset == 16 && *Width == 16) 668327952Sdim SrcSel = WORD_1; 669327952Sdim else if (*Offset == 24 && *Width == 8) 670327952Sdim SrcSel = BYTE_3; 671327952Sdim else 672327952Sdim break; 673317017Sdim 674327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 675327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 676320397Sdim 677327952Sdim if (TRI->isPhysicalRegister(Src0->getReg()) || 678327952Sdim TRI->isPhysicalRegister(Dst->getReg())) 679327952Sdim break; 680317017Sdim 681327952Sdim return make_unique<SDWASrcOperand>( 682327952Sdim Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 683327952Sdim } 684327952Sdim 685327952Sdim case AMDGPU::V_AND_B32_e32: 686327952Sdim case AMDGPU::V_AND_B32_e64: { 687327952Sdim // e.g.: 688327952Sdim // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 689327952Sdim // to SDWA src:v0 src_sel:WORD_0/BYTE_0 690327952Sdim 691327952Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 692327952Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 693327952Sdim auto ValSrc = Src1; 694327952Sdim auto Imm = foldToImm(*Src0); 695327952Sdim 696327952Sdim if (!Imm) { 697327952Sdim Imm = foldToImm(*Src1); 698327952Sdim ValSrc = Src0; 699327952Sdim } 700327952Sdim 701327952Sdim if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 702327952Sdim break; 703327952Sdim 704327952Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 705327952Sdim 706341825Sdim if (TRI->isPhysicalRegister(ValSrc->getReg()) || 707327952Sdim TRI->isPhysicalRegister(Dst->getReg())) 708327952Sdim break; 709327952Sdim 710327952Sdim return make_unique<SDWASrcOperand>( 711327952Sdim ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 712327952Sdim } 713327952Sdim 714327952Sdim case AMDGPU::V_OR_B32_e32: 715327952Sdim case AMDGPU::V_OR_B32_e64: { 716327952Sdim // Patterns for dst_unused:UNUSED_PRESERVE. 717327952Sdim // e.g., from: 718327952Sdim // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 719327952Sdim // src1_sel:WORD_1 src2_sel:WORD1 720327952Sdim // v_add_f16_e32 v3, v1, v2 721327952Sdim // v_or_b32_e32 v4, v0, v3 722327952Sdim // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 723327952Sdim 724327952Sdim // Check if one of operands of v_or_b32 is SDWA instruction 725327952Sdim using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 726327952Sdim auto CheckOROperandsForSDWA = 727327952Sdim [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 728327952Sdim if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 729327952Sdim return CheckRetType(None); 730327952Sdim 731327952Sdim MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 732327952Sdim if (!Op1Def) 733327952Sdim return CheckRetType(None); 734327952Sdim 735327952Sdim MachineInstr *Op1Inst = Op1Def->getParent(); 736327952Sdim if (!TII->isSDWA(*Op1Inst)) 737327952Sdim return CheckRetType(None); 738327952Sdim 739327952Sdim MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 740327952Sdim if (!Op2Def) 741327952Sdim return CheckRetType(None); 742327952Sdim 743327952Sdim return CheckRetType(std::make_pair(Op1Def, Op2Def)); 744327952Sdim }; 745327952Sdim 746327952Sdim MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 747327952Sdim MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 748327952Sdim assert(OrSDWA && OrOther); 749327952Sdim auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 750327952Sdim if (!Res) { 751327952Sdim OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 752327952Sdim OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 753327952Sdim assert(OrSDWA && OrOther); 754327952Sdim Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 755327952Sdim if (!Res) 756317017Sdim break; 757327952Sdim } 758317017Sdim 759327952Sdim MachineOperand *OrSDWADef = Res->first; 760327952Sdim MachineOperand *OrOtherDef = Res->second; 761327952Sdim assert(OrSDWADef && OrOtherDef); 762317017Sdim 763327952Sdim MachineInstr *SDWAInst = OrSDWADef->getParent(); 764327952Sdim MachineInstr *OtherInst = OrOtherDef->getParent(); 765319799Sdim 766327952Sdim // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 767327952Sdim // destination patterns don't overlap. Compatible instruction can be either 768327952Sdim // regular instruction with compatible bitness or SDWA instruction with 769327952Sdim // correct dst_sel 770327952Sdim // SDWAInst | OtherInst bitness / OtherInst dst_sel 771327952Sdim // ----------------------------------------------------- 772327952Sdim // DWORD | no / no 773327952Sdim // WORD_0 | no / BYTE_2/3, WORD_1 774327952Sdim // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 775327952Sdim // BYTE_0 | no / BYTE_1/2/3, WORD_1 776327952Sdim // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 777327952Sdim // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 778327952Sdim // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 779327952Sdim // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 780327952Sdim // but v_add_f32 is not. 781317017Sdim 782327952Sdim // TODO: add support for non-SDWA instructions as OtherInst. 783327952Sdim // For now this only works with SDWA instructions. For regular instructions 784341825Sdim // there is no way to determine if the instruction writes only 8/16/24-bit 785341825Sdim // out of full register size and all registers are at min 32-bit wide. 786327952Sdim if (!TII->isSDWA(*OtherInst)) 787327952Sdim break; 788320397Sdim 789327952Sdim SdwaSel DstSel = static_cast<SdwaSel>( 790327952Sdim TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 791327952Sdim SdwaSel OtherDstSel = static_cast<SdwaSel>( 792327952Sdim TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 793317017Sdim 794327952Sdim bool DstSelAgree = false; 795327952Sdim switch (DstSel) { 796327952Sdim case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 797327952Sdim (OtherDstSel == BYTE_3) || 798327952Sdim (OtherDstSel == WORD_1)); 799327952Sdim break; 800327952Sdim case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 801327952Sdim (OtherDstSel == BYTE_1) || 802327952Sdim (OtherDstSel == WORD_0)); 803327952Sdim break; 804327952Sdim case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 805327952Sdim (OtherDstSel == BYTE_2) || 806327952Sdim (OtherDstSel == BYTE_3) || 807327952Sdim (OtherDstSel == WORD_1)); 808327952Sdim break; 809327952Sdim case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 810327952Sdim (OtherDstSel == BYTE_2) || 811327952Sdim (OtherDstSel == BYTE_3) || 812327952Sdim (OtherDstSel == WORD_1)); 813327952Sdim break; 814327952Sdim case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 815327952Sdim (OtherDstSel == BYTE_1) || 816327952Sdim (OtherDstSel == BYTE_3) || 817327952Sdim (OtherDstSel == WORD_0)); 818327952Sdim break; 819327952Sdim case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 820327952Sdim (OtherDstSel == BYTE_1) || 821327952Sdim (OtherDstSel == BYTE_2) || 822327952Sdim (OtherDstSel == WORD_0)); 823327952Sdim break; 824327952Sdim default: DstSelAgree = false; 825327952Sdim } 826327952Sdim 827327952Sdim if (!DstSelAgree) 828327952Sdim break; 829327952Sdim 830327952Sdim // Also OtherInst dst_unused should be UNUSED_PAD 831327952Sdim DstUnused OtherDstUnused = static_cast<DstUnused>( 832327952Sdim TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 833327952Sdim if (OtherDstUnused != DstUnused::UNUSED_PAD) 834327952Sdim break; 835327952Sdim 836327952Sdim // Create DstPreserveOperand 837327952Sdim MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 838327952Sdim assert(OrDst && OrDst->isReg()); 839327952Sdim 840327952Sdim return make_unique<SDWADstPreserveOperand>( 841327952Sdim OrDst, OrSDWADef, OrOtherDef, DstSel); 842327952Sdim 843327952Sdim } 844327952Sdim } 845327952Sdim 846327952Sdim return std::unique_ptr<SDWAOperand>(nullptr); 847327952Sdim} 848327952Sdim 849341825Sdimvoid SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 850341825Sdim for (MachineInstr &MI : MBB) { 851341825Sdim if (auto Operand = matchSDWAOperand(MI)) { 852341825Sdim LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 853341825Sdim SDWAOperands[&MI] = std::move(Operand); 854341825Sdim ++NumSDWAPatternsFound; 855317017Sdim } 856317017Sdim } 857317017Sdim} 858317017Sdim 859344779Sdim// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 860344779Sdim// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA 861344779Sdim// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. 862344779Sdim// 863344779Sdim// We are transforming from a VOP3 into a VOP2 form of the instruction. 864344779Sdim// %19:vgpr_32 = V_AND_B32_e32 255, 865344779Sdim// killed %16:vgpr_32, implicit $exec 866344779Sdim// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 867344779Sdim// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 868344779Sdim// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 869344779Sdim// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 870344779Sdim// 871344779Sdim// becomes 872344779Sdim// %47:vgpr_32 = V_ADD_I32_sdwa 873344779Sdim// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 874344779Sdim// implicit-def $vcc, implicit $exec 875344779Sdim// %48:vgpr_32 = V_ADDC_U32_e32 876344779Sdim// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 877344779Sdimvoid SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 878344779Sdim const GCNSubtarget &ST) const { 879344779Sdim int Opc = MI.getOpcode(); 880344779Sdim assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && 881344779Sdim "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); 882344779Sdim 883344779Sdim // Can the candidate MI be shrunk? 884344779Sdim if (!TII->canShrink(MI, *MRI)) 885344779Sdim return; 886344779Sdim Opc = AMDGPU::getVOPe32(Opc); 887344779Sdim // Find the related ADD instruction. 888344779Sdim const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 889344779Sdim if (!Sdst) 890344779Sdim return; 891344779Sdim MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 892344779Sdim if (!NextOp) 893344779Sdim return; 894344779Sdim MachineInstr &MISucc = *NextOp->getParent(); 895344779Sdim // Can the successor be shrunk? 896344779Sdim if (!TII->canShrink(MISucc, *MRI)) 897344779Sdim return; 898344779Sdim int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 899344779Sdim // Make sure the carry in/out are subsequently unused. 900344779Sdim MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 901344779Sdim if (!CarryIn) 902344779Sdim return; 903344779Sdim MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 904344779Sdim if (!CarryOut) 905344779Sdim return; 906344779Sdim if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 907344779Sdim return; 908344779Sdim // Make sure VCC or its subregs are dead before MI. 909344779Sdim MachineBasicBlock &MBB = *MI.getParent(); 910344779Sdim auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 911344779Sdim if (Liveness != MachineBasicBlock::LQR_Dead) 912344779Sdim return; 913344779Sdim // Check if VCC is referenced in range of (MI,MISucc]. 914344779Sdim for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 915344779Sdim I != E; ++I) { 916344779Sdim if (I->modifiesRegister(AMDGPU::VCC, TRI)) 917344779Sdim return; 918344779Sdim } 919344779Sdim // Make the two new e32 instruction variants. 920344779Sdim // Replace MI with V_{SUB|ADD}_I32_e32 921344779Sdim auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); 922344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); 923344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 924344779Sdim NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); 925344779Sdim MI.eraseFromParent(); 926344779Sdim // Replace MISucc with V_{SUBB|ADDC}_U32_e32 927344779Sdim auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); 928344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); 929344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); 930344779Sdim NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); 931344779Sdim MISucc.eraseFromParent(); 932344779Sdim} 933344779Sdim 934344779Sdimbool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 935341825Sdim const GCNSubtarget &ST) const { 936327952Sdim // Check if this is already an SDWA instruction 937327952Sdim unsigned Opc = MI.getOpcode(); 938327952Sdim if (TII->isSDWA(Opc)) 939327952Sdim return true; 940327952Sdim 941319250Sdim // Check if this instruction has opcode that supports SDWA 942320397Sdim if (AMDGPU::getSDWAOp(Opc) == -1) 943320397Sdim Opc = AMDGPU::getVOPe32(Opc); 944320397Sdim 945327952Sdim if (AMDGPU::getSDWAOp(Opc) == -1) 946320397Sdim return false; 947320397Sdim 948320397Sdim if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 949320397Sdim return false; 950320397Sdim 951320397Sdim if (TII->isVOPC(Opc)) { 952320397Sdim if (!ST.hasSDWASdst()) { 953320397Sdim const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 954320397Sdim if (SDst && SDst->getReg() != AMDGPU::VCC) 955320397Sdim return false; 956320397Sdim } 957320397Sdim 958320572Sdim if (!ST.hasSDWAOutModsVOPC() && 959320572Sdim (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 960320572Sdim TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 961320397Sdim return false; 962320397Sdim 963320572Sdim } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 964320572Sdim !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 965320397Sdim return false; 966320397Sdim } 967320397Sdim 968320397Sdim if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || 969320397Sdim Opc == AMDGPU::V_MAC_F32_e32)) 970320397Sdim return false; 971320397Sdim 972341825Sdim // FIXME: has SDWA but require handling of implicit VCC use 973341825Sdim if (Opc == AMDGPU::V_CNDMASK_B32_e32) 974341825Sdim return false; 975341825Sdim 976320397Sdim return true; 977318681Sdim} 978318681Sdim 979318681Sdimbool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 980318681Sdim const SDWAOperandsVector &SDWAOperands) { 981341825Sdim 982341825Sdim LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 983341825Sdim 984317017Sdim // Convert to sdwa 985327952Sdim int SDWAOpcode; 986327952Sdim unsigned Opcode = MI.getOpcode(); 987327952Sdim if (TII->isSDWA(Opcode)) { 988327952Sdim SDWAOpcode = Opcode; 989327952Sdim } else { 990327952Sdim SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 991327952Sdim if (SDWAOpcode == -1) 992327952Sdim SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 993327952Sdim } 994317017Sdim assert(SDWAOpcode != -1); 995317017Sdim 996317017Sdim const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 997317017Sdim 998317017Sdim // Create SDWA version of instruction MI and initialize its operands 999317017Sdim MachineInstrBuilder SDWAInst = 1000317017Sdim BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 1001317017Sdim 1002320572Sdim // Copy dst, if it is present in original then should also be present in SDWA 1003320572Sdim MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1004317017Sdim if (Dst) { 1005317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 1006317017Sdim SDWAInst.add(*Dst); 1007320572Sdim } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1008320397Sdim assert(Dst && 1009320397Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1010320397Sdim SDWAInst.add(*Dst); 1011320572Sdim } else { 1012320572Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1013320572Sdim SDWAInst.addReg(AMDGPU::VCC, RegState::Define); 1014317017Sdim } 1015317017Sdim 1016317017Sdim // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1017317017Sdim // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1018317017Sdim MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1019317017Sdim assert( 1020317017Sdim Src0 && 1021317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 1022317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 1023319799Sdim if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1024319799Sdim SDWAInst.addImm(Mod->getImm()); 1025319799Sdim else 1026319799Sdim SDWAInst.addImm(0); 1027317017Sdim SDWAInst.add(*Src0); 1028317017Sdim 1029317017Sdim // Copy src1 if present, initialize src1_modifiers. 1030317017Sdim MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1031317017Sdim if (Src1) { 1032317017Sdim assert( 1033317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 1034317017Sdim AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 1035319799Sdim if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1036319799Sdim SDWAInst.addImm(Mod->getImm()); 1037319799Sdim else 1038319799Sdim SDWAInst.addImm(0); 1039317017Sdim SDWAInst.add(*Src1); 1040317017Sdim } 1041317017Sdim 1042317017Sdim if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1043317017Sdim SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1044317017Sdim // v_mac_f16/32 has additional src2 operand tied to vdst 1045317017Sdim MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1046317017Sdim assert(Src2); 1047317017Sdim SDWAInst.add(*Src2); 1048317017Sdim } 1049317017Sdim 1050320397Sdim // Copy clamp if present, initialize otherwise 1051317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 1052320397Sdim MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1053320397Sdim if (Clamp) { 1054320397Sdim SDWAInst.add(*Clamp); 1055320397Sdim } else { 1056320397Sdim SDWAInst.addImm(0); 1057320397Sdim } 1058317017Sdim 1059320397Sdim // Copy omod if present, initialize otherwise if needed 1060320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 1061320572Sdim MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1062320572Sdim if (OMod) { 1063320572Sdim SDWAInst.add(*OMod); 1064320572Sdim } else { 1065320572Sdim SDWAInst.addImm(0); 1066320572Sdim } 1067320397Sdim } 1068320397Sdim 1069327952Sdim // Copy dst_sel if present, initialize otherwise if needed 1070320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 1071327952Sdim MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1072327952Sdim if (DstSel) { 1073327952Sdim SDWAInst.add(*DstSel); 1074327952Sdim } else { 1075327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1076327952Sdim } 1077320572Sdim } 1078320572Sdim 1079327952Sdim // Copy dst_unused if present, initialize otherwise if needed 1080320572Sdim if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1081327952Sdim MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1082327952Sdim if (DstUnused) { 1083327952Sdim SDWAInst.add(*DstUnused); 1084327952Sdim } else { 1085327952Sdim SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1086327952Sdim } 1087317017Sdim } 1088317017Sdim 1089327952Sdim // Copy src0_sel if present, initialize otherwise 1090317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1091327952Sdim MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1092327952Sdim if (Src0Sel) { 1093327952Sdim SDWAInst.add(*Src0Sel); 1094327952Sdim } else { 1095327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1096327952Sdim } 1097317017Sdim 1098327952Sdim // Copy src1_sel if present, initialize otherwise if needed 1099317017Sdim if (Src1) { 1100317017Sdim assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1101327952Sdim MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1102327952Sdim if (Src1Sel) { 1103327952Sdim SDWAInst.add(*Src1Sel); 1104327952Sdim } else { 1105327952Sdim SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1106327952Sdim } 1107317017Sdim } 1108317017Sdim 1109341825Sdim // Check for a preserved register that needs to be copied. 1110341825Sdim auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1111341825Sdim if (DstUnused && 1112341825Sdim DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1113341825Sdim // We expect, if we are here, that the instruction was already in it's SDWA form, 1114341825Sdim // with a tied operand. 1115341825Sdim assert(Dst && Dst->isTied()); 1116341825Sdim assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1117341825Sdim // We also expect a vdst, since sdst can't preserve. 1118341825Sdim auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1119341825Sdim assert(PreserveDstIdx != -1); 1120341825Sdim 1121341825Sdim auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1122341825Sdim auto Tied = MI.getOperand(TiedIdx); 1123341825Sdim 1124341825Sdim SDWAInst.add(Tied); 1125341825Sdim SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1126341825Sdim } 1127341825Sdim 1128341825Sdim // Apply all sdwa operand patterns. 1129317017Sdim bool Converted = false; 1130317017Sdim for (auto &Operand : SDWAOperands) { 1131341825Sdim LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1132318681Sdim // There should be no intesection between SDWA operands and potential MIs 1133318681Sdim // e.g.: 1134318681Sdim // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1135318681Sdim // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1136318681Sdim // v_add_u32 v3, v4, v2 1137318681Sdim // 1138318681Sdim // In that example it is possible that we would fold 2nd instruction into 3rd 1139318681Sdim // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1140318681Sdim // already destroyed). So if SDWAOperand is also a potential MI then do not 1141318681Sdim // apply it. 1142318681Sdim if (PotentialMatches.count(Operand->getParentInst()) == 0) 1143318681Sdim Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1144317017Sdim } 1145319250Sdim if (Converted) { 1146319250Sdim ConvertedInstructions.push_back(SDWAInst); 1147319250Sdim } else { 1148317017Sdim SDWAInst->eraseFromParent(); 1149317017Sdim return false; 1150317017Sdim } 1151317017Sdim 1152341825Sdim LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1153317017Sdim ++NumSDWAInstructionsPeepholed; 1154317017Sdim 1155317017Sdim MI.eraseFromParent(); 1156317017Sdim return true; 1157317017Sdim} 1158317017Sdim 1159319250Sdim// If an instruction was converted to SDWA it should not have immediates or SGPR 1160320397Sdim// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1161341825Sdimvoid SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1162341825Sdim const GCNSubtarget &ST) const { 1163319250Sdim const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1164320397Sdim unsigned ConstantBusCount = 0; 1165327952Sdim for (MachineOperand &Op : MI.explicit_uses()) { 1166319250Sdim if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1167319250Sdim continue; 1168320397Sdim 1169320397Sdim unsigned I = MI.getOperandNo(&Op); 1170319250Sdim if (Desc.OpInfo[I].RegClass == -1 || 1171319250Sdim !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1172319250Sdim continue; 1173320397Sdim 1174320397Sdim if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1175320397Sdim TRI->isSGPRReg(*MRI, Op.getReg())) { 1176320397Sdim ++ConstantBusCount; 1177320397Sdim continue; 1178320397Sdim } 1179320397Sdim 1180319250Sdim unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1181319250Sdim auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1182319250Sdim TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1183319250Sdim if (Op.isImm()) 1184319250Sdim Copy.addImm(Op.getImm()); 1185319250Sdim else if (Op.isReg()) 1186319250Sdim Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1187319250Sdim Op.getSubReg()); 1188319250Sdim Op.ChangeToRegister(VGPR, false); 1189319250Sdim } 1190319250Sdim} 1191319250Sdim 1192317017Sdimbool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1193341825Sdim const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1194317017Sdim 1195327952Sdim if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1196317017Sdim return false; 1197317017Sdim 1198317017Sdim MRI = &MF.getRegInfo(); 1199317017Sdim TRI = ST.getRegisterInfo(); 1200317017Sdim TII = ST.getInstrInfo(); 1201320397Sdim 1202318681Sdim // Find all SDWA operands in MF. 1203327952Sdim bool Ret = false; 1204341825Sdim for (MachineBasicBlock &MBB : MF) { 1205341825Sdim bool Changed = false; 1206341825Sdim do { 1207344779Sdim // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1208344779Sdim // Look for a possible ADD or SUB that resulted from a previously lowered 1209344779Sdim // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1210344779Sdim // lowers the pair of instructions into e32 form. 1211341825Sdim matchSDWAOperands(MBB); 1212344779Sdim for (const auto &OperandPair : SDWAOperands) { 1213344779Sdim const auto &Operand = OperandPair.second; 1214344779Sdim MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1215344779Sdim if (PotentialMI && 1216344779Sdim (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || 1217344779Sdim PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) 1218344779Sdim pseudoOpConvertToVOP2(*PotentialMI, ST); 1219344779Sdim } 1220344779Sdim SDWAOperands.clear(); 1221317017Sdim 1222344779Sdim // Generate potential match list. 1223344779Sdim matchSDWAOperands(MBB); 1224344779Sdim 1225341825Sdim for (const auto &OperandPair : SDWAOperands) { 1226341825Sdim const auto &Operand = OperandPair.second; 1227341825Sdim MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1228341825Sdim if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1229341825Sdim PotentialMatches[PotentialMI].push_back(Operand.get()); 1230341825Sdim } 1231327952Sdim } 1232317017Sdim 1233341825Sdim for (auto &PotentialPair : PotentialMatches) { 1234341825Sdim MachineInstr &PotentialMI = *PotentialPair.first; 1235341825Sdim convertToSDWA(PotentialMI, PotentialPair.second); 1236341825Sdim } 1237317017Sdim 1238341825Sdim PotentialMatches.clear(); 1239341825Sdim SDWAOperands.clear(); 1240319250Sdim 1241341825Sdim Changed = !ConvertedInstructions.empty(); 1242319250Sdim 1243341825Sdim if (Changed) 1244341825Sdim Ret = true; 1245341825Sdim while (!ConvertedInstructions.empty()) 1246341825Sdim legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1247341825Sdim } while (Changed); 1248341825Sdim } 1249327952Sdim 1250319799Sdim return Ret; 1251317017Sdim} 1252