1234353Sdim//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===// 2218885Sdim// 3218885Sdim// The LLVM Compiler Infrastructure 4218885Sdim// 5218885Sdim// This file is distributed under the University of Illinois Open Source 6218885Sdim// License. See LICENSE.TXT for details. 7218885Sdim// 8218885Sdim//===----------------------------------------------------------------------===// 9218885Sdim// 10218885Sdim// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of 11218885Sdim// multiple and add / sub instructions) when special VMLx hazards are detected. 12218885Sdim// 13218885Sdim//===----------------------------------------------------------------------===// 14218885Sdim 15218885Sdim#include "ARM.h" 16218885Sdim#include "ARMBaseInstrInfo.h" 17221345Sdim#include "ARMSubtarget.h" 18249423Sdim#include "llvm/ADT/SmallPtrSet.h" 19249423Sdim#include "llvm/ADT/Statistic.h" 20249423Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 21218885Sdim#include "llvm/CodeGen/MachineInstr.h" 22218885Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 23218885Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 24218885Sdim#include "llvm/Support/CommandLine.h" 25218885Sdim#include "llvm/Support/Debug.h" 26218885Sdim#include "llvm/Support/raw_ostream.h" 27249423Sdim#include "llvm/Target/TargetRegisterInfo.h" 28218885Sdimusing namespace llvm; 29218885Sdim 30276479Sdim#define DEBUG_TYPE "mlx-expansion" 31276479Sdim 32218885Sdimstatic cl::opt<bool> 33218885SdimForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); 34218885Sdimstatic cl::opt<unsigned> 35218885SdimExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); 36218885Sdim 37218885SdimSTATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); 38218885Sdim 39218885Sdimnamespace { 40218885Sdim struct MLxExpansion : public MachineFunctionPass { 41218885Sdim static char ID; 42218885Sdim MLxExpansion() : MachineFunctionPass(ID) {} 43218885Sdim 44276479Sdim bool runOnMachineFunction(MachineFunction &Fn) override; 45218885Sdim 46276479Sdim const char *getPassName() const override { 47218885Sdim return "ARM MLA / MLS expansion pass"; 48218885Sdim } 49218885Sdim 50218885Sdim private: 51218885Sdim const ARMBaseInstrInfo *TII; 52218885Sdim const TargetRegisterInfo *TRI; 53218885Sdim MachineRegisterInfo *MRI; 54218885Sdim 55243830Sdim bool isLikeA9; 56243830Sdim bool isSwift; 57218885Sdim unsigned MIIdx; 58218885Sdim MachineInstr* LastMIs[4]; 59221345Sdim SmallPtrSet<MachineInstr*, 4> IgnoreStall; 60218885Sdim 61218885Sdim void clearStack(); 62218885Sdim void pushStack(MachineInstr *MI); 63218885Sdim MachineInstr *getAccDefMI(MachineInstr *MI) const; 64218885Sdim unsigned getDefReg(MachineInstr *MI) const; 65243830Sdim bool hasLoopHazard(MachineInstr *MI) const; 66218885Sdim bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; 67221345Sdim bool FindMLxHazard(MachineInstr *MI); 68218885Sdim void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, 69218885Sdim unsigned MulOpc, unsigned AddSubOpc, 70218885Sdim bool NegAcc, bool HasLane); 71218885Sdim bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); 72218885Sdim }; 73218885Sdim char MLxExpansion::ID = 0; 74218885Sdim} 75218885Sdim 76218885Sdimvoid MLxExpansion::clearStack() { 77276479Sdim std::fill(LastMIs, LastMIs + 4, nullptr); 78218885Sdim MIIdx = 0; 79218885Sdim} 80218885Sdim 81218885Sdimvoid MLxExpansion::pushStack(MachineInstr *MI) { 82218885Sdim LastMIs[MIIdx] = MI; 83218885Sdim if (++MIIdx == 4) 84218885Sdim MIIdx = 0; 85218885Sdim} 86218885Sdim 87218885SdimMachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { 88218885Sdim // Look past COPY and INSERT_SUBREG instructions to find the 89218885Sdim // real definition MI. This is important for _sfp instructions. 90218885Sdim unsigned Reg = MI->getOperand(1).getReg(); 91218885Sdim if (TargetRegisterInfo::isPhysicalRegister(Reg)) 92276479Sdim return nullptr; 93218885Sdim 94218885Sdim MachineBasicBlock *MBB = MI->getParent(); 95218885Sdim MachineInstr *DefMI = MRI->getVRegDef(Reg); 96218885Sdim while (true) { 97218885Sdim if (DefMI->getParent() != MBB) 98218885Sdim break; 99218885Sdim if (DefMI->isCopyLike()) { 100218885Sdim Reg = DefMI->getOperand(1).getReg(); 101218885Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 102218885Sdim DefMI = MRI->getVRegDef(Reg); 103218885Sdim continue; 104218885Sdim } 105218885Sdim } else if (DefMI->isInsertSubreg()) { 106218885Sdim Reg = DefMI->getOperand(2).getReg(); 107218885Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 108218885Sdim DefMI = MRI->getVRegDef(Reg); 109218885Sdim continue; 110218885Sdim } 111218885Sdim } 112218885Sdim break; 113218885Sdim } 114218885Sdim return DefMI; 115218885Sdim} 116218885Sdim 117218885Sdimunsigned MLxExpansion::getDefReg(MachineInstr *MI) const { 118218885Sdim unsigned Reg = MI->getOperand(0).getReg(); 119218885Sdim if (TargetRegisterInfo::isPhysicalRegister(Reg) || 120218885Sdim !MRI->hasOneNonDBGUse(Reg)) 121218885Sdim return Reg; 122218885Sdim 123218885Sdim MachineBasicBlock *MBB = MI->getParent(); 124276479Sdim MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg); 125218885Sdim if (UseMI->getParent() != MBB) 126218885Sdim return Reg; 127218885Sdim 128218885Sdim while (UseMI->isCopy() || UseMI->isInsertSubreg()) { 129218885Sdim Reg = UseMI->getOperand(0).getReg(); 130218885Sdim if (TargetRegisterInfo::isPhysicalRegister(Reg) || 131218885Sdim !MRI->hasOneNonDBGUse(Reg)) 132218885Sdim return Reg; 133276479Sdim UseMI = &*MRI->use_instr_nodbg_begin(Reg); 134218885Sdim if (UseMI->getParent() != MBB) 135218885Sdim return Reg; 136218885Sdim } 137218885Sdim 138218885Sdim return Reg; 139218885Sdim} 140218885Sdim 141243830Sdim/// hasLoopHazard - Check whether an MLx instruction is chained to itself across 142243830Sdim/// a single-MBB loop. 143243830Sdimbool MLxExpansion::hasLoopHazard(MachineInstr *MI) const { 144243830Sdim unsigned Reg = MI->getOperand(1).getReg(); 145243830Sdim if (TargetRegisterInfo::isPhysicalRegister(Reg)) 146243830Sdim return false; 147243830Sdim 148243830Sdim MachineBasicBlock *MBB = MI->getParent(); 149243830Sdim MachineInstr *DefMI = MRI->getVRegDef(Reg); 150243830Sdim while (true) { 151243830Sdimouter_continue: 152243830Sdim if (DefMI->getParent() != MBB) 153243830Sdim break; 154243830Sdim 155243830Sdim if (DefMI->isPHI()) { 156243830Sdim for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { 157243830Sdim if (DefMI->getOperand(i + 1).getMBB() == MBB) { 158243830Sdim unsigned SrcReg = DefMI->getOperand(i).getReg(); 159243830Sdim if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { 160243830Sdim DefMI = MRI->getVRegDef(SrcReg); 161243830Sdim goto outer_continue; 162243830Sdim } 163243830Sdim } 164243830Sdim } 165243830Sdim } else if (DefMI->isCopyLike()) { 166243830Sdim Reg = DefMI->getOperand(1).getReg(); 167243830Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 168243830Sdim DefMI = MRI->getVRegDef(Reg); 169243830Sdim continue; 170243830Sdim } 171243830Sdim } else if (DefMI->isInsertSubreg()) { 172243830Sdim Reg = DefMI->getOperand(2).getReg(); 173243830Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 174243830Sdim DefMI = MRI->getVRegDef(Reg); 175243830Sdim continue; 176243830Sdim } 177243830Sdim } 178243830Sdim 179243830Sdim break; 180243830Sdim } 181243830Sdim 182243830Sdim return DefMI == MI; 183243830Sdim} 184243830Sdim 185218885Sdimbool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { 186219077Sdim // FIXME: Detect integer instructions properly. 187224145Sdim const MCInstrDesc &MCID = MI->getDesc(); 188224145Sdim unsigned Domain = MCID.TSFlags & ARMII::DomainMask; 189234353Sdim if (MI->mayStore()) 190218885Sdim return false; 191224145Sdim unsigned Opcode = MCID.getOpcode(); 192219077Sdim if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) 193219077Sdim return false; 194219077Sdim if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) 195219077Sdim return MI->readsRegister(Reg, TRI); 196218885Sdim return false; 197218885Sdim} 198218885Sdim 199243830Sdimstatic bool isFpMulInstruction(unsigned Opcode) { 200243830Sdim switch (Opcode) { 201243830Sdim case ARM::VMULS: 202243830Sdim case ARM::VMULfd: 203243830Sdim case ARM::VMULfq: 204243830Sdim case ARM::VMULD: 205243830Sdim case ARM::VMULslfd: 206243830Sdim case ARM::VMULslfq: 207243830Sdim return true; 208243830Sdim default: 209243830Sdim return false; 210243830Sdim } 211243830Sdim} 212218885Sdim 213221345Sdimbool MLxExpansion::FindMLxHazard(MachineInstr *MI) { 214218885Sdim if (NumExpand >= ExpandLimit) 215218885Sdim return false; 216218885Sdim 217218885Sdim if (ForceExapnd) 218218885Sdim return true; 219218885Sdim 220218885Sdim MachineInstr *DefMI = getAccDefMI(MI); 221221345Sdim if (TII->isFpMLxInstruction(DefMI->getOpcode())) { 222218885Sdim // r0 = vmla 223218885Sdim // r3 = vmla r0, r1, r2 224218885Sdim // takes 16 - 17 cycles 225218885Sdim // 226218885Sdim // r0 = vmla 227218885Sdim // r4 = vmul r1, r2 228218885Sdim // r3 = vadd r0, r4 229218885Sdim // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. 230221345Sdim IgnoreStall.insert(DefMI); 231218885Sdim return true; 232221345Sdim } 233218885Sdim 234243830Sdim // On Swift, we mostly care about hazards from multiplication instructions 235243830Sdim // writing the accumulator and the pipelining of loop iterations by out-of- 236243830Sdim // order execution. 237243830Sdim if (isSwift) 238243830Sdim return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI); 239243830Sdim 240221345Sdim if (IgnoreStall.count(MI)) 241221345Sdim return false; 242221345Sdim 243218885Sdim // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the 244218885Sdim // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall 245218885Sdim // preserves the in-order retirement of the instructions. 246218885Sdim // Look at the next few instructions, if *most* of them can cause hazards, 247218885Sdim // then the scheduler can't *fix* this, we'd better break up the VMLA. 248243830Sdim unsigned Limit1 = isLikeA9 ? 1 : 4; 249243830Sdim unsigned Limit2 = isLikeA9 ? 1 : 4; 250218885Sdim for (unsigned i = 1; i <= 4; ++i) { 251218885Sdim int Idx = ((int)MIIdx - i + 4) % 4; 252218885Sdim MachineInstr *NextMI = LastMIs[Idx]; 253218885Sdim if (!NextMI) 254218885Sdim continue; 255218885Sdim 256221345Sdim if (TII->canCauseFpMLxStall(NextMI->getOpcode())) { 257221345Sdim if (i <= Limit1) 258221345Sdim return true; 259221345Sdim } 260218885Sdim 261218885Sdim // Look for VMLx RAW hazard. 262221345Sdim if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) 263218885Sdim return true; 264218885Sdim } 265218885Sdim 266218885Sdim return false; 267218885Sdim} 268218885Sdim 269218885Sdim/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair 270218885Sdim/// of MUL + ADD / SUB instructions. 271218885Sdimvoid 272218885SdimMLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, 273218885Sdim unsigned MulOpc, unsigned AddSubOpc, 274218885Sdim bool NegAcc, bool HasLane) { 275218885Sdim unsigned DstReg = MI->getOperand(0).getReg(); 276218885Sdim bool DstDead = MI->getOperand(0).isDead(); 277218885Sdim unsigned AccReg = MI->getOperand(1).getReg(); 278218885Sdim unsigned Src1Reg = MI->getOperand(2).getReg(); 279218885Sdim unsigned Src2Reg = MI->getOperand(3).getReg(); 280218885Sdim bool Src1Kill = MI->getOperand(2).isKill(); 281218885Sdim bool Src2Kill = MI->getOperand(3).isKill(); 282218885Sdim unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; 283218885Sdim unsigned NextOp = HasLane ? 5 : 4; 284218885Sdim ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); 285218885Sdim unsigned PredReg = MI->getOperand(++NextOp).getReg(); 286218885Sdim 287224145Sdim const MCInstrDesc &MCID1 = TII->get(MulOpc); 288224145Sdim const MCInstrDesc &MCID2 = TII->get(AddSubOpc); 289239462Sdim const MachineFunction &MF = *MI->getParent()->getParent(); 290239462Sdim unsigned TmpReg = MRI->createVirtualRegister( 291239462Sdim TII->getRegClass(MCID1, 0, TRI, MF)); 292218885Sdim 293234353Sdim MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) 294218885Sdim .addReg(Src1Reg, getKillRegState(Src1Kill)) 295218885Sdim .addReg(Src2Reg, getKillRegState(Src2Kill)); 296218885Sdim if (HasLane) 297218885Sdim MIB.addImm(LaneImm); 298218885Sdim MIB.addImm(Pred).addReg(PredReg); 299218885Sdim 300234353Sdim MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2) 301218885Sdim .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); 302218885Sdim 303218885Sdim if (NegAcc) { 304218885Sdim bool AccKill = MRI->hasOneNonDBGUse(AccReg); 305218885Sdim MIB.addReg(TmpReg, getKillRegState(true)) 306218885Sdim .addReg(AccReg, getKillRegState(AccKill)); 307218885Sdim } else { 308218885Sdim MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); 309218885Sdim } 310218885Sdim MIB.addImm(Pred).addReg(PredReg); 311218885Sdim 312218885Sdim DEBUG({ 313218885Sdim dbgs() << "Expanding: " << *MI; 314218885Sdim dbgs() << " to:\n"; 315218885Sdim MachineBasicBlock::iterator MII = MI; 316276479Sdim MII = std::prev(MII); 317218885Sdim MachineInstr &MI2 = *MII; 318276479Sdim MII = std::prev(MII); 319218885Sdim MachineInstr &MI1 = *MII; 320218885Sdim dbgs() << " " << MI1; 321218885Sdim dbgs() << " " << MI2; 322218885Sdim }); 323218885Sdim 324218885Sdim MI->eraseFromParent(); 325218885Sdim ++NumExpand; 326218885Sdim} 327218885Sdim 328218885Sdimbool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { 329218885Sdim bool Changed = false; 330218885Sdim 331218885Sdim clearStack(); 332221345Sdim IgnoreStall.clear(); 333218885Sdim 334218885Sdim unsigned Skip = 0; 335218885Sdim MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); 336218885Sdim while (MII != E) { 337218885Sdim MachineInstr *MI = &*MII; 338218885Sdim 339276479Sdim if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy()) { 340218885Sdim ++MII; 341218885Sdim continue; 342218885Sdim } 343218885Sdim 344224145Sdim const MCInstrDesc &MCID = MI->getDesc(); 345234353Sdim if (MI->isBarrier()) { 346218885Sdim clearStack(); 347218885Sdim Skip = 0; 348218885Sdim ++MII; 349218885Sdim continue; 350218885Sdim } 351218885Sdim 352224145Sdim unsigned Domain = MCID.TSFlags & ARMII::DomainMask; 353218885Sdim if (Domain == ARMII::DomainGeneral) { 354218885Sdim if (++Skip == 2) 355218885Sdim // Assume dual issues of non-VFP / NEON instructions. 356276479Sdim pushStack(nullptr); 357218885Sdim } else { 358218885Sdim Skip = 0; 359218885Sdim 360218885Sdim unsigned MulOpc, AddSubOpc; 361218885Sdim bool NegAcc, HasLane; 362224145Sdim if (!TII->isFpMLxInstruction(MCID.getOpcode(), 363218885Sdim MulOpc, AddSubOpc, NegAcc, HasLane) || 364218885Sdim !FindMLxHazard(MI)) 365218885Sdim pushStack(MI); 366218885Sdim else { 367218885Sdim ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); 368218885Sdim E = MBB.rend(); // May have changed if MI was the 1st instruction. 369218885Sdim Changed = true; 370218885Sdim continue; 371218885Sdim } 372218885Sdim } 373218885Sdim 374218885Sdim ++MII; 375218885Sdim } 376218885Sdim 377218885Sdim return Changed; 378218885Sdim} 379218885Sdim 380218885Sdimbool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { 381280031Sdim TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo()); 382280031Sdim TRI = Fn.getSubtarget().getRegisterInfo(); 383218885Sdim MRI = &Fn.getRegInfo(); 384288943Sdim const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>(); 385288943Sdim // Only run this for CortexA9. 386288943Sdim if (!STI->isCortexA9()) 387288943Sdim return false; 388243830Sdim isLikeA9 = STI->isLikeA9() || STI->isSwift(); 389243830Sdim isSwift = STI->isSwift(); 390218885Sdim 391218885Sdim bool Modified = false; 392276479Sdim for (MachineBasicBlock &MBB : Fn) 393218885Sdim Modified |= ExpandFPMLxInstructions(MBB); 394218885Sdim 395218885Sdim return Modified; 396218885Sdim} 397218885Sdim 398218885SdimFunctionPass *llvm::createMLxExpansionPass() { 399218885Sdim return new MLxExpansion(); 400218885Sdim} 401