1235633Sdim//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
2218885Sdim//
3218885Sdim//                     The LLVM Compiler Infrastructure
4218885Sdim//
5218885Sdim// This file is distributed under the University of Illinois Open Source
6218885Sdim// License. See LICENSE.TXT for details.
7218885Sdim//
8218885Sdim//===----------------------------------------------------------------------===//
9218885Sdim//
10218885Sdim// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
11218885Sdim// multiple and add / sub instructions) when special VMLx hazards are detected.
12218885Sdim//
13218885Sdim//===----------------------------------------------------------------------===//
14218885Sdim
15218885Sdim#define DEBUG_TYPE "mlx-expansion"
16218885Sdim#include "ARM.h"
17218885Sdim#include "ARMBaseInstrInfo.h"
18221345Sdim#include "ARMSubtarget.h"
19252723Sdim#include "llvm/ADT/SmallPtrSet.h"
20252723Sdim#include "llvm/ADT/Statistic.h"
21252723Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
22218885Sdim#include "llvm/CodeGen/MachineInstr.h"
23218885Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
24218885Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
25218885Sdim#include "llvm/Support/CommandLine.h"
26218885Sdim#include "llvm/Support/Debug.h"
27218885Sdim#include "llvm/Support/raw_ostream.h"
28252723Sdim#include "llvm/Target/TargetRegisterInfo.h"
29218885Sdimusing namespace llvm;
30218885Sdim
31218885Sdimstatic cl::opt<bool>
32218885SdimForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
33218885Sdimstatic cl::opt<unsigned>
34218885SdimExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
35218885Sdim
36218885SdimSTATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
37218885Sdim
38218885Sdimnamespace {
39218885Sdim  struct MLxExpansion : public MachineFunctionPass {
40218885Sdim    static char ID;
41218885Sdim    MLxExpansion() : MachineFunctionPass(ID) {}
42218885Sdim
43218885Sdim    virtual bool runOnMachineFunction(MachineFunction &Fn);
44218885Sdim
45218885Sdim    virtual const char *getPassName() const {
46218885Sdim      return "ARM MLA / MLS expansion pass";
47218885Sdim    }
48218885Sdim
49218885Sdim  private:
50218885Sdim    const ARMBaseInstrInfo *TII;
51218885Sdim    const TargetRegisterInfo *TRI;
52218885Sdim    MachineRegisterInfo *MRI;
53218885Sdim
54245431Sdim    bool isLikeA9;
55245431Sdim    bool isSwift;
56218885Sdim    unsigned MIIdx;
57218885Sdim    MachineInstr* LastMIs[4];
58221345Sdim    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
59218885Sdim
60218885Sdim    void clearStack();
61218885Sdim    void pushStack(MachineInstr *MI);
62218885Sdim    MachineInstr *getAccDefMI(MachineInstr *MI) const;
63218885Sdim    unsigned getDefReg(MachineInstr *MI) const;
64245431Sdim    bool hasLoopHazard(MachineInstr *MI) const;
65218885Sdim    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
66221345Sdim    bool FindMLxHazard(MachineInstr *MI);
67218885Sdim    void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
68218885Sdim                                unsigned MulOpc, unsigned AddSubOpc,
69218885Sdim                                bool NegAcc, bool HasLane);
70218885Sdim    bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
71218885Sdim  };
72218885Sdim  char MLxExpansion::ID = 0;
73218885Sdim}
74218885Sdim
75218885Sdimvoid MLxExpansion::clearStack() {
76218885Sdim  std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
77218885Sdim  MIIdx = 0;
78218885Sdim}
79218885Sdim
80218885Sdimvoid MLxExpansion::pushStack(MachineInstr *MI) {
81218885Sdim  LastMIs[MIIdx] = MI;
82218885Sdim  if (++MIIdx == 4)
83218885Sdim    MIIdx = 0;
84218885Sdim}
85218885Sdim
86218885SdimMachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
87218885Sdim  // Look past COPY and INSERT_SUBREG instructions to find the
88218885Sdim  // real definition MI. This is important for _sfp instructions.
89218885Sdim  unsigned Reg = MI->getOperand(1).getReg();
90218885Sdim  if (TargetRegisterInfo::isPhysicalRegister(Reg))
91218885Sdim    return 0;
92218885Sdim
93218885Sdim  MachineBasicBlock *MBB = MI->getParent();
94218885Sdim  MachineInstr *DefMI = MRI->getVRegDef(Reg);
95218885Sdim  while (true) {
96218885Sdim    if (DefMI->getParent() != MBB)
97218885Sdim      break;
98218885Sdim    if (DefMI->isCopyLike()) {
99218885Sdim      Reg = DefMI->getOperand(1).getReg();
100218885Sdim      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
101218885Sdim        DefMI = MRI->getVRegDef(Reg);
102218885Sdim        continue;
103218885Sdim      }
104218885Sdim    } else if (DefMI->isInsertSubreg()) {
105218885Sdim      Reg = DefMI->getOperand(2).getReg();
106218885Sdim      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
107218885Sdim        DefMI = MRI->getVRegDef(Reg);
108218885Sdim        continue;
109218885Sdim      }
110218885Sdim    }
111218885Sdim    break;
112218885Sdim  }
113218885Sdim  return DefMI;
114218885Sdim}
115218885Sdim
116218885Sdimunsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
117218885Sdim  unsigned Reg = MI->getOperand(0).getReg();
118218885Sdim  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
119218885Sdim      !MRI->hasOneNonDBGUse(Reg))
120218885Sdim    return Reg;
121218885Sdim
122218885Sdim  MachineBasicBlock *MBB = MI->getParent();
123218885Sdim  MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
124218885Sdim  if (UseMI->getParent() != MBB)
125218885Sdim    return Reg;
126218885Sdim
127218885Sdim  while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
128218885Sdim    Reg = UseMI->getOperand(0).getReg();
129218885Sdim    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
130218885Sdim        !MRI->hasOneNonDBGUse(Reg))
131218885Sdim      return Reg;
132218885Sdim    UseMI = &*MRI->use_nodbg_begin(Reg);
133218885Sdim    if (UseMI->getParent() != MBB)
134218885Sdim      return Reg;
135218885Sdim  }
136218885Sdim
137218885Sdim  return Reg;
138218885Sdim}
139218885Sdim
140245431Sdim/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
141245431Sdim/// a single-MBB loop.
142245431Sdimbool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
143245431Sdim  unsigned Reg = MI->getOperand(1).getReg();
144245431Sdim  if (TargetRegisterInfo::isPhysicalRegister(Reg))
145245431Sdim    return false;
146245431Sdim
147245431Sdim  MachineBasicBlock *MBB = MI->getParent();
148245431Sdim  MachineInstr *DefMI = MRI->getVRegDef(Reg);
149245431Sdim  while (true) {
150245431Sdimouter_continue:
151245431Sdim    if (DefMI->getParent() != MBB)
152245431Sdim      break;
153245431Sdim
154245431Sdim    if (DefMI->isPHI()) {
155245431Sdim      for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
156245431Sdim        if (DefMI->getOperand(i + 1).getMBB() == MBB) {
157245431Sdim          unsigned SrcReg = DefMI->getOperand(i).getReg();
158245431Sdim          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
159245431Sdim            DefMI = MRI->getVRegDef(SrcReg);
160245431Sdim            goto outer_continue;
161245431Sdim          }
162245431Sdim        }
163245431Sdim      }
164245431Sdim    } else if (DefMI->isCopyLike()) {
165245431Sdim      Reg = DefMI->getOperand(1).getReg();
166245431Sdim      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
167245431Sdim        DefMI = MRI->getVRegDef(Reg);
168245431Sdim        continue;
169245431Sdim      }
170245431Sdim    } else if (DefMI->isInsertSubreg()) {
171245431Sdim      Reg = DefMI->getOperand(2).getReg();
172245431Sdim      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
173245431Sdim        DefMI = MRI->getVRegDef(Reg);
174245431Sdim        continue;
175245431Sdim      }
176245431Sdim    }
177245431Sdim
178245431Sdim    break;
179245431Sdim  }
180245431Sdim
181245431Sdim  return DefMI == MI;
182245431Sdim}
183245431Sdim
184218885Sdimbool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
185219077Sdim  // FIXME: Detect integer instructions properly.
186224145Sdim  const MCInstrDesc &MCID = MI->getDesc();
187224145Sdim  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
188235633Sdim  if (MI->mayStore())
189218885Sdim    return false;
190224145Sdim  unsigned Opcode = MCID.getOpcode();
191219077Sdim  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
192219077Sdim    return false;
193219077Sdim  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
194219077Sdim    return MI->readsRegister(Reg, TRI);
195218885Sdim  return false;
196218885Sdim}
197218885Sdim
198245431Sdimstatic bool isFpMulInstruction(unsigned Opcode) {
199245431Sdim  switch (Opcode) {
200245431Sdim  case ARM::VMULS:
201245431Sdim  case ARM::VMULfd:
202245431Sdim  case ARM::VMULfq:
203245431Sdim  case ARM::VMULD:
204245431Sdim  case ARM::VMULslfd:
205245431Sdim  case ARM::VMULslfq:
206245431Sdim    return true;
207245431Sdim  default:
208245431Sdim    return false;
209245431Sdim  }
210245431Sdim}
211218885Sdim
212221345Sdimbool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
213218885Sdim  if (NumExpand >= ExpandLimit)
214218885Sdim    return false;
215218885Sdim
216218885Sdim  if (ForceExapnd)
217218885Sdim    return true;
218218885Sdim
219218885Sdim  MachineInstr *DefMI = getAccDefMI(MI);
220221345Sdim  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
221218885Sdim    // r0 = vmla
222218885Sdim    // r3 = vmla r0, r1, r2
223218885Sdim    // takes 16 - 17 cycles
224218885Sdim    //
225218885Sdim    // r0 = vmla
226218885Sdim    // r4 = vmul r1, r2
227218885Sdim    // r3 = vadd r0, r4
228218885Sdim    // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
229221345Sdim    IgnoreStall.insert(DefMI);
230218885Sdim    return true;
231221345Sdim  }
232218885Sdim
233245431Sdim  // On Swift, we mostly care about hazards from multiplication instructions
234245431Sdim  // writing the accumulator and the pipelining of loop iterations by out-of-
235245431Sdim  // order execution.
236245431Sdim  if (isSwift)
237245431Sdim    return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
238245431Sdim
239221345Sdim  if (IgnoreStall.count(MI))
240221345Sdim    return false;
241221345Sdim
242218885Sdim  // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
243218885Sdim  // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
244218885Sdim  // preserves the in-order retirement of the instructions.
245218885Sdim  // Look at the next few instructions, if *most* of them can cause hazards,
246218885Sdim  // then the scheduler can't *fix* this, we'd better break up the VMLA.
247245431Sdim  unsigned Limit1 = isLikeA9 ? 1 : 4;
248245431Sdim  unsigned Limit2 = isLikeA9 ? 1 : 4;
249218885Sdim  for (unsigned i = 1; i <= 4; ++i) {
250218885Sdim    int Idx = ((int)MIIdx - i + 4) % 4;
251218885Sdim    MachineInstr *NextMI = LastMIs[Idx];
252218885Sdim    if (!NextMI)
253218885Sdim      continue;
254218885Sdim
255221345Sdim    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
256221345Sdim      if (i <= Limit1)
257221345Sdim        return true;
258221345Sdim    }
259218885Sdim
260218885Sdim    // Look for VMLx RAW hazard.
261221345Sdim    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
262218885Sdim      return true;
263218885Sdim  }
264218885Sdim
265218885Sdim  return false;
266218885Sdim}
267218885Sdim
268218885Sdim/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
269218885Sdim/// of MUL + ADD / SUB instructions.
270218885Sdimvoid
271218885SdimMLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
272218885Sdim                                     unsigned MulOpc, unsigned AddSubOpc,
273218885Sdim                                     bool NegAcc, bool HasLane) {
274218885Sdim  unsigned DstReg = MI->getOperand(0).getReg();
275218885Sdim  bool DstDead = MI->getOperand(0).isDead();
276218885Sdim  unsigned AccReg = MI->getOperand(1).getReg();
277218885Sdim  unsigned Src1Reg = MI->getOperand(2).getReg();
278218885Sdim  unsigned Src2Reg = MI->getOperand(3).getReg();
279218885Sdim  bool Src1Kill = MI->getOperand(2).isKill();
280218885Sdim  bool Src2Kill = MI->getOperand(3).isKill();
281218885Sdim  unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
282218885Sdim  unsigned NextOp = HasLane ? 5 : 4;
283218885Sdim  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
284218885Sdim  unsigned PredReg = MI->getOperand(++NextOp).getReg();
285218885Sdim
286224145Sdim  const MCInstrDesc &MCID1 = TII->get(MulOpc);
287224145Sdim  const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
288245431Sdim  const MachineFunction &MF = *MI->getParent()->getParent();
289245431Sdim  unsigned TmpReg = MRI->createVirtualRegister(
290245431Sdim                      TII->getRegClass(MCID1, 0, TRI, MF));
291218885Sdim
292235633Sdim  MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
293218885Sdim    .addReg(Src1Reg, getKillRegState(Src1Kill))
294218885Sdim    .addReg(Src2Reg, getKillRegState(Src2Kill));
295218885Sdim  if (HasLane)
296218885Sdim    MIB.addImm(LaneImm);
297218885Sdim  MIB.addImm(Pred).addReg(PredReg);
298218885Sdim
299235633Sdim  MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
300218885Sdim    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
301218885Sdim
302218885Sdim  if (NegAcc) {
303218885Sdim    bool AccKill = MRI->hasOneNonDBGUse(AccReg);
304218885Sdim    MIB.addReg(TmpReg, getKillRegState(true))
305218885Sdim       .addReg(AccReg, getKillRegState(AccKill));
306218885Sdim  } else {
307218885Sdim    MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
308218885Sdim  }
309218885Sdim  MIB.addImm(Pred).addReg(PredReg);
310218885Sdim
311218885Sdim  DEBUG({
312218885Sdim      dbgs() << "Expanding: " << *MI;
313218885Sdim      dbgs() << "  to:\n";
314218885Sdim      MachineBasicBlock::iterator MII = MI;
315218885Sdim      MII = llvm::prior(MII);
316218885Sdim      MachineInstr &MI2 = *MII;
317218885Sdim      MII = llvm::prior(MII);
318218885Sdim      MachineInstr &MI1 = *MII;
319218885Sdim      dbgs() << "    " << MI1;
320218885Sdim      dbgs() << "    " << MI2;
321218885Sdim   });
322218885Sdim
323218885Sdim  MI->eraseFromParent();
324218885Sdim  ++NumExpand;
325218885Sdim}
326218885Sdim
327218885Sdimbool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
328218885Sdim  bool Changed = false;
329218885Sdim
330218885Sdim  clearStack();
331221345Sdim  IgnoreStall.clear();
332218885Sdim
333218885Sdim  unsigned Skip = 0;
334218885Sdim  MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
335218885Sdim  while (MII != E) {
336218885Sdim    MachineInstr *MI = &*MII;
337218885Sdim
338218885Sdim    if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
339218885Sdim      ++MII;
340218885Sdim      continue;
341218885Sdim    }
342218885Sdim
343224145Sdim    const MCInstrDesc &MCID = MI->getDesc();
344235633Sdim    if (MI->isBarrier()) {
345218885Sdim      clearStack();
346218885Sdim      Skip = 0;
347218885Sdim      ++MII;
348218885Sdim      continue;
349218885Sdim    }
350218885Sdim
351224145Sdim    unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
352218885Sdim    if (Domain == ARMII::DomainGeneral) {
353218885Sdim      if (++Skip == 2)
354218885Sdim        // Assume dual issues of non-VFP / NEON instructions.
355218885Sdim        pushStack(0);
356218885Sdim    } else {
357218885Sdim      Skip = 0;
358218885Sdim
359218885Sdim      unsigned MulOpc, AddSubOpc;
360218885Sdim      bool NegAcc, HasLane;
361224145Sdim      if (!TII->isFpMLxInstruction(MCID.getOpcode(),
362218885Sdim                                   MulOpc, AddSubOpc, NegAcc, HasLane) ||
363218885Sdim          !FindMLxHazard(MI))
364218885Sdim        pushStack(MI);
365218885Sdim      else {
366218885Sdim        ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
367218885Sdim        E = MBB.rend(); // May have changed if MI was the 1st instruction.
368218885Sdim        Changed = true;
369218885Sdim        continue;
370218885Sdim      }
371218885Sdim    }
372218885Sdim
373218885Sdim    ++MII;
374218885Sdim  }
375218885Sdim
376218885Sdim  return Changed;
377218885Sdim}
378218885Sdim
379218885Sdimbool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
380218885Sdim  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
381218885Sdim  TRI = Fn.getTarget().getRegisterInfo();
382218885Sdim  MRI = &Fn.getRegInfo();
383221345Sdim  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
384245431Sdim  isLikeA9 = STI->isLikeA9() || STI->isSwift();
385245431Sdim  isSwift = STI->isSwift();
386218885Sdim
387218885Sdim  bool Modified = false;
388218885Sdim  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
389218885Sdim       ++MFI) {
390218885Sdim    MachineBasicBlock &MBB = *MFI;
391218885Sdim    Modified |= ExpandFPMLxInstructions(MBB);
392218885Sdim  }
393218885Sdim
394218885Sdim  return Modified;
395218885Sdim}
396218885Sdim
397218885SdimFunctionPass *llvm::createMLxExpansionPass() {
398218885Sdim  return new MLxExpansion();
399218885Sdim}
400