1249259Sdim//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
2249259Sdim//
3249259Sdim//                     The LLVM Compiler Infrastructure
4249259Sdim//
5249259Sdim// This file is distributed under the University of Illinois Open Source
6249259Sdim// License. See LICENSE.TXT for details.
7249259Sdim//
8249259Sdim//===----------------------------------------------------------------------===//
9249259Sdim//
10249259Sdim// The Cortex-A15 processor employs a tracking scheme in its register renaming
11249259Sdim// in order to process each instruction's micro-ops speculatively and
12249259Sdim// out-of-order with appropriate forwarding. The ARM architecture allows VFP
13249259Sdim// instructions to read and write 32-bit S-registers.  Each S-register
14249259Sdim// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
15249259Sdim//
16249259Sdim// There are several instruction patterns which can be used to provide this
17249259Sdim// capability which can provide higher performance than other, potentially more
18249259Sdim// direct patterns, specifically around when one micro-op reads a D-register
19249259Sdim// operand that has recently been written as one or more S-register results.
20249259Sdim//
21249259Sdim// This file defines a pre-regalloc pass which looks for SPR producers which
22249259Sdim// are going to be used by a DPR (or QPR) consumers and creates the more
23249259Sdim// optimized access pattern.
24249259Sdim//
25249259Sdim//===----------------------------------------------------------------------===//
26249259Sdim
27249259Sdim#define DEBUG_TYPE "a15-sd-optimizer"
28249259Sdim#include "ARM.h"
29249259Sdim#include "ARMBaseInstrInfo.h"
30249259Sdim#include "ARMSubtarget.h"
31249259Sdim#include "ARMISelLowering.h"
32249259Sdim#include "ARMTargetMachine.h"
33249259Sdim
34249259Sdim#include "llvm/ADT/SmallPtrSet.h"
35249259Sdim#include "llvm/ADT/Statistic.h"
36249259Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
37249259Sdim#include "llvm/CodeGen/MachineInstr.h"
38249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
39249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
40249259Sdim#include "llvm/Support/CommandLine.h"
41249259Sdim#include "llvm/Support/Debug.h"
42249259Sdim#include "llvm/Support/raw_ostream.h"
43249259Sdim#include "llvm/Target/TargetRegisterInfo.h"
44249259Sdim
45249259Sdim#include <set>
46249259Sdim
47249259Sdimusing namespace llvm;
48249259Sdim
49249259Sdimnamespace {
50249259Sdim  struct A15SDOptimizer : public MachineFunctionPass {
51249259Sdim    static char ID;
52249259Sdim    A15SDOptimizer() : MachineFunctionPass(ID) {}
53249259Sdim
54249259Sdim    virtual bool runOnMachineFunction(MachineFunction &Fn);
55249259Sdim
56249259Sdim    virtual const char *getPassName() const {
57249259Sdim      return "ARM A15 S->D optimizer";
58249259Sdim    }
59249259Sdim
60249259Sdim  private:
61249259Sdim    const ARMBaseInstrInfo *TII;
62249259Sdim    const TargetRegisterInfo *TRI;
63249259Sdim    MachineRegisterInfo *MRI;
64249259Sdim
65249259Sdim    bool runOnInstruction(MachineInstr *MI);
66249259Sdim
67249259Sdim    //
68249259Sdim    // Instruction builder helpers
69249259Sdim    //
70249259Sdim    unsigned createDupLane(MachineBasicBlock &MBB,
71249259Sdim                           MachineBasicBlock::iterator InsertBefore,
72249259Sdim                           DebugLoc DL,
73249259Sdim                           unsigned Reg, unsigned Lane,
74249259Sdim                           bool QPR=false);
75249259Sdim
76249259Sdim    unsigned createExtractSubreg(MachineBasicBlock &MBB,
77249259Sdim                                 MachineBasicBlock::iterator InsertBefore,
78249259Sdim                                 DebugLoc DL,
79249259Sdim                                 unsigned DReg, unsigned Lane,
80249259Sdim                                 const TargetRegisterClass *TRC);
81249259Sdim
82249259Sdim    unsigned createVExt(MachineBasicBlock &MBB,
83249259Sdim                        MachineBasicBlock::iterator InsertBefore,
84249259Sdim                        DebugLoc DL,
85249259Sdim                        unsigned Ssub0, unsigned Ssub1);
86249259Sdim
87249259Sdim    unsigned createRegSequence(MachineBasicBlock &MBB,
88249259Sdim                               MachineBasicBlock::iterator InsertBefore,
89249259Sdim                               DebugLoc DL,
90249259Sdim                               unsigned Reg1, unsigned Reg2);
91249259Sdim
92249259Sdim    unsigned createInsertSubreg(MachineBasicBlock &MBB,
93249259Sdim                                MachineBasicBlock::iterator InsertBefore,
94249259Sdim                                DebugLoc DL, unsigned DReg, unsigned Lane,
95249259Sdim                                unsigned ToInsert);
96249259Sdim
97249259Sdim    unsigned createImplicitDef(MachineBasicBlock &MBB,
98249259Sdim                               MachineBasicBlock::iterator InsertBefore,
99249259Sdim                               DebugLoc DL);
100249259Sdim
101249259Sdim    //
102249259Sdim    // Various property checkers
103249259Sdim    //
104249259Sdim    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
105249259Sdim    bool hasPartialWrite(MachineInstr *MI);
106249259Sdim    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
107249259Sdim    unsigned getDPRLaneFromSPR(unsigned SReg);
108249259Sdim
109249259Sdim    //
110249259Sdim    // Methods used for getting the definitions of partial registers
111249259Sdim    //
112249259Sdim
113249259Sdim    MachineInstr *elideCopies(MachineInstr *MI);
114249259Sdim    void elideCopiesAndPHIs(MachineInstr *MI,
115249259Sdim                            SmallVectorImpl<MachineInstr*> &Outs);
116249259Sdim
117249259Sdim    //
118249259Sdim    // Pattern optimization methods
119249259Sdim    //
120249259Sdim    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
121249259Sdim    unsigned optimizeSDPattern(MachineInstr *MI);
122249259Sdim    unsigned getPrefSPRLane(unsigned SReg);
123249259Sdim
124249259Sdim    //
125249259Sdim    // Sanitizing method - used to make sure if don't leave dead code around.
126249259Sdim    //
127249259Sdim    void eraseInstrWithNoUses(MachineInstr *MI);
128249259Sdim
129249259Sdim    //
130249259Sdim    // A map used to track the changes done by this pass.
131249259Sdim    //
132249259Sdim    std::map<MachineInstr*, unsigned> Replacements;
133249259Sdim    std::set<MachineInstr *> DeadInstr;
134249259Sdim  };
135249259Sdim  char A15SDOptimizer::ID = 0;
136249259Sdim} // end anonymous namespace
137249259Sdim
138249259Sdim// Returns true if this is a use of a SPR register.
139249259Sdimbool A15SDOptimizer::usesRegClass(MachineOperand &MO,
140249259Sdim                                  const TargetRegisterClass *TRC) {
141249259Sdim  if (!MO.isReg())
142249259Sdim    return false;
143249259Sdim  unsigned Reg = MO.getReg();
144249259Sdim
145249259Sdim  if (TargetRegisterInfo::isVirtualRegister(Reg))
146249259Sdim    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
147249259Sdim  else
148249259Sdim    return TRC->contains(Reg);
149249259Sdim}
150249259Sdim
151249259Sdimunsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
152249259Sdim  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
153249259Sdim                                           &ARM::DPRRegClass);
154249259Sdim  if (DReg != ARM::NoRegister) return ARM::ssub_1;
155249259Sdim  return ARM::ssub_0;
156249259Sdim}
157249259Sdim
158249259Sdim// Get the subreg type that is most likely to be coalesced
159249259Sdim// for an SPR register that will be used in VDUP32d pseudo.
160249259Sdimunsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
161249259Sdim  if (!TRI->isVirtualRegister(SReg))
162249259Sdim    return getDPRLaneFromSPR(SReg);
163249259Sdim
164249259Sdim  MachineInstr *MI = MRI->getVRegDef(SReg);
165249259Sdim  if (!MI) return ARM::ssub_0;
166249259Sdim  MachineOperand *MO = MI->findRegisterDefOperand(SReg);
167249259Sdim
168249259Sdim  assert(MO->isReg() && "Non register operand found!");
169249259Sdim  if (!MO) return ARM::ssub_0;
170249259Sdim
171249259Sdim  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
172249259Sdim                                    &ARM::SPRRegClass)) {
173249259Sdim    SReg = MI->getOperand(1).getReg();
174249259Sdim  }
175249259Sdim
176249259Sdim  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
177249259Sdim    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
178249259Sdim    return ARM::ssub_0;
179249259Sdim  }
180249259Sdim  return getDPRLaneFromSPR(SReg);
181249259Sdim}
182249259Sdim
183249259Sdim// MI is known to be dead. Figure out what instructions
184249259Sdim// are also made dead by this and mark them for removal.
185249259Sdimvoid A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
186249259Sdim  SmallVector<MachineInstr *, 8> Front;
187249259Sdim  DeadInstr.insert(MI);
188249259Sdim
189249259Sdim  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
190249259Sdim  Front.push_back(MI);
191249259Sdim
192249259Sdim  while (Front.size() != 0) {
193249259Sdim    MI = Front.back();
194249259Sdim    Front.pop_back();
195249259Sdim
196249259Sdim    // MI is already known to be dead. We need to see
197249259Sdim    // if other instructions can also be removed.
198249259Sdim    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
199249259Sdim      MachineOperand &MO = MI->getOperand(i);
200249259Sdim      if ((!MO.isReg()) || (!MO.isUse()))
201249259Sdim        continue;
202249259Sdim      unsigned Reg = MO.getReg();
203249259Sdim      if (!TRI->isVirtualRegister(Reg))
204249259Sdim        continue;
205249259Sdim      MachineOperand *Op = MI->findRegisterDefOperand(Reg);
206249259Sdim
207249259Sdim      if (!Op)
208249259Sdim        continue;
209249259Sdim
210249259Sdim      MachineInstr *Def = Op->getParent();
211249259Sdim
212249259Sdim      // We don't need to do anything if we have already marked
213249259Sdim      // this instruction as being dead.
214249259Sdim      if (DeadInstr.find(Def) != DeadInstr.end())
215249259Sdim        continue;
216249259Sdim
217249259Sdim      // Check if all the uses of this instruction are marked as
218249259Sdim      // dead. If so, we can also mark this instruction as being
219249259Sdim      // dead.
220249259Sdim      bool IsDead = true;
221249259Sdim      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
222249259Sdim        MachineOperand &MODef = Def->getOperand(j);
223249259Sdim        if ((!MODef.isReg()) || (!MODef.isDef()))
224249259Sdim          continue;
225249259Sdim        unsigned DefReg = MODef.getReg();
226249259Sdim        if (!TRI->isVirtualRegister(DefReg)) {
227249259Sdim          IsDead = false;
228249259Sdim          break;
229249259Sdim        }
230249259Sdim        for (MachineRegisterInfo::use_iterator II = MRI->use_begin(Reg),
231249259Sdim                            EE = MRI->use_end();
232249259Sdim                            II != EE; ++II) {
233249259Sdim          // We don't care about self references.
234249259Sdim          if (&*II == Def)
235249259Sdim            continue;
236249259Sdim          if (DeadInstr.find(&*II) == DeadInstr.end()) {
237249259Sdim            IsDead = false;
238249259Sdim            break;
239249259Sdim          }
240249259Sdim        }
241249259Sdim      }
242249259Sdim
243249259Sdim      if (!IsDead) continue;
244249259Sdim
245249259Sdim      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
246249259Sdim      DeadInstr.insert(Def);
247249259Sdim    }
248249259Sdim  }
249249259Sdim}
250249259Sdim
251249259Sdim// Creates the more optimized patterns and generally does all the code
252249259Sdim// transformations in this pass.
253249259Sdimunsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
254249259Sdim  if (MI->isCopy()) {
255249259Sdim    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
256249259Sdim  }
257249259Sdim
258249259Sdim  if (MI->isInsertSubreg()) {
259249259Sdim    unsigned DPRReg = MI->getOperand(1).getReg();
260249259Sdim    unsigned SPRReg = MI->getOperand(2).getReg();
261249259Sdim
262249259Sdim    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
263249259Sdim      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
264249259Sdim      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
265249259Sdim
266249259Sdim      if (DPRMI && SPRMI) {
267249259Sdim        // See if the first operand of this insert_subreg is IMPLICIT_DEF
268249259Sdim        MachineInstr *ECDef = elideCopies(DPRMI);
269249259Sdim        if (ECDef != 0 && ECDef->isImplicitDef()) {
270249259Sdim          // Another corner case - if we're inserting something that is purely
271249259Sdim          // a subreg copy of a DPR, just use that DPR.
272249259Sdim
273249259Sdim          MachineInstr *EC = elideCopies(SPRMI);
274249259Sdim          // Is it a subreg copy of ssub_0?
275249259Sdim          if (EC && EC->isCopy() &&
276249259Sdim              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
277249259Sdim            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
278249259Sdim
279249259Sdim            // Find the thing we're subreg copying out of - is it of the same
280249259Sdim            // regclass as DPRMI? (i.e. a DPR or QPR).
281249259Sdim            unsigned FullReg = SPRMI->getOperand(1).getReg();
282249259Sdim            const TargetRegisterClass *TRC =
283249259Sdim              MRI->getRegClass(MI->getOperand(1).getReg());
284249259Sdim            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
285249259Sdim              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
286249259Sdim              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
287249259Sdim              eraseInstrWithNoUses(MI);
288249259Sdim              return FullReg;
289249259Sdim            }
290249259Sdim          }
291249259Sdim
292249259Sdim          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
293249259Sdim        }
294249259Sdim      }
295249259Sdim    }
296249259Sdim    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
297249259Sdim  }
298249259Sdim
299249259Sdim  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
300249259Sdim                                          &ARM::SPRRegClass)) {
301249259Sdim    // See if all bar one of the operands are IMPLICIT_DEF and insert the
302249259Sdim    // optimizer pattern accordingly.
303249259Sdim    unsigned NumImplicit = 0, NumTotal = 0;
304249259Sdim    unsigned NonImplicitReg = ~0U;
305249259Sdim
306249259Sdim    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
307249259Sdim      if (!MI->getOperand(I).isReg())
308249259Sdim        continue;
309249259Sdim      ++NumTotal;
310249259Sdim      unsigned OpReg = MI->getOperand(I).getReg();
311249259Sdim
312249259Sdim      if (!TRI->isVirtualRegister(OpReg))
313249259Sdim        break;
314249259Sdim
315249259Sdim      MachineInstr *Def = MRI->getVRegDef(OpReg);
316249259Sdim      if (!Def)
317249259Sdim        break;
318249259Sdim      if (Def->isImplicitDef())
319249259Sdim        ++NumImplicit;
320249259Sdim      else
321249259Sdim        NonImplicitReg = MI->getOperand(I).getReg();
322249259Sdim    }
323249259Sdim
324249259Sdim    if (NumImplicit == NumTotal - 1)
325249259Sdim      return optimizeAllLanesPattern(MI, NonImplicitReg);
326249259Sdim    else
327249259Sdim      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
328249259Sdim  }
329249259Sdim
330249259Sdim  assert(0 && "Unhandled update pattern!");
331249259Sdim  return 0;
332249259Sdim}
333249259Sdim
334249259Sdim// Return true if this MachineInstr inserts a scalar (SPR) value into
335249259Sdim// a D or Q register.
336249259Sdimbool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
337249259Sdim  // The only way we can do a partial register update is through a COPY,
338249259Sdim  // INSERT_SUBREG or REG_SEQUENCE.
339249259Sdim  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
340249259Sdim    return true;
341249259Sdim
342249259Sdim  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
343249259Sdim                                           &ARM::SPRRegClass))
344249259Sdim    return true;
345249259Sdim
346249259Sdim  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
347249259Sdim    return true;
348249259Sdim
349249259Sdim  return false;
350249259Sdim}
351249259Sdim
352249259Sdim// Looks through full copies to get the instruction that defines the input
353249259Sdim// operand for MI.
354249259SdimMachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
355249259Sdim  if (!MI->isFullCopy())
356249259Sdim    return MI;
357249259Sdim  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
358249259Sdim    return NULL;
359249259Sdim  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
360249259Sdim  if (!Def)
361249259Sdim    return NULL;
362249259Sdim  return elideCopies(Def);
363249259Sdim}
364249259Sdim
365249259Sdim// Look through full copies and PHIs to get the set of non-copy MachineInstrs
366249259Sdim// that can produce MI.
367249259Sdimvoid A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
368249259Sdim                                        SmallVectorImpl<MachineInstr*> &Outs) {
369249259Sdim   // Looking through PHIs may create loops so we need to track what
370249259Sdim   // instructions we have visited before.
371249259Sdim   std::set<MachineInstr *> Reached;
372249259Sdim   SmallVector<MachineInstr *, 8> Front;
373249259Sdim   Front.push_back(MI);
374249259Sdim   while (Front.size() != 0) {
375249259Sdim     MI = Front.back();
376249259Sdim     Front.pop_back();
377249259Sdim
378249259Sdim     // If we have already explored this MachineInstr, ignore it.
379249259Sdim     if (Reached.find(MI) != Reached.end())
380249259Sdim       continue;
381249259Sdim     Reached.insert(MI);
382249259Sdim     if (MI->isPHI()) {
383249259Sdim       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
384249259Sdim         unsigned Reg = MI->getOperand(I).getReg();
385249259Sdim         if (!TRI->isVirtualRegister(Reg)) {
386249259Sdim           continue;
387249259Sdim         }
388249259Sdim         MachineInstr *NewMI = MRI->getVRegDef(Reg);
389249259Sdim         if (!NewMI)
390249259Sdim           continue;
391249259Sdim         Front.push_back(NewMI);
392249259Sdim       }
393249259Sdim     } else if (MI->isFullCopy()) {
394249259Sdim       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
395249259Sdim         continue;
396249259Sdim       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
397249259Sdim       if (!NewMI)
398249259Sdim         continue;
399249259Sdim       Front.push_back(NewMI);
400249259Sdim     } else {
401249259Sdim       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
402249259Sdim       Outs.push_back(MI);
403249259Sdim     }
404249259Sdim   }
405249259Sdim}
406249259Sdim
407249259Sdim// Return the DPR virtual registers that are read by this machine instruction
408249259Sdim// (if any).
409249259SdimSmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
410249259Sdim  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
411249259Sdim      MI->isKill())
412249259Sdim    return SmallVector<unsigned, 8>();
413249259Sdim
414249259Sdim  SmallVector<unsigned, 8> Defs;
415249259Sdim  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
416249259Sdim    MachineOperand &MO = MI->getOperand(i);
417249259Sdim
418249259Sdim    if (!MO.isReg() || !MO.isUse())
419249259Sdim      continue;
420249259Sdim    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
421266759Sdim        !usesRegClass(MO, &ARM::QPRRegClass) &&
422266759Sdim        !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
423249259Sdim      continue;
424249259Sdim
425249259Sdim    Defs.push_back(MO.getReg());
426249259Sdim  }
427249259Sdim  return Defs;
428249259Sdim}
429249259Sdim
430249259Sdim// Creates a DPR register from an SPR one by using a VDUP.
431249259Sdimunsigned
432249259SdimA15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
433249259Sdim                              MachineBasicBlock::iterator InsertBefore,
434249259Sdim                              DebugLoc DL,
435249259Sdim                              unsigned Reg, unsigned Lane, bool QPR) {
436249259Sdim  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
437249259Sdim                                                  &ARM::DPRRegClass);
438249259Sdim  AddDefaultPred(BuildMI(MBB,
439249259Sdim                         InsertBefore,
440249259Sdim                         DL,
441249259Sdim                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
442249259Sdim                         Out)
443249259Sdim                   .addReg(Reg)
444249259Sdim                   .addImm(Lane));
445249259Sdim
446249259Sdim  return Out;
447249259Sdim}
448249259Sdim
449249259Sdim// Creates a SPR register from a DPR by copying the value in lane 0.
450249259Sdimunsigned
451249259SdimA15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
452249259Sdim                                    MachineBasicBlock::iterator InsertBefore,
453249259Sdim                                    DebugLoc DL,
454249259Sdim                                    unsigned DReg, unsigned Lane,
455249259Sdim                                    const TargetRegisterClass *TRC) {
456249259Sdim  unsigned Out = MRI->createVirtualRegister(TRC);
457249259Sdim  BuildMI(MBB,
458249259Sdim          InsertBefore,
459249259Sdim          DL,
460249259Sdim          TII->get(TargetOpcode::COPY), Out)
461249259Sdim    .addReg(DReg, 0, Lane);
462249259Sdim
463249259Sdim  return Out;
464249259Sdim}
465249259Sdim
466249259Sdim// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
467249259Sdimunsigned
468249259SdimA15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
469249259Sdim                                  MachineBasicBlock::iterator InsertBefore,
470249259Sdim                                  DebugLoc DL,
471249259Sdim                                  unsigned Reg1, unsigned Reg2) {
472249259Sdim  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
473249259Sdim  BuildMI(MBB,
474249259Sdim          InsertBefore,
475249259Sdim          DL,
476249259Sdim          TII->get(TargetOpcode::REG_SEQUENCE), Out)
477249259Sdim    .addReg(Reg1)
478249259Sdim    .addImm(ARM::dsub_0)
479249259Sdim    .addReg(Reg2)
480249259Sdim    .addImm(ARM::dsub_1);
481249259Sdim  return Out;
482249259Sdim}
483249259Sdim
484249259Sdim// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
485249259Sdim// and merges them into one DPR register.
486249259Sdimunsigned
487249259SdimA15SDOptimizer::createVExt(MachineBasicBlock &MBB,
488249259Sdim                           MachineBasicBlock::iterator InsertBefore,
489249259Sdim                           DebugLoc DL,
490249259Sdim                           unsigned Ssub0, unsigned Ssub1) {
491249259Sdim  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
492249259Sdim  AddDefaultPred(BuildMI(MBB,
493249259Sdim                         InsertBefore,
494249259Sdim                         DL,
495249259Sdim                         TII->get(ARM::VEXTd32), Out)
496249259Sdim                   .addReg(Ssub0)
497249259Sdim                   .addReg(Ssub1)
498249259Sdim                   .addImm(1));
499249259Sdim  return Out;
500249259Sdim}
501249259Sdim
502249259Sdimunsigned
503249259SdimA15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
504249259Sdim                                   MachineBasicBlock::iterator InsertBefore,
505249259Sdim                                   DebugLoc DL, unsigned DReg, unsigned Lane,
506249259Sdim                                   unsigned ToInsert) {
507249259Sdim  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
508249259Sdim  BuildMI(MBB,
509249259Sdim          InsertBefore,
510249259Sdim          DL,
511249259Sdim          TII->get(TargetOpcode::INSERT_SUBREG), Out)
512249259Sdim    .addReg(DReg)
513249259Sdim    .addReg(ToInsert)
514249259Sdim    .addImm(Lane);
515249259Sdim
516249259Sdim  return Out;
517249259Sdim}
518249259Sdim
519249259Sdimunsigned
520249259SdimA15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
521249259Sdim                                  MachineBasicBlock::iterator InsertBefore,
522249259Sdim                                  DebugLoc DL) {
523249259Sdim  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
524249259Sdim  BuildMI(MBB,
525249259Sdim          InsertBefore,
526249259Sdim          DL,
527249259Sdim          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
528249259Sdim  return Out;
529249259Sdim}
530249259Sdim
531249259Sdim// This function inserts instructions in order to optimize interactions between
532249259Sdim// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
533249259Sdim// lanes, and the using VEXT instructions to recompose the result.
534249259Sdimunsigned
535249259SdimA15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
536249259Sdim  MachineBasicBlock::iterator InsertPt(MI);
537249259Sdim  DebugLoc DL = MI->getDebugLoc();
538249259Sdim  MachineBasicBlock &MBB = *MI->getParent();
539249259Sdim  InsertPt++;
540249259Sdim  unsigned Out;
541249259Sdim
542266759Sdim  // DPair has the same length as QPR and also has two DPRs as subreg.
543266759Sdim  // Treat DPair as QPR.
544266759Sdim  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
545266759Sdim      MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
546249259Sdim    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
547249259Sdim                                         ARM::dsub_0, &ARM::DPRRegClass);
548249259Sdim    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
549249259Sdim                                         ARM::dsub_1, &ARM::DPRRegClass);
550249259Sdim
551249259Sdim    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
552249259Sdim    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
553249259Sdim    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
554249259Sdim
555249259Sdim    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
556249259Sdim    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
557249259Sdim    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
558249259Sdim
559249259Sdim    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
560249259Sdim
561249259Sdim  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
562249259Sdim    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
563249259Sdim    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
564249259Sdim    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
565249259Sdim
566249259Sdim  } else {
567249259Sdim    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
568249259Sdim           "Found unexpected regclass!");
569249259Sdim
570249259Sdim    unsigned PrefLane = getPrefSPRLane(Reg);
571249259Sdim    unsigned Lane;
572249259Sdim    switch (PrefLane) {
573249259Sdim      case ARM::ssub_0: Lane = 0; break;
574249259Sdim      case ARM::ssub_1: Lane = 1; break;
575249259Sdim      default: llvm_unreachable("Unknown preferred lane!");
576249259Sdim    }
577249259Sdim
578266759Sdim    // Treat DPair as QPR
579266759Sdim    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
580266759Sdim                   usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
581249259Sdim
582249259Sdim    Out = createImplicitDef(MBB, InsertPt, DL);
583249259Sdim    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
584249259Sdim    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
585249259Sdim    eraseInstrWithNoUses(MI);
586249259Sdim  }
587249259Sdim  return Out;
588249259Sdim}
589249259Sdim
590249259Sdimbool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
591249259Sdim  // We look for instructions that write S registers that are then read as
592249259Sdim  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
593249259Sdim  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
594249259Sdim  // merge two SPR values to form a DPR register.  In order avoid false
595249259Sdim  // positives we make sure that there is an SPR producer so we look past
596249259Sdim  // COPY and PHI nodes to find it.
597249259Sdim  //
598249259Sdim  // The best code pattern for when an SPR producer is going to be used by a
599249259Sdim  // DPR or QPR consumer depends on whether the other lanes of the
600249259Sdim  // corresponding DPR/QPR are currently defined.
601249259Sdim  //
602249259Sdim  // We can handle these efficiently, depending on the type of
603249259Sdim  // pseudo-instruction that is producing the pattern
604249259Sdim  //
605249259Sdim  //   * COPY:          * VDUP all lanes and merge the results together
606249259Sdim  //                      using VEXTs.
607249259Sdim  //
608249259Sdim  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
609249259Sdim  //                      lane, and the other lane(s) of the DPR/QPR register
610249259Sdim  //                      that we are inserting in are undefined, use the
611249259Sdim  //                      original DPR/QPR value.
612249259Sdim  //                    * Otherwise, fall back on the same stategy as COPY.
613249259Sdim  //
614249259Sdim  //   * REG_SEQUENCE:  * If all except one of the input operands are
615249259Sdim  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
616249259Sdim  //                      defined input operand
617249259Sdim  //                    * Otherwise, fall back on the same stategy as COPY.
618249259Sdim  //
619249259Sdim
620249259Sdim  // First, get all the reads of D-registers done by this instruction.
621249259Sdim  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
622249259Sdim  bool Modified = false;
623249259Sdim
624263509Sdim  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
625249259Sdim     I != E; ++I) {
626249259Sdim    // Follow the def-use chain for this DPR through COPYs, and also through
627249259Sdim    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
628249259Sdim    // we can end up with multiple defs of this DPR.
629249259Sdim
630249259Sdim    SmallVector<MachineInstr *, 8> DefSrcs;
631249259Sdim    if (!TRI->isVirtualRegister(*I))
632249259Sdim      continue;
633249259Sdim    MachineInstr *Def = MRI->getVRegDef(*I);
634249259Sdim    if (!Def)
635249259Sdim      continue;
636249259Sdim
637249259Sdim    elideCopiesAndPHIs(Def, DefSrcs);
638249259Sdim
639263509Sdim    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
640249259Sdim      EE = DefSrcs.end(); II != EE; ++II) {
641249259Sdim      MachineInstr *MI = *II;
642249259Sdim
643249259Sdim      // If we've already analyzed and replaced this operand, don't do
644249259Sdim      // anything.
645249259Sdim      if (Replacements.find(MI) != Replacements.end())
646249259Sdim        continue;
647249259Sdim
648249259Sdim      // Now, work out if the instruction causes a SPR->DPR dependency.
649249259Sdim      if (!hasPartialWrite(MI))
650249259Sdim        continue;
651249259Sdim
652249259Sdim      // Collect all the uses of this MI's DPR def for updating later.
653249259Sdim      SmallVector<MachineOperand*, 8> Uses;
654249259Sdim      unsigned DPRDefReg = MI->getOperand(0).getReg();
655249259Sdim      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
656249259Sdim             E = MRI->use_end(); I != E; ++I)
657249259Sdim        Uses.push_back(&I.getOperand());
658249259Sdim
659249259Sdim      // We can optimize this.
660249259Sdim      unsigned NewReg = optimizeSDPattern(MI);
661249259Sdim
662249259Sdim      if (NewReg != 0) {
663249259Sdim        Modified = true;
664263509Sdim        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
665249259Sdim               E = Uses.end(); I != E; ++I) {
666263509Sdim          // Make sure to constrain the register class of the new register to
667263509Sdim          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
668263509Sdim          // reference into a plain DPR, and that will end poorly. NewReg is
669263509Sdim          // always virtual here, so there will always be a matching subclass
670263509Sdim          // to find.
671263509Sdim          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
672263509Sdim
673249259Sdim          DEBUG(dbgs() << "Replacing operand "
674249259Sdim                       << **I << " with "
675249259Sdim                       << PrintReg(NewReg) << "\n");
676249259Sdim          (*I)->substVirtReg(NewReg, 0, *TRI);
677249259Sdim        }
678249259Sdim      }
679249259Sdim      Replacements[MI] = NewReg;
680249259Sdim    }
681249259Sdim  }
682249259Sdim  return Modified;
683249259Sdim}
684249259Sdim
685249259Sdimbool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
686249259Sdim  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
687249259Sdim  TRI = Fn.getTarget().getRegisterInfo();
688249259Sdim  MRI = &Fn.getRegInfo();
689249259Sdim  bool Modified = false;
690249259Sdim
691249259Sdim  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
692249259Sdim
693249259Sdim  DeadInstr.clear();
694249259Sdim  Replacements.clear();
695249259Sdim
696249259Sdim  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
697249259Sdim       ++MFI) {
698249259Sdim
699249259Sdim    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
700249259Sdim      MI != ME;) {
701249259Sdim      Modified |= runOnInstruction(MI++);
702249259Sdim    }
703249259Sdim
704249259Sdim  }
705249259Sdim
706249259Sdim  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
707249259Sdim                                            E = DeadInstr.end();
708249259Sdim                                            I != E; ++I) {
709249259Sdim    (*I)->eraseFromParent();
710249259Sdim  }
711249259Sdim
712249259Sdim  return Modified;
713249259Sdim}
714249259Sdim
715249259SdimFunctionPass *llvm::createA15SDOptimizerPass() {
716249259Sdim  return new A15SDOptimizer();
717249259Sdim}
718