1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-ldst-opt"
16#include "ARM.h"
17#include "ARMBaseInstrInfo.h"
18#include "ARMBaseRegisterInfo.h"
19#include "ARMMachineFunctionInfo.h"
20#include "MCTargetDesc/ARMAddressingModes.h"
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallPtrSet.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/CodeGen/MachineBasicBlock.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include "llvm/CodeGen/MachineInstr.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineRegisterInfo.h"
32#include "llvm/CodeGen/RegisterScavenging.h"
33#include "llvm/CodeGen/SelectionDAGNodes.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/Function.h"
37#include "llvm/Support/Debug.h"
38#include "llvm/Support/ErrorHandling.h"
39#include "llvm/Support/raw_ostream.h"
40#include "llvm/Target/TargetInstrInfo.h"
41#include "llvm/Target/TargetMachine.h"
42#include "llvm/Target/TargetRegisterInfo.h"
43using namespace llvm;
44
45STATISTIC(NumLDMGened , "Number of ldm instructions generated");
46STATISTIC(NumSTMGened , "Number of stm instructions generated");
47STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
48STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
49STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
50STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
51STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
52STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
53STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
54STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
55STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
56
57/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
58/// load / store instructions to form ldm / stm instructions.
59
60namespace {
61  struct ARMLoadStoreOpt : public MachineFunctionPass {
62    static char ID;
63    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
64
65    const TargetInstrInfo *TII;
66    const TargetRegisterInfo *TRI;
67    const ARMSubtarget *STI;
68    ARMFunctionInfo *AFI;
69    RegScavenger *RS;
70    bool isThumb2;
71
72    virtual bool runOnMachineFunction(MachineFunction &Fn);
73
74    virtual const char *getPassName() const {
75      return "ARM load / store optimization pass";
76    }
77
78  private:
79    struct MemOpQueueEntry {
80      int Offset;
81      unsigned Reg;
82      bool isKill;
83      unsigned Position;
84      MachineBasicBlock::iterator MBBI;
85      bool Merged;
86      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
87                      MachineBasicBlock::iterator i)
88        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
89    };
90    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
91    typedef MemOpQueue::iterator MemOpQueueIter;
92
93    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
94                          const MemOpQueue &MemOps, unsigned DefReg,
95                          unsigned RangeBegin, unsigned RangeEnd);
96
97    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
98                  int Offset, unsigned Base, bool BaseKill, int Opcode,
99                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
100                  DebugLoc dl,
101                  ArrayRef<std::pair<unsigned, bool> > Regs,
102                  ArrayRef<unsigned> ImpDefs);
103    void MergeOpsUpdate(MachineBasicBlock &MBB,
104                        MemOpQueue &MemOps,
105                        unsigned memOpsBegin,
106                        unsigned memOpsEnd,
107                        unsigned insertAfter,
108                        int Offset,
109                        unsigned Base,
110                        bool BaseKill,
111                        int Opcode,
112                        ARMCC::CondCodes Pred,
113                        unsigned PredReg,
114                        unsigned Scratch,
115                        DebugLoc dl,
116                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
117    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
118                      int Opcode, unsigned Size,
119                      ARMCC::CondCodes Pred, unsigned PredReg,
120                      unsigned Scratch, MemOpQueue &MemOps,
121                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
122
123    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
124    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
125                             MachineBasicBlock::iterator &MBBI);
126    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
127                                  MachineBasicBlock::iterator MBBI,
128                                  const TargetInstrInfo *TII,
129                                  bool &Advance,
130                                  MachineBasicBlock::iterator &I);
131    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
132                                   MachineBasicBlock::iterator MBBI,
133                                   bool &Advance,
134                                   MachineBasicBlock::iterator &I);
135    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
136    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
137  };
138  char ARMLoadStoreOpt::ID = 0;
139}
140
141static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
142  switch (Opcode) {
143  default: llvm_unreachable("Unhandled opcode!");
144  case ARM::LDRi12:
145    ++NumLDMGened;
146    switch (Mode) {
147    default: llvm_unreachable("Unhandled submode!");
148    case ARM_AM::ia: return ARM::LDMIA;
149    case ARM_AM::da: return ARM::LDMDA;
150    case ARM_AM::db: return ARM::LDMDB;
151    case ARM_AM::ib: return ARM::LDMIB;
152    }
153  case ARM::STRi12:
154    ++NumSTMGened;
155    switch (Mode) {
156    default: llvm_unreachable("Unhandled submode!");
157    case ARM_AM::ia: return ARM::STMIA;
158    case ARM_AM::da: return ARM::STMDA;
159    case ARM_AM::db: return ARM::STMDB;
160    case ARM_AM::ib: return ARM::STMIB;
161    }
162  case ARM::t2LDRi8:
163  case ARM::t2LDRi12:
164    ++NumLDMGened;
165    switch (Mode) {
166    default: llvm_unreachable("Unhandled submode!");
167    case ARM_AM::ia: return ARM::t2LDMIA;
168    case ARM_AM::db: return ARM::t2LDMDB;
169    }
170  case ARM::t2STRi8:
171  case ARM::t2STRi12:
172    ++NumSTMGened;
173    switch (Mode) {
174    default: llvm_unreachable("Unhandled submode!");
175    case ARM_AM::ia: return ARM::t2STMIA;
176    case ARM_AM::db: return ARM::t2STMDB;
177    }
178  case ARM::VLDRS:
179    ++NumVLDMGened;
180    switch (Mode) {
181    default: llvm_unreachable("Unhandled submode!");
182    case ARM_AM::ia: return ARM::VLDMSIA;
183    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
184    }
185  case ARM::VSTRS:
186    ++NumVSTMGened;
187    switch (Mode) {
188    default: llvm_unreachable("Unhandled submode!");
189    case ARM_AM::ia: return ARM::VSTMSIA;
190    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
191    }
192  case ARM::VLDRD:
193    ++NumVLDMGened;
194    switch (Mode) {
195    default: llvm_unreachable("Unhandled submode!");
196    case ARM_AM::ia: return ARM::VLDMDIA;
197    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
198    }
199  case ARM::VSTRD:
200    ++NumVSTMGened;
201    switch (Mode) {
202    default: llvm_unreachable("Unhandled submode!");
203    case ARM_AM::ia: return ARM::VSTMDIA;
204    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
205    }
206  }
207}
208
209namespace llvm {
210  namespace ARM_AM {
211
212AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
213  switch (Opcode) {
214  default: llvm_unreachable("Unhandled opcode!");
215  case ARM::LDMIA_RET:
216  case ARM::LDMIA:
217  case ARM::LDMIA_UPD:
218  case ARM::STMIA:
219  case ARM::STMIA_UPD:
220  case ARM::t2LDMIA_RET:
221  case ARM::t2LDMIA:
222  case ARM::t2LDMIA_UPD:
223  case ARM::t2STMIA:
224  case ARM::t2STMIA_UPD:
225  case ARM::VLDMSIA:
226  case ARM::VLDMSIA_UPD:
227  case ARM::VSTMSIA:
228  case ARM::VSTMSIA_UPD:
229  case ARM::VLDMDIA:
230  case ARM::VLDMDIA_UPD:
231  case ARM::VSTMDIA:
232  case ARM::VSTMDIA_UPD:
233    return ARM_AM::ia;
234
235  case ARM::LDMDA:
236  case ARM::LDMDA_UPD:
237  case ARM::STMDA:
238  case ARM::STMDA_UPD:
239    return ARM_AM::da;
240
241  case ARM::LDMDB:
242  case ARM::LDMDB_UPD:
243  case ARM::STMDB:
244  case ARM::STMDB_UPD:
245  case ARM::t2LDMDB:
246  case ARM::t2LDMDB_UPD:
247  case ARM::t2STMDB:
248  case ARM::t2STMDB_UPD:
249  case ARM::VLDMSDB_UPD:
250  case ARM::VSTMSDB_UPD:
251  case ARM::VLDMDDB_UPD:
252  case ARM::VSTMDDB_UPD:
253    return ARM_AM::db;
254
255  case ARM::LDMIB:
256  case ARM::LDMIB_UPD:
257  case ARM::STMIB:
258  case ARM::STMIB_UPD:
259    return ARM_AM::ib;
260  }
261}
262
263  } // end namespace ARM_AM
264} // end namespace llvm
265
266static bool isT2i32Load(unsigned Opc) {
267  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
268}
269
270static bool isi32Load(unsigned Opc) {
271  return Opc == ARM::LDRi12 || isT2i32Load(Opc);
272}
273
274static bool isT2i32Store(unsigned Opc) {
275  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
276}
277
278static bool isi32Store(unsigned Opc) {
279  return Opc == ARM::STRi12 || isT2i32Store(Opc);
280}
281
282/// MergeOps - Create and insert a LDM or STM with Base as base register and
283/// registers in Regs as the register operands that would be loaded / stored.
284/// It returns true if the transformation is done.
285bool
286ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
287                          MachineBasicBlock::iterator MBBI,
288                          int Offset, unsigned Base, bool BaseKill,
289                          int Opcode, ARMCC::CondCodes Pred,
290                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
291                          ArrayRef<std::pair<unsigned, bool> > Regs,
292                          ArrayRef<unsigned> ImpDefs) {
293  // Only a single register to load / store. Don't bother.
294  unsigned NumRegs = Regs.size();
295  if (NumRegs <= 1)
296    return false;
297
298  ARM_AM::AMSubMode Mode = ARM_AM::ia;
299  // VFP and Thumb2 do not support IB or DA modes.
300  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
301  bool haveIBAndDA = isNotVFP && !isThumb2;
302  if (Offset == 4 && haveIBAndDA)
303    Mode = ARM_AM::ib;
304  else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
305    Mode = ARM_AM::da;
306  else if (Offset == -4 * (int)NumRegs && isNotVFP)
307    // VLDM/VSTM do not support DB mode without also updating the base reg.
308    Mode = ARM_AM::db;
309  else if (Offset != 0) {
310    // Check if this is a supported opcode before we insert instructions to
311    // calculate a new base register.
312    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
313
314    // If starting offset isn't zero, insert a MI to materialize a new base.
315    // But only do so if it is cost effective, i.e. merging more than two
316    // loads / stores.
317    if (NumRegs <= 2)
318      return false;
319
320    unsigned NewBase;
321    if (isi32Load(Opcode))
322      // If it is a load, then just use one of the destination register to
323      // use as the new base.
324      NewBase = Regs[NumRegs-1].first;
325    else {
326      // Use the scratch register to use as a new base.
327      NewBase = Scratch;
328      if (NewBase == 0)
329        return false;
330    }
331    int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri;
332    if (Offset < 0) {
333      BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri;
334      Offset = - Offset;
335    }
336    int ImmedOffset = isThumb2
337      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
338    if (ImmedOffset == -1)
339      // FIXME: Try t2ADDri12 or t2SUBri12?
340      return false;  // Probably not worth it then.
341
342    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
343      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
344      .addImm(Pred).addReg(PredReg).addReg(0);
345    Base = NewBase;
346    BaseKill = true;  // New base is always killed right its use.
347  }
348
349  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
350                Opcode == ARM::VLDRD);
351  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
352  if (!Opcode) return false;
353  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
354    .addReg(Base, getKillRegState(BaseKill))
355    .addImm(Pred).addReg(PredReg);
356  for (unsigned i = 0; i != NumRegs; ++i)
357    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
358                     | getKillRegState(Regs[i].second));
359
360  // Add implicit defs for super-registers.
361  for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i)
362    MIB.addReg(ImpDefs[i], RegState::ImplicitDefine);
363
364  return true;
365}
366
367/// \brief Find all instructions using a given imp-def within a range.
368///
369/// We are trying to combine a range of instructions, one of which (located at
370/// position RangeBegin) implicitly defines a register. The final LDM/STM will
371/// be placed at RangeEnd, and so any uses of this definition between RangeStart
372/// and RangeEnd must be modified to use an undefined value.
373///
374/// The live range continues until we find a second definition or one of the
375/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
376/// we must consider all uses and decide which are relevant in a second pass.
377void ARMLoadStoreOpt::findUsesOfImpDef(
378    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
379    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
380  std::map<unsigned, MachineOperand *> Uses;
381  unsigned LastLivePos = RangeEnd;
382
383  // First we find all uses of this register with Position between RangeBegin
384  // and RangeEnd, any or all of these could be uses of a definition at
385  // RangeBegin. We also record the latest position a definition at RangeBegin
386  // would be considered live.
387  for (unsigned i = 0; i < MemOps.size(); ++i) {
388    MachineInstr &MI = *MemOps[i].MBBI;
389    unsigned MIPosition = MemOps[i].Position;
390    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
391      continue;
392
393    // If this instruction defines the register, then any later use will be of
394    // that definition rather than ours.
395    if (MI.definesRegister(DefReg))
396      LastLivePos = std::min(LastLivePos, MIPosition);
397
398    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
399    if (!UseOp)
400      continue;
401
402    // If this instruction kills the register then (assuming liveness is
403    // correct when we start) we don't need to think about anything after here.
404    if (UseOp->isKill())
405      LastLivePos = std::min(LastLivePos, MIPosition);
406
407    Uses[MIPosition] = UseOp;
408  }
409
410  // Now we traverse the list of all uses, and append the ones that actually use
411  // our definition to the requested list.
412  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
413                                                      E = Uses.end();
414       I != E; ++I) {
415    // List is sorted by position so once we've found one out of range there
416    // will be no more to consider.
417    if (I->first > LastLivePos)
418      break;
419    UsesOfImpDefs.push_back(I->second);
420  }
421}
422
423// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
424// success.
425void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
426                                     MemOpQueue &memOps,
427                                     unsigned memOpsBegin, unsigned memOpsEnd,
428                                     unsigned insertAfter, int Offset,
429                                     unsigned Base, bool BaseKill,
430                                     int Opcode,
431                                     ARMCC::CondCodes Pred, unsigned PredReg,
432                                     unsigned Scratch,
433                                     DebugLoc dl,
434                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
435  // First calculate which of the registers should be killed by the merged
436  // instruction.
437  const unsigned insertPos = memOps[insertAfter].Position;
438  SmallSet<unsigned, 4> KilledRegs;
439  DenseMap<unsigned, unsigned> Killer;
440  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
441    if (i == memOpsBegin) {
442      i = memOpsEnd;
443      if (i == e)
444        break;
445    }
446    if (memOps[i].Position < insertPos && memOps[i].isKill) {
447      unsigned Reg = memOps[i].Reg;
448      KilledRegs.insert(Reg);
449      Killer[Reg] = i;
450    }
451  }
452
453  SmallVector<std::pair<unsigned, bool>, 8> Regs;
454  SmallVector<unsigned, 8> ImpDefs;
455  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
456  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
457    unsigned Reg = memOps[i].Reg;
458    // If we are inserting the merged operation after an operation that
459    // uses the same register, make sure to transfer any kill flag.
460    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
461    Regs.push_back(std::make_pair(Reg, isKill));
462
463    // Collect any implicit defs of super-registers. They must be preserved.
464    for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) {
465      if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead())
466        continue;
467      unsigned DefReg = MO->getReg();
468      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
469        ImpDefs.push_back(DefReg);
470
471      // There may be other uses of the definition between this instruction and
472      // the eventual LDM/STM position. These should be marked undef if the
473      // merge takes place.
474      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
475                       insertPos);
476    }
477  }
478
479  // Try to do the merge.
480  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
481  ++Loc;
482  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
483                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
484    return;
485
486  // Merge succeeded, update records.
487  Merges.push_back(prior(Loc));
488
489  // In gathering loads together, we may have moved the imp-def of a register
490  // past one of its uses. This is OK, since we know better than the rest of
491  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
492  // affected uses.
493  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
494                                                   E = UsesOfImpDefs.end();
495       I != E; ++I)
496    (*I)->setIsUndef();
497
498  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
499    // Remove kill flags from any memops that come before insertPos.
500    if (Regs[i-memOpsBegin].second) {
501      unsigned Reg = Regs[i-memOpsBegin].first;
502      if (KilledRegs.count(Reg)) {
503        unsigned j = Killer[Reg];
504        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
505        assert(Idx >= 0 && "Cannot find killing operand");
506        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
507        memOps[j].isKill = false;
508      }
509      memOps[i].isKill = true;
510    }
511    MBB.erase(memOps[i].MBBI);
512    // Update this memop to refer to the merged instruction.
513    // We may need to move kill flags again.
514    memOps[i].Merged = true;
515    memOps[i].MBBI = Merges.back();
516    memOps[i].Position = insertPos;
517  }
518}
519
520/// MergeLDR_STR - Merge a number of load / store instructions into one or more
521/// load / store multiple instructions.
522void
523ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
524                         unsigned Base, int Opcode, unsigned Size,
525                         ARMCC::CondCodes Pred, unsigned PredReg,
526                         unsigned Scratch, MemOpQueue &MemOps,
527                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
528  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
529  int Offset = MemOps[SIndex].Offset;
530  int SOffset = Offset;
531  unsigned insertAfter = SIndex;
532  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
533  DebugLoc dl = Loc->getDebugLoc();
534  const MachineOperand &PMO = Loc->getOperand(0);
535  unsigned PReg = PMO.getReg();
536  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
537  unsigned Count = 1;
538  unsigned Limit = ~0U;
539
540  // vldm / vstm limit are 32 for S variants, 16 for D variants.
541
542  switch (Opcode) {
543  default: break;
544  case ARM::VSTRS:
545    Limit = 32;
546    break;
547  case ARM::VSTRD:
548    Limit = 16;
549    break;
550  case ARM::VLDRD:
551    Limit = 16;
552    break;
553  case ARM::VLDRS:
554    Limit = 32;
555    break;
556  }
557
558  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
559    int NewOffset = MemOps[i].Offset;
560    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
561    unsigned Reg = MO.getReg();
562    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
563    // Register numbers must be in ascending order. For VFP / NEON load and
564    // store multiples, the registers must also be consecutive and within the
565    // limit on the number of registers per instruction.
566    if (Reg != ARM::SP &&
567        NewOffset == Offset + (int)Size &&
568        ((isNotVFP && RegNum > PRegNum) ||
569         ((Count < Limit) && RegNum == PRegNum+1)) &&
570        // On Swift we don't want vldm/vstm to start with a odd register num
571        // because Q register unaligned vldm/vstm need more uops.
572        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
573      Offset += Size;
574      PRegNum = RegNum;
575      ++Count;
576    } else {
577      // Can't merge this in. Try merge the earlier ones first.
578      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
579                     Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
580      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
581                   MemOps, Merges);
582      return;
583    }
584
585    if (MemOps[i].Position > MemOps[insertAfter].Position)
586      insertAfter = i;
587  }
588
589  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
590  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
591                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
592  return;
593}
594
595static bool definesCPSR(MachineInstr *MI) {
596  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
597    const MachineOperand &MO = MI->getOperand(i);
598    if (!MO.isReg())
599      continue;
600    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
601      // If the instruction has live CPSR def, then it's not safe to fold it
602      // into load / store.
603      return true;
604  }
605
606  return false;
607}
608
609static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
610                                unsigned Bytes, unsigned Limit,
611                                ARMCC::CondCodes Pred, unsigned PredReg) {
612  unsigned MyPredReg = 0;
613  if (!MI)
614    return false;
615
616  bool CheckCPSRDef = false;
617  switch (MI->getOpcode()) {
618  default: return false;
619  case ARM::t2SUBri:
620  case ARM::SUBri:
621    CheckCPSRDef = true;
622  // fallthrough
623  case ARM::tSUBspi:
624    break;
625  }
626
627  // Make sure the offset fits in 8 bits.
628  if (Bytes == 0 || (Limit && Bytes >= Limit))
629    return false;
630
631  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
632  if (!(MI->getOperand(0).getReg() == Base &&
633        MI->getOperand(1).getReg() == Base &&
634        (MI->getOperand(2).getImm()*Scale) == Bytes &&
635        getInstrPredicate(MI, MyPredReg) == Pred &&
636        MyPredReg == PredReg))
637    return false;
638
639  return CheckCPSRDef ? !definesCPSR(MI) : true;
640}
641
642static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
643                                unsigned Bytes, unsigned Limit,
644                                ARMCC::CondCodes Pred, unsigned PredReg) {
645  unsigned MyPredReg = 0;
646  if (!MI)
647    return false;
648
649  bool CheckCPSRDef = false;
650  switch (MI->getOpcode()) {
651  default: return false;
652  case ARM::t2ADDri:
653  case ARM::ADDri:
654    CheckCPSRDef = true;
655  // fallthrough
656  case ARM::tADDspi:
657    break;
658  }
659
660  if (Bytes == 0 || (Limit && Bytes >= Limit))
661    // Make sure the offset fits in 8 bits.
662    return false;
663
664  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
665  if (!(MI->getOperand(0).getReg() == Base &&
666        MI->getOperand(1).getReg() == Base &&
667        (MI->getOperand(2).getImm()*Scale) == Bytes &&
668        getInstrPredicate(MI, MyPredReg) == Pred &&
669        MyPredReg == PredReg))
670    return false;
671
672  return CheckCPSRDef ? !definesCPSR(MI) : true;
673}
674
675static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
676  switch (MI->getOpcode()) {
677  default: return 0;
678  case ARM::LDRi12:
679  case ARM::STRi12:
680  case ARM::t2LDRi8:
681  case ARM::t2LDRi12:
682  case ARM::t2STRi8:
683  case ARM::t2STRi12:
684  case ARM::VLDRS:
685  case ARM::VSTRS:
686    return 4;
687  case ARM::VLDRD:
688  case ARM::VSTRD:
689    return 8;
690  case ARM::LDMIA:
691  case ARM::LDMDA:
692  case ARM::LDMDB:
693  case ARM::LDMIB:
694  case ARM::STMIA:
695  case ARM::STMDA:
696  case ARM::STMDB:
697  case ARM::STMIB:
698  case ARM::t2LDMIA:
699  case ARM::t2LDMDB:
700  case ARM::t2STMIA:
701  case ARM::t2STMDB:
702  case ARM::VLDMSIA:
703  case ARM::VSTMSIA:
704    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
705  case ARM::VLDMDIA:
706  case ARM::VSTMDIA:
707    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
708  }
709}
710
711static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
712                                            ARM_AM::AMSubMode Mode) {
713  switch (Opc) {
714  default: llvm_unreachable("Unhandled opcode!");
715  case ARM::LDMIA:
716  case ARM::LDMDA:
717  case ARM::LDMDB:
718  case ARM::LDMIB:
719    switch (Mode) {
720    default: llvm_unreachable("Unhandled submode!");
721    case ARM_AM::ia: return ARM::LDMIA_UPD;
722    case ARM_AM::ib: return ARM::LDMIB_UPD;
723    case ARM_AM::da: return ARM::LDMDA_UPD;
724    case ARM_AM::db: return ARM::LDMDB_UPD;
725    }
726  case ARM::STMIA:
727  case ARM::STMDA:
728  case ARM::STMDB:
729  case ARM::STMIB:
730    switch (Mode) {
731    default: llvm_unreachable("Unhandled submode!");
732    case ARM_AM::ia: return ARM::STMIA_UPD;
733    case ARM_AM::ib: return ARM::STMIB_UPD;
734    case ARM_AM::da: return ARM::STMDA_UPD;
735    case ARM_AM::db: return ARM::STMDB_UPD;
736    }
737  case ARM::t2LDMIA:
738  case ARM::t2LDMDB:
739    switch (Mode) {
740    default: llvm_unreachable("Unhandled submode!");
741    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
742    case ARM_AM::db: return ARM::t2LDMDB_UPD;
743    }
744  case ARM::t2STMIA:
745  case ARM::t2STMDB:
746    switch (Mode) {
747    default: llvm_unreachable("Unhandled submode!");
748    case ARM_AM::ia: return ARM::t2STMIA_UPD;
749    case ARM_AM::db: return ARM::t2STMDB_UPD;
750    }
751  case ARM::VLDMSIA:
752    switch (Mode) {
753    default: llvm_unreachable("Unhandled submode!");
754    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
755    case ARM_AM::db: return ARM::VLDMSDB_UPD;
756    }
757  case ARM::VLDMDIA:
758    switch (Mode) {
759    default: llvm_unreachable("Unhandled submode!");
760    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
761    case ARM_AM::db: return ARM::VLDMDDB_UPD;
762    }
763  case ARM::VSTMSIA:
764    switch (Mode) {
765    default: llvm_unreachable("Unhandled submode!");
766    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
767    case ARM_AM::db: return ARM::VSTMSDB_UPD;
768    }
769  case ARM::VSTMDIA:
770    switch (Mode) {
771    default: llvm_unreachable("Unhandled submode!");
772    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
773    case ARM_AM::db: return ARM::VSTMDDB_UPD;
774    }
775  }
776}
777
778/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
779/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
780///
781/// stmia rn, <ra, rb, rc>
782/// rn := rn + 4 * 3;
783/// =>
784/// stmia rn!, <ra, rb, rc>
785///
786/// rn := rn - 4 * 3;
787/// ldmia rn, <ra, rb, rc>
788/// =>
789/// ldmdb rn!, <ra, rb, rc>
790bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
791                                               MachineBasicBlock::iterator MBBI,
792                                               bool &Advance,
793                                               MachineBasicBlock::iterator &I) {
794  MachineInstr *MI = MBBI;
795  unsigned Base = MI->getOperand(0).getReg();
796  bool BaseKill = MI->getOperand(0).isKill();
797  unsigned Bytes = getLSMultipleTransferSize(MI);
798  unsigned PredReg = 0;
799  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
800  int Opcode = MI->getOpcode();
801  DebugLoc dl = MI->getDebugLoc();
802
803  // Can't use an updating ld/st if the base register is also a dest
804  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
805  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
806    if (MI->getOperand(i).getReg() == Base)
807      return false;
808
809  bool DoMerge = false;
810  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
811
812  // Try merging with the previous instruction.
813  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
814  if (MBBI != BeginMBBI) {
815    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
816    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
817      --PrevMBBI;
818    if (Mode == ARM_AM::ia &&
819        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
820      Mode = ARM_AM::db;
821      DoMerge = true;
822    } else if (Mode == ARM_AM::ib &&
823               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
824      Mode = ARM_AM::da;
825      DoMerge = true;
826    }
827    if (DoMerge)
828      MBB.erase(PrevMBBI);
829  }
830
831  // Try merging with the next instruction.
832  MachineBasicBlock::iterator EndMBBI = MBB.end();
833  if (!DoMerge && MBBI != EndMBBI) {
834    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
835    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
836      ++NextMBBI;
837    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
838        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
839      DoMerge = true;
840    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
841               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
842      DoMerge = true;
843    }
844    if (DoMerge) {
845      if (NextMBBI == I) {
846        Advance = true;
847        ++I;
848      }
849      MBB.erase(NextMBBI);
850    }
851  }
852
853  if (!DoMerge)
854    return false;
855
856  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
857  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
858    .addReg(Base, getDefRegState(true)) // WB base register
859    .addReg(Base, getKillRegState(BaseKill))
860    .addImm(Pred).addReg(PredReg);
861
862  // Transfer the rest of operands.
863  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
864    MIB.addOperand(MI->getOperand(OpNum));
865
866  // Transfer memoperands.
867  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
868
869  MBB.erase(MBBI);
870  return true;
871}
872
873static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
874                                             ARM_AM::AddrOpc Mode) {
875  switch (Opc) {
876  case ARM::LDRi12:
877    return ARM::LDR_PRE_IMM;
878  case ARM::STRi12:
879    return ARM::STR_PRE_IMM;
880  case ARM::VLDRS:
881    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
882  case ARM::VLDRD:
883    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
884  case ARM::VSTRS:
885    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
886  case ARM::VSTRD:
887    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
888  case ARM::t2LDRi8:
889  case ARM::t2LDRi12:
890    return ARM::t2LDR_PRE;
891  case ARM::t2STRi8:
892  case ARM::t2STRi12:
893    return ARM::t2STR_PRE;
894  default: llvm_unreachable("Unhandled opcode!");
895  }
896}
897
898static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
899                                              ARM_AM::AddrOpc Mode) {
900  switch (Opc) {
901  case ARM::LDRi12:
902    return ARM::LDR_POST_IMM;
903  case ARM::STRi12:
904    return ARM::STR_POST_IMM;
905  case ARM::VLDRS:
906    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
907  case ARM::VLDRD:
908    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
909  case ARM::VSTRS:
910    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
911  case ARM::VSTRD:
912    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
913  case ARM::t2LDRi8:
914  case ARM::t2LDRi12:
915    return ARM::t2LDR_POST;
916  case ARM::t2STRi8:
917  case ARM::t2STRi12:
918    return ARM::t2STR_POST;
919  default: llvm_unreachable("Unhandled opcode!");
920  }
921}
922
923/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
924/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
925bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
926                                               MachineBasicBlock::iterator MBBI,
927                                               const TargetInstrInfo *TII,
928                                               bool &Advance,
929                                               MachineBasicBlock::iterator &I) {
930  MachineInstr *MI = MBBI;
931  unsigned Base = MI->getOperand(1).getReg();
932  bool BaseKill = MI->getOperand(1).isKill();
933  unsigned Bytes = getLSMultipleTransferSize(MI);
934  int Opcode = MI->getOpcode();
935  DebugLoc dl = MI->getDebugLoc();
936  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
937                Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
938  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
939  if (isi32Load(Opcode) || isi32Store(Opcode))
940    if (MI->getOperand(2).getImm() != 0)
941      return false;
942  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
943    return false;
944
945  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
946  // Can't do the merge if the destination register is the same as the would-be
947  // writeback register.
948  if (MI->getOperand(0).getReg() == Base)
949    return false;
950
951  unsigned PredReg = 0;
952  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
953  bool DoMerge = false;
954  ARM_AM::AddrOpc AddSub = ARM_AM::add;
955  unsigned NewOpc = 0;
956  // AM2 - 12 bits, thumb2 - 8 bits.
957  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
958
959  // Try merging with the previous instruction.
960  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
961  if (MBBI != BeginMBBI) {
962    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
963    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
964      --PrevMBBI;
965    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
966      DoMerge = true;
967      AddSub = ARM_AM::sub;
968    } else if (!isAM5 &&
969               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
970      DoMerge = true;
971    }
972    if (DoMerge) {
973      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
974      MBB.erase(PrevMBBI);
975    }
976  }
977
978  // Try merging with the next instruction.
979  MachineBasicBlock::iterator EndMBBI = MBB.end();
980  if (!DoMerge && MBBI != EndMBBI) {
981    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
982    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
983      ++NextMBBI;
984    if (!isAM5 &&
985        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
986      DoMerge = true;
987      AddSub = ARM_AM::sub;
988    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
989      DoMerge = true;
990    }
991    if (DoMerge) {
992      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
993      if (NextMBBI == I) {
994        Advance = true;
995        ++I;
996      }
997      MBB.erase(NextMBBI);
998    }
999  }
1000
1001  if (!DoMerge)
1002    return false;
1003
1004  if (isAM5) {
1005    // VLDM[SD}_UPD, VSTM[SD]_UPD
1006    // (There are no base-updating versions of VLDR/VSTR instructions, but the
1007    // updating load/store-multiple instructions can be used with only one
1008    // register.)
1009    MachineOperand &MO = MI->getOperand(0);
1010    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
1011      .addReg(Base, getDefRegState(true)) // WB base register
1012      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1013      .addImm(Pred).addReg(PredReg)
1014      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1015                            getKillRegState(MO.isKill())));
1016  } else if (isLd) {
1017    if (isAM2) {
1018      // LDR_PRE, LDR_POST
1019      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1020        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1021        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
1022          .addReg(Base, RegState::Define)
1023          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1024      } else {
1025        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1026        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
1027          .addReg(Base, RegState::Define)
1028          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
1029      }
1030    } else {
1031      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1032      // t2LDR_PRE, t2LDR_POST
1033      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
1034        .addReg(Base, RegState::Define)
1035        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1036    }
1037  } else {
1038    MachineOperand &MO = MI->getOperand(0);
1039    // FIXME: post-indexed stores use am2offset_imm, which still encodes
1040    // the vestigal zero-reg offset register. When that's fixed, this clause
1041    // can be removed entirely.
1042    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1043      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1044      // STR_PRE, STR_POST
1045      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
1046        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1047        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
1048    } else {
1049      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
1050      // t2STR_PRE, t2STR_POST
1051      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
1052        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1053        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1054    }
1055  }
1056  MBB.erase(MBBI);
1057
1058  return true;
1059}
1060
1061/// isMemoryOp - Returns true if instruction is a memory operation that this
1062/// pass is capable of operating on.
1063static bool isMemoryOp(const MachineInstr *MI) {
1064  // When no memory operands are present, conservatively assume unaligned,
1065  // volatile, unfoldable.
1066  if (!MI->hasOneMemOperand())
1067    return false;
1068
1069  const MachineMemOperand *MMO = *MI->memoperands_begin();
1070
1071  // Don't touch volatile memory accesses - we may be changing their order.
1072  if (MMO->isVolatile())
1073    return false;
1074
1075  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1076  // not.
1077  if (MMO->getAlignment() < 4)
1078    return false;
1079
1080  // str <undef> could probably be eliminated entirely, but for now we just want
1081  // to avoid making a mess of it.
1082  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1083  if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
1084      MI->getOperand(0).isUndef())
1085    return false;
1086
1087  // Likewise don't mess with references to undefined addresses.
1088  if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
1089      MI->getOperand(1).isUndef())
1090    return false;
1091
1092  int Opcode = MI->getOpcode();
1093  switch (Opcode) {
1094  default: break;
1095  case ARM::VLDRS:
1096  case ARM::VSTRS:
1097    return MI->getOperand(1).isReg();
1098  case ARM::VLDRD:
1099  case ARM::VSTRD:
1100    return MI->getOperand(1).isReg();
1101  case ARM::LDRi12:
1102  case ARM::STRi12:
1103  case ARM::t2LDRi8:
1104  case ARM::t2LDRi12:
1105  case ARM::t2STRi8:
1106  case ARM::t2STRi12:
1107    return MI->getOperand(1).isReg();
1108  }
1109  return false;
1110}
1111
1112/// AdvanceRS - Advance register scavenger to just before the earliest memory
1113/// op that is being merged.
1114void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
1115  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
1116  unsigned Position = MemOps[0].Position;
1117  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
1118    if (MemOps[i].Position < Position) {
1119      Position = MemOps[i].Position;
1120      Loc = MemOps[i].MBBI;
1121    }
1122  }
1123
1124  if (Loc != MBB.begin())
1125    RS->forward(prior(Loc));
1126}
1127
1128static int getMemoryOpOffset(const MachineInstr *MI) {
1129  int Opcode = MI->getOpcode();
1130  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
1131  unsigned NumOperands = MI->getDesc().getNumOperands();
1132  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
1133
1134  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
1135      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
1136      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
1137      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
1138    return OffField;
1139
1140  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
1141    : ARM_AM::getAM5Offset(OffField) * 4;
1142  if (isAM3) {
1143    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
1144      Offset = -Offset;
1145  } else {
1146    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
1147      Offset = -Offset;
1148  }
1149  return Offset;
1150}
1151
1152static void InsertLDR_STR(MachineBasicBlock &MBB,
1153                          MachineBasicBlock::iterator &MBBI,
1154                          int Offset, bool isDef,
1155                          DebugLoc dl, unsigned NewOpc,
1156                          unsigned Reg, bool RegDeadKill, bool RegUndef,
1157                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
1158                          bool OffKill, bool OffUndef,
1159                          ARMCC::CondCodes Pred, unsigned PredReg,
1160                          const TargetInstrInfo *TII, bool isT2) {
1161  if (isDef) {
1162    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1163                                      TII->get(NewOpc))
1164      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1165      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1166    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1167  } else {
1168    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1169                                      TII->get(NewOpc))
1170      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1171      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1172    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1173  }
1174}
1175
1176bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1177                                          MachineBasicBlock::iterator &MBBI) {
1178  MachineInstr *MI = &*MBBI;
1179  unsigned Opcode = MI->getOpcode();
1180  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
1181      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
1182    const MachineOperand &BaseOp = MI->getOperand(2);
1183    unsigned BaseReg = BaseOp.getReg();
1184    unsigned EvenReg = MI->getOperand(0).getReg();
1185    unsigned OddReg  = MI->getOperand(1).getReg();
1186    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1187    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1188    // ARM errata 602117: LDRD with base in list may result in incorrect base
1189    // register when interrupted or faulted.
1190    bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3();
1191    if (!Errata602117 &&
1192        ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum))
1193      return false;
1194
1195    MachineBasicBlock::iterator NewBBI = MBBI;
1196    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1197    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1198    bool EvenDeadKill = isLd ?
1199      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1200    bool EvenUndef = MI->getOperand(0).isUndef();
1201    bool OddDeadKill  = isLd ?
1202      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1203    bool OddUndef = MI->getOperand(1).isUndef();
1204    bool BaseKill = BaseOp.isKill();
1205    bool BaseUndef = BaseOp.isUndef();
1206    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1207    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1208    int OffImm = getMemoryOpOffset(MI);
1209    unsigned PredReg = 0;
1210    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1211
1212    if (OddRegNum > EvenRegNum && OffImm == 0) {
1213      // Ascending register numbers and no offset. It's safe to change it to a
1214      // ldm or stm.
1215      unsigned NewOpc = (isLd)
1216        ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1217        : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1218      if (isLd) {
1219        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1220          .addReg(BaseReg, getKillRegState(BaseKill))
1221          .addImm(Pred).addReg(PredReg)
1222          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1223          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1224        ++NumLDRD2LDM;
1225      } else {
1226        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1227          .addReg(BaseReg, getKillRegState(BaseKill))
1228          .addImm(Pred).addReg(PredReg)
1229          .addReg(EvenReg,
1230                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1231          .addReg(OddReg,
1232                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
1233        ++NumSTRD2STM;
1234      }
1235      NewBBI = llvm::prior(MBBI);
1236    } else {
1237      // Split into two instructions.
1238      unsigned NewOpc = (isLd)
1239        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1240        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1241      // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1242      // so adjust and use t2LDRi12 here for that.
1243      unsigned NewOpc2 = (isLd)
1244        ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1245        : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1246      DebugLoc dl = MBBI->getDebugLoc();
1247      // If this is a load and base register is killed, it may have been
1248      // re-defed by the load, make sure the first load does not clobber it.
1249      if (isLd &&
1250          (BaseKill || OffKill) &&
1251          (TRI->regsOverlap(EvenReg, BaseReg))) {
1252        assert(!TRI->regsOverlap(OddReg, BaseReg));
1253        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1254                      OddReg, OddDeadKill, false,
1255                      BaseReg, false, BaseUndef, false, OffUndef,
1256                      Pred, PredReg, TII, isT2);
1257        NewBBI = llvm::prior(MBBI);
1258        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1259                      EvenReg, EvenDeadKill, false,
1260                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1261                      Pred, PredReg, TII, isT2);
1262      } else {
1263        if (OddReg == EvenReg && EvenDeadKill) {
1264          // If the two source operands are the same, the kill marker is
1265          // probably on the first one. e.g.
1266          // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1267          EvenDeadKill = false;
1268          OddDeadKill = true;
1269        }
1270        // Never kill the base register in the first instruction.
1271        if (EvenReg == BaseReg)
1272          EvenDeadKill = false;
1273        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1274                      EvenReg, EvenDeadKill, EvenUndef,
1275                      BaseReg, false, BaseUndef, false, OffUndef,
1276                      Pred, PredReg, TII, isT2);
1277        NewBBI = llvm::prior(MBBI);
1278        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1279                      OddReg, OddDeadKill, OddUndef,
1280                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1281                      Pred, PredReg, TII, isT2);
1282      }
1283      if (isLd)
1284        ++NumLDRD2LDR;
1285      else
1286        ++NumSTRD2STR;
1287    }
1288
1289    MBB.erase(MI);
1290    MBBI = NewBBI;
1291    return true;
1292  }
1293  return false;
1294}
1295
1296/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
1297/// ops of the same base and incrementing offset into LDM / STM ops.
1298bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1299  unsigned NumMerges = 0;
1300  unsigned NumMemOps = 0;
1301  MemOpQueue MemOps;
1302  unsigned CurrBase = 0;
1303  int CurrOpc = -1;
1304  unsigned CurrSize = 0;
1305  ARMCC::CondCodes CurrPred = ARMCC::AL;
1306  unsigned CurrPredReg = 0;
1307  unsigned Position = 0;
1308  SmallVector<MachineBasicBlock::iterator,4> Merges;
1309
1310  RS->enterBasicBlock(&MBB);
1311  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1312  while (MBBI != E) {
1313    if (FixInvalidRegPairOp(MBB, MBBI))
1314      continue;
1315
1316    bool Advance  = false;
1317    bool TryMerge = false;
1318    bool Clobber  = false;
1319
1320    bool isMemOp = isMemoryOp(MBBI);
1321    if (isMemOp) {
1322      int Opcode = MBBI->getOpcode();
1323      unsigned Size = getLSMultipleTransferSize(MBBI);
1324      const MachineOperand &MO = MBBI->getOperand(0);
1325      unsigned Reg = MO.getReg();
1326      bool isKill = MO.isDef() ? false : MO.isKill();
1327      unsigned Base = MBBI->getOperand(1).getReg();
1328      unsigned PredReg = 0;
1329      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
1330      int Offset = getMemoryOpOffset(MBBI);
1331      // Watch out for:
1332      // r4 := ldr [r5]
1333      // r5 := ldr [r5, #4]
1334      // r6 := ldr [r5, #8]
1335      //
1336      // The second ldr has effectively broken the chain even though it
1337      // looks like the later ldr(s) use the same base register. Try to
1338      // merge the ldr's so far, including this one. But don't try to
1339      // combine the following ldr(s).
1340      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
1341
1342      // Watch out for:
1343      // r4 := ldr [r0, #8]
1344      // r4 := ldr [r0, #4]
1345      //
1346      // The optimization may reorder the second ldr in front of the first
1347      // ldr, which violates write after write(WAW) dependence. The same as
1348      // str. Try to merge inst(s) already in MemOps.
1349      bool Overlap = false;
1350      for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) {
1351        if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) {
1352          Overlap = true;
1353          break;
1354        }
1355      }
1356
1357      if (CurrBase == 0 && !Clobber) {
1358        // Start of a new chain.
1359        CurrBase = Base;
1360        CurrOpc  = Opcode;
1361        CurrSize = Size;
1362        CurrPred = Pred;
1363        CurrPredReg = PredReg;
1364        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
1365        ++NumMemOps;
1366        Advance = true;
1367      } else if (!Overlap) {
1368        if (Clobber) {
1369          TryMerge = true;
1370          Advance = true;
1371        }
1372
1373        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1374          // No need to match PredReg.
1375          // Continue adding to the queue.
1376          if (Offset > MemOps.back().Offset) {
1377            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
1378                                             Position, MBBI));
1379            ++NumMemOps;
1380            Advance = true;
1381          } else {
1382            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
1383                 I != E; ++I) {
1384              if (Offset < I->Offset) {
1385                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
1386                                                 Position, MBBI));
1387                ++NumMemOps;
1388                Advance = true;
1389                break;
1390              } else if (Offset == I->Offset) {
1391                // Collision! This can't be merged!
1392                break;
1393              }
1394            }
1395          }
1396        }
1397      }
1398    }
1399
1400    if (MBBI->isDebugValue()) {
1401      ++MBBI;
1402      if (MBBI == E)
1403        // Reach the end of the block, try merging the memory instructions.
1404        TryMerge = true;
1405    } else if (Advance) {
1406      ++Position;
1407      ++MBBI;
1408      if (MBBI == E)
1409        // Reach the end of the block, try merging the memory instructions.
1410        TryMerge = true;
1411    } else
1412      TryMerge = true;
1413
1414    if (TryMerge) {
1415      if (NumMemOps > 1) {
1416        // Try to find a free register to use as a new base in case it's needed.
1417        // First advance to the instruction just before the start of the chain.
1418        AdvanceRS(MBB, MemOps);
1419        // Find a scratch register.
1420        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
1421        // Process the load / store instructions.
1422        RS->forward(prior(MBBI));
1423
1424        // Merge ops.
1425        Merges.clear();
1426        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
1427                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
1428
1429        // Try folding preceding/trailing base inc/dec into the generated
1430        // LDM/STM ops.
1431        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
1432          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
1433            ++NumMerges;
1434        NumMerges += Merges.size();
1435
1436        // Try folding preceding/trailing base inc/dec into those load/store
1437        // that were not merged to form LDM/STM ops.
1438        for (unsigned i = 0; i != NumMemOps; ++i)
1439          if (!MemOps[i].Merged)
1440            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1441              ++NumMerges;
1442
1443        // RS may be pointing to an instruction that's deleted.
1444        RS->skipTo(prior(MBBI));
1445      } else if (NumMemOps == 1) {
1446        // Try folding preceding/trailing base inc/dec into the single
1447        // load/store.
1448        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1449          ++NumMerges;
1450          RS->forward(prior(MBBI));
1451        }
1452      }
1453
1454      CurrBase = 0;
1455      CurrOpc = -1;
1456      CurrSize = 0;
1457      CurrPred = ARMCC::AL;
1458      CurrPredReg = 0;
1459      if (NumMemOps) {
1460        MemOps.clear();
1461        NumMemOps = 0;
1462      }
1463
1464      // If iterator hasn't been advanced and this is not a memory op, skip it.
1465      // It can't start a new chain anyway.
1466      if (!Advance && !isMemOp && MBBI != E) {
1467        ++Position;
1468        ++MBBI;
1469      }
1470    }
1471  }
1472  return NumMerges > 0;
1473}
1474
1475/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
1476/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
1477/// directly restore the value of LR into pc.
1478///   ldmfd sp!, {..., lr}
1479///   bx lr
1480/// or
1481///   ldmfd sp!, {..., lr}
1482///   mov pc, lr
1483/// =>
1484///   ldmfd sp!, {..., pc}
1485bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1486  if (MBB.empty()) return false;
1487
1488  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1489  if (MBBI != MBB.begin() &&
1490      (MBBI->getOpcode() == ARM::BX_RET ||
1491       MBBI->getOpcode() == ARM::tBX_RET ||
1492       MBBI->getOpcode() == ARM::MOVPCLR)) {
1493    MachineInstr *PrevMI = prior(MBBI);
1494    unsigned Opcode = PrevMI->getOpcode();
1495    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1496        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1497        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1498      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1499      if (MO.getReg() != ARM::LR)
1500        return false;
1501      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1502      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1503              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1504      PrevMI->setDesc(TII->get(NewOpc));
1505      MO.setReg(ARM::PC);
1506      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
1507      MBB.erase(MBBI);
1508      return true;
1509    }
1510  }
1511  return false;
1512}
1513
1514bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1515  const TargetMachine &TM = Fn.getTarget();
1516  AFI = Fn.getInfo<ARMFunctionInfo>();
1517  TII = TM.getInstrInfo();
1518  TRI = TM.getRegisterInfo();
1519  STI = &TM.getSubtarget<ARMSubtarget>();
1520  RS = new RegScavenger();
1521  isThumb2 = AFI->isThumb2Function();
1522
1523  bool Modified = false;
1524  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1525       ++MFI) {
1526    MachineBasicBlock &MBB = *MFI;
1527    Modified |= LoadStoreMultipleOpti(MBB);
1528    if (TM.getSubtarget<ARMSubtarget>().hasV5TOps())
1529      Modified |= MergeReturnIntoLDM(MBB);
1530  }
1531
1532  delete RS;
1533  return Modified;
1534}
1535
1536
1537/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1538/// load / stores from consecutive locations close to make it more
1539/// likely they will be combined later.
1540
1541namespace {
1542  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1543    static char ID;
1544    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
1545
1546    const DataLayout *TD;
1547    const TargetInstrInfo *TII;
1548    const TargetRegisterInfo *TRI;
1549    const ARMSubtarget *STI;
1550    MachineRegisterInfo *MRI;
1551    MachineFunction *MF;
1552
1553    virtual bool runOnMachineFunction(MachineFunction &Fn);
1554
1555    virtual const char *getPassName() const {
1556      return "ARM pre- register allocation load / store optimization pass";
1557    }
1558
1559  private:
1560    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1561                          unsigned &NewOpc, unsigned &EvenReg,
1562                          unsigned &OddReg, unsigned &BaseReg,
1563                          int &Offset,
1564                          unsigned &PredReg, ARMCC::CondCodes &Pred,
1565                          bool &isT2);
1566    bool RescheduleOps(MachineBasicBlock *MBB,
1567                       SmallVectorImpl<MachineInstr *> &Ops,
1568                       unsigned Base, bool isLd,
1569                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1570    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1571  };
1572  char ARMPreAllocLoadStoreOpt::ID = 0;
1573}
1574
1575bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1576  TD  = Fn.getTarget().getDataLayout();
1577  TII = Fn.getTarget().getInstrInfo();
1578  TRI = Fn.getTarget().getRegisterInfo();
1579  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1580  MRI = &Fn.getRegInfo();
1581  MF  = &Fn;
1582
1583  bool Modified = false;
1584  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1585       ++MFI)
1586    Modified |= RescheduleLoadStoreInstrs(MFI);
1587
1588  return Modified;
1589}
1590
1591static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1592                                      MachineBasicBlock::iterator I,
1593                                      MachineBasicBlock::iterator E,
1594                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
1595                                      SmallSet<unsigned, 4> &MemRegs,
1596                                      const TargetRegisterInfo *TRI) {
1597  // Are there stores / loads / calls between them?
1598  // FIXME: This is overly conservative. We should make use of alias information
1599  // some day.
1600  SmallSet<unsigned, 4> AddedRegPressure;
1601  while (++I != E) {
1602    if (I->isDebugValue() || MemOps.count(&*I))
1603      continue;
1604    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
1605      return false;
1606    if (isLd && I->mayStore())
1607      return false;
1608    if (!isLd) {
1609      if (I->mayLoad())
1610        return false;
1611      // It's not safe to move the first 'str' down.
1612      // str r1, [r0]
1613      // strh r5, [r0]
1614      // str r4, [r0, #+4]
1615      if (I->mayStore())
1616        return false;
1617    }
1618    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1619      MachineOperand &MO = I->getOperand(j);
1620      if (!MO.isReg())
1621        continue;
1622      unsigned Reg = MO.getReg();
1623      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1624        return false;
1625      if (Reg != Base && !MemRegs.count(Reg))
1626        AddedRegPressure.insert(Reg);
1627    }
1628  }
1629
1630  // Estimate register pressure increase due to the transformation.
1631  if (MemRegs.size() <= 4)
1632    // Ok if we are moving small number of instructions.
1633    return true;
1634  return AddedRegPressure.size() <= MemRegs.size() * 2;
1635}
1636
1637
1638/// Copy Op0 and Op1 operands into a new array assigned to MI.
1639static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
1640                                   MachineInstr *Op1) {
1641  assert(MI->memoperands_empty() && "expected a new machineinstr");
1642  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
1643    + (Op1->memoperands_end() - Op1->memoperands_begin());
1644
1645  MachineFunction *MF = MI->getParent()->getParent();
1646  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
1647  MachineSDNode::mmo_iterator MemEnd =
1648    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
1649  MemEnd =
1650    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
1651  MI->setMemRefs(MemBegin, MemEnd);
1652}
1653
1654bool
1655ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1656                                          DebugLoc &dl,
1657                                          unsigned &NewOpc, unsigned &EvenReg,
1658                                          unsigned &OddReg, unsigned &BaseReg,
1659                                          int &Offset, unsigned &PredReg,
1660                                          ARMCC::CondCodes &Pred,
1661                                          bool &isT2) {
1662  // Make sure we're allowed to generate LDRD/STRD.
1663  if (!STI->hasV5TEOps())
1664    return false;
1665
1666  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1667  unsigned Scale = 1;
1668  unsigned Opcode = Op0->getOpcode();
1669  if (Opcode == ARM::LDRi12)
1670    NewOpc = ARM::LDRD;
1671  else if (Opcode == ARM::STRi12)
1672    NewOpc = ARM::STRD;
1673  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1674    NewOpc = ARM::t2LDRDi8;
1675    Scale = 4;
1676    isT2 = true;
1677  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1678    NewOpc = ARM::t2STRDi8;
1679    Scale = 4;
1680    isT2 = true;
1681  } else
1682    return false;
1683
1684  // Make sure the base address satisfies i64 ld / st alignment requirement.
1685  // At the moment, we ignore the memoryoperand's value.
1686  // If we want to use AliasAnalysis, we should check it accordingly.
1687  if (!Op0->hasOneMemOperand() ||
1688      (*Op0->memoperands_begin())->isVolatile())
1689    return false;
1690
1691  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1692  const Function *Func = MF->getFunction();
1693  unsigned ReqAlign = STI->hasV6Ops()
1694    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
1695    : 8;  // Pre-v6 need 8-byte align
1696  if (Align < ReqAlign)
1697    return false;
1698
1699  // Then make sure the immediate offset fits.
1700  int OffImm = getMemoryOpOffset(Op0);
1701  if (isT2) {
1702    int Limit = (1 << 8) * Scale;
1703    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
1704      return false;
1705    Offset = OffImm;
1706  } else {
1707    ARM_AM::AddrOpc AddSub = ARM_AM::add;
1708    if (OffImm < 0) {
1709      AddSub = ARM_AM::sub;
1710      OffImm = - OffImm;
1711    }
1712    int Limit = (1 << 8) * Scale;
1713    if (OffImm >= Limit || (OffImm & (Scale-1)))
1714      return false;
1715    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1716  }
1717  EvenReg = Op0->getOperand(0).getReg();
1718  OddReg  = Op1->getOperand(0).getReg();
1719  if (EvenReg == OddReg)
1720    return false;
1721  BaseReg = Op0->getOperand(1).getReg();
1722  Pred = getInstrPredicate(Op0, PredReg);
1723  dl = Op0->getDebugLoc();
1724  return true;
1725}
1726
1727namespace {
1728  struct OffsetCompare {
1729    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1730      int LOffset = getMemoryOpOffset(LHS);
1731      int ROffset = getMemoryOpOffset(RHS);
1732      assert(LHS == RHS || LOffset != ROffset);
1733      return LOffset > ROffset;
1734    }
1735  };
1736}
1737
1738bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1739                                 SmallVectorImpl<MachineInstr *> &Ops,
1740                                 unsigned Base, bool isLd,
1741                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1742  bool RetVal = false;
1743
1744  // Sort by offset (in reverse order).
1745  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1746
1747  // The loads / stores of the same base are in order. Scan them from first to
1748  // last and check for the following:
1749  // 1. Any def of base.
1750  // 2. Any gaps.
1751  while (Ops.size() > 1) {
1752    unsigned FirstLoc = ~0U;
1753    unsigned LastLoc = 0;
1754    MachineInstr *FirstOp = 0;
1755    MachineInstr *LastOp = 0;
1756    int LastOffset = 0;
1757    unsigned LastOpcode = 0;
1758    unsigned LastBytes = 0;
1759    unsigned NumMove = 0;
1760    for (int i = Ops.size() - 1; i >= 0; --i) {
1761      MachineInstr *Op = Ops[i];
1762      unsigned Loc = MI2LocMap[Op];
1763      if (Loc <= FirstLoc) {
1764        FirstLoc = Loc;
1765        FirstOp = Op;
1766      }
1767      if (Loc >= LastLoc) {
1768        LastLoc = Loc;
1769        LastOp = Op;
1770      }
1771
1772      unsigned LSMOpcode
1773        = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
1774      if (LastOpcode && LSMOpcode != LastOpcode)
1775        break;
1776
1777      int Offset = getMemoryOpOffset(Op);
1778      unsigned Bytes = getLSMultipleTransferSize(Op);
1779      if (LastBytes) {
1780        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1781          break;
1782      }
1783      LastOffset = Offset;
1784      LastBytes = Bytes;
1785      LastOpcode = LSMOpcode;
1786      if (++NumMove == 8) // FIXME: Tune this limit.
1787        break;
1788    }
1789
1790    if (NumMove <= 1)
1791      Ops.pop_back();
1792    else {
1793      SmallPtrSet<MachineInstr*, 4> MemOps;
1794      SmallSet<unsigned, 4> MemRegs;
1795      for (int i = NumMove-1; i >= 0; --i) {
1796        MemOps.insert(Ops[i]);
1797        MemRegs.insert(Ops[i]->getOperand(0).getReg());
1798      }
1799
1800      // Be conservative, if the instructions are too far apart, don't
1801      // move them. We want to limit the increase of register pressure.
1802      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1803      if (DoMove)
1804        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1805                                           MemOps, MemRegs, TRI);
1806      if (!DoMove) {
1807        for (unsigned i = 0; i != NumMove; ++i)
1808          Ops.pop_back();
1809      } else {
1810        // This is the new location for the loads / stores.
1811        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1812        while (InsertPos != MBB->end()
1813               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
1814          ++InsertPos;
1815
1816        // If we are moving a pair of loads / stores, see if it makes sense
1817        // to try to allocate a pair of registers that can form register pairs.
1818        MachineInstr *Op0 = Ops.back();
1819        MachineInstr *Op1 = Ops[Ops.size()-2];
1820        unsigned EvenReg = 0, OddReg = 0;
1821        unsigned BaseReg = 0, PredReg = 0;
1822        ARMCC::CondCodes Pred = ARMCC::AL;
1823        bool isT2 = false;
1824        unsigned NewOpc = 0;
1825        int Offset = 0;
1826        DebugLoc dl;
1827        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1828                                             EvenReg, OddReg, BaseReg,
1829                                             Offset, PredReg, Pred, isT2)) {
1830          Ops.pop_back();
1831          Ops.pop_back();
1832
1833          const MCInstrDesc &MCID = TII->get(NewOpc);
1834          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
1835          MRI->constrainRegClass(EvenReg, TRC);
1836          MRI->constrainRegClass(OddReg, TRC);
1837
1838          // Form the pair instruction.
1839          if (isLd) {
1840            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
1841              .addReg(EvenReg, RegState::Define)
1842              .addReg(OddReg, RegState::Define)
1843              .addReg(BaseReg);
1844            // FIXME: We're converting from LDRi12 to an insn that still
1845            // uses addrmode2, so we need an explicit offset reg. It should
1846            // always by reg0 since we're transforming LDRi12s.
1847            if (!isT2)
1848              MIB.addReg(0);
1849            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1850            concatenateMemOperands(MIB, Op0, Op1);
1851            DEBUG(dbgs() << "Formed " << *MIB << "\n");
1852            ++NumLDRDFormed;
1853          } else {
1854            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
1855              .addReg(EvenReg)
1856              .addReg(OddReg)
1857              .addReg(BaseReg);
1858            // FIXME: We're converting from LDRi12 to an insn that still
1859            // uses addrmode2, so we need an explicit offset reg. It should
1860            // always by reg0 since we're transforming STRi12s.
1861            if (!isT2)
1862              MIB.addReg(0);
1863            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1864            concatenateMemOperands(MIB, Op0, Op1);
1865            DEBUG(dbgs() << "Formed " << *MIB << "\n");
1866            ++NumSTRDFormed;
1867          }
1868          MBB->erase(Op0);
1869          MBB->erase(Op1);
1870
1871          // Add register allocation hints to form register pairs.
1872          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1873          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
1874        } else {
1875          for (unsigned i = 0; i != NumMove; ++i) {
1876            MachineInstr *Op = Ops.back();
1877            Ops.pop_back();
1878            MBB->splice(InsertPos, MBB, Op);
1879          }
1880        }
1881
1882        NumLdStMoved += NumMove;
1883        RetVal = true;
1884      }
1885    }
1886  }
1887
1888  return RetVal;
1889}
1890
1891bool
1892ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1893  bool RetVal = false;
1894
1895  DenseMap<MachineInstr*, unsigned> MI2LocMap;
1896  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1897  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1898  SmallVector<unsigned, 4> LdBases;
1899  SmallVector<unsigned, 4> StBases;
1900
1901  unsigned Loc = 0;
1902  MachineBasicBlock::iterator MBBI = MBB->begin();
1903  MachineBasicBlock::iterator E = MBB->end();
1904  while (MBBI != E) {
1905    for (; MBBI != E; ++MBBI) {
1906      MachineInstr *MI = MBBI;
1907      if (MI->isCall() || MI->isTerminator()) {
1908        // Stop at barriers.
1909        ++MBBI;
1910        break;
1911      }
1912
1913      if (!MI->isDebugValue())
1914        MI2LocMap[MI] = ++Loc;
1915
1916      if (!isMemoryOp(MI))
1917        continue;
1918      unsigned PredReg = 0;
1919      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
1920        continue;
1921
1922      int Opc = MI->getOpcode();
1923      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
1924      unsigned Base = MI->getOperand(1).getReg();
1925      int Offset = getMemoryOpOffset(MI);
1926
1927      bool StopHere = false;
1928      if (isLd) {
1929        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1930          Base2LdsMap.find(Base);
1931        if (BI != Base2LdsMap.end()) {
1932          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1933            if (Offset == getMemoryOpOffset(BI->second[i])) {
1934              StopHere = true;
1935              break;
1936            }
1937          }
1938          if (!StopHere)
1939            BI->second.push_back(MI);
1940        } else {
1941          Base2LdsMap[Base].push_back(MI);
1942          LdBases.push_back(Base);
1943        }
1944      } else {
1945        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1946          Base2StsMap.find(Base);
1947        if (BI != Base2StsMap.end()) {
1948          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1949            if (Offset == getMemoryOpOffset(BI->second[i])) {
1950              StopHere = true;
1951              break;
1952            }
1953          }
1954          if (!StopHere)
1955            BI->second.push_back(MI);
1956        } else {
1957          Base2StsMap[Base].push_back(MI);
1958          StBases.push_back(Base);
1959        }
1960      }
1961
1962      if (StopHere) {
1963        // Found a duplicate (a base+offset combination that's seen earlier).
1964        // Backtrack.
1965        --Loc;
1966        break;
1967      }
1968    }
1969
1970    // Re-schedule loads.
1971    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1972      unsigned Base = LdBases[i];
1973      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
1974      if (Lds.size() > 1)
1975        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1976    }
1977
1978    // Re-schedule stores.
1979    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1980      unsigned Base = StBases[i];
1981      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
1982      if (Sts.size() > 1)
1983        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1984    }
1985
1986    if (MBBI != E) {
1987      Base2LdsMap.clear();
1988      Base2StsMap.clear();
1989      LdBases.clear();
1990      StBases.clear();
1991    }
1992  }
1993
1994  return RetVal;
1995}
1996
1997
1998/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1999/// optimization pass.
2000FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2001  if (PreAlloc)
2002    return new ARMPreAllocLoadStoreOpt();
2003  return new ARMLoadStoreOpt();
2004}
2005