AArch64LoadStoreOptimizer.cpp revision 360784
1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64InstrInfo.h"
15#include "AArch64Subtarget.h"
16#include "MCTargetDesc/AArch64AddressingModes.h"
17#include "llvm/ADT/BitVector.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/iterator_range.h"
22#include "llvm/Analysis/AliasAnalysis.h"
23#include "llvm/CodeGen/MachineBasicBlock.h"
24#include "llvm/CodeGen/MachineFunction.h"
25#include "llvm/CodeGen/MachineFunctionPass.h"
26#include "llvm/CodeGen/MachineInstr.h"
27#include "llvm/CodeGen/MachineInstrBuilder.h"
28#include "llvm/CodeGen/MachineOperand.h"
29#include "llvm/CodeGen/MachineRegisterInfo.h"
30#include "llvm/CodeGen/TargetRegisterInfo.h"
31#include "llvm/IR/DebugLoc.h"
32#include "llvm/MC/MCRegisterInfo.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/DebugCounter.h"
37#include "llvm/Support/ErrorHandling.h"
38#include "llvm/Support/raw_ostream.h"
39#include <cassert>
40#include <cstdint>
41#include <functional>
42#include <iterator>
43#include <limits>
44
45using namespace llvm;
46
47#define DEBUG_TYPE "aarch64-ldst-opt"
48
49STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
50STATISTIC(NumPostFolded, "Number of post-index updates folded");
51STATISTIC(NumPreFolded, "Number of pre-index updates folded");
52STATISTIC(NumUnscaledPairCreated,
53          "Number of load/store from unscaled generated");
54STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
55STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
56
57DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
58              "Controls which pairs are considered for renaming");
59
60// The LdStLimit limits how far we search for load/store pairs.
61static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
62                                   cl::init(20), cl::Hidden);
63
64// The UpdateLimit limits how far we search for update instructions when we form
65// pre-/post-index instructions.
66static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
67                                     cl::Hidden);
68
69// Enable register renaming to find additional store pairing opportunities.
70static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
71                                    cl::init(false), cl::Hidden);
72
73#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
74
75namespace {
76
77using LdStPairFlags = struct LdStPairFlags {
78  // If a matching instruction is found, MergeForward is set to true if the
79  // merge is to remove the first instruction and replace the second with
80  // a pair-wise insn, and false if the reverse is true.
81  bool MergeForward = false;
82
83  // SExtIdx gives the index of the result of the load pair that must be
84  // extended. The value of SExtIdx assumes that the paired load produces the
85  // value in this order: (I, returned iterator), i.e., -1 means no value has
86  // to be extended, 0 means I, and 1 means the returned iterator.
87  int SExtIdx = -1;
88
89  // If not none, RenameReg can be used to rename the result register of the
90  // first store in a pair. Currently this only works when merging stores
91  // forward.
92  Optional<MCPhysReg> RenameReg = None;
93
94  LdStPairFlags() = default;
95
96  void setMergeForward(bool V = true) { MergeForward = V; }
97  bool getMergeForward() const { return MergeForward; }
98
99  void setSExtIdx(int V) { SExtIdx = V; }
100  int getSExtIdx() const { return SExtIdx; }
101
102  void setRenameReg(MCPhysReg R) { RenameReg = R; }
103  void clearRenameReg() { RenameReg = None; }
104  Optional<MCPhysReg> getRenameReg() const { return RenameReg; }
105};
106
107struct AArch64LoadStoreOpt : public MachineFunctionPass {
108  static char ID;
109
110  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
111    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
112  }
113
114  AliasAnalysis *AA;
115  const AArch64InstrInfo *TII;
116  const TargetRegisterInfo *TRI;
117  const AArch64Subtarget *Subtarget;
118
119  // Track which register units have been modified and used.
120  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
121  LiveRegUnits DefinedInBB;
122
123  void getAnalysisUsage(AnalysisUsage &AU) const override {
124    AU.addRequired<AAResultsWrapperPass>();
125    MachineFunctionPass::getAnalysisUsage(AU);
126  }
127
128  // Scan the instructions looking for a load/store that can be combined
129  // with the current instruction into a load/store pair.
130  // Return the matching instruction if one is found, else MBB->end().
131  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
132                                               LdStPairFlags &Flags,
133                                               unsigned Limit,
134                                               bool FindNarrowMerge);
135
136  // Scan the instructions looking for a store that writes to the address from
137  // which the current load instruction reads. Return true if one is found.
138  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
139                         MachineBasicBlock::iterator &StoreI);
140
141  // Merge the two instructions indicated into a wider narrow store instruction.
142  MachineBasicBlock::iterator
143  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
144                        MachineBasicBlock::iterator MergeMI,
145                        const LdStPairFlags &Flags);
146
147  // Merge the two instructions indicated into a single pair-wise instruction.
148  MachineBasicBlock::iterator
149  mergePairedInsns(MachineBasicBlock::iterator I,
150                   MachineBasicBlock::iterator Paired,
151                   const LdStPairFlags &Flags);
152
153  // Promote the load that reads directly from the address stored to.
154  MachineBasicBlock::iterator
155  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
156                       MachineBasicBlock::iterator StoreI);
157
158  // Scan the instruction list to find a base register update that can
159  // be combined with the current instruction (a load or store) using
160  // pre or post indexed addressing with writeback. Scan forwards.
161  MachineBasicBlock::iterator
162  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
163                                int UnscaledOffset, unsigned Limit);
164
165  // Scan the instruction list to find a base register update that can
166  // be combined with the current instruction (a load or store) using
167  // pre or post indexed addressing with writeback. Scan backwards.
168  MachineBasicBlock::iterator
169  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
170
171  // Find an instruction that updates the base register of the ld/st
172  // instruction.
173  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
174                            unsigned BaseReg, int Offset);
175
176  // Merge a pre- or post-index base register update into a ld/st instruction.
177  MachineBasicBlock::iterator
178  mergeUpdateInsn(MachineBasicBlock::iterator I,
179                  MachineBasicBlock::iterator Update, bool IsPreIdx);
180
181  // Find and merge zero store instructions.
182  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
183
184  // Find and pair ldr/str instructions.
185  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
186
187  // Find and promote load instructions which read directly from store.
188  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
189
190  // Find and merge a base register updates before or after a ld/st instruction.
191  bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
192
193  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
194
195  bool runOnMachineFunction(MachineFunction &Fn) override;
196
197  MachineFunctionProperties getRequiredProperties() const override {
198    return MachineFunctionProperties().set(
199        MachineFunctionProperties::Property::NoVRegs);
200  }
201
202  StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
203};
204
205char AArch64LoadStoreOpt::ID = 0;
206
207} // end anonymous namespace
208
209INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
210                AARCH64_LOAD_STORE_OPT_NAME, false, false)
211
212static bool isNarrowStore(unsigned Opc) {
213  switch (Opc) {
214  default:
215    return false;
216  case AArch64::STRBBui:
217  case AArch64::STURBBi:
218  case AArch64::STRHHui:
219  case AArch64::STURHHi:
220    return true;
221  }
222}
223
224// These instruction set memory tag and either keep memory contents unchanged or
225// set it to zero, ignoring the address part of the source register.
226static bool isTagStore(const MachineInstr &MI) {
227  switch (MI.getOpcode()) {
228  default:
229    return false;
230  case AArch64::STGOffset:
231  case AArch64::STZGOffset:
232  case AArch64::ST2GOffset:
233  case AArch64::STZ2GOffset:
234    return true;
235  }
236}
237
238static unsigned getMatchingNonSExtOpcode(unsigned Opc,
239                                         bool *IsValidLdStrOpc = nullptr) {
240  if (IsValidLdStrOpc)
241    *IsValidLdStrOpc = true;
242  switch (Opc) {
243  default:
244    if (IsValidLdStrOpc)
245      *IsValidLdStrOpc = false;
246    return std::numeric_limits<unsigned>::max();
247  case AArch64::STRDui:
248  case AArch64::STURDi:
249  case AArch64::STRQui:
250  case AArch64::STURQi:
251  case AArch64::STRBBui:
252  case AArch64::STURBBi:
253  case AArch64::STRHHui:
254  case AArch64::STURHHi:
255  case AArch64::STRWui:
256  case AArch64::STURWi:
257  case AArch64::STRXui:
258  case AArch64::STURXi:
259  case AArch64::LDRDui:
260  case AArch64::LDURDi:
261  case AArch64::LDRQui:
262  case AArch64::LDURQi:
263  case AArch64::LDRWui:
264  case AArch64::LDURWi:
265  case AArch64::LDRXui:
266  case AArch64::LDURXi:
267  case AArch64::STRSui:
268  case AArch64::STURSi:
269  case AArch64::LDRSui:
270  case AArch64::LDURSi:
271    return Opc;
272  case AArch64::LDRSWui:
273    return AArch64::LDRWui;
274  case AArch64::LDURSWi:
275    return AArch64::LDURWi;
276  }
277}
278
279static unsigned getMatchingWideOpcode(unsigned Opc) {
280  switch (Opc) {
281  default:
282    llvm_unreachable("Opcode has no wide equivalent!");
283  case AArch64::STRBBui:
284    return AArch64::STRHHui;
285  case AArch64::STRHHui:
286    return AArch64::STRWui;
287  case AArch64::STURBBi:
288    return AArch64::STURHHi;
289  case AArch64::STURHHi:
290    return AArch64::STURWi;
291  case AArch64::STURWi:
292    return AArch64::STURXi;
293  case AArch64::STRWui:
294    return AArch64::STRXui;
295  }
296}
297
298static unsigned getMatchingPairOpcode(unsigned Opc) {
299  switch (Opc) {
300  default:
301    llvm_unreachable("Opcode has no pairwise equivalent!");
302  case AArch64::STRSui:
303  case AArch64::STURSi:
304    return AArch64::STPSi;
305  case AArch64::STRDui:
306  case AArch64::STURDi:
307    return AArch64::STPDi;
308  case AArch64::STRQui:
309  case AArch64::STURQi:
310    return AArch64::STPQi;
311  case AArch64::STRWui:
312  case AArch64::STURWi:
313    return AArch64::STPWi;
314  case AArch64::STRXui:
315  case AArch64::STURXi:
316    return AArch64::STPXi;
317  case AArch64::LDRSui:
318  case AArch64::LDURSi:
319    return AArch64::LDPSi;
320  case AArch64::LDRDui:
321  case AArch64::LDURDi:
322    return AArch64::LDPDi;
323  case AArch64::LDRQui:
324  case AArch64::LDURQi:
325    return AArch64::LDPQi;
326  case AArch64::LDRWui:
327  case AArch64::LDURWi:
328    return AArch64::LDPWi;
329  case AArch64::LDRXui:
330  case AArch64::LDURXi:
331    return AArch64::LDPXi;
332  case AArch64::LDRSWui:
333  case AArch64::LDURSWi:
334    return AArch64::LDPSWi;
335  }
336}
337
338static unsigned isMatchingStore(MachineInstr &LoadInst,
339                                MachineInstr &StoreInst) {
340  unsigned LdOpc = LoadInst.getOpcode();
341  unsigned StOpc = StoreInst.getOpcode();
342  switch (LdOpc) {
343  default:
344    llvm_unreachable("Unsupported load instruction!");
345  case AArch64::LDRBBui:
346    return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
347           StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
348  case AArch64::LDURBBi:
349    return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
350           StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
351  case AArch64::LDRHHui:
352    return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
353           StOpc == AArch64::STRXui;
354  case AArch64::LDURHHi:
355    return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
356           StOpc == AArch64::STURXi;
357  case AArch64::LDRWui:
358    return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
359  case AArch64::LDURWi:
360    return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
361  case AArch64::LDRXui:
362    return StOpc == AArch64::STRXui;
363  case AArch64::LDURXi:
364    return StOpc == AArch64::STURXi;
365  }
366}
367
368static unsigned getPreIndexedOpcode(unsigned Opc) {
369  // FIXME: We don't currently support creating pre-indexed loads/stores when
370  // the load or store is the unscaled version.  If we decide to perform such an
371  // optimization in the future the cases for the unscaled loads/stores will
372  // need to be added here.
373  switch (Opc) {
374  default:
375    llvm_unreachable("Opcode has no pre-indexed equivalent!");
376  case AArch64::STRSui:
377    return AArch64::STRSpre;
378  case AArch64::STRDui:
379    return AArch64::STRDpre;
380  case AArch64::STRQui:
381    return AArch64::STRQpre;
382  case AArch64::STRBBui:
383    return AArch64::STRBBpre;
384  case AArch64::STRHHui:
385    return AArch64::STRHHpre;
386  case AArch64::STRWui:
387    return AArch64::STRWpre;
388  case AArch64::STRXui:
389    return AArch64::STRXpre;
390  case AArch64::LDRSui:
391    return AArch64::LDRSpre;
392  case AArch64::LDRDui:
393    return AArch64::LDRDpre;
394  case AArch64::LDRQui:
395    return AArch64::LDRQpre;
396  case AArch64::LDRBBui:
397    return AArch64::LDRBBpre;
398  case AArch64::LDRHHui:
399    return AArch64::LDRHHpre;
400  case AArch64::LDRWui:
401    return AArch64::LDRWpre;
402  case AArch64::LDRXui:
403    return AArch64::LDRXpre;
404  case AArch64::LDRSWui:
405    return AArch64::LDRSWpre;
406  case AArch64::LDPSi:
407    return AArch64::LDPSpre;
408  case AArch64::LDPSWi:
409    return AArch64::LDPSWpre;
410  case AArch64::LDPDi:
411    return AArch64::LDPDpre;
412  case AArch64::LDPQi:
413    return AArch64::LDPQpre;
414  case AArch64::LDPWi:
415    return AArch64::LDPWpre;
416  case AArch64::LDPXi:
417    return AArch64::LDPXpre;
418  case AArch64::STPSi:
419    return AArch64::STPSpre;
420  case AArch64::STPDi:
421    return AArch64::STPDpre;
422  case AArch64::STPQi:
423    return AArch64::STPQpre;
424  case AArch64::STPWi:
425    return AArch64::STPWpre;
426  case AArch64::STPXi:
427    return AArch64::STPXpre;
428  case AArch64::STGOffset:
429    return AArch64::STGPreIndex;
430  case AArch64::STZGOffset:
431    return AArch64::STZGPreIndex;
432  case AArch64::ST2GOffset:
433    return AArch64::ST2GPreIndex;
434  case AArch64::STZ2GOffset:
435    return AArch64::STZ2GPreIndex;
436  case AArch64::STGPi:
437    return AArch64::STGPpre;
438  }
439}
440
441static unsigned getPostIndexedOpcode(unsigned Opc) {
442  switch (Opc) {
443  default:
444    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
445  case AArch64::STRSui:
446  case AArch64::STURSi:
447    return AArch64::STRSpost;
448  case AArch64::STRDui:
449  case AArch64::STURDi:
450    return AArch64::STRDpost;
451  case AArch64::STRQui:
452  case AArch64::STURQi:
453    return AArch64::STRQpost;
454  case AArch64::STRBBui:
455    return AArch64::STRBBpost;
456  case AArch64::STRHHui:
457    return AArch64::STRHHpost;
458  case AArch64::STRWui:
459  case AArch64::STURWi:
460    return AArch64::STRWpost;
461  case AArch64::STRXui:
462  case AArch64::STURXi:
463    return AArch64::STRXpost;
464  case AArch64::LDRSui:
465  case AArch64::LDURSi:
466    return AArch64::LDRSpost;
467  case AArch64::LDRDui:
468  case AArch64::LDURDi:
469    return AArch64::LDRDpost;
470  case AArch64::LDRQui:
471  case AArch64::LDURQi:
472    return AArch64::LDRQpost;
473  case AArch64::LDRBBui:
474    return AArch64::LDRBBpost;
475  case AArch64::LDRHHui:
476    return AArch64::LDRHHpost;
477  case AArch64::LDRWui:
478  case AArch64::LDURWi:
479    return AArch64::LDRWpost;
480  case AArch64::LDRXui:
481  case AArch64::LDURXi:
482    return AArch64::LDRXpost;
483  case AArch64::LDRSWui:
484    return AArch64::LDRSWpost;
485  case AArch64::LDPSi:
486    return AArch64::LDPSpost;
487  case AArch64::LDPSWi:
488    return AArch64::LDPSWpost;
489  case AArch64::LDPDi:
490    return AArch64::LDPDpost;
491  case AArch64::LDPQi:
492    return AArch64::LDPQpost;
493  case AArch64::LDPWi:
494    return AArch64::LDPWpost;
495  case AArch64::LDPXi:
496    return AArch64::LDPXpost;
497  case AArch64::STPSi:
498    return AArch64::STPSpost;
499  case AArch64::STPDi:
500    return AArch64::STPDpost;
501  case AArch64::STPQi:
502    return AArch64::STPQpost;
503  case AArch64::STPWi:
504    return AArch64::STPWpost;
505  case AArch64::STPXi:
506    return AArch64::STPXpost;
507  case AArch64::STGOffset:
508    return AArch64::STGPostIndex;
509  case AArch64::STZGOffset:
510    return AArch64::STZGPostIndex;
511  case AArch64::ST2GOffset:
512    return AArch64::ST2GPostIndex;
513  case AArch64::STZ2GOffset:
514    return AArch64::STZ2GPostIndex;
515  case AArch64::STGPi:
516    return AArch64::STGPpost;
517  }
518}
519
520static bool isPairedLdSt(const MachineInstr &MI) {
521  switch (MI.getOpcode()) {
522  default:
523    return false;
524  case AArch64::LDPSi:
525  case AArch64::LDPSWi:
526  case AArch64::LDPDi:
527  case AArch64::LDPQi:
528  case AArch64::LDPWi:
529  case AArch64::LDPXi:
530  case AArch64::STPSi:
531  case AArch64::STPDi:
532  case AArch64::STPQi:
533  case AArch64::STPWi:
534  case AArch64::STPXi:
535  case AArch64::STGPi:
536    return true;
537  }
538}
539
540// Returns the scale and offset range of pre/post indexed variants of MI.
541static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
542                                       int &MinOffset, int &MaxOffset) {
543  bool IsPaired = isPairedLdSt(MI);
544  bool IsTagStore = isTagStore(MI);
545  // ST*G and all paired ldst have the same scale in pre/post-indexed variants
546  // as in the "unsigned offset" variant.
547  // All other pre/post indexed ldst instructions are unscaled.
548  Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
549
550  if (IsPaired) {
551    MinOffset = -64;
552    MaxOffset = 63;
553  } else {
554    MinOffset = -256;
555    MaxOffset = 255;
556  }
557}
558
559static MachineOperand &getLdStRegOp(MachineInstr &MI,
560                                    unsigned PairedRegOp = 0) {
561  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
562  unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
563  return MI.getOperand(Idx);
564}
565
566static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
567  unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
568  return MI.getOperand(Idx);
569}
570
571static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
572  unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
573  return MI.getOperand(Idx);
574}
575
576static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
577                                  MachineInstr &StoreInst,
578                                  const AArch64InstrInfo *TII) {
579  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
580  int LoadSize = TII->getMemScale(LoadInst);
581  int StoreSize = TII->getMemScale(StoreInst);
582  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
583                             ? getLdStOffsetOp(StoreInst).getImm()
584                             : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
585  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
586                             ? getLdStOffsetOp(LoadInst).getImm()
587                             : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
588  return (UnscaledStOffset <= UnscaledLdOffset) &&
589         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
590}
591
592static bool isPromotableZeroStoreInst(MachineInstr &MI) {
593  unsigned Opc = MI.getOpcode();
594  return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
595          isNarrowStore(Opc)) &&
596         getLdStRegOp(MI).getReg() == AArch64::WZR;
597}
598
599static bool isPromotableLoadFromStore(MachineInstr &MI) {
600  switch (MI.getOpcode()) {
601  default:
602    return false;
603  // Scaled instructions.
604  case AArch64::LDRBBui:
605  case AArch64::LDRHHui:
606  case AArch64::LDRWui:
607  case AArch64::LDRXui:
608  // Unscaled instructions.
609  case AArch64::LDURBBi:
610  case AArch64::LDURHHi:
611  case AArch64::LDURWi:
612  case AArch64::LDURXi:
613    return true;
614  }
615}
616
617static bool isMergeableLdStUpdate(MachineInstr &MI) {
618  unsigned Opc = MI.getOpcode();
619  switch (Opc) {
620  default:
621    return false;
622  // Scaled instructions.
623  case AArch64::STRSui:
624  case AArch64::STRDui:
625  case AArch64::STRQui:
626  case AArch64::STRXui:
627  case AArch64::STRWui:
628  case AArch64::STRHHui:
629  case AArch64::STRBBui:
630  case AArch64::LDRSui:
631  case AArch64::LDRDui:
632  case AArch64::LDRQui:
633  case AArch64::LDRXui:
634  case AArch64::LDRWui:
635  case AArch64::LDRHHui:
636  case AArch64::LDRBBui:
637  case AArch64::STGOffset:
638  case AArch64::STZGOffset:
639  case AArch64::ST2GOffset:
640  case AArch64::STZ2GOffset:
641  case AArch64::STGPi:
642  // Unscaled instructions.
643  case AArch64::STURSi:
644  case AArch64::STURDi:
645  case AArch64::STURQi:
646  case AArch64::STURWi:
647  case AArch64::STURXi:
648  case AArch64::LDURSi:
649  case AArch64::LDURDi:
650  case AArch64::LDURQi:
651  case AArch64::LDURWi:
652  case AArch64::LDURXi:
653  // Paired instructions.
654  case AArch64::LDPSi:
655  case AArch64::LDPSWi:
656  case AArch64::LDPDi:
657  case AArch64::LDPQi:
658  case AArch64::LDPWi:
659  case AArch64::LDPXi:
660  case AArch64::STPSi:
661  case AArch64::STPDi:
662  case AArch64::STPQi:
663  case AArch64::STPWi:
664  case AArch64::STPXi:
665    // Make sure this is a reg+imm (as opposed to an address reloc).
666    if (!getLdStOffsetOp(MI).isImm())
667      return false;
668
669    return true;
670  }
671}
672
673MachineBasicBlock::iterator
674AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
675                                           MachineBasicBlock::iterator MergeMI,
676                                           const LdStPairFlags &Flags) {
677  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
678         "Expected promotable zero stores.");
679
680  MachineBasicBlock::iterator NextI = I;
681  ++NextI;
682  // If NextI is the second of the two instructions to be merged, we need
683  // to skip one further. Either way we merge will invalidate the iterator,
684  // and we don't need to scan the new instruction, as it's a pairwise
685  // instruction, which we're not considering for further action anyway.
686  if (NextI == MergeMI)
687    ++NextI;
688
689  unsigned Opc = I->getOpcode();
690  bool IsScaled = !TII->isUnscaledLdSt(Opc);
691  int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
692
693  bool MergeForward = Flags.getMergeForward();
694  // Insert our new paired instruction after whichever of the paired
695  // instructions MergeForward indicates.
696  MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
697  // Also based on MergeForward is from where we copy the base register operand
698  // so we get the flags compatible with the input code.
699  const MachineOperand &BaseRegOp =
700      MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
701
702  // Which register is Rt and which is Rt2 depends on the offset order.
703  MachineInstr *RtMI;
704  if (getLdStOffsetOp(*I).getImm() ==
705      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
706    RtMI = &*MergeMI;
707  else
708    RtMI = &*I;
709
710  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
711  // Change the scaled offset from small to large type.
712  if (IsScaled) {
713    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
714    OffsetImm /= 2;
715  }
716
717  // Construct the new instruction.
718  DebugLoc DL = I->getDebugLoc();
719  MachineBasicBlock *MBB = I->getParent();
720  MachineInstrBuilder MIB;
721  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
722            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
723            .add(BaseRegOp)
724            .addImm(OffsetImm)
725            .cloneMergedMemRefs({&*I, &*MergeMI})
726            .setMIFlags(I->mergeFlagsWith(*MergeMI));
727  (void)MIB;
728
729  LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
730  LLVM_DEBUG(I->print(dbgs()));
731  LLVM_DEBUG(dbgs() << "    ");
732  LLVM_DEBUG(MergeMI->print(dbgs()));
733  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
734  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
735  LLVM_DEBUG(dbgs() << "\n");
736
737  // Erase the old instructions.
738  I->eraseFromParent();
739  MergeMI->eraseFromParent();
740  return NextI;
741}
742
743// Apply Fn to all instructions between MI and the beginning of the block, until
744// a def for DefReg is reached. Returns true, iff Fn returns true for all
745// visited instructions. Stop after visiting Limit iterations.
746static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
747                              const TargetRegisterInfo *TRI, unsigned Limit,
748                              std::function<bool(MachineInstr &, bool)> &Fn) {
749  auto MBB = MI.getParent();
750  for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(),
751                                           E = MBB->rend();
752       I != E; I++) {
753    if (!Limit)
754      return false;
755    --Limit;
756
757    bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) {
758      return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
759             TRI->regsOverlap(MOP.getReg(), DefReg);
760    });
761    if (!Fn(*I, isDef))
762      return false;
763    if (isDef)
764      break;
765  }
766  return true;
767}
768
769static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
770                                   const TargetRegisterInfo *TRI) {
771
772  for (const MachineOperand &MOP : phys_regs_and_masks(MI))
773    if (MOP.isReg() && MOP.isKill())
774      Units.removeReg(MOP.getReg());
775
776  for (const MachineOperand &MOP : phys_regs_and_masks(MI))
777    if (MOP.isReg() && !MOP.isKill())
778      Units.addReg(MOP.getReg());
779}
780
781MachineBasicBlock::iterator
782AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
783                                      MachineBasicBlock::iterator Paired,
784                                      const LdStPairFlags &Flags) {
785  MachineBasicBlock::iterator NextI = I;
786  ++NextI;
787  // If NextI is the second of the two instructions to be merged, we need
788  // to skip one further. Either way we merge will invalidate the iterator,
789  // and we don't need to scan the new instruction, as it's a pairwise
790  // instruction, which we're not considering for further action anyway.
791  if (NextI == Paired)
792    ++NextI;
793
794  int SExtIdx = Flags.getSExtIdx();
795  unsigned Opc =
796      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
797  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
798  int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
799
800  bool MergeForward = Flags.getMergeForward();
801
802  Optional<MCPhysReg> RenameReg = Flags.getRenameReg();
803  if (MergeForward && RenameReg) {
804    MCRegister RegToRename = getLdStRegOp(*I).getReg();
805    DefinedInBB.addReg(*RenameReg);
806
807    // Return the sub/super register for RenameReg, matching the size of
808    // OriginalReg.
809    auto GetMatchingSubReg = [this,
810                              RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
811      for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
812        if (TRI->getMinimalPhysRegClass(OriginalReg) ==
813            TRI->getMinimalPhysRegClass(SubOrSuper))
814          return SubOrSuper;
815      llvm_unreachable("Should have found matching sub or super register!");
816    };
817
818    std::function<bool(MachineInstr &, bool)> UpdateMIs =
819        [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
820          if (IsDef) {
821            bool SeenDef = false;
822            for (auto &MOP : MI.operands()) {
823              // Rename the first explicit definition and all implicit
824              // definitions matching RegToRename.
825              if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
826                  (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
827                  TRI->regsOverlap(MOP.getReg(), RegToRename)) {
828                assert((MOP.isImplicit() ||
829                        (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
830                       "Need renamable operands");
831                MOP.setReg(GetMatchingSubReg(MOP.getReg()));
832                SeenDef = true;
833              }
834            }
835          } else {
836            for (auto &MOP : MI.operands()) {
837              if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
838                  TRI->regsOverlap(MOP.getReg(), RegToRename)) {
839                assert((MOP.isImplicit() ||
840                        (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
841                           "Need renamable operands");
842                MOP.setReg(GetMatchingSubReg(MOP.getReg()));
843              }
844            }
845          }
846          LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
847          return true;
848        };
849    forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
850
851#if !defined(NDEBUG)
852    // Make sure the register used for renaming is not used between the paired
853    // instructions. That would trash the content before the new paired
854    // instruction.
855    for (auto &MI :
856         iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
857             std::next(I), std::next(Paired)))
858      assert(all_of(MI.operands(),
859                    [this, &RenameReg](const MachineOperand &MOP) {
860                      return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
861                             !TRI->regsOverlap(MOP.getReg(), *RenameReg);
862                    }) &&
863             "Rename register used between paired instruction, trashing the "
864             "content");
865#endif
866  }
867
868  // Insert our new paired instruction after whichever of the paired
869  // instructions MergeForward indicates.
870  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
871  // Also based on MergeForward is from where we copy the base register operand
872  // so we get the flags compatible with the input code.
873  const MachineOperand &BaseRegOp =
874      MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
875
876  int Offset = getLdStOffsetOp(*I).getImm();
877  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
878  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
879  if (IsUnscaled != PairedIsUnscaled) {
880    // We're trying to pair instructions that differ in how they are scaled.  If
881    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
882    // the opposite (i.e., make Paired's offset unscaled).
883    int MemSize = TII->getMemScale(*Paired);
884    if (PairedIsUnscaled) {
885      // If the unscaled offset isn't a multiple of the MemSize, we can't
886      // pair the operations together.
887      assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
888             "Offset should be a multiple of the stride!");
889      PairedOffset /= MemSize;
890    } else {
891      PairedOffset *= MemSize;
892    }
893  }
894
895  // Which register is Rt and which is Rt2 depends on the offset order.
896  MachineInstr *RtMI, *Rt2MI;
897  if (Offset == PairedOffset + OffsetStride) {
898    RtMI = &*Paired;
899    Rt2MI = &*I;
900    // Here we swapped the assumption made for SExtIdx.
901    // I.e., we turn ldp I, Paired into ldp Paired, I.
902    // Update the index accordingly.
903    if (SExtIdx != -1)
904      SExtIdx = (SExtIdx + 1) % 2;
905  } else {
906    RtMI = &*I;
907    Rt2MI = &*Paired;
908  }
909  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
910  // Scale the immediate offset, if necessary.
911  if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
912    assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
913           "Unscaled offset cannot be scaled.");
914    OffsetImm /= TII->getMemScale(*RtMI);
915  }
916
917  // Construct the new instruction.
918  MachineInstrBuilder MIB;
919  DebugLoc DL = I->getDebugLoc();
920  MachineBasicBlock *MBB = I->getParent();
921  MachineOperand RegOp0 = getLdStRegOp(*RtMI);
922  MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
923  // Kill flags may become invalid when moving stores for pairing.
924  if (RegOp0.isUse()) {
925    if (!MergeForward) {
926      // Clear kill flags on store if moving upwards. Example:
927      //   STRWui %w0, ...
928      //   USE %w1
929      //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
930      RegOp0.setIsKill(false);
931      RegOp1.setIsKill(false);
932    } else {
933      // Clear kill flags of the first stores register. Example:
934      //   STRWui %w1, ...
935      //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
936      //   STRW %w0
937      Register Reg = getLdStRegOp(*I).getReg();
938      for (MachineInstr &MI : make_range(std::next(I), Paired))
939        MI.clearRegisterKills(Reg, TRI);
940    }
941  }
942  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
943            .add(RegOp0)
944            .add(RegOp1)
945            .add(BaseRegOp)
946            .addImm(OffsetImm)
947            .cloneMergedMemRefs({&*I, &*Paired})
948            .setMIFlags(I->mergeFlagsWith(*Paired));
949
950  (void)MIB;
951
952  LLVM_DEBUG(
953      dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
954  LLVM_DEBUG(I->print(dbgs()));
955  LLVM_DEBUG(dbgs() << "    ");
956  LLVM_DEBUG(Paired->print(dbgs()));
957  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
958  if (SExtIdx != -1) {
959    // Generate the sign extension for the proper result of the ldp.
960    // I.e., with X1, that would be:
961    // %w1 = KILL %w1, implicit-def %x1
962    // %x1 = SBFMXri killed %x1, 0, 31
963    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
964    // Right now, DstMO has the extended register, since it comes from an
965    // extended opcode.
966    Register DstRegX = DstMO.getReg();
967    // Get the W variant of that register.
968    Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
969    // Update the result of LDP to use the W instead of the X variant.
970    DstMO.setReg(DstRegW);
971    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
972    LLVM_DEBUG(dbgs() << "\n");
973    // Make the machine verifier happy by providing a definition for
974    // the X register.
975    // Insert this definition right after the generated LDP, i.e., before
976    // InsertionPoint.
977    MachineInstrBuilder MIBKill =
978        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
979            .addReg(DstRegW)
980            .addReg(DstRegX, RegState::Define);
981    MIBKill->getOperand(2).setImplicit();
982    // Create the sign extension.
983    MachineInstrBuilder MIBSXTW =
984        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
985            .addReg(DstRegX)
986            .addImm(0)
987            .addImm(31);
988    (void)MIBSXTW;
989    LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
990    LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
991  } else {
992    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
993  }
994  LLVM_DEBUG(dbgs() << "\n");
995
996  if (MergeForward)
997    for (const MachineOperand &MOP : phys_regs_and_masks(*I))
998      if (MOP.isReg() && MOP.isKill())
999        DefinedInBB.addReg(MOP.getReg());
1000
1001  // Erase the old instructions.
1002  I->eraseFromParent();
1003  Paired->eraseFromParent();
1004
1005  return NextI;
1006}
1007
1008MachineBasicBlock::iterator
1009AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1010                                          MachineBasicBlock::iterator StoreI) {
1011  MachineBasicBlock::iterator NextI = LoadI;
1012  ++NextI;
1013
1014  int LoadSize = TII->getMemScale(*LoadI);
1015  int StoreSize = TII->getMemScale(*StoreI);
1016  Register LdRt = getLdStRegOp(*LoadI).getReg();
1017  const MachineOperand &StMO = getLdStRegOp(*StoreI);
1018  Register StRt = getLdStRegOp(*StoreI).getReg();
1019  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1020
1021  assert((IsStoreXReg ||
1022          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1023         "Unexpected RegClass");
1024
1025  MachineInstr *BitExtMI;
1026  if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1027    // Remove the load, if the destination register of the loads is the same
1028    // register for stored value.
1029    if (StRt == LdRt && LoadSize == 8) {
1030      for (MachineInstr &MI : make_range(StoreI->getIterator(),
1031                                         LoadI->getIterator())) {
1032        if (MI.killsRegister(StRt, TRI)) {
1033          MI.clearRegisterKills(StRt, TRI);
1034          break;
1035        }
1036      }
1037      LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
1038      LLVM_DEBUG(LoadI->print(dbgs()));
1039      LLVM_DEBUG(dbgs() << "\n");
1040      LoadI->eraseFromParent();
1041      return NextI;
1042    }
1043    // Replace the load with a mov if the load and store are in the same size.
1044    BitExtMI =
1045        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1046                TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1047            .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1048            .add(StMO)
1049            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1050            .setMIFlags(LoadI->getFlags());
1051  } else {
1052    // FIXME: Currently we disable this transformation in big-endian targets as
1053    // performance and correctness are verified only in little-endian.
1054    if (!Subtarget->isLittleEndian())
1055      return NextI;
1056    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
1057    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
1058           "Unsupported ld/st match");
1059    assert(LoadSize <= StoreSize && "Invalid load size");
1060    int UnscaledLdOffset = IsUnscaled
1061                               ? getLdStOffsetOp(*LoadI).getImm()
1062                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1063    int UnscaledStOffset = IsUnscaled
1064                               ? getLdStOffsetOp(*StoreI).getImm()
1065                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1066    int Width = LoadSize * 8;
1067    unsigned DestReg =
1068        IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1069                          LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1070                    : LdRt;
1071
1072    assert((UnscaledLdOffset >= UnscaledStOffset &&
1073            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1074           "Invalid offset");
1075
1076    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1077    int Imms = Immr + Width - 1;
1078    if (UnscaledLdOffset == UnscaledStOffset) {
1079      uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1080                                | ((Immr) << 6)               // immr
1081                                | ((Imms) << 0)               // imms
1082          ;
1083
1084      BitExtMI =
1085          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1086                  TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1087                  DestReg)
1088              .add(StMO)
1089              .addImm(AndMaskEncoded)
1090              .setMIFlags(LoadI->getFlags());
1091    } else {
1092      BitExtMI =
1093          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1094                  TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1095                  DestReg)
1096              .add(StMO)
1097              .addImm(Immr)
1098              .addImm(Imms)
1099              .setMIFlags(LoadI->getFlags());
1100    }
1101  }
1102
1103  // Clear kill flags between store and load.
1104  for (MachineInstr &MI : make_range(StoreI->getIterator(),
1105                                     BitExtMI->getIterator()))
1106    if (MI.killsRegister(StRt, TRI)) {
1107      MI.clearRegisterKills(StRt, TRI);
1108      break;
1109    }
1110
1111  LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1112  LLVM_DEBUG(StoreI->print(dbgs()));
1113  LLVM_DEBUG(dbgs() << "    ");
1114  LLVM_DEBUG(LoadI->print(dbgs()));
1115  LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1116  LLVM_DEBUG(StoreI->print(dbgs()));
1117  LLVM_DEBUG(dbgs() << "    ");
1118  LLVM_DEBUG((BitExtMI)->print(dbgs()));
1119  LLVM_DEBUG(dbgs() << "\n");
1120
1121  // Erase the old instructions.
1122  LoadI->eraseFromParent();
1123  return NextI;
1124}
1125
1126static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1127  // Convert the byte-offset used by unscaled into an "element" offset used
1128  // by the scaled pair load/store instructions.
1129  if (IsUnscaled) {
1130    // If the byte-offset isn't a multiple of the stride, there's no point
1131    // trying to match it.
1132    if (Offset % OffsetStride)
1133      return false;
1134    Offset /= OffsetStride;
1135  }
1136  return Offset <= 63 && Offset >= -64;
1137}
1138
1139// Do alignment, specialized to power of 2 and for signed ints,
1140// avoiding having to do a C-style cast from uint_64t to int when
1141// using alignTo from include/llvm/Support/MathExtras.h.
1142// FIXME: Move this function to include/MathExtras.h?
1143static int alignTo(int Num, int PowOf2) {
1144  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1145}
1146
1147static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1148                     AliasAnalysis *AA) {
1149  // One of the instructions must modify memory.
1150  if (!MIa.mayStore() && !MIb.mayStore())
1151    return false;
1152
1153  // Both instructions must be memory operations.
1154  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
1155    return false;
1156
1157  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
1158}
1159
1160static bool mayAlias(MachineInstr &MIa,
1161                     SmallVectorImpl<MachineInstr *> &MemInsns,
1162                     AliasAnalysis *AA) {
1163  for (MachineInstr *MIb : MemInsns)
1164    if (mayAlias(MIa, *MIb, AA))
1165      return true;
1166
1167  return false;
1168}
1169
1170bool AArch64LoadStoreOpt::findMatchingStore(
1171    MachineBasicBlock::iterator I, unsigned Limit,
1172    MachineBasicBlock::iterator &StoreI) {
1173  MachineBasicBlock::iterator B = I->getParent()->begin();
1174  MachineBasicBlock::iterator MBBI = I;
1175  MachineInstr &LoadMI = *I;
1176  Register BaseReg = getLdStBaseOp(LoadMI).getReg();
1177
1178  // If the load is the first instruction in the block, there's obviously
1179  // not any matching store.
1180  if (MBBI == B)
1181    return false;
1182
1183  // Track which register units have been modified and used between the first
1184  // insn and the second insn.
1185  ModifiedRegUnits.clear();
1186  UsedRegUnits.clear();
1187
1188  unsigned Count = 0;
1189  do {
1190    --MBBI;
1191    MachineInstr &MI = *MBBI;
1192
1193    // Don't count transient instructions towards the search limit since there
1194    // may be different numbers of them if e.g. debug information is present.
1195    if (!MI.isTransient())
1196      ++Count;
1197
1198    // If the load instruction reads directly from the address to which the
1199    // store instruction writes and the stored value is not modified, we can
1200    // promote the load. Since we do not handle stores with pre-/post-index,
1201    // it's unnecessary to check if BaseReg is modified by the store itself.
1202    if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1203        BaseReg == getLdStBaseOp(MI).getReg() &&
1204        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1205        ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1206      StoreI = MBBI;
1207      return true;
1208    }
1209
1210    if (MI.isCall())
1211      return false;
1212
1213    // Update modified / uses register units.
1214    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1215
1216    // Otherwise, if the base register is modified, we have no match, so
1217    // return early.
1218    if (!ModifiedRegUnits.available(BaseReg))
1219      return false;
1220
1221    // If we encounter a store aliased with the load, return early.
1222    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
1223      return false;
1224  } while (MBBI != B && Count < Limit);
1225  return false;
1226}
1227
1228// Returns true if FirstMI and MI are candidates for merging or pairing.
1229// Otherwise, returns false.
1230static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1231                                       LdStPairFlags &Flags,
1232                                       const AArch64InstrInfo *TII) {
1233  // If this is volatile or if pairing is suppressed, not a candidate.
1234  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1235    return false;
1236
1237  // We should have already checked FirstMI for pair suppression and volatility.
1238  assert(!FirstMI.hasOrderedMemoryRef() &&
1239         !TII->isLdStPairSuppressed(FirstMI) &&
1240         "FirstMI shouldn't get here if either of these checks are true.");
1241
1242  unsigned OpcA = FirstMI.getOpcode();
1243  unsigned OpcB = MI.getOpcode();
1244
1245  // Opcodes match: nothing more to check.
1246  if (OpcA == OpcB)
1247    return true;
1248
1249  // Try to match a sign-extended load/store with a zero-extended load/store.
1250  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1251  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1252  assert(IsValidLdStrOpc &&
1253         "Given Opc should be a Load or Store with an immediate");
1254  // OpcA will be the first instruction in the pair.
1255  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1256    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1257    return true;
1258  }
1259
1260  // If the second instruction isn't even a mergable/pairable load/store, bail
1261  // out.
1262  if (!PairIsValidLdStrOpc)
1263    return false;
1264
1265  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1266  // offsets.
1267  if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1268    return false;
1269
1270  // Try to match an unscaled load/store with a scaled load/store.
1271  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1272         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1273
1274  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1275}
1276
1277static bool
1278canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1279                 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1280                 const TargetRegisterInfo *TRI) {
1281  if (!FirstMI.mayStore())
1282    return false;
1283
1284  // Check if we can find an unused register which we can use to rename
1285  // the register used by the first load/store.
1286  auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1287  MachineFunction &MF = *FirstMI.getParent()->getParent();
1288  if (!RegClass || !MF.getRegInfo().tracksLiveness())
1289    return false;
1290
1291  auto RegToRename = getLdStRegOp(FirstMI).getReg();
1292  // For now, we only rename if the store operand gets killed at the store.
1293  if (!getLdStRegOp(FirstMI).isKill() &&
1294      !any_of(FirstMI.operands(),
1295              [TRI, RegToRename](const MachineOperand &MOP) {
1296                return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1297                       MOP.isImplicit() && MOP.isKill() &&
1298                       TRI->regsOverlap(RegToRename, MOP.getReg());
1299              })) {
1300    LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI << "\n");
1301    return false;
1302  }
1303  auto canRenameMOP = [](const MachineOperand &MOP) {
1304    return MOP.isImplicit() ||
1305           (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1306  };
1307
1308  bool FoundDef = false;
1309
1310  // For each instruction between FirstMI and the previous def for RegToRename,
1311  // we
1312  // * check if we can rename RegToRename in this instruction
1313  // * collect the registers used and required register classes for RegToRename.
1314  std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1315                                                           bool IsDef) {
1316    LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
1317    // Currently we do not try to rename across frame-setup instructions.
1318    if (MI.getFlag(MachineInstr::FrameSetup)) {
1319      LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions currently ("
1320                        << MI << ")\n");
1321      return false;
1322    }
1323
1324    UsedInBetween.accumulate(MI);
1325
1326    // For a definition, check that we can rename the definition and exit the
1327    // loop.
1328    FoundDef = IsDef;
1329
1330    // For defs, check if we can rename the first def of RegToRename.
1331    if (FoundDef) {
1332      // For some pseudo instructions, we might not generate code in the end
1333      // (e.g. KILL) and we would end up without a correct def for the rename
1334      // register.
1335      // TODO: This might be overly conservative and we could handle those cases
1336      // in multiple ways:
1337      //       1. Insert an extra copy, to materialize the def.
1338      //       2. Skip pseudo-defs until we find an non-pseudo def.
1339      if (MI.isPseudo()) {
1340        LLVM_DEBUG(dbgs() << "  Cannot rename pseudo instruction " << MI
1341                          << "\n");
1342        return false;
1343      }
1344
1345      for (auto &MOP : MI.operands()) {
1346        if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1347            !TRI->regsOverlap(MOP.getReg(), RegToRename))
1348          continue;
1349        if (!canRenameMOP(MOP)) {
1350          LLVM_DEBUG(dbgs()
1351                     << "  Cannot rename " << MOP << " in " << MI << "\n");
1352          return false;
1353        }
1354        RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1355      }
1356      return true;
1357    } else {
1358      for (auto &MOP : MI.operands()) {
1359        if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1360            !TRI->regsOverlap(MOP.getReg(), RegToRename))
1361          continue;
1362
1363        if (!canRenameMOP(MOP)) {
1364          LLVM_DEBUG(dbgs()
1365                     << "  Cannot rename " << MOP << " in " << MI << "\n");
1366          return false;
1367        }
1368        RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1369      }
1370    }
1371    return true;
1372  };
1373
1374  if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1375    return false;
1376
1377  if (!FoundDef) {
1378    LLVM_DEBUG(dbgs() << "  Did not find definition for register in BB\n");
1379    return false;
1380  }
1381  return true;
1382}
1383
1384// Check if we can find a physical register for renaming. This register must:
1385// * not be defined up to FirstMI (checking DefinedInBB)
1386// * not used between the MI and the defining instruction of the register to
1387//   rename (checked using UsedInBetween).
1388// * is available in all used register classes (checked using RequiredClasses).
1389static Optional<MCPhysReg> tryToFindRegisterToRename(
1390    MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB,
1391    LiveRegUnits &UsedInBetween,
1392    SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1393    const TargetRegisterInfo *TRI) {
1394  auto &MF = *FirstMI.getParent()->getParent();
1395  MachineRegisterInfo &RegInfo = MF.getRegInfo();
1396
1397  // Checks if any sub- or super-register of PR is callee saved.
1398  auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1399    return any_of(TRI->sub_and_superregs_inclusive(PR),
1400                  [&MF, TRI](MCPhysReg SubOrSuper) {
1401                    return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1402                  });
1403  };
1404
1405  // Check if PR or one of its sub- or super-registers can be used for all
1406  // required register classes.
1407  auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1408    return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1409      return any_of(TRI->sub_and_superregs_inclusive(PR),
1410                    [C, TRI](MCPhysReg SubOrSuper) {
1411                      return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1412                    });
1413    });
1414  };
1415
1416  auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1417  for (const MCPhysReg &PR : *RegClass) {
1418    if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1419        !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1420        CanBeUsedForAllClasses(PR)) {
1421      DefinedInBB.addReg(PR);
1422      LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1423                        << "\n");
1424      return {PR};
1425    }
1426  }
1427  LLVM_DEBUG(dbgs() << "No rename register found from "
1428                    << TRI->getRegClassName(RegClass) << "\n");
1429  return None;
1430}
1431
1432/// Scan the instructions looking for a load/store that can be combined with the
1433/// current instruction into a wider equivalent or a load/store pair.
1434MachineBasicBlock::iterator
1435AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1436                                      LdStPairFlags &Flags, unsigned Limit,
1437                                      bool FindNarrowMerge) {
1438  MachineBasicBlock::iterator E = I->getParent()->end();
1439  MachineBasicBlock::iterator MBBI = I;
1440  MachineBasicBlock::iterator MBBIWithRenameReg;
1441  MachineInstr &FirstMI = *I;
1442  ++MBBI;
1443
1444  bool MayLoad = FirstMI.mayLoad();
1445  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1446  Register Reg = getLdStRegOp(FirstMI).getReg();
1447  Register BaseReg = getLdStBaseOp(FirstMI).getReg();
1448  int Offset = getLdStOffsetOp(FirstMI).getImm();
1449  int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1450  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1451
1452  Optional<bool> MaybeCanRename = None;
1453  if (!EnableRenaming)
1454    MaybeCanRename = {false};
1455
1456  SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1457  LiveRegUnits UsedInBetween;
1458  UsedInBetween.init(*TRI);
1459
1460  Flags.clearRenameReg();
1461
1462  // Track which register units have been modified and used between the first
1463  // insn (inclusive) and the second insn.
1464  ModifiedRegUnits.clear();
1465  UsedRegUnits.clear();
1466
1467  // Remember any instructions that read/write memory between FirstMI and MI.
1468  SmallVector<MachineInstr *, 4> MemInsns;
1469
1470  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1471    MachineInstr &MI = *MBBI;
1472
1473    UsedInBetween.accumulate(MI);
1474
1475    // Don't count transient instructions towards the search limit since there
1476    // may be different numbers of them if e.g. debug information is present.
1477    if (!MI.isTransient())
1478      ++Count;
1479
1480    Flags.setSExtIdx(-1);
1481    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1482        getLdStOffsetOp(MI).isImm()) {
1483      assert(MI.mayLoadOrStore() && "Expected memory operation.");
1484      // If we've found another instruction with the same opcode, check to see
1485      // if the base and offset are compatible with our starting instruction.
1486      // These instructions all have scaled immediate operands, so we just
1487      // check for +1/-1. Make sure to check the new instruction offset is
1488      // actually an immediate and not a symbolic reference destined for
1489      // a relocation.
1490      Register MIBaseReg = getLdStBaseOp(MI).getReg();
1491      int MIOffset = getLdStOffsetOp(MI).getImm();
1492      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1493      if (IsUnscaled != MIIsUnscaled) {
1494        // We're trying to pair instructions that differ in how they are scaled.
1495        // If FirstMI is scaled then scale the offset of MI accordingly.
1496        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1497        int MemSize = TII->getMemScale(MI);
1498        if (MIIsUnscaled) {
1499          // If the unscaled offset isn't a multiple of the MemSize, we can't
1500          // pair the operations together: bail and keep looking.
1501          if (MIOffset % MemSize) {
1502            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1503                                              UsedRegUnits, TRI);
1504            MemInsns.push_back(&MI);
1505            continue;
1506          }
1507          MIOffset /= MemSize;
1508        } else {
1509          MIOffset *= MemSize;
1510        }
1511      }
1512
1513      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1514                                   (Offset + OffsetStride == MIOffset))) {
1515        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1516        if (FindNarrowMerge) {
1517          // If the alignment requirements of the scaled wide load/store
1518          // instruction can't express the offset of the scaled narrow input,
1519          // bail and keep looking. For promotable zero stores, allow only when
1520          // the stored value is the same (i.e., WZR).
1521          if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1522              (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1523            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1524                                              UsedRegUnits, TRI);
1525            MemInsns.push_back(&MI);
1526            continue;
1527          }
1528        } else {
1529          // Pairwise instructions have a 7-bit signed offset field. Single
1530          // insns have a 12-bit unsigned offset field.  If the resultant
1531          // immediate offset of merging these instructions is out of range for
1532          // a pairwise instruction, bail and keep looking.
1533          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1534            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1535                                              UsedRegUnits, TRI);
1536            MemInsns.push_back(&MI);
1537            continue;
1538          }
1539          // If the alignment requirements of the paired (scaled) instruction
1540          // can't express the offset of the unscaled input, bail and keep
1541          // looking.
1542          if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1543            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1544                                              UsedRegUnits, TRI);
1545            MemInsns.push_back(&MI);
1546            continue;
1547          }
1548        }
1549        // If the destination register of the loads is the same register, bail
1550        // and keep looking. A load-pair instruction with both destination
1551        // registers the same is UNPREDICTABLE and will result in an exception.
1552        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
1553          LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1554                                            TRI);
1555          MemInsns.push_back(&MI);
1556          continue;
1557        }
1558
1559        // If the Rt of the second instruction was not modified or used between
1560        // the two instructions and none of the instructions between the second
1561        // and first alias with the second, we can combine the second into the
1562        // first.
1563        if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1564            !(MI.mayLoad() &&
1565              !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1566            !mayAlias(MI, MemInsns, AA)) {
1567
1568          Flags.setMergeForward(false);
1569          Flags.clearRenameReg();
1570          return MBBI;
1571        }
1572
1573        // Likewise, if the Rt of the first instruction is not modified or used
1574        // between the two instructions and none of the instructions between the
1575        // first and the second alias with the first, we can combine the first
1576        // into the second.
1577        if (!(MayLoad &&
1578              !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1579            !mayAlias(FirstMI, MemInsns, AA)) {
1580
1581          if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1582            Flags.setMergeForward(true);
1583            Flags.clearRenameReg();
1584            return MBBI;
1585          }
1586
1587          if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1588            if (!MaybeCanRename)
1589              MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1590                                                 RequiredClasses, TRI)};
1591
1592            if (*MaybeCanRename) {
1593              Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
1594                  FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
1595                  TRI);
1596              if (MaybeRenameReg) {
1597                Flags.setRenameReg(*MaybeRenameReg);
1598                Flags.setMergeForward(true);
1599                MBBIWithRenameReg = MBBI;
1600              }
1601            }
1602          }
1603        }
1604        // Unable to combine these instructions due to interference in between.
1605        // Keep looking.
1606      }
1607    }
1608
1609    if (Flags.getRenameReg())
1610      return MBBIWithRenameReg;
1611
1612    // If the instruction wasn't a matching load or store.  Stop searching if we
1613    // encounter a call instruction that might modify memory.
1614    if (MI.isCall())
1615      return E;
1616
1617    // Update modified / uses register units.
1618    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1619
1620    // Otherwise, if the base register is modified, we have no match, so
1621    // return early.
1622    if (!ModifiedRegUnits.available(BaseReg))
1623      return E;
1624
1625    // Update list of instructions that read/write memory.
1626    if (MI.mayLoadOrStore())
1627      MemInsns.push_back(&MI);
1628  }
1629  return E;
1630}
1631
1632MachineBasicBlock::iterator
1633AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1634                                     MachineBasicBlock::iterator Update,
1635                                     bool IsPreIdx) {
1636  assert((Update->getOpcode() == AArch64::ADDXri ||
1637          Update->getOpcode() == AArch64::SUBXri) &&
1638         "Unexpected base register update instruction to merge!");
1639  MachineBasicBlock::iterator NextI = I;
1640  // Return the instruction following the merged instruction, which is
1641  // the instruction following our unmerged load. Unless that's the add/sub
1642  // instruction we're merging, in which case it's the one after that.
1643  if (++NextI == Update)
1644    ++NextI;
1645
1646  int Value = Update->getOperand(2).getImm();
1647  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1648         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1649  if (Update->getOpcode() == AArch64::SUBXri)
1650    Value = -Value;
1651
1652  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1653                             : getPostIndexedOpcode(I->getOpcode());
1654  MachineInstrBuilder MIB;
1655  int Scale, MinOffset, MaxOffset;
1656  getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1657  if (!isPairedLdSt(*I)) {
1658    // Non-paired instruction.
1659    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1660              .add(getLdStRegOp(*Update))
1661              .add(getLdStRegOp(*I))
1662              .add(getLdStBaseOp(*I))
1663              .addImm(Value / Scale)
1664              .setMemRefs(I->memoperands())
1665              .setMIFlags(I->mergeFlagsWith(*Update));
1666  } else {
1667    // Paired instruction.
1668    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1669              .add(getLdStRegOp(*Update))
1670              .add(getLdStRegOp(*I, 0))
1671              .add(getLdStRegOp(*I, 1))
1672              .add(getLdStBaseOp(*I))
1673              .addImm(Value / Scale)
1674              .setMemRefs(I->memoperands())
1675              .setMIFlags(I->mergeFlagsWith(*Update));
1676  }
1677  (void)MIB;
1678
1679  if (IsPreIdx) {
1680    ++NumPreFolded;
1681    LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1682  } else {
1683    ++NumPostFolded;
1684    LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1685  }
1686  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1687  LLVM_DEBUG(I->print(dbgs()));
1688  LLVM_DEBUG(dbgs() << "    ");
1689  LLVM_DEBUG(Update->print(dbgs()));
1690  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1691  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1692  LLVM_DEBUG(dbgs() << "\n");
1693
1694  // Erase the old instructions for the block.
1695  I->eraseFromParent();
1696  Update->eraseFromParent();
1697
1698  return NextI;
1699}
1700
1701bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1702                                               MachineInstr &MI,
1703                                               unsigned BaseReg, int Offset) {
1704  switch (MI.getOpcode()) {
1705  default:
1706    break;
1707  case AArch64::SUBXri:
1708  case AArch64::ADDXri:
1709    // Make sure it's a vanilla immediate operand, not a relocation or
1710    // anything else we can't handle.
1711    if (!MI.getOperand(2).isImm())
1712      break;
1713    // Watch out for 1 << 12 shifted value.
1714    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1715      break;
1716
1717    // The update instruction source and destination register must be the
1718    // same as the load/store base register.
1719    if (MI.getOperand(0).getReg() != BaseReg ||
1720        MI.getOperand(1).getReg() != BaseReg)
1721      break;
1722
1723    int UpdateOffset = MI.getOperand(2).getImm();
1724    if (MI.getOpcode() == AArch64::SUBXri)
1725      UpdateOffset = -UpdateOffset;
1726
1727    // The immediate must be a multiple of the scaling factor of the pre/post
1728    // indexed instruction.
1729    int Scale, MinOffset, MaxOffset;
1730    getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1731    if (UpdateOffset % Scale != 0)
1732      break;
1733
1734    // Scaled offset must fit in the instruction immediate.
1735    int ScaledOffset = UpdateOffset / Scale;
1736    if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1737      break;
1738
1739    // If we have a non-zero Offset, we check that it matches the amount
1740    // we're adding to the register.
1741    if (!Offset || Offset == UpdateOffset)
1742      return true;
1743    break;
1744  }
1745  return false;
1746}
1747
1748MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1749    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1750  MachineBasicBlock::iterator E = I->getParent()->end();
1751  MachineInstr &MemMI = *I;
1752  MachineBasicBlock::iterator MBBI = I;
1753
1754  Register BaseReg = getLdStBaseOp(MemMI).getReg();
1755  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
1756
1757  // Scan forward looking for post-index opportunities.  Updating instructions
1758  // can't be formed if the memory instruction doesn't have the offset we're
1759  // looking for.
1760  if (MIUnscaledOffset != UnscaledOffset)
1761    return E;
1762
1763  // If the base register overlaps a source/destination register, we can't
1764  // merge the update. This does not apply to tag store instructions which
1765  // ignore the address part of the source register.
1766  // This does not apply to STGPi as well, which does not have unpredictable
1767  // behavior in this case unlike normal stores, and always performs writeback
1768  // after reading the source register value.
1769  if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1770    bool IsPairedInsn = isPairedLdSt(MemMI);
1771    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1772      Register DestReg = getLdStRegOp(MemMI, i).getReg();
1773      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1774        return E;
1775    }
1776  }
1777
1778  // Track which register units have been modified and used between the first
1779  // insn (inclusive) and the second insn.
1780  ModifiedRegUnits.clear();
1781  UsedRegUnits.clear();
1782  ++MBBI;
1783  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1784    MachineInstr &MI = *MBBI;
1785
1786    // Don't count transient instructions towards the search limit since there
1787    // may be different numbers of them if e.g. debug information is present.
1788    if (!MI.isTransient())
1789      ++Count;
1790
1791    // If we found a match, return it.
1792    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1793      return MBBI;
1794
1795    // Update the status of what the instruction clobbered and used.
1796    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1797
1798    // Otherwise, if the base register is used or modified, we have no match, so
1799    // return early.
1800    if (!ModifiedRegUnits.available(BaseReg) ||
1801        !UsedRegUnits.available(BaseReg))
1802      return E;
1803  }
1804  return E;
1805}
1806
1807MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1808    MachineBasicBlock::iterator I, unsigned Limit) {
1809  MachineBasicBlock::iterator B = I->getParent()->begin();
1810  MachineBasicBlock::iterator E = I->getParent()->end();
1811  MachineInstr &MemMI = *I;
1812  MachineBasicBlock::iterator MBBI = I;
1813
1814  Register BaseReg = getLdStBaseOp(MemMI).getReg();
1815  int Offset = getLdStOffsetOp(MemMI).getImm();
1816
1817  // If the load/store is the first instruction in the block, there's obviously
1818  // not any matching update. Ditto if the memory offset isn't zero.
1819  if (MBBI == B || Offset != 0)
1820    return E;
1821  // If the base register overlaps a destination register, we can't
1822  // merge the update.
1823  if (!isTagStore(MemMI)) {
1824    bool IsPairedInsn = isPairedLdSt(MemMI);
1825    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1826      Register DestReg = getLdStRegOp(MemMI, i).getReg();
1827      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1828        return E;
1829    }
1830  }
1831
1832  // Track which register units have been modified and used between the first
1833  // insn (inclusive) and the second insn.
1834  ModifiedRegUnits.clear();
1835  UsedRegUnits.clear();
1836  unsigned Count = 0;
1837  do {
1838    --MBBI;
1839    MachineInstr &MI = *MBBI;
1840
1841    // Don't count transient instructions towards the search limit since there
1842    // may be different numbers of them if e.g. debug information is present.
1843    if (!MI.isTransient())
1844      ++Count;
1845
1846    // If we found a match, return it.
1847    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1848      return MBBI;
1849
1850    // Update the status of what the instruction clobbered and used.
1851    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1852
1853    // Otherwise, if the base register is used or modified, we have no match, so
1854    // return early.
1855    if (!ModifiedRegUnits.available(BaseReg) ||
1856        !UsedRegUnits.available(BaseReg))
1857      return E;
1858  } while (MBBI != B && Count < Limit);
1859  return E;
1860}
1861
1862bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1863    MachineBasicBlock::iterator &MBBI) {
1864  MachineInstr &MI = *MBBI;
1865  // If this is a volatile load, don't mess with it.
1866  if (MI.hasOrderedMemoryRef())
1867    return false;
1868
1869  // Make sure this is a reg+imm.
1870  // FIXME: It is possible to extend it to handle reg+reg cases.
1871  if (!getLdStOffsetOp(MI).isImm())
1872    return false;
1873
1874  // Look backward up to LdStLimit instructions.
1875  MachineBasicBlock::iterator StoreI;
1876  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1877    ++NumLoadsFromStoresPromoted;
1878    // Promote the load. Keeping the iterator straight is a
1879    // pain, so we let the merge routine tell us what the next instruction
1880    // is after it's done mucking about.
1881    MBBI = promoteLoadFromStore(MBBI, StoreI);
1882    return true;
1883  }
1884  return false;
1885}
1886
1887// Merge adjacent zero stores into a wider store.
1888bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
1889    MachineBasicBlock::iterator &MBBI) {
1890  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
1891  MachineInstr &MI = *MBBI;
1892  MachineBasicBlock::iterator E = MI.getParent()->end();
1893
1894  if (!TII->isCandidateToMergeOrPair(MI))
1895    return false;
1896
1897  // Look ahead up to LdStLimit instructions for a mergable instruction.
1898  LdStPairFlags Flags;
1899  MachineBasicBlock::iterator MergeMI =
1900      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1901  if (MergeMI != E) {
1902    ++NumZeroStoresPromoted;
1903
1904    // Keeping the iterator straight is a pain, so we let the merge routine tell
1905    // us what the next instruction is after it's done mucking about.
1906    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
1907    return true;
1908  }
1909  return false;
1910}
1911
1912// Find loads and stores that can be merged into a single load or store pair
1913// instruction.
1914bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1915  MachineInstr &MI = *MBBI;
1916  MachineBasicBlock::iterator E = MI.getParent()->end();
1917
1918  if (!TII->isCandidateToMergeOrPair(MI))
1919    return false;
1920
1921  // Early exit if the offset is not possible to match. (6 bits of positive
1922  // range, plus allow an extra one in case we find a later insn that matches
1923  // with Offset-1)
1924  bool IsUnscaled = TII->isUnscaledLdSt(MI);
1925  int Offset = getLdStOffsetOp(MI).getImm();
1926  int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
1927  // Allow one more for offset.
1928  if (Offset > 0)
1929    Offset -= OffsetStride;
1930  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1931    return false;
1932
1933  // Look ahead up to LdStLimit instructions for a pairable instruction.
1934  LdStPairFlags Flags;
1935  MachineBasicBlock::iterator Paired =
1936      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1937  if (Paired != E) {
1938    ++NumPairCreated;
1939    if (TII->isUnscaledLdSt(MI))
1940      ++NumUnscaledPairCreated;
1941    // Keeping the iterator straight is a pain, so we let the merge routine tell
1942    // us what the next instruction is after it's done mucking about.
1943    auto Prev = std::prev(MBBI);
1944    MBBI = mergePairedInsns(MBBI, Paired, Flags);
1945    // Collect liveness info for instructions between Prev and the new position
1946    // MBBI.
1947    for (auto I = std::next(Prev); I != MBBI; I++)
1948      updateDefinedRegisters(*I, DefinedInBB, TRI);
1949
1950    return true;
1951  }
1952  return false;
1953}
1954
1955bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
1956    (MachineBasicBlock::iterator &MBBI) {
1957  MachineInstr &MI = *MBBI;
1958  MachineBasicBlock::iterator E = MI.getParent()->end();
1959  MachineBasicBlock::iterator Update;
1960
1961  // Look forward to try to form a post-index instruction. For example,
1962  // ldr x0, [x20]
1963  // add x20, x20, #32
1964  //   merged into:
1965  // ldr x0, [x20], #32
1966  Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1967  if (Update != E) {
1968    // Merge the update into the ld/st.
1969    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1970    return true;
1971  }
1972
1973  // Don't know how to handle unscaled pre/post-index versions below, so bail.
1974  if (TII->isUnscaledLdSt(MI.getOpcode()))
1975    return false;
1976
1977  // Look back to try to find a pre-index instruction. For example,
1978  // add x0, x0, #8
1979  // ldr x1, [x0]
1980  //   merged into:
1981  // ldr x1, [x0, #8]!
1982  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1983  if (Update != E) {
1984    // Merge the update into the ld/st.
1985    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1986    return true;
1987  }
1988
1989  // The immediate in the load/store is scaled by the size of the memory
1990  // operation. The immediate in the add we're looking for,
1991  // however, is not, so adjust here.
1992  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
1993
1994  // Look forward to try to find a pre-index instruction. For example,
1995  // ldr x1, [x0, #64]
1996  // add x0, x0, #64
1997  //   merged into:
1998  // ldr x1, [x0, #64]!
1999  Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2000  if (Update != E) {
2001    // Merge the update into the ld/st.
2002    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2003    return true;
2004  }
2005
2006  return false;
2007}
2008
2009bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2010                                        bool EnableNarrowZeroStOpt) {
2011
2012  bool Modified = false;
2013  // Four tranformations to do here:
2014  // 1) Find loads that directly read from stores and promote them by
2015  //    replacing with mov instructions. If the store is wider than the load,
2016  //    the load will be replaced with a bitfield extract.
2017  //      e.g.,
2018  //        str w1, [x0, #4]
2019  //        ldrh w2, [x0, #6]
2020  //        ; becomes
2021  //        str w1, [x0, #4]
2022  //        lsr w2, w1, #16
2023  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2024       MBBI != E;) {
2025    if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2026      Modified = true;
2027    else
2028      ++MBBI;
2029  }
2030  // 2) Merge adjacent zero stores into a wider store.
2031  //      e.g.,
2032  //        strh wzr, [x0]
2033  //        strh wzr, [x0, #2]
2034  //        ; becomes
2035  //        str wzr, [x0]
2036  //      e.g.,
2037  //        str wzr, [x0]
2038  //        str wzr, [x0, #4]
2039  //        ; becomes
2040  //        str xzr, [x0]
2041  if (EnableNarrowZeroStOpt)
2042    for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2043         MBBI != E;) {
2044      if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2045        Modified = true;
2046      else
2047        ++MBBI;
2048    }
2049  // 3) Find loads and stores that can be merged into a single load or store
2050  //    pair instruction.
2051  //      e.g.,
2052  //        ldr x0, [x2]
2053  //        ldr x1, [x2, #8]
2054  //        ; becomes
2055  //        ldp x0, x1, [x2]
2056
2057  if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2058    DefinedInBB.clear();
2059    DefinedInBB.addLiveIns(MBB);
2060  }
2061
2062  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2063       MBBI != E;) {
2064    // Track currently live registers up to this point, to help with
2065    // searching for a rename register on demand.
2066    updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2067    if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2068      Modified = true;
2069    else
2070      ++MBBI;
2071  }
2072  // 4) Find base register updates that can be merged into the load or store
2073  //    as a base-reg writeback.
2074  //      e.g.,
2075  //        ldr x0, [x2]
2076  //        add x2, x2, #4
2077  //        ; becomes
2078  //        ldr x0, [x2], #4
2079  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2080       MBBI != E;) {
2081    if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2082      Modified = true;
2083    else
2084      ++MBBI;
2085  }
2086
2087  return Modified;
2088}
2089
2090bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2091  if (skipFunction(Fn.getFunction()))
2092    return false;
2093
2094  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
2095  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2096  TRI = Subtarget->getRegisterInfo();
2097  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2098
2099  // Resize the modified and used register unit trackers.  We do this once
2100  // per function and then clear the register units each time we optimize a load
2101  // or store.
2102  ModifiedRegUnits.init(*TRI);
2103  UsedRegUnits.init(*TRI);
2104  DefinedInBB.init(*TRI);
2105
2106  bool Modified = false;
2107  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2108  for (auto &MBB : Fn) {
2109    auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2110    Modified |= M;
2111  }
2112
2113  return Modified;
2114}
2115
2116// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2117// stores near one another?  Note: The pre-RA instruction scheduler already has
2118// hooks to try and schedule pairable loads/stores together to improve pairing
2119// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
2120
2121// FIXME: When pairing store instructions it's very possible for this pass to
2122// hoist a store with a KILL marker above another use (without a KILL marker).
2123// The resulting IR is invalid, but nothing uses the KILL markers after this
2124// pass, so it's never caused a problem in practice.
2125
2126/// createAArch64LoadStoreOptimizationPass - returns an instance of the
2127/// load / store optimization pass.
2128FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2129  return new AArch64LoadStoreOpt();
2130}
2131