1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11//  ds_read_b32 v0, v2 offset:16
12//  ds_read_b32 v1, v2 offset:32
13// ==>
14//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17//  s_buffer_load_dword s4, s[0:3], 4
18//  s_buffer_load_dword s5, s[0:3], 8
19// ==>
20//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27//  s_movk_i32 s0, 0x1800
28//  v_add_co_u32_e32 v0, vcc, s0, v2
29//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31//  s_movk_i32 s0, 0x1000
32//  v_add_co_u32_e32 v5, vcc, s0, v2
33//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34//  global_load_dwordx2 v[5:6], v[5:6], off
35//  global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37//  s_movk_i32 s0, 0x1000
38//  v_add_co_u32_e32 v5, vcc, s0, v2
39//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40//  global_load_dwordx2 v[5:6], v[5:6], off
41//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46//   the constant into the data register is placed between the stores, although
47//   this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50//   one pair, and recomputes live intervals and moves on to the next pair. It
51//   would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54//   cluster of loads have offsets that are too large to fit in the 8-bit
55//   offsets, but are close enough to fit in the 8 bits, we can add to the base
56//   pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
62#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63#include "llvm/Analysis/AliasAnalysis.h"
64#include "llvm/CodeGen/MachineFunctionPass.h"
65#include "llvm/InitializePasses.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73  UNKNOWN,
74  DS_READ,
75  DS_WRITE,
76  S_BUFFER_LOAD_IMM,
77  S_BUFFER_LOAD_SGPR_IMM,
78  S_LOAD_IMM,
79  BUFFER_LOAD,
80  BUFFER_STORE,
81  MIMG,
82  TBUFFER_LOAD,
83  TBUFFER_STORE,
84  GLOBAL_LOAD_SADDR,
85  GLOBAL_STORE_SADDR,
86  FLAT_LOAD,
87  FLAT_STORE,
88  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89  GLOBAL_STORE // any CombineInfo, they are only ever returned by
90               // getCommonInstClass.
91};
92
93struct AddressRegs {
94  unsigned char NumVAddrs = 0;
95  bool SBase = false;
96  bool SRsrc = false;
97  bool SOffset = false;
98  bool SAddr = false;
99  bool VAddr = false;
100  bool Addr = false;
101  bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108  struct CombineInfo {
109    MachineBasicBlock::iterator I;
110    unsigned EltSize;
111    unsigned Offset;
112    unsigned Width;
113    unsigned Format;
114    unsigned BaseOff;
115    unsigned DMask;
116    InstClassEnum InstClass;
117    unsigned CPol = 0;
118    bool IsAGPR;
119    bool UseST64;
120    int AddrIdx[MaxAddressRegs];
121    const MachineOperand *AddrReg[MaxAddressRegs];
122    unsigned NumAddresses;
123    unsigned Order;
124
125    bool hasSameBaseAddress(const CombineInfo &CI) {
126      if (NumAddresses != CI.NumAddresses)
127        return false;
128
129      const MachineInstr &MI = *CI.I;
130      for (unsigned i = 0; i < NumAddresses; i++) {
131        const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133        if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134          if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135              AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136            return false;
137          }
138          continue;
139        }
140
141        // Check same base pointer. Be careful of subregisters, which can occur
142        // with vectors of pointers.
143        if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144            AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145         return false;
146        }
147      }
148      return true;
149    }
150
151    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152      for (unsigned i = 0; i < NumAddresses; ++i) {
153        const MachineOperand *AddrOp = AddrReg[i];
154        // Immediates are always OK.
155        if (AddrOp->isImm())
156          continue;
157
158        // Don't try to merge addresses that aren't either immediates or registers.
159        // TODO: Should be possible to merge FrameIndexes and maybe some other
160        // non-register
161        if (!AddrOp->isReg())
162          return false;
163
164        // TODO: We should be able to merge instructions with other physical reg
165        // addresses too.
166        if (AddrOp->getReg().isPhysical() &&
167            AddrOp->getReg() != AMDGPU::SGPR_NULL)
168          return false;
169
170        // If an address has only one use then there will be no other
171        // instructions with the same address, so we can't merge this one.
172        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173          return false;
174      }
175      return true;
176    }
177
178    void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180    // Compare by pointer order.
181    bool operator<(const CombineInfo& Other) const {
182      return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183    }
184  };
185
186  struct BaseRegisters {
187    Register LoReg;
188    Register HiReg;
189
190    unsigned LoSubReg = 0;
191    unsigned HiSubReg = 0;
192  };
193
194  struct MemAddress {
195    BaseRegisters Base;
196    int64_t Offset = 0;
197  };
198
199  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201private:
202  const GCNSubtarget *STM = nullptr;
203  const SIInstrInfo *TII = nullptr;
204  const SIRegisterInfo *TRI = nullptr;
205  MachineRegisterInfo *MRI = nullptr;
206  AliasAnalysis *AA = nullptr;
207  bool OptimizeAgain;
208
209  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210                           const DenseSet<Register> &ARegUses,
211                           const MachineInstr &A, const MachineInstr &B) const;
212  static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                  const SIInstrInfo &TII,
214                                  const CombineInfo &Paired);
215  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                   CombineInfo &Paired, bool Modify = false);
217  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                        const CombineInfo &Paired);
219  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                     const CombineInfo &Paired);
222  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                    const CombineInfo &Paired);
224  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225
226  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227
228  unsigned read2Opcode(unsigned EltSize) const;
229  unsigned read2ST64Opcode(unsigned EltSize) const;
230  MachineBasicBlock::iterator
231  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232                 MachineBasicBlock::iterator InsertBefore);
233
234  unsigned write2Opcode(unsigned EltSize) const;
235  unsigned write2ST64Opcode(unsigned EltSize) const;
236  MachineBasicBlock::iterator
237  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                  MachineBasicBlock::iterator InsertBefore);
239  MachineBasicBlock::iterator
240  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                 MachineBasicBlock::iterator InsertBefore);
242  MachineBasicBlock::iterator
243  mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                       MachineBasicBlock::iterator InsertBefore);
245  MachineBasicBlock::iterator
246  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                      MachineBasicBlock::iterator InsertBefore);
248  MachineBasicBlock::iterator
249  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                       MachineBasicBlock::iterator InsertBefore);
251  MachineBasicBlock::iterator
252  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                       MachineBasicBlock::iterator InsertBefore);
254  MachineBasicBlock::iterator
255  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                        MachineBasicBlock::iterator InsertBefore);
257  MachineBasicBlock::iterator
258  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259                    MachineBasicBlock::iterator InsertBefore);
260  MachineBasicBlock::iterator
261  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262                     MachineBasicBlock::iterator InsertBefore);
263
264  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265                           int32_t NewOffset) const;
266  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268  std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270  /// Promotes constant offset to the immediate by adjusting the base. It
271  /// tries to use a base from the nearby instructions that allows it to have
272  /// a 13bit constant offset which gets promoted to the immediate.
273  bool promoteConstantOffsetToImm(MachineInstr &CI,
274                                  MemInfoMap &Visited,
275                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276  void addInstToMergeableList(const CombineInfo &CI,
277                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
278
279  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282      std::list<std::list<CombineInfo>> &MergeableInsts) const;
283
284  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285                                                     const CombineInfo &Paired);
286
287  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288                                          const CombineInfo &Paired);
289
290public:
291  static char ID;
292
293  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
294    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295  }
296
297  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298                                     bool &OptimizeListAgain);
299  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300
301  bool runOnMachineFunction(MachineFunction &MF) override;
302
303  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304
305  void getAnalysisUsage(AnalysisUsage &AU) const override {
306    AU.setPreservesCFG();
307    AU.addRequired<AAResultsWrapperPass>();
308
309    MachineFunctionPass::getAnalysisUsage(AU);
310  }
311
312  MachineFunctionProperties getRequiredProperties() const override {
313    return MachineFunctionProperties()
314      .set(MachineFunctionProperties::Property::IsSSA);
315  }
316};
317
318static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319  const unsigned Opc = MI.getOpcode();
320
321  if (TII.isMUBUF(Opc)) {
322    // FIXME: Handle d16 correctly
323    return AMDGPU::getMUBUFElements(Opc);
324  }
325  if (TII.isImage(MI)) {
326    uint64_t DMaskImm =
327        TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328    return llvm::popcount(DMaskImm);
329  }
330  if (TII.isMTBUF(Opc)) {
331    return AMDGPU::getMTBUFElements(Opc);
332  }
333
334  switch (Opc) {
335  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337  case AMDGPU::S_LOAD_DWORD_IMM:
338  case AMDGPU::GLOBAL_LOAD_DWORD:
339  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340  case AMDGPU::GLOBAL_STORE_DWORD:
341  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342  case AMDGPU::FLAT_LOAD_DWORD:
343  case AMDGPU::FLAT_STORE_DWORD:
344    return 1;
345  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347  case AMDGPU::S_LOAD_DWORDX2_IMM:
348  case AMDGPU::GLOBAL_LOAD_DWORDX2:
349  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350  case AMDGPU::GLOBAL_STORE_DWORDX2:
351  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352  case AMDGPU::FLAT_LOAD_DWORDX2:
353  case AMDGPU::FLAT_STORE_DWORDX2:
354    return 2;
355  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357  case AMDGPU::S_LOAD_DWORDX3_IMM:
358  case AMDGPU::GLOBAL_LOAD_DWORDX3:
359  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360  case AMDGPU::GLOBAL_STORE_DWORDX3:
361  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362  case AMDGPU::FLAT_LOAD_DWORDX3:
363  case AMDGPU::FLAT_STORE_DWORDX3:
364    return 3;
365  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367  case AMDGPU::S_LOAD_DWORDX4_IMM:
368  case AMDGPU::GLOBAL_LOAD_DWORDX4:
369  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370  case AMDGPU::GLOBAL_STORE_DWORDX4:
371  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372  case AMDGPU::FLAT_LOAD_DWORDX4:
373  case AMDGPU::FLAT_STORE_DWORDX4:
374    return 4;
375  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377  case AMDGPU::S_LOAD_DWORDX8_IMM:
378    return 8;
379  case AMDGPU::DS_READ_B32:      [[fallthrough]];
380  case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381  case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
382  case AMDGPU::DS_WRITE_B32_gfx9:
383    return 1;
384  case AMDGPU::DS_READ_B64:      [[fallthrough]];
385  case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386  case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
387  case AMDGPU::DS_WRITE_B64_gfx9:
388    return 2;
389  default:
390    return 0;
391  }
392}
393
394/// Maps instruction opcode to enum InstClassEnum.
395static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396  switch (Opc) {
397  default:
398    if (TII.isMUBUF(Opc)) {
399      switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400      default:
401        return UNKNOWN;
402      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
403      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
404      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
405      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
407      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
408      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
409      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
410        return BUFFER_LOAD;
411      case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
412      case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
413      case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
414      case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
415      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
416      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
417      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
418      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
419        return BUFFER_STORE;
420      }
421    }
422    if (TII.isImage(Opc)) {
423      // Ignore instructions encoded without vaddr.
424      if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
425          !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
426        return UNKNOWN;
427      // Ignore BVH instructions
428      if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
429        return UNKNOWN;
430      // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
431      if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
432          TII.isGather4(Opc))
433        return UNKNOWN;
434      return MIMG;
435    }
436    if (TII.isMTBUF(Opc)) {
437      switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
438      default:
439        return UNKNOWN;
440      case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
441      case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
442      case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
443      case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
444      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
445      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
446      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
447      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
448      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
449      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
450      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
451      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
452      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
453      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
454      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
455      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
456        return TBUFFER_LOAD;
457      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
458      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
459      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
460      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
461      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
462      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
463      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
464      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
465        return TBUFFER_STORE;
466      }
467    }
468    return UNKNOWN;
469  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
472  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474    return S_BUFFER_LOAD_IMM;
475  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
476  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
477  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
478  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
479  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
480    return S_BUFFER_LOAD_SGPR_IMM;
481  case AMDGPU::S_LOAD_DWORD_IMM:
482  case AMDGPU::S_LOAD_DWORDX2_IMM:
483  case AMDGPU::S_LOAD_DWORDX3_IMM:
484  case AMDGPU::S_LOAD_DWORDX4_IMM:
485  case AMDGPU::S_LOAD_DWORDX8_IMM:
486    return S_LOAD_IMM;
487  case AMDGPU::DS_READ_B32:
488  case AMDGPU::DS_READ_B32_gfx9:
489  case AMDGPU::DS_READ_B64:
490  case AMDGPU::DS_READ_B64_gfx9:
491    return DS_READ;
492  case AMDGPU::DS_WRITE_B32:
493  case AMDGPU::DS_WRITE_B32_gfx9:
494  case AMDGPU::DS_WRITE_B64:
495  case AMDGPU::DS_WRITE_B64_gfx9:
496    return DS_WRITE;
497  case AMDGPU::GLOBAL_LOAD_DWORD:
498  case AMDGPU::GLOBAL_LOAD_DWORDX2:
499  case AMDGPU::GLOBAL_LOAD_DWORDX3:
500  case AMDGPU::GLOBAL_LOAD_DWORDX4:
501  case AMDGPU::FLAT_LOAD_DWORD:
502  case AMDGPU::FLAT_LOAD_DWORDX2:
503  case AMDGPU::FLAT_LOAD_DWORDX3:
504  case AMDGPU::FLAT_LOAD_DWORDX4:
505    return FLAT_LOAD;
506  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
507  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
508  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
509  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
510    return GLOBAL_LOAD_SADDR;
511  case AMDGPU::GLOBAL_STORE_DWORD:
512  case AMDGPU::GLOBAL_STORE_DWORDX2:
513  case AMDGPU::GLOBAL_STORE_DWORDX3:
514  case AMDGPU::GLOBAL_STORE_DWORDX4:
515  case AMDGPU::FLAT_STORE_DWORD:
516  case AMDGPU::FLAT_STORE_DWORDX2:
517  case AMDGPU::FLAT_STORE_DWORDX3:
518  case AMDGPU::FLAT_STORE_DWORDX4:
519    return FLAT_STORE;
520  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
521  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
522  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
523  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
524    return GLOBAL_STORE_SADDR;
525  }
526}
527
528/// Determines instruction subclass from opcode. Only instructions
529/// of the same subclass can be merged together. The merged instruction may have
530/// a different subclass but must have the same class.
531static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
532  switch (Opc) {
533  default:
534    if (TII.isMUBUF(Opc))
535      return AMDGPU::getMUBUFBaseOpcode(Opc);
536    if (TII.isImage(Opc)) {
537      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
538      assert(Info);
539      return Info->BaseOpcode;
540    }
541    if (TII.isMTBUF(Opc))
542      return AMDGPU::getMTBUFBaseOpcode(Opc);
543    return -1;
544  case AMDGPU::DS_READ_B32:
545  case AMDGPU::DS_READ_B32_gfx9:
546  case AMDGPU::DS_READ_B64:
547  case AMDGPU::DS_READ_B64_gfx9:
548  case AMDGPU::DS_WRITE_B32:
549  case AMDGPU::DS_WRITE_B32_gfx9:
550  case AMDGPU::DS_WRITE_B64:
551  case AMDGPU::DS_WRITE_B64_gfx9:
552    return Opc;
553  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
554  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
555  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
556  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
557  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
558    return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
559  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
560  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
561  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
562  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
563  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
564    return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
565  case AMDGPU::S_LOAD_DWORD_IMM:
566  case AMDGPU::S_LOAD_DWORDX2_IMM:
567  case AMDGPU::S_LOAD_DWORDX3_IMM:
568  case AMDGPU::S_LOAD_DWORDX4_IMM:
569  case AMDGPU::S_LOAD_DWORDX8_IMM:
570    return AMDGPU::S_LOAD_DWORD_IMM;
571  case AMDGPU::GLOBAL_LOAD_DWORD:
572  case AMDGPU::GLOBAL_LOAD_DWORDX2:
573  case AMDGPU::GLOBAL_LOAD_DWORDX3:
574  case AMDGPU::GLOBAL_LOAD_DWORDX4:
575  case AMDGPU::FLAT_LOAD_DWORD:
576  case AMDGPU::FLAT_LOAD_DWORDX2:
577  case AMDGPU::FLAT_LOAD_DWORDX3:
578  case AMDGPU::FLAT_LOAD_DWORDX4:
579    return AMDGPU::FLAT_LOAD_DWORD;
580  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
581  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
582  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
583  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
584    return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
585  case AMDGPU::GLOBAL_STORE_DWORD:
586  case AMDGPU::GLOBAL_STORE_DWORDX2:
587  case AMDGPU::GLOBAL_STORE_DWORDX3:
588  case AMDGPU::GLOBAL_STORE_DWORDX4:
589  case AMDGPU::FLAT_STORE_DWORD:
590  case AMDGPU::FLAT_STORE_DWORDX2:
591  case AMDGPU::FLAT_STORE_DWORDX3:
592  case AMDGPU::FLAT_STORE_DWORDX4:
593    return AMDGPU::FLAT_STORE_DWORD;
594  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
595  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
596  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
597  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
598    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
599  }
600}
601
602// GLOBAL loads and stores are classified as FLAT initially. If both combined
603// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
604// If either or both instructions are non segment specific FLAT the resulting
605// combined operation will be FLAT, potentially promoting one of the GLOBAL
606// operations to FLAT.
607// For other instructions return the original unmodified class.
608InstClassEnum
609SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
610                                         const CombineInfo &Paired) {
611  assert(CI.InstClass == Paired.InstClass);
612
613  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
614      SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
615    return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
616
617  return CI.InstClass;
618}
619
620static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
621  AddressRegs Result;
622
623  if (TII.isMUBUF(Opc)) {
624    if (AMDGPU::getMUBUFHasVAddr(Opc))
625      Result.VAddr = true;
626    if (AMDGPU::getMUBUFHasSrsrc(Opc))
627      Result.SRsrc = true;
628    if (AMDGPU::getMUBUFHasSoffset(Opc))
629      Result.SOffset = true;
630
631    return Result;
632  }
633
634  if (TII.isImage(Opc)) {
635    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
636    if (VAddr0Idx >= 0) {
637      int RsrcName =
638          TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
639      int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
640      Result.NumVAddrs = RsrcIdx - VAddr0Idx;
641    } else {
642      Result.VAddr = true;
643    }
644    Result.SRsrc = true;
645    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
646    if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
647      Result.SSamp = true;
648
649    return Result;
650  }
651  if (TII.isMTBUF(Opc)) {
652    if (AMDGPU::getMTBUFHasVAddr(Opc))
653      Result.VAddr = true;
654    if (AMDGPU::getMTBUFHasSrsrc(Opc))
655      Result.SRsrc = true;
656    if (AMDGPU::getMTBUFHasSoffset(Opc))
657      Result.SOffset = true;
658
659    return Result;
660  }
661
662  switch (Opc) {
663  default:
664    return Result;
665  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
666  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
667  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
668  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
669  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
670    Result.SOffset = true;
671    [[fallthrough]];
672  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
673  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
674  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
675  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
676  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
677  case AMDGPU::S_LOAD_DWORD_IMM:
678  case AMDGPU::S_LOAD_DWORDX2_IMM:
679  case AMDGPU::S_LOAD_DWORDX3_IMM:
680  case AMDGPU::S_LOAD_DWORDX4_IMM:
681  case AMDGPU::S_LOAD_DWORDX8_IMM:
682    Result.SBase = true;
683    return Result;
684  case AMDGPU::DS_READ_B32:
685  case AMDGPU::DS_READ_B64:
686  case AMDGPU::DS_READ_B32_gfx9:
687  case AMDGPU::DS_READ_B64_gfx9:
688  case AMDGPU::DS_WRITE_B32:
689  case AMDGPU::DS_WRITE_B64:
690  case AMDGPU::DS_WRITE_B32_gfx9:
691  case AMDGPU::DS_WRITE_B64_gfx9:
692    Result.Addr = true;
693    return Result;
694  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
695  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
696  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
697  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
698  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
699  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
700  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
701  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
702    Result.SAddr = true;
703    [[fallthrough]];
704  case AMDGPU::GLOBAL_LOAD_DWORD:
705  case AMDGPU::GLOBAL_LOAD_DWORDX2:
706  case AMDGPU::GLOBAL_LOAD_DWORDX3:
707  case AMDGPU::GLOBAL_LOAD_DWORDX4:
708  case AMDGPU::GLOBAL_STORE_DWORD:
709  case AMDGPU::GLOBAL_STORE_DWORDX2:
710  case AMDGPU::GLOBAL_STORE_DWORDX3:
711  case AMDGPU::GLOBAL_STORE_DWORDX4:
712  case AMDGPU::FLAT_LOAD_DWORD:
713  case AMDGPU::FLAT_LOAD_DWORDX2:
714  case AMDGPU::FLAT_LOAD_DWORDX3:
715  case AMDGPU::FLAT_LOAD_DWORDX4:
716  case AMDGPU::FLAT_STORE_DWORD:
717  case AMDGPU::FLAT_STORE_DWORDX2:
718  case AMDGPU::FLAT_STORE_DWORDX3:
719  case AMDGPU::FLAT_STORE_DWORDX4:
720    Result.VAddr = true;
721    return Result;
722  }
723}
724
725void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
726                                              const SILoadStoreOptimizer &LSO) {
727  I = MI;
728  unsigned Opc = MI->getOpcode();
729  InstClass = getInstClass(Opc, *LSO.TII);
730
731  if (InstClass == UNKNOWN)
732    return;
733
734  IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
735
736  switch (InstClass) {
737  case DS_READ:
738   EltSize =
739          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
740                                                                          : 4;
741   break;
742  case DS_WRITE:
743    EltSize =
744          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
745                                                                            : 4;
746    break;
747  case S_BUFFER_LOAD_IMM:
748  case S_BUFFER_LOAD_SGPR_IMM:
749  case S_LOAD_IMM:
750    EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
751    break;
752  default:
753    EltSize = 4;
754    break;
755  }
756
757  if (InstClass == MIMG) {
758    DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
759    // Offset is not considered for MIMG instructions.
760    Offset = 0;
761  } else {
762    int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
763    Offset = I->getOperand(OffsetIdx).getImm();
764  }
765
766  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
767    Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
768
769  Width = getOpcodeWidth(*I, *LSO.TII);
770
771  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
772    Offset &= 0xffff;
773  } else if (InstClass != MIMG) {
774    CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
775  }
776
777  AddressRegs Regs = getRegs(Opc, *LSO.TII);
778  bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
779
780  NumAddresses = 0;
781  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
782    AddrIdx[NumAddresses++] =
783        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
784  if (Regs.Addr)
785    AddrIdx[NumAddresses++] =
786        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
787  if (Regs.SBase)
788    AddrIdx[NumAddresses++] =
789        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
790  if (Regs.SRsrc)
791    AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
792        Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
793  if (Regs.SOffset)
794    AddrIdx[NumAddresses++] =
795        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
796  if (Regs.SAddr)
797    AddrIdx[NumAddresses++] =
798        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
799  if (Regs.VAddr)
800    AddrIdx[NumAddresses++] =
801        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
802  if (Regs.SSamp)
803    AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
804        Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
805  assert(NumAddresses <= MaxAddressRegs);
806
807  for (unsigned J = 0; J < NumAddresses; J++)
808    AddrReg[J] = &I->getOperand(AddrIdx[J]);
809}
810
811} // end anonymous namespace.
812
813INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
814                      "SI Load Store Optimizer", false, false)
815INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
816INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
817                    false, false)
818
819char SILoadStoreOptimizer::ID = 0;
820
821char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
822
823FunctionPass *llvm::createSILoadStoreOptimizerPass() {
824  return new SILoadStoreOptimizer();
825}
826
827static void addDefsUsesToList(const MachineInstr &MI,
828                              DenseSet<Register> &RegDefs,
829                              DenseSet<Register> &RegUses) {
830  for (const auto &Op : MI.operands()) {
831    if (!Op.isReg())
832      continue;
833    if (Op.isDef())
834      RegDefs.insert(Op.getReg());
835    if (Op.readsReg())
836      RegUses.insert(Op.getReg());
837  }
838}
839
840bool SILoadStoreOptimizer::canSwapInstructions(
841    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
842    const MachineInstr &A, const MachineInstr &B) const {
843  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
844      (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
845    return false;
846  for (const auto &BOp : B.operands()) {
847    if (!BOp.isReg())
848      continue;
849    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
850      return false;
851    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
852      return false;
853  }
854  return true;
855}
856
857// Given that \p CI and \p Paired are adjacent memory operations produce a new
858// MMO for the combined operation with a new access size.
859MachineMemOperand *
860SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
861                                               const CombineInfo &Paired) {
862  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
863  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
864
865  unsigned Size = MMOa->getSize() + MMOb->getSize();
866
867  // A base pointer for the combined operation is the same as the leading
868  // operation's pointer.
869  if (Paired < CI)
870    std::swap(MMOa, MMOb);
871
872  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
873  // If merging FLAT and GLOBAL set address space to FLAT.
874  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
875    PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
876
877  MachineFunction *MF = CI.I->getMF();
878  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
879}
880
881bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
882                                               const SIInstrInfo &TII,
883                                               const CombineInfo &Paired) {
884  assert(CI.InstClass == MIMG);
885
886  // Ignore instructions with tfe/lwe set.
887  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
888  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
889
890  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
891    return false;
892
893  // Check other optional immediate operands for equality.
894  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
895                                AMDGPU::OpName::unorm, AMDGPU::OpName::da,
896                                AMDGPU::OpName::r128, AMDGPU::OpName::a16};
897
898  for (auto op : OperandsToMatch) {
899    int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
900    if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
901      return false;
902    if (Idx != -1 &&
903        CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
904      return false;
905  }
906
907  // Check DMask for overlaps.
908  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
909  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
910
911  if (!MaxMask)
912    return false;
913
914  unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
915  if ((1u << AllowedBitsForMin) <= MinMask)
916    return false;
917
918  return true;
919}
920
921static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
922                                       unsigned ComponentCount,
923                                       const GCNSubtarget &STI) {
924  if (ComponentCount > 4)
925    return 0;
926
927  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
928      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
929  if (!OldFormatInfo)
930    return 0;
931
932  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
933      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
934                                           ComponentCount,
935                                           OldFormatInfo->NumFormat, STI);
936
937  if (!NewFormatInfo)
938    return 0;
939
940  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
941         NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
942
943  return NewFormatInfo->Format;
944}
945
946// Return the value in the inclusive range [Lo,Hi] that is aligned to the
947// highest power of two. Note that the result is well defined for all inputs
948// including corner cases like:
949// - if Lo == Hi, return that value
950// - if Lo == 0, return 0 (even though the "- 1" below underflows
951// - if Lo > Hi, return 0 (as if the range wrapped around)
952static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
953  return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
954}
955
956bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
957                                                const GCNSubtarget &STI,
958                                                CombineInfo &Paired,
959                                                bool Modify) {
960  assert(CI.InstClass != MIMG);
961
962  // XXX - Would the same offset be OK? Is there any reason this would happen or
963  // be useful?
964  if (CI.Offset == Paired.Offset)
965    return false;
966
967  // This won't be valid if the offset isn't aligned.
968  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
969    return false;
970
971  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
972
973    const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
974        llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
975    if (!Info0)
976      return false;
977    const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
978        llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
979    if (!Info1)
980      return false;
981
982    if (Info0->BitsPerComp != Info1->BitsPerComp ||
983        Info0->NumFormat != Info1->NumFormat)
984      return false;
985
986    // TODO: Should be possible to support more formats, but if format loads
987    // are not dword-aligned, the merged load might not be valid.
988    if (Info0->BitsPerComp != 32)
989      return false;
990
991    if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
992      return false;
993  }
994
995  uint32_t EltOffset0 = CI.Offset / CI.EltSize;
996  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
997  CI.UseST64 = false;
998  CI.BaseOff = 0;
999
1000  // Handle all non-DS instructions.
1001  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1002    if (EltOffset0 + CI.Width != EltOffset1 &&
1003            EltOffset1 + Paired.Width != EltOffset0)
1004      return false;
1005    if (CI.CPol != Paired.CPol)
1006      return false;
1007    if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1008        CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1009      // Reject cases like:
1010      //   dword + dwordx2 -> dwordx3
1011      //   dword + dwordx3 -> dwordx4
1012      // If we tried to combine these cases, we would fail to extract a subreg
1013      // for the result of the second load due to SGPR alignment requirements.
1014      if (CI.Width != Paired.Width &&
1015          (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1016        return false;
1017    }
1018    return true;
1019  }
1020
1021  // If the offset in elements doesn't fit in 8-bits, we might be able to use
1022  // the stride 64 versions.
1023  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1024      isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1025    if (Modify) {
1026      CI.Offset = EltOffset0 / 64;
1027      Paired.Offset = EltOffset1 / 64;
1028      CI.UseST64 = true;
1029    }
1030    return true;
1031  }
1032
1033  // Check if the new offsets fit in the reduced 8-bit range.
1034  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1035    if (Modify) {
1036      CI.Offset = EltOffset0;
1037      Paired.Offset = EltOffset1;
1038    }
1039    return true;
1040  }
1041
1042  // Try to shift base address to decrease offsets.
1043  uint32_t Min = std::min(EltOffset0, EltOffset1);
1044  uint32_t Max = std::max(EltOffset0, EltOffset1);
1045
1046  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1047  if (((Max - Min) & ~Mask) == 0) {
1048    if (Modify) {
1049      // From the range of values we could use for BaseOff, choose the one that
1050      // is aligned to the highest power of two, to maximise the chance that
1051      // the same offset can be reused for other load/store pairs.
1052      uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1053      // Copy the low bits of the offsets, so that when we adjust them by
1054      // subtracting BaseOff they will be multiples of 64.
1055      BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1056      CI.BaseOff = BaseOff * CI.EltSize;
1057      CI.Offset = (EltOffset0 - BaseOff) / 64;
1058      Paired.Offset = (EltOffset1 - BaseOff) / 64;
1059      CI.UseST64 = true;
1060    }
1061    return true;
1062  }
1063
1064  if (isUInt<8>(Max - Min)) {
1065    if (Modify) {
1066      // From the range of values we could use for BaseOff, choose the one that
1067      // is aligned to the highest power of two, to maximise the chance that
1068      // the same offset can be reused for other load/store pairs.
1069      uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1070      CI.BaseOff = BaseOff * CI.EltSize;
1071      CI.Offset = EltOffset0 - BaseOff;
1072      Paired.Offset = EltOffset1 - BaseOff;
1073    }
1074    return true;
1075  }
1076
1077  return false;
1078}
1079
1080bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1081                                     const CombineInfo &CI,
1082                                     const CombineInfo &Paired) {
1083  const unsigned Width = (CI.Width + Paired.Width);
1084  switch (CI.InstClass) {
1085  default:
1086    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1087  case S_BUFFER_LOAD_IMM:
1088  case S_BUFFER_LOAD_SGPR_IMM:
1089  case S_LOAD_IMM:
1090    switch (Width) {
1091    default:
1092      return false;
1093    case 2:
1094    case 4:
1095    case 8:
1096      return true;
1097    case 3:
1098      return STM.hasScalarDwordx3Loads();
1099    }
1100  }
1101}
1102
1103const TargetRegisterClass *
1104SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1105  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1106    return TRI->getRegClassForReg(*MRI, Dst->getReg());
1107  }
1108  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1109    return TRI->getRegClassForReg(*MRI, Src->getReg());
1110  }
1111  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1112    return TRI->getRegClassForReg(*MRI, Src->getReg());
1113  }
1114  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1115    return TRI->getRegClassForReg(*MRI, Dst->getReg());
1116  }
1117  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1118    return TRI->getRegClassForReg(*MRI, Src->getReg());
1119  }
1120  return nullptr;
1121}
1122
1123/// This function assumes that CI comes before Paired in a basic block. Return
1124/// an insertion point for the merged instruction or nullptr on failure.
1125SILoadStoreOptimizer::CombineInfo *
1126SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1127                                           CombineInfo &Paired) {
1128  // If another instruction has already been merged into CI, it may now be a
1129  // type that we can't do any further merging into.
1130  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1131    return nullptr;
1132  assert(CI.InstClass == Paired.InstClass);
1133
1134  if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1135      getInstSubclass(Paired.I->getOpcode(), *TII))
1136    return nullptr;
1137
1138  // Check both offsets (or masks for MIMG) can be combined and fit in the
1139  // reduced range.
1140  if (CI.InstClass == MIMG) {
1141    if (!dmasksCanBeCombined(CI, *TII, Paired))
1142      return nullptr;
1143  } else {
1144    if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1145      return nullptr;
1146  }
1147
1148  DenseSet<Register> RegDefs;
1149  DenseSet<Register> RegUses;
1150  CombineInfo *Where;
1151  if (CI.I->mayLoad()) {
1152    // Try to hoist Paired up to CI.
1153    addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1154    for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1155      if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1156        return nullptr;
1157    }
1158    Where = &CI;
1159  } else {
1160    // Try to sink CI down to Paired.
1161    addDefsUsesToList(*CI.I, RegDefs, RegUses);
1162    for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1163      if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1164        return nullptr;
1165    }
1166    Where = &Paired;
1167  }
1168
1169  // Call offsetsCanBeCombined with modify = true so that the offsets are
1170  // correct for the new instruction.  This should return true, because
1171  // this function should only be called on CombineInfo objects that
1172  // have already been confirmed to be mergeable.
1173  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1174    offsetsCanBeCombined(CI, *STM, Paired, true);
1175  return Where;
1176}
1177
1178unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1179  if (STM->ldsRequiresM0Init())
1180    return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1181  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1182}
1183
1184unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1185  if (STM->ldsRequiresM0Init())
1186    return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1187
1188  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1189                        : AMDGPU::DS_READ2ST64_B64_gfx9;
1190}
1191
1192MachineBasicBlock::iterator
1193SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1194                                     MachineBasicBlock::iterator InsertBefore) {
1195  MachineBasicBlock *MBB = CI.I->getParent();
1196
1197  // Be careful, since the addresses could be subregisters themselves in weird
1198  // cases, like vectors of pointers.
1199  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1200
1201  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1202  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1203
1204  unsigned NewOffset0 = CI.Offset;
1205  unsigned NewOffset1 = Paired.Offset;
1206  unsigned Opc =
1207      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1208
1209  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1210  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1211
1212  if (NewOffset0 > NewOffset1) {
1213    // Canonicalize the merged instruction so the smaller offset comes first.
1214    std::swap(NewOffset0, NewOffset1);
1215    std::swap(SubRegIdx0, SubRegIdx1);
1216  }
1217
1218  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1219         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1220
1221  const MCInstrDesc &Read2Desc = TII->get(Opc);
1222
1223  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1224  Register DestReg = MRI->createVirtualRegister(SuperRC);
1225
1226  DebugLoc DL = CI.I->getDebugLoc();
1227
1228  Register BaseReg = AddrReg->getReg();
1229  unsigned BaseSubReg = AddrReg->getSubReg();
1230  unsigned BaseRegFlags = 0;
1231  if (CI.BaseOff) {
1232    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1233    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1234        .addImm(CI.BaseOff);
1235
1236    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1237    BaseRegFlags = RegState::Kill;
1238
1239    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1240        .addReg(ImmReg)
1241        .addReg(AddrReg->getReg(), 0, BaseSubReg)
1242        .addImm(0); // clamp bit
1243    BaseSubReg = 0;
1244  }
1245
1246  MachineInstrBuilder Read2 =
1247      BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1248          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1249          .addImm(NewOffset0)                        // offset0
1250          .addImm(NewOffset1)                        // offset1
1251          .addImm(0)                                 // gds
1252          .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1253
1254  (void)Read2;
1255
1256  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1257
1258  // Copy to the old destination registers.
1259  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1260      .add(*Dest0) // Copy to same destination including flags and sub reg.
1261      .addReg(DestReg, 0, SubRegIdx0);
1262  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1263      .add(*Dest1)
1264      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1265
1266  CI.I->eraseFromParent();
1267  Paired.I->eraseFromParent();
1268
1269  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1270  return Read2;
1271}
1272
1273unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1274  if (STM->ldsRequiresM0Init())
1275    return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1276  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1277                        : AMDGPU::DS_WRITE2_B64_gfx9;
1278}
1279
1280unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1281  if (STM->ldsRequiresM0Init())
1282    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1283                          : AMDGPU::DS_WRITE2ST64_B64;
1284
1285  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1286                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1287}
1288
1289MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1290    CombineInfo &CI, CombineInfo &Paired,
1291    MachineBasicBlock::iterator InsertBefore) {
1292  MachineBasicBlock *MBB = CI.I->getParent();
1293
1294  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1295  // sure we preserve the subregister index and any register flags set on them.
1296  const MachineOperand *AddrReg =
1297      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1298  const MachineOperand *Data0 =
1299      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1300  const MachineOperand *Data1 =
1301      TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1302
1303  unsigned NewOffset0 = CI.Offset;
1304  unsigned NewOffset1 = Paired.Offset;
1305  unsigned Opc =
1306      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1307
1308  if (NewOffset0 > NewOffset1) {
1309    // Canonicalize the merged instruction so the smaller offset comes first.
1310    std::swap(NewOffset0, NewOffset1);
1311    std::swap(Data0, Data1);
1312  }
1313
1314  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1315         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1316
1317  const MCInstrDesc &Write2Desc = TII->get(Opc);
1318  DebugLoc DL = CI.I->getDebugLoc();
1319
1320  Register BaseReg = AddrReg->getReg();
1321  unsigned BaseSubReg = AddrReg->getSubReg();
1322  unsigned BaseRegFlags = 0;
1323  if (CI.BaseOff) {
1324    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1325    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1326        .addImm(CI.BaseOff);
1327
1328    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1329    BaseRegFlags = RegState::Kill;
1330
1331    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1332        .addReg(ImmReg)
1333        .addReg(AddrReg->getReg(), 0, BaseSubReg)
1334        .addImm(0); // clamp bit
1335    BaseSubReg = 0;
1336  }
1337
1338  MachineInstrBuilder Write2 =
1339      BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1340          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1341          .add(*Data0)                               // data0
1342          .add(*Data1)                               // data1
1343          .addImm(NewOffset0)                        // offset0
1344          .addImm(NewOffset1)                        // offset1
1345          .addImm(0)                                 // gds
1346          .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1347
1348  CI.I->eraseFromParent();
1349  Paired.I->eraseFromParent();
1350
1351  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1352  return Write2;
1353}
1354
1355MachineBasicBlock::iterator
1356SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1357                                     MachineBasicBlock::iterator InsertBefore) {
1358  MachineBasicBlock *MBB = CI.I->getParent();
1359  DebugLoc DL = CI.I->getDebugLoc();
1360  const unsigned Opcode = getNewOpcode(CI, Paired);
1361
1362  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1363
1364  Register DestReg = MRI->createVirtualRegister(SuperRC);
1365  unsigned MergedDMask = CI.DMask | Paired.DMask;
1366  unsigned DMaskIdx =
1367      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1368
1369  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1370  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1371    if (I == DMaskIdx)
1372      MIB.addImm(MergedDMask);
1373    else
1374      MIB.add((*CI.I).getOperand(I));
1375  }
1376
1377  // It shouldn't be possible to get this far if the two instructions
1378  // don't have a single memoperand, because MachineInstr::mayAlias()
1379  // will return true if this is the case.
1380  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1381
1382  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1383
1384  unsigned SubRegIdx0, SubRegIdx1;
1385  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1386
1387  // Copy to the old destination registers.
1388  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1389  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1390  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1391
1392  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1393      .add(*Dest0) // Copy to same destination including flags and sub reg.
1394      .addReg(DestReg, 0, SubRegIdx0);
1395  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1396      .add(*Dest1)
1397      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1398
1399  CI.I->eraseFromParent();
1400  Paired.I->eraseFromParent();
1401  return New;
1402}
1403
1404MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1405    CombineInfo &CI, CombineInfo &Paired,
1406    MachineBasicBlock::iterator InsertBefore) {
1407  MachineBasicBlock *MBB = CI.I->getParent();
1408  DebugLoc DL = CI.I->getDebugLoc();
1409  const unsigned Opcode = getNewOpcode(CI, Paired);
1410
1411  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412
1413  Register DestReg = MRI->createVirtualRegister(SuperRC);
1414  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1415
1416  // It shouldn't be possible to get this far if the two instructions
1417  // don't have a single memoperand, because MachineInstr::mayAlias()
1418  // will return true if this is the case.
1419  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1420
1421  MachineInstrBuilder New =
1422      BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1423          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1424  if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1425    New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1426  New.addImm(MergedOffset);
1427  New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1428
1429  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1430  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1431  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1432
1433  // Copy to the old destination registers.
1434  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1435  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1436  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1437
1438  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1439      .add(*Dest0) // Copy to same destination including flags and sub reg.
1440      .addReg(DestReg, 0, SubRegIdx0);
1441  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1442      .add(*Dest1)
1443      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1444
1445  CI.I->eraseFromParent();
1446  Paired.I->eraseFromParent();
1447  return New;
1448}
1449
1450MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1451    CombineInfo &CI, CombineInfo &Paired,
1452    MachineBasicBlock::iterator InsertBefore) {
1453  MachineBasicBlock *MBB = CI.I->getParent();
1454  DebugLoc DL = CI.I->getDebugLoc();
1455
1456  const unsigned Opcode = getNewOpcode(CI, Paired);
1457
1458  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459
1460  // Copy to the new source register.
1461  Register DestReg = MRI->createVirtualRegister(SuperRC);
1462  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1463
1464  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1465
1466  AddressRegs Regs = getRegs(Opcode, *TII);
1467
1468  if (Regs.VAddr)
1469    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1470
1471  // It shouldn't be possible to get this far if the two instructions
1472  // don't have a single memoperand, because MachineInstr::mayAlias()
1473  // will return true if this is the case.
1474  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1475
1476  MachineInstr *New =
1477    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1478        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1479        .addImm(MergedOffset) // offset
1480        .addImm(CI.CPol)      // cpol
1481        .addImm(0)            // swz
1482        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1483
1484  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1485  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1486  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1487
1488  // Copy to the old destination registers.
1489  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1490  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1491  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1492
1493  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1494      .add(*Dest0) // Copy to same destination including flags and sub reg.
1495      .addReg(DestReg, 0, SubRegIdx0);
1496  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1497      .add(*Dest1)
1498      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1499
1500  CI.I->eraseFromParent();
1501  Paired.I->eraseFromParent();
1502  return New;
1503}
1504
1505MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1506    CombineInfo &CI, CombineInfo &Paired,
1507    MachineBasicBlock::iterator InsertBefore) {
1508  MachineBasicBlock *MBB = CI.I->getParent();
1509  DebugLoc DL = CI.I->getDebugLoc();
1510
1511  const unsigned Opcode = getNewOpcode(CI, Paired);
1512
1513  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1514
1515  // Copy to the new source register.
1516  Register DestReg = MRI->createVirtualRegister(SuperRC);
1517  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1518
1519  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1520
1521  AddressRegs Regs = getRegs(Opcode, *TII);
1522
1523  if (Regs.VAddr)
1524    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1525
1526  unsigned JoinedFormat =
1527      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1528
1529  // It shouldn't be possible to get this far if the two instructions
1530  // don't have a single memoperand, because MachineInstr::mayAlias()
1531  // will return true if this is the case.
1532  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1533
1534  MachineInstr *New =
1535      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1536          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1537          .addImm(MergedOffset) // offset
1538          .addImm(JoinedFormat) // format
1539          .addImm(CI.CPol)      // cpol
1540          .addImm(0)            // swz
1541          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1542
1543  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1544  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1545  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1546
1547  // Copy to the old destination registers.
1548  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1549  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1551
1552  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1553      .add(*Dest0) // Copy to same destination including flags and sub reg.
1554      .addReg(DestReg, 0, SubRegIdx0);
1555  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1556      .add(*Dest1)
1557      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1558
1559  CI.I->eraseFromParent();
1560  Paired.I->eraseFromParent();
1561  return New;
1562}
1563
1564MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1565    CombineInfo &CI, CombineInfo &Paired,
1566    MachineBasicBlock::iterator InsertBefore) {
1567  MachineBasicBlock *MBB = CI.I->getParent();
1568  DebugLoc DL = CI.I->getDebugLoc();
1569
1570  const unsigned Opcode = getNewOpcode(CI, Paired);
1571
1572  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1573  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1574  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1575
1576  // Copy to the new source register.
1577  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1578  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1579
1580  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1581  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1582
1583  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1584      .add(*Src0)
1585      .addImm(SubRegIdx0)
1586      .add(*Src1)
1587      .addImm(SubRegIdx1);
1588
1589  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1590                 .addReg(SrcReg, RegState::Kill);
1591
1592  AddressRegs Regs = getRegs(Opcode, *TII);
1593
1594  if (Regs.VAddr)
1595    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596
1597  unsigned JoinedFormat =
1598      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599
1600  // It shouldn't be possible to get this far if the two instructions
1601  // don't have a single memoperand, because MachineInstr::mayAlias()
1602  // will return true if this is the case.
1603  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604
1605  MachineInstr *New =
1606      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608          .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1609          .addImm(JoinedFormat)                     // format
1610          .addImm(CI.CPol)                          // cpol
1611          .addImm(0)                                // swz
1612          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613
1614  CI.I->eraseFromParent();
1615  Paired.I->eraseFromParent();
1616  return New;
1617}
1618
1619MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1620    CombineInfo &CI, CombineInfo &Paired,
1621    MachineBasicBlock::iterator InsertBefore) {
1622  MachineBasicBlock *MBB = CI.I->getParent();
1623  DebugLoc DL = CI.I->getDebugLoc();
1624
1625  const unsigned Opcode = getNewOpcode(CI, Paired);
1626
1627  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1628  Register DestReg = MRI->createVirtualRegister(SuperRC);
1629
1630  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1631
1632  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1633    MIB.add(*SAddr);
1634
1635  MachineInstr *New =
1636    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1637       .addImm(std::min(CI.Offset, Paired.Offset))
1638       .addImm(CI.CPol)
1639       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1640
1641  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1642  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1643  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1644
1645  // Copy to the old destination registers.
1646  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1647  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1648  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1649
1650  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1651      .add(*Dest0) // Copy to same destination including flags and sub reg.
1652      .addReg(DestReg, 0, SubRegIdx0);
1653  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1654      .add(*Dest1)
1655      .addReg(DestReg, RegState::Kill, SubRegIdx1);
1656
1657  CI.I->eraseFromParent();
1658  Paired.I->eraseFromParent();
1659  return New;
1660}
1661
1662MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1663    CombineInfo &CI, CombineInfo &Paired,
1664    MachineBasicBlock::iterator InsertBefore) {
1665  MachineBasicBlock *MBB = CI.I->getParent();
1666  DebugLoc DL = CI.I->getDebugLoc();
1667
1668  const unsigned Opcode = getNewOpcode(CI, Paired);
1669
1670  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1671  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1672  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1673
1674  // Copy to the new source register.
1675  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1676  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1677
1678  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1679  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1680
1681  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1682      .add(*Src0)
1683      .addImm(SubRegIdx0)
1684      .add(*Src1)
1685      .addImm(SubRegIdx1);
1686
1687  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1688                 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1689                 .addReg(SrcReg, RegState::Kill);
1690
1691  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1692    MIB.add(*SAddr);
1693
1694  MachineInstr *New =
1695    MIB.addImm(std::min(CI.Offset, Paired.Offset))
1696       .addImm(CI.CPol)
1697       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1698
1699  CI.I->eraseFromParent();
1700  Paired.I->eraseFromParent();
1701  return New;
1702}
1703
1704unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1705                                            const CombineInfo &Paired) {
1706  const unsigned Width = CI.Width + Paired.Width;
1707
1708  switch (getCommonInstClass(CI, Paired)) {
1709  default:
1710    assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1711    // FIXME: Handle d16 correctly
1712    return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1713                                  Width);
1714  case TBUFFER_LOAD:
1715  case TBUFFER_STORE:
1716    return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1717                                  Width);
1718
1719  case UNKNOWN:
1720    llvm_unreachable("Unknown instruction class");
1721  case S_BUFFER_LOAD_IMM:
1722    switch (Width) {
1723    default:
1724      return 0;
1725    case 2:
1726      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1727    case 3:
1728      return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1729    case 4:
1730      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1731    case 8:
1732      return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1733    }
1734  case S_BUFFER_LOAD_SGPR_IMM:
1735    switch (Width) {
1736    default:
1737      return 0;
1738    case 2:
1739      return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1740    case 3:
1741      return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1742    case 4:
1743      return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1744    case 8:
1745      return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1746    }
1747  case S_LOAD_IMM:
1748    switch (Width) {
1749    default:
1750      return 0;
1751    case 2:
1752      return AMDGPU::S_LOAD_DWORDX2_IMM;
1753    case 3:
1754      return AMDGPU::S_LOAD_DWORDX3_IMM;
1755    case 4:
1756      return AMDGPU::S_LOAD_DWORDX4_IMM;
1757    case 8:
1758      return AMDGPU::S_LOAD_DWORDX8_IMM;
1759    }
1760  case GLOBAL_LOAD:
1761    switch (Width) {
1762    default:
1763      return 0;
1764    case 2:
1765      return AMDGPU::GLOBAL_LOAD_DWORDX2;
1766    case 3:
1767      return AMDGPU::GLOBAL_LOAD_DWORDX3;
1768    case 4:
1769      return AMDGPU::GLOBAL_LOAD_DWORDX4;
1770    }
1771  case GLOBAL_LOAD_SADDR:
1772    switch (Width) {
1773    default:
1774      return 0;
1775    case 2:
1776      return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1777    case 3:
1778      return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1779    case 4:
1780      return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1781    }
1782  case GLOBAL_STORE:
1783    switch (Width) {
1784    default:
1785      return 0;
1786    case 2:
1787      return AMDGPU::GLOBAL_STORE_DWORDX2;
1788    case 3:
1789      return AMDGPU::GLOBAL_STORE_DWORDX3;
1790    case 4:
1791      return AMDGPU::GLOBAL_STORE_DWORDX4;
1792    }
1793  case GLOBAL_STORE_SADDR:
1794    switch (Width) {
1795    default:
1796      return 0;
1797    case 2:
1798      return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1799    case 3:
1800      return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1801    case 4:
1802      return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1803    }
1804  case FLAT_LOAD:
1805    switch (Width) {
1806    default:
1807      return 0;
1808    case 2:
1809      return AMDGPU::FLAT_LOAD_DWORDX2;
1810    case 3:
1811      return AMDGPU::FLAT_LOAD_DWORDX3;
1812    case 4:
1813      return AMDGPU::FLAT_LOAD_DWORDX4;
1814    }
1815  case FLAT_STORE:
1816    switch (Width) {
1817    default:
1818      return 0;
1819    case 2:
1820      return AMDGPU::FLAT_STORE_DWORDX2;
1821    case 3:
1822      return AMDGPU::FLAT_STORE_DWORDX3;
1823    case 4:
1824      return AMDGPU::FLAT_STORE_DWORDX4;
1825    }
1826  case MIMG:
1827    assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1828           "No overlaps");
1829    return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1830  }
1831}
1832
1833std::pair<unsigned, unsigned>
1834SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1835                                    const CombineInfo &Paired) {
1836  assert((CI.InstClass != MIMG ||
1837          ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1838           CI.Width + Paired.Width)) &&
1839         "No overlaps");
1840
1841  unsigned Idx0;
1842  unsigned Idx1;
1843
1844  static const unsigned Idxs[5][4] = {
1845      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1846      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1847      {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1848      {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1849      {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1850  };
1851
1852  assert(CI.Width >= 1 && CI.Width <= 4);
1853  assert(Paired.Width >= 1 && Paired.Width <= 4);
1854
1855  if (Paired < CI) {
1856    Idx1 = Idxs[0][Paired.Width - 1];
1857    Idx0 = Idxs[Paired.Width][CI.Width - 1];
1858  } else {
1859    Idx0 = Idxs[0][CI.Width - 1];
1860    Idx1 = Idxs[CI.Width][Paired.Width - 1];
1861  }
1862
1863  return std::pair(Idx0, Idx1);
1864}
1865
1866const TargetRegisterClass *
1867SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1868                                             const CombineInfo &Paired) {
1869  if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1870      CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1871    switch (CI.Width + Paired.Width) {
1872    default:
1873      return nullptr;
1874    case 2:
1875      return &AMDGPU::SReg_64_XEXECRegClass;
1876    case 3:
1877      return &AMDGPU::SGPR_96RegClass;
1878    case 4:
1879      return &AMDGPU::SGPR_128RegClass;
1880    case 8:
1881      return &AMDGPU::SGPR_256RegClass;
1882    case 16:
1883      return &AMDGPU::SGPR_512RegClass;
1884    }
1885  }
1886
1887  unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1888  return TRI->isAGPRClass(getDataRegClass(*CI.I))
1889             ? TRI->getAGPRClassForBitWidth(BitWidth)
1890             : TRI->getVGPRClassForBitWidth(BitWidth);
1891}
1892
1893MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1894    CombineInfo &CI, CombineInfo &Paired,
1895    MachineBasicBlock::iterator InsertBefore) {
1896  MachineBasicBlock *MBB = CI.I->getParent();
1897  DebugLoc DL = CI.I->getDebugLoc();
1898
1899  const unsigned Opcode = getNewOpcode(CI, Paired);
1900
1901  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1902  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1903  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1904
1905  // Copy to the new source register.
1906  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1907  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1908
1909  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1910  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1911
1912  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1913      .add(*Src0)
1914      .addImm(SubRegIdx0)
1915      .add(*Src1)
1916      .addImm(SubRegIdx1);
1917
1918  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1919                 .addReg(SrcReg, RegState::Kill);
1920
1921  AddressRegs Regs = getRegs(Opcode, *TII);
1922
1923  if (Regs.VAddr)
1924    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1925
1926
1927  // It shouldn't be possible to get this far if the two instructions
1928  // don't have a single memoperand, because MachineInstr::mayAlias()
1929  // will return true if this is the case.
1930  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1931
1932  MachineInstr *New =
1933    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1934        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1935        .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1936        .addImm(CI.CPol)      // cpol
1937        .addImm(0)            // swz
1938        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1939
1940  CI.I->eraseFromParent();
1941  Paired.I->eraseFromParent();
1942  return New;
1943}
1944
1945MachineOperand
1946SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1947  APInt V(32, Val, true);
1948  if (TII->isInlineConstant(V))
1949    return MachineOperand::CreateImm(Val);
1950
1951  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1952  MachineInstr *Mov =
1953  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1954          TII->get(AMDGPU::S_MOV_B32), Reg)
1955    .addImm(Val);
1956  (void)Mov;
1957  LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1958  return MachineOperand::CreateReg(Reg, false);
1959}
1960
1961// Compute base address using Addr and return the final register.
1962Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1963                                           const MemAddress &Addr) const {
1964  MachineBasicBlock *MBB = MI.getParent();
1965  MachineBasicBlock::iterator MBBI = MI.getIterator();
1966  DebugLoc DL = MI.getDebugLoc();
1967
1968  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1969          Addr.Base.LoSubReg) &&
1970         "Expected 32-bit Base-Register-Low!!");
1971
1972  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1973          Addr.Base.HiSubReg) &&
1974         "Expected 32-bit Base-Register-Hi!!");
1975
1976  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1977  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1978  MachineOperand OffsetHi =
1979    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1980
1981  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1982  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1983  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1984
1985  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1986  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1987  MachineInstr *LoHalf =
1988    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1989      .addReg(CarryReg, RegState::Define)
1990      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1991      .add(OffsetLo)
1992      .addImm(0); // clamp bit
1993  (void)LoHalf;
1994  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1995
1996  MachineInstr *HiHalf =
1997  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1998    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1999    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2000    .add(OffsetHi)
2001    .addReg(CarryReg, RegState::Kill)
2002    .addImm(0); // clamp bit
2003  (void)HiHalf;
2004  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
2005
2006  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2007  MachineInstr *FullBase =
2008    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2009      .addReg(DestSub0)
2010      .addImm(AMDGPU::sub0)
2011      .addReg(DestSub1)
2012      .addImm(AMDGPU::sub1);
2013  (void)FullBase;
2014  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
2015
2016  return FullDestReg;
2017}
2018
2019// Update base and offset with the NewBase and NewOffset in MI.
2020void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2021                                               Register NewBase,
2022                                               int32_t NewOffset) const {
2023  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2024  Base->setReg(NewBase);
2025  Base->setIsKill(false);
2026  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2027}
2028
2029std::optional<int32_t>
2030SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2031  if (Op.isImm())
2032    return Op.getImm();
2033
2034  if (!Op.isReg())
2035    return std::nullopt;
2036
2037  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2038  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2039      !Def->getOperand(1).isImm())
2040    return std::nullopt;
2041
2042  return Def->getOperand(1).getImm();
2043}
2044
2045// Analyze Base and extracts:
2046//  - 32bit base registers, subregisters
2047//  - 64bit constant offset
2048// Expecting base computation as:
2049//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2050//   %LO:vgpr_32, %c:sreg_64_xexec =
2051//       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2052//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2053//   %Base:vreg_64 =
2054//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2055void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2056                                                      MemAddress &Addr) const {
2057  if (!Base.isReg())
2058    return;
2059
2060  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2061  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2062      || Def->getNumOperands() != 5)
2063    return;
2064
2065  MachineOperand BaseLo = Def->getOperand(1);
2066  MachineOperand BaseHi = Def->getOperand(3);
2067  if (!BaseLo.isReg() || !BaseHi.isReg())
2068    return;
2069
2070  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2071  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2072
2073  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2074      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2075    return;
2076
2077  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2078  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2079
2080  auto Offset0P = extractConstOffset(*Src0);
2081  if (Offset0P)
2082    BaseLo = *Src1;
2083  else {
2084    if (!(Offset0P = extractConstOffset(*Src1)))
2085      return;
2086    BaseLo = *Src0;
2087  }
2088
2089  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2090  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2091
2092  if (Src0->isImm())
2093    std::swap(Src0, Src1);
2094
2095  if (!Src1->isImm())
2096    return;
2097
2098  uint64_t Offset1 = Src1->getImm();
2099  BaseHi = *Src0;
2100
2101  Addr.Base.LoReg = BaseLo.getReg();
2102  Addr.Base.HiReg = BaseHi.getReg();
2103  Addr.Base.LoSubReg = BaseLo.getSubReg();
2104  Addr.Base.HiSubReg = BaseHi.getSubReg();
2105  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2106}
2107
2108bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2109    MachineInstr &MI,
2110    MemInfoMap &Visited,
2111    SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2112
2113  if (!(MI.mayLoad() ^ MI.mayStore()))
2114    return false;
2115
2116  // TODO: Support flat and scratch.
2117  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2118    return false;
2119
2120  if (MI.mayLoad() &&
2121      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2122    return false;
2123
2124  if (AnchorList.count(&MI))
2125    return false;
2126
2127  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2128
2129  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2130    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2131    return false;
2132  }
2133
2134  // Step1: Find the base-registers and a 64bit constant offset.
2135  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2136  MemAddress MAddr;
2137  if (!Visited.contains(&MI)) {
2138    processBaseWithConstOffset(Base, MAddr);
2139    Visited[&MI] = MAddr;
2140  } else
2141    MAddr = Visited[&MI];
2142
2143  if (MAddr.Offset == 0) {
2144    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2145                         " constant offsets that can be promoted.\n";);
2146    return false;
2147  }
2148
2149  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2150             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2151
2152  // Step2: Traverse through MI's basic block and find an anchor(that has the
2153  // same base-registers) with the highest 13bit distance from MI's offset.
2154  // E.g. (64bit loads)
2155  // bb:
2156  //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2157  //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2158  //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2159  //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2160  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2161  //
2162  // Starting from the first load, the optimization will try to find a new base
2163  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2164  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2165  // as the new-base(anchor) because of the maximum distance which can
2166  // accommodate more intermediate bases presumably.
2167  //
2168  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2169  // (&a + 8192) for load1, load2, load4.
2170  //   addr = &a + 8192
2171  //   load1 = load(addr,       -4096)
2172  //   load2 = load(addr,       -2048)
2173  //   load3 = load(addr,       0)
2174  //   load4 = load(addr,       2048)
2175  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2176  //
2177  MachineInstr *AnchorInst = nullptr;
2178  MemAddress AnchorAddr;
2179  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2180  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2181
2182  MachineBasicBlock *MBB = MI.getParent();
2183  MachineBasicBlock::iterator E = MBB->end();
2184  MachineBasicBlock::iterator MBBI = MI.getIterator();
2185  ++MBBI;
2186  const SITargetLowering *TLI =
2187    static_cast<const SITargetLowering *>(STM->getTargetLowering());
2188
2189  for ( ; MBBI != E; ++MBBI) {
2190    MachineInstr &MINext = *MBBI;
2191    // TODO: Support finding an anchor(with same base) from store addresses or
2192    // any other load addresses where the opcodes are different.
2193    if (MINext.getOpcode() != MI.getOpcode() ||
2194        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2195      continue;
2196
2197    const MachineOperand &BaseNext =
2198      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2199    MemAddress MAddrNext;
2200    if (!Visited.contains(&MINext)) {
2201      processBaseWithConstOffset(BaseNext, MAddrNext);
2202      Visited[&MINext] = MAddrNext;
2203    } else
2204      MAddrNext = Visited[&MINext];
2205
2206    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2207        MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2208        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2209        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2210      continue;
2211
2212    InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2213
2214    int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2215    TargetLoweringBase::AddrMode AM;
2216    AM.HasBaseReg = true;
2217    AM.BaseOffs = Dist;
2218    if (TLI->isLegalGlobalAddressingMode(AM) &&
2219        (uint32_t)std::abs(Dist) > MaxDist) {
2220      MaxDist = std::abs(Dist);
2221
2222      AnchorAddr = MAddrNext;
2223      AnchorInst = &MINext;
2224    }
2225  }
2226
2227  if (AnchorInst) {
2228    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2229               AnchorInst->dump());
2230    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2231               <<  AnchorAddr.Offset << "\n\n");
2232
2233    // Instead of moving up, just re-compute anchor-instruction's base address.
2234    Register Base = computeBase(MI, AnchorAddr);
2235
2236    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2237    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2238
2239    for (auto P : InstsWCommonBase) {
2240      TargetLoweringBase::AddrMode AM;
2241      AM.HasBaseReg = true;
2242      AM.BaseOffs = P.second - AnchorAddr.Offset;
2243
2244      if (TLI->isLegalGlobalAddressingMode(AM)) {
2245        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
2246                   dbgs() << ")"; P.first->dump());
2247        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2248        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
2249      }
2250    }
2251    AnchorList.insert(AnchorInst);
2252    return true;
2253  }
2254
2255  return false;
2256}
2257
2258void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2259                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2260  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2261    if (AddrList.front().InstClass == CI.InstClass &&
2262        AddrList.front().IsAGPR == CI.IsAGPR &&
2263        AddrList.front().hasSameBaseAddress(CI)) {
2264      AddrList.emplace_back(CI);
2265      return;
2266    }
2267  }
2268
2269  // Base address not found, so add a new list.
2270  MergeableInsts.emplace_back(1, CI);
2271}
2272
2273std::pair<MachineBasicBlock::iterator, bool>
2274SILoadStoreOptimizer::collectMergeableInsts(
2275    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2276    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2277    std::list<std::list<CombineInfo>> &MergeableInsts) const {
2278  bool Modified = false;
2279
2280  // Sort potential mergeable instructions into lists.  One list per base address.
2281  unsigned Order = 0;
2282  MachineBasicBlock::iterator BlockI = Begin;
2283  for (; BlockI != End; ++BlockI) {
2284    MachineInstr &MI = *BlockI;
2285
2286    // We run this before checking if an address is mergeable, because it can produce
2287    // better code even if the instructions aren't mergeable.
2288    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2289      Modified = true;
2290
2291    // Treat volatile accesses, ordered accesses and unmodeled side effects as
2292    // barriers. We can look after this barrier for separate merges.
2293    if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2294      LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2295
2296      // Search will resume after this instruction in a separate merge list.
2297      ++BlockI;
2298      break;
2299    }
2300
2301    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2302    if (InstClass == UNKNOWN)
2303      continue;
2304
2305    // Do not merge VMEM buffer instructions with "swizzled" bit set.
2306    int Swizzled =
2307        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2308    if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2309      continue;
2310
2311    CombineInfo CI;
2312    CI.setMI(MI, *this);
2313    CI.Order = Order++;
2314
2315    if (!CI.hasMergeableAddress(*MRI))
2316      continue;
2317
2318    if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2319      // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2320      //        operands. However we are reporting that ds_write2 shall have
2321      //        only VGPR data so that machine copy propagation does not
2322      //        create an illegal instruction with a VGPR and AGPR sources.
2323      //        Consequenctially if we create such instruction the verifier
2324      //        will complain.
2325      continue;
2326    }
2327
2328    LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2329
2330    addInstToMergeableList(CI, MergeableInsts);
2331  }
2332
2333  // At this point we have lists of Mergeable instructions.
2334  //
2335  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2336  // list try to find an instruction that can be merged with I.  If an instruction
2337  // is found, it is stored in the Paired field.  If no instructions are found, then
2338  // the CombineInfo object is deleted from the list.
2339
2340  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2341                                                   E = MergeableInsts.end(); I != E;) {
2342
2343    std::list<CombineInfo> &MergeList = *I;
2344    if (MergeList.size() <= 1) {
2345      // This means we have found only one instruction with a given address
2346      // that can be merged, and we need at least 2 instructions to do a merge,
2347      // so this list can be discarded.
2348      I = MergeableInsts.erase(I);
2349      continue;
2350    }
2351
2352    // Sort the lists by offsets, this way mergeable instructions will be
2353    // adjacent to each other in the list, which will make it easier to find
2354    // matches.
2355    MergeList.sort(
2356        [] (const CombineInfo &A, const CombineInfo &B) {
2357          return A.Offset < B.Offset;
2358        });
2359    ++I;
2360  }
2361
2362  return std::pair(BlockI, Modified);
2363}
2364
2365// Scan through looking for adjacent LDS operations with constant offsets from
2366// the same base register. We rely on the scheduler to do the hard work of
2367// clustering nearby loads, and assume these are all adjacent.
2368bool SILoadStoreOptimizer::optimizeBlock(
2369                       std::list<std::list<CombineInfo> > &MergeableInsts) {
2370  bool Modified = false;
2371
2372  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2373                                                   E = MergeableInsts.end(); I != E;) {
2374    std::list<CombineInfo> &MergeList = *I;
2375
2376    bool OptimizeListAgain = false;
2377    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2378      // We weren't able to make any changes, so delete the list so we don't
2379      // process the same instructions the next time we try to optimize this
2380      // block.
2381      I = MergeableInsts.erase(I);
2382      continue;
2383    }
2384
2385    Modified = true;
2386
2387    // We made changes, but also determined that there were no more optimization
2388    // opportunities, so we don't need to reprocess the list
2389    if (!OptimizeListAgain) {
2390      I = MergeableInsts.erase(I);
2391      continue;
2392    }
2393    OptimizeAgain = true;
2394  }
2395  return Modified;
2396}
2397
2398bool
2399SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2400                                          std::list<CombineInfo> &MergeList,
2401                                          bool &OptimizeListAgain) {
2402  if (MergeList.empty())
2403    return false;
2404
2405  bool Modified = false;
2406
2407  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2408       Next = std::next(I)) {
2409
2410    auto First = I;
2411    auto Second = Next;
2412
2413    if ((*First).Order > (*Second).Order)
2414      std::swap(First, Second);
2415    CombineInfo &CI = *First;
2416    CombineInfo &Paired = *Second;
2417
2418    CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2419    if (!Where) {
2420      ++I;
2421      continue;
2422    }
2423
2424    Modified = true;
2425
2426    LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2427
2428    MachineBasicBlock::iterator NewMI;
2429    switch (CI.InstClass) {
2430    default:
2431      llvm_unreachable("unknown InstClass");
2432      break;
2433    case DS_READ:
2434      NewMI = mergeRead2Pair(CI, Paired, Where->I);
2435      break;
2436    case DS_WRITE:
2437      NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2438      break;
2439    case S_BUFFER_LOAD_IMM:
2440    case S_BUFFER_LOAD_SGPR_IMM:
2441    case S_LOAD_IMM:
2442      NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2443      OptimizeListAgain |= CI.Width + Paired.Width < 8;
2444      break;
2445    case BUFFER_LOAD:
2446      NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2447      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2448      break;
2449    case BUFFER_STORE:
2450      NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2451      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2452      break;
2453    case MIMG:
2454      NewMI = mergeImagePair(CI, Paired, Where->I);
2455      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2456      break;
2457    case TBUFFER_LOAD:
2458      NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2459      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2460      break;
2461    case TBUFFER_STORE:
2462      NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2463      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2464      break;
2465    case FLAT_LOAD:
2466    case GLOBAL_LOAD:
2467    case GLOBAL_LOAD_SADDR:
2468      NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2469      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2470      break;
2471    case FLAT_STORE:
2472    case GLOBAL_STORE:
2473    case GLOBAL_STORE_SADDR:
2474      NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2475      OptimizeListAgain |= CI.Width + Paired.Width < 4;
2476      break;
2477    }
2478    CI.setMI(NewMI, *this);
2479    CI.Order = Where->Order;
2480    if (I == Second)
2481      I = Next;
2482
2483    MergeList.erase(Second);
2484  }
2485
2486  return Modified;
2487}
2488
2489bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2490  if (skipFunction(MF.getFunction()))
2491    return false;
2492
2493  STM = &MF.getSubtarget<GCNSubtarget>();
2494  if (!STM->loadStoreOptEnabled())
2495    return false;
2496
2497  TII = STM->getInstrInfo();
2498  TRI = &TII->getRegisterInfo();
2499
2500  MRI = &MF.getRegInfo();
2501  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2502
2503  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2504
2505  bool Modified = false;
2506
2507  // Contains the list of instructions for which constant offsets are being
2508  // promoted to the IMM. This is tracked for an entire block at time.
2509  SmallPtrSet<MachineInstr *, 4> AnchorList;
2510  MemInfoMap Visited;
2511
2512  for (MachineBasicBlock &MBB : MF) {
2513    MachineBasicBlock::iterator SectionEnd;
2514    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2515         I = SectionEnd) {
2516      bool CollectModified;
2517      std::list<std::list<CombineInfo>> MergeableInsts;
2518
2519      // First pass: Collect list of all instructions we know how to merge in a
2520      // subset of the block.
2521      std::tie(SectionEnd, CollectModified) =
2522          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2523
2524      Modified |= CollectModified;
2525
2526      do {
2527        OptimizeAgain = false;
2528        Modified |= optimizeBlock(MergeableInsts);
2529      } while (OptimizeAgain);
2530    }
2531
2532    Visited.clear();
2533    AnchorList.clear();
2534  }
2535
2536  return Modified;
2537}
2538