1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11//  ds_read_b32 v0, v2 offset:16
12//  ds_read_b32 v1, v2 offset:32
13// ==>
14//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17//  s_buffer_load_dword s4, s[0:3], 4
18//  s_buffer_load_dword s5, s[0:3], 8
19// ==>
20//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27//  s_movk_i32 s0, 0x1800
28//  v_add_co_u32_e32 v0, vcc, s0, v2
29//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31//  s_movk_i32 s0, 0x1000
32//  v_add_co_u32_e32 v5, vcc, s0, v2
33//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34//  global_load_dwordx2 v[5:6], v[5:6], off
35//  global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37//  s_movk_i32 s0, 0x1000
38//  v_add_co_u32_e32 v5, vcc, s0, v2
39//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40//  global_load_dwordx2 v[5:6], v[5:6], off
41//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46//   the constant into the data register is placed between the stores, although
47//   this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50//   one pair, and recomputes live intervals and moves on to the next pair. It
51//   would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54//   cluster of loads have offsets that are too large to fit in the 8-bit
55//   offsets, but are close enough to fit in the 8 bits, we can add to the base
56//   pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "AMDGPUSubtarget.h"
62#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63#include "SIInstrInfo.h"
64#include "SIRegisterInfo.h"
65#include "Utils/AMDGPUBaseInfo.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/SmallVector.h"
68#include "llvm/ADT/StringRef.h"
69#include "llvm/Analysis/AliasAnalysis.h"
70#include "llvm/CodeGen/MachineBasicBlock.h"
71#include "llvm/CodeGen/MachineFunction.h"
72#include "llvm/CodeGen/MachineFunctionPass.h"
73#include "llvm/CodeGen/MachineInstr.h"
74#include "llvm/CodeGen/MachineInstrBuilder.h"
75#include "llvm/CodeGen/MachineOperand.h"
76#include "llvm/CodeGen/MachineRegisterInfo.h"
77#include "llvm/IR/DebugLoc.h"
78#include "llvm/InitializePasses.h"
79#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
81#include "llvm/Support/MathExtras.h"
82#include "llvm/Support/raw_ostream.h"
83#include <algorithm>
84#include <cassert>
85#include <cstdlib>
86#include <iterator>
87#include <utility>
88
89using namespace llvm;
90
91#define DEBUG_TYPE "si-load-store-opt"
92
93namespace {
94enum InstClassEnum {
95  UNKNOWN,
96  DS_READ,
97  DS_WRITE,
98  S_BUFFER_LOAD_IMM,
99  BUFFER_LOAD,
100  BUFFER_STORE,
101  MIMG,
102  TBUFFER_LOAD,
103  TBUFFER_STORE,
104};
105
106struct AddressRegs {
107  unsigned char NumVAddrs = 0;
108  bool SBase = false;
109  bool SRsrc = false;
110  bool SOffset = false;
111  bool VAddr = false;
112  bool Addr = false;
113  bool SSamp = false;
114};
115
116// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
117const unsigned MaxAddressRegs = 12 + 1 + 1;
118
119class SILoadStoreOptimizer : public MachineFunctionPass {
120  struct CombineInfo {
121    MachineBasicBlock::iterator I;
122    unsigned EltSize;
123    unsigned Offset;
124    unsigned Width;
125    unsigned Format;
126    unsigned BaseOff;
127    unsigned DMask;
128    InstClassEnum InstClass;
129    bool GLC;
130    bool SLC;
131    bool DLC;
132    bool UseST64;
133    int AddrIdx[MaxAddressRegs];
134    const MachineOperand *AddrReg[MaxAddressRegs];
135    unsigned NumAddresses;
136    unsigned Order;
137
138    bool hasSameBaseAddress(const MachineInstr &MI) {
139      for (unsigned i = 0; i < NumAddresses; i++) {
140        const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
141
142        if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
143          if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
144              AddrReg[i]->getImm() != AddrRegNext.getImm()) {
145            return false;
146          }
147          continue;
148        }
149
150        // Check same base pointer. Be careful of subregisters, which can occur
151        // with vectors of pointers.
152        if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
153            AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
154         return false;
155        }
156      }
157      return true;
158    }
159
160    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
161      for (unsigned i = 0; i < NumAddresses; ++i) {
162        const MachineOperand *AddrOp = AddrReg[i];
163        // Immediates are always OK.
164        if (AddrOp->isImm())
165          continue;
166
167        // Don't try to merge addresses that aren't either immediates or registers.
168        // TODO: Should be possible to merge FrameIndexes and maybe some other
169        // non-register
170        if (!AddrOp->isReg())
171          return false;
172
173        // TODO: We should be able to merge physical reg addreses.
174        if (Register::isPhysicalRegister(AddrOp->getReg()))
175          return false;
176
177        // If an address has only one use then there will be on other
178        // instructions with the same address, so we can't merge this one.
179        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
180          return false;
181      }
182      return true;
183    }
184
185    void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
186               const GCNSubtarget &STM);
187  };
188
189  struct BaseRegisters {
190    Register LoReg;
191    Register HiReg;
192
193    unsigned LoSubReg = 0;
194    unsigned HiSubReg = 0;
195  };
196
197  struct MemAddress {
198    BaseRegisters Base;
199    int64_t Offset = 0;
200  };
201
202  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
203
204private:
205  const GCNSubtarget *STM = nullptr;
206  const SIInstrInfo *TII = nullptr;
207  const SIRegisterInfo *TRI = nullptr;
208  MachineRegisterInfo *MRI = nullptr;
209  AliasAnalysis *AA = nullptr;
210  bool OptimizeAgain;
211
212  static bool dmasksCanBeCombined(const CombineInfo &CI,
213                                  const SIInstrInfo &TII,
214                                  const CombineInfo &Paired);
215  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216                                   CombineInfo &Paired, bool Modify = false);
217  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218                        const CombineInfo &Paired);
219  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221                                                     const CombineInfo &Paired);
222  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223                                                    const CombineInfo &Paired);
224
225  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
226                            SmallVectorImpl<MachineInstr *> &InstsToMove);
227
228  unsigned read2Opcode(unsigned EltSize) const;
229  unsigned read2ST64Opcode(unsigned EltSize) const;
230  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
231                                             CombineInfo &Paired,
232                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
233
234  unsigned write2Opcode(unsigned EltSize) const;
235  unsigned write2ST64Opcode(unsigned EltSize) const;
236  MachineBasicBlock::iterator
237  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
239  MachineBasicBlock::iterator
240  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
242  MachineBasicBlock::iterator
243  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
245  MachineBasicBlock::iterator
246  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
248  MachineBasicBlock::iterator
249  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
251  MachineBasicBlock::iterator
252  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
254  MachineBasicBlock::iterator
255  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
257
258  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
259                           int32_t NewOffset) const;
260  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
261  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
262  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
263  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
264  /// Promotes constant offset to the immediate by adjusting the base. It
265  /// tries to use a base from the nearby instructions that allows it to have
266  /// a 13bit constant offset which gets promoted to the immediate.
267  bool promoteConstantOffsetToImm(MachineInstr &CI,
268                                  MemInfoMap &Visited,
269                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
270  void addInstToMergeableList(const CombineInfo &CI,
271                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
272
273  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
274      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
275      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
276      std::list<std::list<CombineInfo>> &MergeableInsts) const;
277
278public:
279  static char ID;
280
281  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
282    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
283  }
284
285  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
286                                     bool &OptimizeListAgain);
287  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
288
289  bool runOnMachineFunction(MachineFunction &MF) override;
290
291  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
292
293  void getAnalysisUsage(AnalysisUsage &AU) const override {
294    AU.setPreservesCFG();
295    AU.addRequired<AAResultsWrapperPass>();
296
297    MachineFunctionPass::getAnalysisUsage(AU);
298  }
299
300  MachineFunctionProperties getRequiredProperties() const override {
301    return MachineFunctionProperties()
302      .set(MachineFunctionProperties::Property::IsSSA);
303  }
304};
305
306static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
307  const unsigned Opc = MI.getOpcode();
308
309  if (TII.isMUBUF(Opc)) {
310    // FIXME: Handle d16 correctly
311    return AMDGPU::getMUBUFElements(Opc);
312  }
313  if (TII.isMIMG(MI)) {
314    uint64_t DMaskImm =
315        TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
316    return countPopulation(DMaskImm);
317  }
318  if (TII.isMTBUF(Opc)) {
319    return AMDGPU::getMTBUFElements(Opc);
320  }
321
322  switch (Opc) {
323  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
324    return 1;
325  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
326    return 2;
327  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
328    return 4;
329  default:
330    return 0;
331  }
332}
333
334/// Maps instruction opcode to enum InstClassEnum.
335static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
336  switch (Opc) {
337  default:
338    if (TII.isMUBUF(Opc)) {
339      switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
340      default:
341        return UNKNOWN;
342      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
343      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
344      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
345      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
346        return BUFFER_LOAD;
347      case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
348      case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
349      case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
350      case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
351        return BUFFER_STORE;
352      }
353    }
354    if (TII.isMIMG(Opc)) {
355      // Ignore instructions encoded without vaddr.
356      if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
357          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
358        return UNKNOWN;
359      // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
360      if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
361          TII.isGather4(Opc))
362        return UNKNOWN;
363      return MIMG;
364    }
365    if (TII.isMTBUF(Opc)) {
366      switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
367      default:
368        return UNKNOWN;
369      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
370      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
371      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
372      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
373        return TBUFFER_LOAD;
374      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
375      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
376      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
377      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
378        return TBUFFER_STORE;
379      }
380    }
381    return UNKNOWN;
382  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
383  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
384  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385    return S_BUFFER_LOAD_IMM;
386  case AMDGPU::DS_READ_B32:
387  case AMDGPU::DS_READ_B32_gfx9:
388  case AMDGPU::DS_READ_B64:
389  case AMDGPU::DS_READ_B64_gfx9:
390    return DS_READ;
391  case AMDGPU::DS_WRITE_B32:
392  case AMDGPU::DS_WRITE_B32_gfx9:
393  case AMDGPU::DS_WRITE_B64:
394  case AMDGPU::DS_WRITE_B64_gfx9:
395    return DS_WRITE;
396  }
397}
398
399/// Determines instruction subclass from opcode. Only instructions
400/// of the same subclass can be merged together.
401static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
402  switch (Opc) {
403  default:
404    if (TII.isMUBUF(Opc))
405      return AMDGPU::getMUBUFBaseOpcode(Opc);
406    if (TII.isMIMG(Opc)) {
407      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
408      assert(Info);
409      return Info->BaseOpcode;
410    }
411    if (TII.isMTBUF(Opc))
412      return AMDGPU::getMTBUFBaseOpcode(Opc);
413    return -1;
414  case AMDGPU::DS_READ_B32:
415  case AMDGPU::DS_READ_B32_gfx9:
416  case AMDGPU::DS_READ_B64:
417  case AMDGPU::DS_READ_B64_gfx9:
418  case AMDGPU::DS_WRITE_B32:
419  case AMDGPU::DS_WRITE_B32_gfx9:
420  case AMDGPU::DS_WRITE_B64:
421  case AMDGPU::DS_WRITE_B64_gfx9:
422    return Opc;
423  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
424  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
425  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
426    return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
427  }
428}
429
430static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
431  AddressRegs Result;
432
433  if (TII.isMUBUF(Opc)) {
434    if (AMDGPU::getMUBUFHasVAddr(Opc))
435      Result.VAddr = true;
436    if (AMDGPU::getMUBUFHasSrsrc(Opc))
437      Result.SRsrc = true;
438    if (AMDGPU::getMUBUFHasSoffset(Opc))
439      Result.SOffset = true;
440
441    return Result;
442  }
443
444  if (TII.isMIMG(Opc)) {
445    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
446    if (VAddr0Idx >= 0) {
447      int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
448      Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
449    } else {
450      Result.VAddr = true;
451    }
452    Result.SRsrc = true;
453    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
454    if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
455      Result.SSamp = true;
456
457    return Result;
458  }
459  if (TII.isMTBUF(Opc)) {
460    if (AMDGPU::getMTBUFHasVAddr(Opc))
461      Result.VAddr = true;
462    if (AMDGPU::getMTBUFHasSrsrc(Opc))
463      Result.SRsrc = true;
464    if (AMDGPU::getMTBUFHasSoffset(Opc))
465      Result.SOffset = true;
466
467    return Result;
468  }
469
470  switch (Opc) {
471  default:
472    return Result;
473  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
474  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
475  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
476    Result.SBase = true;
477    return Result;
478  case AMDGPU::DS_READ_B32:
479  case AMDGPU::DS_READ_B64:
480  case AMDGPU::DS_READ_B32_gfx9:
481  case AMDGPU::DS_READ_B64_gfx9:
482  case AMDGPU::DS_WRITE_B32:
483  case AMDGPU::DS_WRITE_B64:
484  case AMDGPU::DS_WRITE_B32_gfx9:
485  case AMDGPU::DS_WRITE_B64_gfx9:
486    Result.Addr = true;
487    return Result;
488  }
489}
490
491void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
492                                              const SIInstrInfo &TII,
493                                              const GCNSubtarget &STM) {
494  I = MI;
495  unsigned Opc = MI->getOpcode();
496  InstClass = getInstClass(Opc, TII);
497
498  if (InstClass == UNKNOWN)
499    return;
500
501  switch (InstClass) {
502  case DS_READ:
503   EltSize =
504          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
505                                                                          : 4;
506   break;
507  case DS_WRITE:
508    EltSize =
509          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
510                                                                            : 4;
511    break;
512  case S_BUFFER_LOAD_IMM:
513    EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
514    break;
515  default:
516    EltSize = 4;
517    break;
518  }
519
520  if (InstClass == MIMG) {
521    DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
522    // Offset is not considered for MIMG instructions.
523    Offset = 0;
524  } else {
525    int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
526    Offset = I->getOperand(OffsetIdx).getImm();
527  }
528
529  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
530    Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
531
532  Width = getOpcodeWidth(*I, TII);
533
534  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
535    Offset &= 0xffff;
536  } else if (InstClass != MIMG) {
537    GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
538    if (InstClass != S_BUFFER_LOAD_IMM) {
539      SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
540    }
541    DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
542  }
543
544  AddressRegs Regs = getRegs(Opc, TII);
545
546  NumAddresses = 0;
547  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
548    AddrIdx[NumAddresses++] =
549        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
550  if (Regs.Addr)
551    AddrIdx[NumAddresses++] =
552        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
553  if (Regs.SBase)
554    AddrIdx[NumAddresses++] =
555        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
556  if (Regs.SRsrc)
557    AddrIdx[NumAddresses++] =
558        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
559  if (Regs.SOffset)
560    AddrIdx[NumAddresses++] =
561        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
562  if (Regs.VAddr)
563    AddrIdx[NumAddresses++] =
564        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
565  if (Regs.SSamp)
566    AddrIdx[NumAddresses++] =
567        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
568  assert(NumAddresses <= MaxAddressRegs);
569
570  for (unsigned J = 0; J < NumAddresses; J++)
571    AddrReg[J] = &I->getOperand(AddrIdx[J]);
572}
573
574} // end anonymous namespace.
575
576INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
577                      "SI Load Store Optimizer", false, false)
578INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
579INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
580                    false, false)
581
582char SILoadStoreOptimizer::ID = 0;
583
584char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
585
586FunctionPass *llvm::createSILoadStoreOptimizerPass() {
587  return new SILoadStoreOptimizer();
588}
589
590static void moveInstsAfter(MachineBasicBlock::iterator I,
591                           ArrayRef<MachineInstr *> InstsToMove) {
592  MachineBasicBlock *MBB = I->getParent();
593  ++I;
594  for (MachineInstr *MI : InstsToMove) {
595    MI->removeFromParent();
596    MBB->insert(I, MI);
597  }
598}
599
600static void addDefsUsesToList(const MachineInstr &MI,
601                              DenseSet<Register> &RegDefs,
602                              DenseSet<Register> &PhysRegUses) {
603  for (const MachineOperand &Op : MI.operands()) {
604    if (Op.isReg()) {
605      if (Op.isDef())
606        RegDefs.insert(Op.getReg());
607      else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
608        PhysRegUses.insert(Op.getReg());
609    }
610  }
611}
612
613static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
614                                      MachineBasicBlock::iterator B,
615                                      AliasAnalysis *AA) {
616  // RAW or WAR - cannot reorder
617  // WAW - cannot reorder
618  // RAR - safe to reorder
619  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
620}
621
622// Add MI and its defs to the lists if MI reads one of the defs that are
623// already in the list. Returns true in that case.
624static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
625                                  DenseSet<Register> &PhysRegUses,
626                                  SmallVectorImpl<MachineInstr *> &Insts) {
627  for (MachineOperand &Use : MI.operands()) {
628    // If one of the defs is read, then there is a use of Def between I and the
629    // instruction that I will potentially be merged with. We will need to move
630    // this instruction after the merged instructions.
631    //
632    // Similarly, if there is a def which is read by an instruction that is to
633    // be moved for merging, then we need to move the def-instruction as well.
634    // This can only happen for physical registers such as M0; virtual
635    // registers are in SSA form.
636    if (Use.isReg() &&
637        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
638         (Use.isDef() && RegDefs.count(Use.getReg())) ||
639         (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
640          PhysRegUses.count(Use.getReg())))) {
641      Insts.push_back(&MI);
642      addDefsUsesToList(MI, RegDefs, PhysRegUses);
643      return true;
644    }
645  }
646
647  return false;
648}
649
650static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
651                                    ArrayRef<MachineInstr *> InstsToMove,
652                                    AliasAnalysis *AA) {
653  assert(MemOp.mayLoadOrStore());
654
655  for (MachineInstr *InstToMove : InstsToMove) {
656    if (!InstToMove->mayLoadOrStore())
657      continue;
658    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
659      return false;
660  }
661  return true;
662}
663
664// This function assumes that \p A and \p B have are identical except for
665// size and offset, and they referecne adjacent memory.
666static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
667                                                   const MachineMemOperand *A,
668                                                   const MachineMemOperand *B) {
669  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
670  unsigned Size = A->getSize() + B->getSize();
671  // This function adds the offset parameter to the existing offset for A,
672  // so we pass 0 here as the offset and then manually set it to the correct
673  // value after the call.
674  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
675  MMO->setOffset(MinOffset);
676  return MMO;
677}
678
679bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
680                                               const SIInstrInfo &TII,
681                                               const CombineInfo &Paired) {
682  assert(CI.InstClass == MIMG);
683
684  // Ignore instructions with tfe/lwe set.
685  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
686  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
687
688  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
689    return false;
690
691  // Check other optional immediate operands for equality.
692  unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
693                                AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
694                                AMDGPU::OpName::da,  AMDGPU::OpName::r128,
695                                AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
696
697  for (auto op : OperandsToMatch) {
698    int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
699    if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
700      return false;
701    if (Idx != -1 &&
702        CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
703      return false;
704  }
705
706  // Check DMask for overlaps.
707  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
708  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
709
710  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
711  if ((1u << AllowedBitsForMin) <= MinMask)
712    return false;
713
714  return true;
715}
716
717static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
718                                       unsigned ComponentCount,
719                                       const GCNSubtarget &STI) {
720  if (ComponentCount > 4)
721    return 0;
722
723  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
724      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
725  if (!OldFormatInfo)
726    return 0;
727
728  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
729      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
730                                           ComponentCount,
731                                           OldFormatInfo->NumFormat, STI);
732
733  if (!NewFormatInfo)
734    return 0;
735
736  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
737         NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
738
739  return NewFormatInfo->Format;
740}
741
742bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
743                                                const GCNSubtarget &STI,
744                                                CombineInfo &Paired,
745                                                bool Modify) {
746  assert(CI.InstClass != MIMG);
747
748  // XXX - Would the same offset be OK? Is there any reason this would happen or
749  // be useful?
750  if (CI.Offset == Paired.Offset)
751    return false;
752
753  // This won't be valid if the offset isn't aligned.
754  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
755    return false;
756
757  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
758
759    const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
760        llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
761    if (!Info0)
762      return false;
763    const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
764        llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
765    if (!Info1)
766      return false;
767
768    if (Info0->BitsPerComp != Info1->BitsPerComp ||
769        Info0->NumFormat != Info1->NumFormat)
770      return false;
771
772    // TODO: Should be possible to support more formats, but if format loads
773    // are not dword-aligned, the merged load might not be valid.
774    if (Info0->BitsPerComp != 32)
775      return false;
776
777    if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
778      return false;
779  }
780
781  unsigned EltOffset0 = CI.Offset / CI.EltSize;
782  unsigned EltOffset1 = Paired.Offset / CI.EltSize;
783  CI.UseST64 = false;
784  CI.BaseOff = 0;
785
786  // Handle DS instructions.
787  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
788    return (EltOffset0 + CI.Width == EltOffset1 ||
789            EltOffset1 + Paired.Width == EltOffset0) &&
790           CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
791           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
792  }
793
794  // Handle SMEM and VMEM instructions.
795  // If the offset in elements doesn't fit in 8-bits, we might be able to use
796  // the stride 64 versions.
797  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
798      isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
799    if (Modify) {
800      CI.Offset = EltOffset0 / 64;
801      Paired.Offset = EltOffset1 / 64;
802      CI.UseST64 = true;
803    }
804    return true;
805  }
806
807  // Check if the new offsets fit in the reduced 8-bit range.
808  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
809    if (Modify) {
810      CI.Offset = EltOffset0;
811      Paired.Offset = EltOffset1;
812    }
813    return true;
814  }
815
816  // Try to shift base address to decrease offsets.
817  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
818  CI.BaseOff = std::min(CI.Offset, Paired.Offset);
819
820  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
821    if (Modify) {
822      CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
823      Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
824      CI.UseST64 = true;
825    }
826    return true;
827  }
828
829  if (isUInt<8>(OffsetDiff)) {
830    if (Modify) {
831      CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
832      Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
833    }
834    return true;
835  }
836
837  return false;
838}
839
840bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
841                                     const CombineInfo &CI,
842                                     const CombineInfo &Paired) {
843  const unsigned Width = (CI.Width + Paired.Width);
844  switch (CI.InstClass) {
845  default:
846    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
847  case S_BUFFER_LOAD_IMM:
848    switch (Width) {
849    default:
850      return false;
851    case 2:
852    case 4:
853      return true;
854    }
855  }
856}
857
858/// This function assumes that CI comes before Paired in a basic block.
859bool SILoadStoreOptimizer::checkAndPrepareMerge(
860    CombineInfo &CI, CombineInfo &Paired,
861    SmallVectorImpl<MachineInstr *> &InstsToMove) {
862
863  // Check both offsets (or masks for MIMG) can be combined and fit in the
864  // reduced range.
865  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
866    return false;
867
868  if (CI.InstClass != MIMG &&
869      (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
870    return false;
871
872  const unsigned Opc = CI.I->getOpcode();
873  const InstClassEnum InstClass = getInstClass(Opc, *TII);
874
875  if (InstClass == UNKNOWN) {
876    return false;
877  }
878  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
879
880  // Do not merge VMEM buffer instructions with "swizzled" bit set.
881  int Swizzled =
882      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
883  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
884    return false;
885
886  DenseSet<Register> RegDefsToMove;
887  DenseSet<Register> PhysRegUsesToMove;
888  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
889
890  MachineBasicBlock::iterator E = std::next(Paired.I);
891  MachineBasicBlock::iterator MBBI = std::next(CI.I);
892  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
893  for (; MBBI != E; ++MBBI) {
894
895    if (MBBI == MBBE) {
896      // CombineInfo::Order is a hint on the instruction ordering within the
897      // basic block. This hint suggests that CI precedes Paired, which is
898      // true most of the time. However, moveInstsAfter() processing a
899      // previous list may have changed this order in a situation when it
900      // moves an instruction which exists in some other merge list.
901      // In this case it must be dependent.
902      return false;
903    }
904
905    if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
906        (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
907      // This is not a matching instruction, but we can keep looking as
908      // long as one of these conditions are met:
909      // 1. It is safe to move I down past MBBI.
910      // 2. It is safe to move MBBI down past the instruction that I will
911      //    be merged into.
912
913      if (MBBI->hasUnmodeledSideEffects()) {
914        // We can't re-order this instruction with respect to other memory
915        // operations, so we fail both conditions mentioned above.
916        return false;
917      }
918
919      if (MBBI->mayLoadOrStore() &&
920          (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
921           !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
922        // We fail condition #1, but we may still be able to satisfy condition
923        // #2.  Add this instruction to the move list and then we will check
924        // if condition #2 holds once we have selected the matching instruction.
925        InstsToMove.push_back(&*MBBI);
926        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
927        continue;
928      }
929
930      // When we match I with another DS instruction we will be moving I down
931      // to the location of the matched instruction any uses of I will need to
932      // be moved down as well.
933      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
934                            InstsToMove);
935      continue;
936    }
937
938    // Don't merge volatiles.
939    if (MBBI->hasOrderedMemoryRef())
940      return false;
941
942    int Swizzled =
943        AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
944    if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
945      return false;
946
947    // Handle a case like
948    //   DS_WRITE_B32 addr, v, idx0
949    //   w = DS_READ_B32 addr, idx0
950    //   DS_WRITE_B32 addr, f(w), idx1
951    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
952    // merging of the two writes.
953    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
954                              InstsToMove))
955      continue;
956
957    if (&*MBBI == &*Paired.I) {
958      // We need to go through the list of instructions that we plan to
959      // move and make sure they are all safe to move down past the merged
960      // instruction.
961      if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
962
963        // Call offsetsCanBeCombined with modify = true so that the offsets are
964        // correct for the new instruction.  This should return true, because
965        // this function should only be called on CombineInfo objects that
966        // have already been confirmed to be mergeable.
967        if (CI.InstClass != MIMG)
968          offsetsCanBeCombined(CI, *STM, Paired, true);
969        return true;
970      }
971      return false;
972    }
973
974    // We've found a load/store that we couldn't merge for some reason.
975    // We could potentially keep looking, but we'd need to make sure that
976    // it was safe to move I and also all the instruction in InstsToMove
977    // down past this instruction.
978    // check if we can move I across MBBI and if we can move all I's users
979    if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
980        !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
981      break;
982  }
983  return false;
984}
985
986unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
987  if (STM->ldsRequiresM0Init())
988    return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
989  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
990}
991
992unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
993  if (STM->ldsRequiresM0Init())
994    return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
995
996  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
997                        : AMDGPU::DS_READ2ST64_B64_gfx9;
998}
999
1000MachineBasicBlock::iterator
1001SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1002    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1003  MachineBasicBlock *MBB = CI.I->getParent();
1004
1005  // Be careful, since the addresses could be subregisters themselves in weird
1006  // cases, like vectors of pointers.
1007  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1008
1009  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1010  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1011
1012  unsigned NewOffset0 = CI.Offset;
1013  unsigned NewOffset1 = Paired.Offset;
1014  unsigned Opc =
1015      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1016
1017  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1018  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1019
1020  if (NewOffset0 > NewOffset1) {
1021    // Canonicalize the merged instruction so the smaller offset comes first.
1022    std::swap(NewOffset0, NewOffset1);
1023    std::swap(SubRegIdx0, SubRegIdx1);
1024  }
1025
1026  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1027         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1028
1029  const MCInstrDesc &Read2Desc = TII->get(Opc);
1030
1031  const TargetRegisterClass *SuperRC =
1032      (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
1033  Register DestReg = MRI->createVirtualRegister(SuperRC);
1034
1035  DebugLoc DL = CI.I->getDebugLoc();
1036
1037  Register BaseReg = AddrReg->getReg();
1038  unsigned BaseSubReg = AddrReg->getSubReg();
1039  unsigned BaseRegFlags = 0;
1040  if (CI.BaseOff) {
1041    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1042    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1043        .addImm(CI.BaseOff);
1044
1045    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1046    BaseRegFlags = RegState::Kill;
1047
1048    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1049        .addReg(ImmReg)
1050        .addReg(AddrReg->getReg(), 0, BaseSubReg)
1051        .addImm(0); // clamp bit
1052    BaseSubReg = 0;
1053  }
1054
1055  MachineInstrBuilder Read2 =
1056      BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1057          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1058          .addImm(NewOffset0)                        // offset0
1059          .addImm(NewOffset1)                        // offset1
1060          .addImm(0)                                 // gds
1061          .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1062
1063  (void)Read2;
1064
1065  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1066
1067  // Copy to the old destination registers.
1068  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1069      .add(*Dest0) // Copy to same destination including flags and sub reg.
1070      .addReg(DestReg, 0, SubRegIdx0);
1071  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1072                            .add(*Dest1)
1073                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
1074
1075  moveInstsAfter(Copy1, InstsToMove);
1076
1077  CI.I->eraseFromParent();
1078  Paired.I->eraseFromParent();
1079
1080  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1081  return Read2;
1082}
1083
1084unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1085  if (STM->ldsRequiresM0Init())
1086    return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1087  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1088                        : AMDGPU::DS_WRITE2_B64_gfx9;
1089}
1090
1091unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1092  if (STM->ldsRequiresM0Init())
1093    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1094                          : AMDGPU::DS_WRITE2ST64_B64;
1095
1096  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1097                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1098}
1099
1100MachineBasicBlock::iterator
1101SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1102                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1103  MachineBasicBlock *MBB = CI.I->getParent();
1104
1105  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1106  // sure we preserve the subregister index and any register flags set on them.
1107  const MachineOperand *AddrReg =
1108      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1109  const MachineOperand *Data0 =
1110      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1111  const MachineOperand *Data1 =
1112      TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1113
1114  unsigned NewOffset0 = CI.Offset;
1115  unsigned NewOffset1 = Paired.Offset;
1116  unsigned Opc =
1117      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1118
1119  if (NewOffset0 > NewOffset1) {
1120    // Canonicalize the merged instruction so the smaller offset comes first.
1121    std::swap(NewOffset0, NewOffset1);
1122    std::swap(Data0, Data1);
1123  }
1124
1125  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1126         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1127
1128  const MCInstrDesc &Write2Desc = TII->get(Opc);
1129  DebugLoc DL = CI.I->getDebugLoc();
1130
1131  Register BaseReg = AddrReg->getReg();
1132  unsigned BaseSubReg = AddrReg->getSubReg();
1133  unsigned BaseRegFlags = 0;
1134  if (CI.BaseOff) {
1135    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1136    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1137        .addImm(CI.BaseOff);
1138
1139    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1140    BaseRegFlags = RegState::Kill;
1141
1142    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1143        .addReg(ImmReg)
1144        .addReg(AddrReg->getReg(), 0, BaseSubReg)
1145        .addImm(0); // clamp bit
1146    BaseSubReg = 0;
1147  }
1148
1149  MachineInstrBuilder Write2 =
1150      BuildMI(*MBB, Paired.I, DL, Write2Desc)
1151          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1152          .add(*Data0)                               // data0
1153          .add(*Data1)                               // data1
1154          .addImm(NewOffset0)                        // offset0
1155          .addImm(NewOffset1)                        // offset1
1156          .addImm(0)                                 // gds
1157          .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1158
1159  moveInstsAfter(Write2, InstsToMove);
1160
1161  CI.I->eraseFromParent();
1162  Paired.I->eraseFromParent();
1163
1164  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1165  return Write2;
1166}
1167
1168MachineBasicBlock::iterator
1169SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1170                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1171  MachineBasicBlock *MBB = CI.I->getParent();
1172  DebugLoc DL = CI.I->getDebugLoc();
1173  const unsigned Opcode = getNewOpcode(CI, Paired);
1174
1175  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1176
1177  Register DestReg = MRI->createVirtualRegister(SuperRC);
1178  unsigned MergedDMask = CI.DMask | Paired.DMask;
1179  unsigned DMaskIdx =
1180      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1181
1182  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1183  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1184    if (I == DMaskIdx)
1185      MIB.addImm(MergedDMask);
1186    else
1187      MIB.add((*CI.I).getOperand(I));
1188  }
1189
1190  // It shouldn't be possible to get this far if the two instructions
1191  // don't have a single memoperand, because MachineInstr::mayAlias()
1192  // will return true if this is the case.
1193  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1194
1195  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1196  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1197
1198  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1199
1200  unsigned SubRegIdx0, SubRegIdx1;
1201  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1202
1203  // Copy to the old destination registers.
1204  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1205  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1206  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1207
1208  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1209      .add(*Dest0) // Copy to same destination including flags and sub reg.
1210      .addReg(DestReg, 0, SubRegIdx0);
1211  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1212                            .add(*Dest1)
1213                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
1214
1215  moveInstsAfter(Copy1, InstsToMove);
1216
1217  CI.I->eraseFromParent();
1218  Paired.I->eraseFromParent();
1219  return New;
1220}
1221
1222MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1223    CombineInfo &CI, CombineInfo &Paired,
1224    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1225  MachineBasicBlock *MBB = CI.I->getParent();
1226  DebugLoc DL = CI.I->getDebugLoc();
1227  const unsigned Opcode = getNewOpcode(CI, Paired);
1228
1229  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1230
1231  Register DestReg = MRI->createVirtualRegister(SuperRC);
1232  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1233
1234  // It shouldn't be possible to get this far if the two instructions
1235  // don't have a single memoperand, because MachineInstr::mayAlias()
1236  // will return true if this is the case.
1237  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1238
1239  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1240  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1241
1242  MachineInstr *New =
1243    BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1244        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1245        .addImm(MergedOffset) // offset
1246        .addImm(CI.GLC)      // glc
1247        .addImm(CI.DLC)      // dlc
1248        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1249
1250  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1251  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1252  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1253
1254  // Copy to the old destination registers.
1255  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1256  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1257  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1258
1259  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1260      .add(*Dest0) // Copy to same destination including flags and sub reg.
1261      .addReg(DestReg, 0, SubRegIdx0);
1262  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1263                            .add(*Dest1)
1264                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
1265
1266  moveInstsAfter(Copy1, InstsToMove);
1267
1268  CI.I->eraseFromParent();
1269  Paired.I->eraseFromParent();
1270  return New;
1271}
1272
1273MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1274    CombineInfo &CI, CombineInfo &Paired,
1275    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1276  MachineBasicBlock *MBB = CI.I->getParent();
1277  DebugLoc DL = CI.I->getDebugLoc();
1278
1279  const unsigned Opcode = getNewOpcode(CI, Paired);
1280
1281  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1282
1283  // Copy to the new source register.
1284  Register DestReg = MRI->createVirtualRegister(SuperRC);
1285  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1286
1287  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1288
1289  AddressRegs Regs = getRegs(Opcode, *TII);
1290
1291  if (Regs.VAddr)
1292    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1293
1294  // It shouldn't be possible to get this far if the two instructions
1295  // don't have a single memoperand, because MachineInstr::mayAlias()
1296  // will return true if this is the case.
1297  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1298
1299  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1300  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1301
1302  MachineInstr *New =
1303    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1304        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1305        .addImm(MergedOffset) // offset
1306        .addImm(CI.GLC)      // glc
1307        .addImm(CI.SLC)      // slc
1308        .addImm(0)            // tfe
1309        .addImm(CI.DLC)      // dlc
1310        .addImm(0)            // swz
1311        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1312
1313  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1314  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1315  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1316
1317  // Copy to the old destination registers.
1318  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1319  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1320  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1321
1322  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1323      .add(*Dest0) // Copy to same destination including flags and sub reg.
1324      .addReg(DestReg, 0, SubRegIdx0);
1325  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1326                            .add(*Dest1)
1327                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
1328
1329  moveInstsAfter(Copy1, InstsToMove);
1330
1331  CI.I->eraseFromParent();
1332  Paired.I->eraseFromParent();
1333  return New;
1334}
1335
1336MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1337    CombineInfo &CI, CombineInfo &Paired,
1338    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1339  MachineBasicBlock *MBB = CI.I->getParent();
1340  DebugLoc DL = CI.I->getDebugLoc();
1341
1342  const unsigned Opcode = getNewOpcode(CI, Paired);
1343
1344  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1345
1346  // Copy to the new source register.
1347  Register DestReg = MRI->createVirtualRegister(SuperRC);
1348  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1349
1350  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1351
1352  AddressRegs Regs = getRegs(Opcode, *TII);
1353
1354  if (Regs.VAddr)
1355    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1356
1357  unsigned JoinedFormat =
1358      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1359
1360  // It shouldn't be possible to get this far if the two instructions
1361  // don't have a single memoperand, because MachineInstr::mayAlias()
1362  // will return true if this is the case.
1363  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1364
1365  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1366  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1367
1368  MachineInstr *New =
1369      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1370          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1371          .addImm(MergedOffset) // offset
1372          .addImm(JoinedFormat) // format
1373          .addImm(CI.GLC)      // glc
1374          .addImm(CI.SLC)      // slc
1375          .addImm(0)            // tfe
1376          .addImm(CI.DLC)      // dlc
1377          .addImm(0)            // swz
1378          .addMemOperand(
1379              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1380
1381  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1382  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1383  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1384
1385  // Copy to the old destination registers.
1386  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1387  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1388  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1389
1390  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1391      .add(*Dest0) // Copy to same destination including flags and sub reg.
1392      .addReg(DestReg, 0, SubRegIdx0);
1393  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1394                            .add(*Dest1)
1395                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
1396
1397  moveInstsAfter(Copy1, InstsToMove);
1398
1399  CI.I->eraseFromParent();
1400  Paired.I->eraseFromParent();
1401  return New;
1402}
1403
1404MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1405    CombineInfo &CI, CombineInfo &Paired,
1406    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1407  MachineBasicBlock *MBB = CI.I->getParent();
1408  DebugLoc DL = CI.I->getDebugLoc();
1409
1410  const unsigned Opcode = getNewOpcode(CI, Paired);
1411
1412  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1413  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1414  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1415
1416  // Copy to the new source register.
1417  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1418  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1419
1420  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1421  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1422
1423  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1424      .add(*Src0)
1425      .addImm(SubRegIdx0)
1426      .add(*Src1)
1427      .addImm(SubRegIdx1);
1428
1429  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1430                 .addReg(SrcReg, RegState::Kill);
1431
1432  AddressRegs Regs = getRegs(Opcode, *TII);
1433
1434  if (Regs.VAddr)
1435    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1436
1437  unsigned JoinedFormat =
1438      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1439
1440  // It shouldn't be possible to get this far if the two instructions
1441  // don't have a single memoperand, because MachineInstr::mayAlias()
1442  // will return true if this is the case.
1443  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1444
1445  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1446  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1447
1448  MachineInstr *New =
1449      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1450          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1451          .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1452          .addImm(JoinedFormat)                     // format
1453          .addImm(CI.GLC)                          // glc
1454          .addImm(CI.SLC)                          // slc
1455          .addImm(0)                                // tfe
1456          .addImm(CI.DLC)                          // dlc
1457          .addImm(0)                                // swz
1458          .addMemOperand(
1459              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1460
1461  moveInstsAfter(MIB, InstsToMove);
1462
1463  CI.I->eraseFromParent();
1464  Paired.I->eraseFromParent();
1465  return New;
1466}
1467
1468unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1469                                            const CombineInfo &Paired) {
1470  const unsigned Width = CI.Width + Paired.Width;
1471
1472  switch (CI.InstClass) {
1473  default:
1474    assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1475    // FIXME: Handle d16 correctly
1476    return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1477                                  Width);
1478  case TBUFFER_LOAD:
1479  case TBUFFER_STORE:
1480    return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1481                                  Width);
1482
1483  case UNKNOWN:
1484    llvm_unreachable("Unknown instruction class");
1485  case S_BUFFER_LOAD_IMM:
1486    switch (Width) {
1487    default:
1488      return 0;
1489    case 2:
1490      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1491    case 4:
1492      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1493    }
1494  case MIMG:
1495    assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
1496    return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1497  }
1498}
1499
1500std::pair<unsigned, unsigned>
1501SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
1502
1503  if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
1504    return std::make_pair(0, 0);
1505
1506  bool ReverseOrder;
1507  if (CI.InstClass == MIMG) {
1508    assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1509           "No overlaps");
1510    ReverseOrder = CI.DMask > Paired.DMask;
1511  } else
1512    ReverseOrder = CI.Offset > Paired.Offset;
1513
1514  static const unsigned Idxs[4][4] = {
1515      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1516      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1517      {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1518      {AMDGPU::sub3, 0, 0, 0},
1519  };
1520  unsigned Idx0;
1521  unsigned Idx1;
1522
1523  assert(CI.Width >= 1 && CI.Width <= 3);
1524  assert(Paired.Width >= 1 && Paired.Width <= 3);
1525
1526  if (ReverseOrder) {
1527    Idx1 = Idxs[0][Paired.Width - 1];
1528    Idx0 = Idxs[Paired.Width][CI.Width - 1];
1529  } else {
1530    Idx0 = Idxs[0][CI.Width - 1];
1531    Idx1 = Idxs[CI.Width][Paired.Width - 1];
1532  }
1533
1534  return std::make_pair(Idx0, Idx1);
1535}
1536
1537const TargetRegisterClass *
1538SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1539                                             const CombineInfo &Paired) {
1540  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1541    switch (CI.Width + Paired.Width) {
1542    default:
1543      return nullptr;
1544    case 2:
1545      return &AMDGPU::SReg_64_XEXECRegClass;
1546    case 4:
1547      return &AMDGPU::SGPR_128RegClass;
1548    case 8:
1549      return &AMDGPU::SGPR_256RegClass;
1550    case 16:
1551      return &AMDGPU::SGPR_512RegClass;
1552    }
1553  } else {
1554    switch (CI.Width + Paired.Width) {
1555    default:
1556      return nullptr;
1557    case 2:
1558      return &AMDGPU::VReg_64RegClass;
1559    case 3:
1560      return &AMDGPU::VReg_96RegClass;
1561    case 4:
1562      return &AMDGPU::VReg_128RegClass;
1563    }
1564  }
1565}
1566
1567MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1568    CombineInfo &CI, CombineInfo &Paired,
1569    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1570  MachineBasicBlock *MBB = CI.I->getParent();
1571  DebugLoc DL = CI.I->getDebugLoc();
1572
1573  const unsigned Opcode = getNewOpcode(CI, Paired);
1574
1575  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1576  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1577  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1578
1579  // Copy to the new source register.
1580  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1581  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1582
1583  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1584  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1585
1586  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1587      .add(*Src0)
1588      .addImm(SubRegIdx0)
1589      .add(*Src1)
1590      .addImm(SubRegIdx1);
1591
1592  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1593                 .addReg(SrcReg, RegState::Kill);
1594
1595  AddressRegs Regs = getRegs(Opcode, *TII);
1596
1597  if (Regs.VAddr)
1598    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1599
1600
1601  // It shouldn't be possible to get this far if the two instructions
1602  // don't have a single memoperand, because MachineInstr::mayAlias()
1603  // will return true if this is the case.
1604  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1605
1606  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1607  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1608
1609  MachineInstr *New =
1610    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1611        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1612        .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1613        .addImm(CI.GLC)      // glc
1614        .addImm(CI.SLC)      // slc
1615        .addImm(0)            // tfe
1616        .addImm(CI.DLC)      // dlc
1617        .addImm(0)            // swz
1618        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1619
1620  moveInstsAfter(MIB, InstsToMove);
1621
1622  CI.I->eraseFromParent();
1623  Paired.I->eraseFromParent();
1624  return New;
1625}
1626
1627MachineOperand
1628SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1629  APInt V(32, Val, true);
1630  if (TII->isInlineConstant(V))
1631    return MachineOperand::CreateImm(Val);
1632
1633  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1634  MachineInstr *Mov =
1635  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1636          TII->get(AMDGPU::S_MOV_B32), Reg)
1637    .addImm(Val);
1638  (void)Mov;
1639  LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1640  return MachineOperand::CreateReg(Reg, false);
1641}
1642
1643// Compute base address using Addr and return the final register.
1644Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1645                                           const MemAddress &Addr) const {
1646  MachineBasicBlock *MBB = MI.getParent();
1647  MachineBasicBlock::iterator MBBI = MI.getIterator();
1648  DebugLoc DL = MI.getDebugLoc();
1649
1650  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1651          Addr.Base.LoSubReg) &&
1652         "Expected 32-bit Base-Register-Low!!");
1653
1654  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1655          Addr.Base.HiSubReg) &&
1656         "Expected 32-bit Base-Register-Hi!!");
1657
1658  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1659  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1660  MachineOperand OffsetHi =
1661    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1662
1663  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1664  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1665  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1666
1667  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1668  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1669  MachineInstr *LoHalf =
1670    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1671      .addReg(CarryReg, RegState::Define)
1672      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1673      .add(OffsetLo)
1674      .addImm(0); // clamp bit
1675  (void)LoHalf;
1676  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1677
1678  MachineInstr *HiHalf =
1679  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1680    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1681    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1682    .add(OffsetHi)
1683    .addReg(CarryReg, RegState::Kill)
1684    .addImm(0); // clamp bit
1685  (void)HiHalf;
1686  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1687
1688  Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1689  MachineInstr *FullBase =
1690    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1691      .addReg(DestSub0)
1692      .addImm(AMDGPU::sub0)
1693      .addReg(DestSub1)
1694      .addImm(AMDGPU::sub1);
1695  (void)FullBase;
1696  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1697
1698  return FullDestReg;
1699}
1700
1701// Update base and offset with the NewBase and NewOffset in MI.
1702void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1703                                               Register NewBase,
1704                                               int32_t NewOffset) const {
1705  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1706  Base->setReg(NewBase);
1707  Base->setIsKill(false);
1708  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1709}
1710
1711Optional<int32_t>
1712SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1713  if (Op.isImm())
1714    return Op.getImm();
1715
1716  if (!Op.isReg())
1717    return None;
1718
1719  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1720  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1721      !Def->getOperand(1).isImm())
1722    return None;
1723
1724  return Def->getOperand(1).getImm();
1725}
1726
1727// Analyze Base and extracts:
1728//  - 32bit base registers, subregisters
1729//  - 64bit constant offset
1730// Expecting base computation as:
1731//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1732//   %LO:vgpr_32, %c:sreg_64_xexec =
1733//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1734//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1735//   %Base:vreg_64 =
1736//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1737void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1738                                                      MemAddress &Addr) const {
1739  if (!Base.isReg())
1740    return;
1741
1742  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1743  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1744      || Def->getNumOperands() != 5)
1745    return;
1746
1747  MachineOperand BaseLo = Def->getOperand(1);
1748  MachineOperand BaseHi = Def->getOperand(3);
1749  if (!BaseLo.isReg() || !BaseHi.isReg())
1750    return;
1751
1752  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1753  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1754
1755  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1756      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1757    return;
1758
1759  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1760  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1761
1762  auto Offset0P = extractConstOffset(*Src0);
1763  if (Offset0P)
1764    BaseLo = *Src1;
1765  else {
1766    if (!(Offset0P = extractConstOffset(*Src1)))
1767      return;
1768    BaseLo = *Src0;
1769  }
1770
1771  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1772  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1773
1774  if (Src0->isImm())
1775    std::swap(Src0, Src1);
1776
1777  if (!Src1->isImm())
1778    return;
1779
1780  uint64_t Offset1 = Src1->getImm();
1781  BaseHi = *Src0;
1782
1783  Addr.Base.LoReg = BaseLo.getReg();
1784  Addr.Base.HiReg = BaseHi.getReg();
1785  Addr.Base.LoSubReg = BaseLo.getSubReg();
1786  Addr.Base.HiSubReg = BaseHi.getSubReg();
1787  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1788}
1789
1790bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1791    MachineInstr &MI,
1792    MemInfoMap &Visited,
1793    SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1794
1795  if (!(MI.mayLoad() ^ MI.mayStore()))
1796    return false;
1797
1798  // TODO: Support flat and scratch.
1799  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1800    return false;
1801
1802  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1803    return false;
1804
1805  if (AnchorList.count(&MI))
1806    return false;
1807
1808  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1809
1810  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1811    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1812    return false;
1813  }
1814
1815  // Step1: Find the base-registers and a 64bit constant offset.
1816  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1817  MemAddress MAddr;
1818  if (Visited.find(&MI) == Visited.end()) {
1819    processBaseWithConstOffset(Base, MAddr);
1820    Visited[&MI] = MAddr;
1821  } else
1822    MAddr = Visited[&MI];
1823
1824  if (MAddr.Offset == 0) {
1825    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1826                         " constant offsets that can be promoted.\n";);
1827    return false;
1828  }
1829
1830  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1831             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1832
1833  // Step2: Traverse through MI's basic block and find an anchor(that has the
1834  // same base-registers) with the highest 13bit distance from MI's offset.
1835  // E.g. (64bit loads)
1836  // bb:
1837  //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1838  //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1839  //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1840  //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1841  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1842  //
1843  // Starting from the first load, the optimization will try to find a new base
1844  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1845  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1846  // as the new-base(anchor) because of the maximum distance which can
1847  // accomodate more intermediate bases presumeably.
1848  //
1849  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1850  // (&a + 8192) for load1, load2, load4.
1851  //   addr = &a + 8192
1852  //   load1 = load(addr,       -4096)
1853  //   load2 = load(addr,       -2048)
1854  //   load3 = load(addr,       0)
1855  //   load4 = load(addr,       2048)
1856  //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1857  //
1858  MachineInstr *AnchorInst = nullptr;
1859  MemAddress AnchorAddr;
1860  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1861  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1862
1863  MachineBasicBlock *MBB = MI.getParent();
1864  MachineBasicBlock::iterator E = MBB->end();
1865  MachineBasicBlock::iterator MBBI = MI.getIterator();
1866  ++MBBI;
1867  const SITargetLowering *TLI =
1868    static_cast<const SITargetLowering *>(STM->getTargetLowering());
1869
1870  for ( ; MBBI != E; ++MBBI) {
1871    MachineInstr &MINext = *MBBI;
1872    // TODO: Support finding an anchor(with same base) from store addresses or
1873    // any other load addresses where the opcodes are different.
1874    if (MINext.getOpcode() != MI.getOpcode() ||
1875        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1876      continue;
1877
1878    const MachineOperand &BaseNext =
1879      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1880    MemAddress MAddrNext;
1881    if (Visited.find(&MINext) == Visited.end()) {
1882      processBaseWithConstOffset(BaseNext, MAddrNext);
1883      Visited[&MINext] = MAddrNext;
1884    } else
1885      MAddrNext = Visited[&MINext];
1886
1887    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1888        MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1889        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1890        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1891      continue;
1892
1893    InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1894
1895    int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1896    TargetLoweringBase::AddrMode AM;
1897    AM.HasBaseReg = true;
1898    AM.BaseOffs = Dist;
1899    if (TLI->isLegalGlobalAddressingMode(AM) &&
1900        (uint32_t)std::abs(Dist) > MaxDist) {
1901      MaxDist = std::abs(Dist);
1902
1903      AnchorAddr = MAddrNext;
1904      AnchorInst = &MINext;
1905    }
1906  }
1907
1908  if (AnchorInst) {
1909    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1910               AnchorInst->dump());
1911    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1912               <<  AnchorAddr.Offset << "\n\n");
1913
1914    // Instead of moving up, just re-compute anchor-instruction's base address.
1915    Register Base = computeBase(MI, AnchorAddr);
1916
1917    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1918    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1919
1920    for (auto P : InstsWCommonBase) {
1921      TargetLoweringBase::AddrMode AM;
1922      AM.HasBaseReg = true;
1923      AM.BaseOffs = P.second - AnchorAddr.Offset;
1924
1925      if (TLI->isLegalGlobalAddressingMode(AM)) {
1926        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1927                   dbgs() << ")"; P.first->dump());
1928        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1929        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1930      }
1931    }
1932    AnchorList.insert(AnchorInst);
1933    return true;
1934  }
1935
1936  return false;
1937}
1938
1939void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1940                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1941  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1942    if (AddrList.front().InstClass == CI.InstClass &&
1943        AddrList.front().hasSameBaseAddress(*CI.I)) {
1944      AddrList.emplace_back(CI);
1945      return;
1946    }
1947  }
1948
1949  // Base address not found, so add a new list.
1950  MergeableInsts.emplace_back(1, CI);
1951}
1952
1953std::pair<MachineBasicBlock::iterator, bool>
1954SILoadStoreOptimizer::collectMergeableInsts(
1955    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1956    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1957    std::list<std::list<CombineInfo>> &MergeableInsts) const {
1958  bool Modified = false;
1959
1960  // Sort potential mergeable instructions into lists.  One list per base address.
1961  unsigned Order = 0;
1962  MachineBasicBlock::iterator BlockI = Begin;
1963  for (; BlockI != End; ++BlockI) {
1964    MachineInstr &MI = *BlockI;
1965
1966    // We run this before checking if an address is mergeable, because it can produce
1967    // better code even if the instructions aren't mergeable.
1968    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1969      Modified = true;
1970
1971    // Don't combine if volatile. We also won't be able to merge across this, so
1972    // break the search. We can look after this barrier for separate merges.
1973    if (MI.hasOrderedMemoryRef()) {
1974      LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
1975
1976      // Search will resume after this instruction in a separate merge list.
1977      ++BlockI;
1978      break;
1979    }
1980
1981    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1982    if (InstClass == UNKNOWN)
1983      continue;
1984
1985    CombineInfo CI;
1986    CI.setMI(MI, *TII, *STM);
1987    CI.Order = Order++;
1988
1989    if (!CI.hasMergeableAddress(*MRI))
1990      continue;
1991
1992    LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
1993
1994    addInstToMergeableList(CI, MergeableInsts);
1995  }
1996
1997  // At this point we have lists of Mergeable instructions.
1998  //
1999  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2000  // list try to find an instruction that can be merged with I.  If an instruction
2001  // is found, it is stored in the Paired field.  If no instructions are found, then
2002  // the CombineInfo object is deleted from the list.
2003
2004  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2005                                                   E = MergeableInsts.end(); I != E;) {
2006
2007    std::list<CombineInfo> &MergeList = *I;
2008    if (MergeList.size() <= 1) {
2009      // This means we have found only one instruction with a given address
2010      // that can be merged, and we need at least 2 instructions to do a merge,
2011      // so this list can be discarded.
2012      I = MergeableInsts.erase(I);
2013      continue;
2014    }
2015
2016    // Sort the lists by offsets, this way mergeable instructions will be
2017    // adjacent to each other in the list, which will make it easier to find
2018    // matches.
2019    MergeList.sort(
2020        [] (const CombineInfo &A, CombineInfo &B) {
2021          return A.Offset < B.Offset;
2022        });
2023    ++I;
2024  }
2025
2026  return std::make_pair(BlockI, Modified);
2027}
2028
2029// Scan through looking for adjacent LDS operations with constant offsets from
2030// the same base register. We rely on the scheduler to do the hard work of
2031// clustering nearby loads, and assume these are all adjacent.
2032bool SILoadStoreOptimizer::optimizeBlock(
2033                       std::list<std::list<CombineInfo> > &MergeableInsts) {
2034  bool Modified = false;
2035
2036  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2037                                                   E = MergeableInsts.end(); I != E;) {
2038    std::list<CombineInfo> &MergeList = *I;
2039
2040    bool OptimizeListAgain = false;
2041    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2042      // We weren't able to make any changes, so delete the list so we don't
2043      // process the same instructions the next time we try to optimize this
2044      // block.
2045      I = MergeableInsts.erase(I);
2046      continue;
2047    }
2048
2049    Modified = true;
2050
2051    // We made changes, but also determined that there were no more optimization
2052    // opportunities, so we don't need to reprocess the list
2053    if (!OptimizeListAgain) {
2054      I = MergeableInsts.erase(I);
2055      continue;
2056    }
2057    OptimizeAgain = true;
2058  }
2059  return Modified;
2060}
2061
2062bool
2063SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2064                                          std::list<CombineInfo> &MergeList,
2065                                          bool &OptimizeListAgain) {
2066  if (MergeList.empty())
2067    return false;
2068
2069  bool Modified = false;
2070
2071  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2072       Next = std::next(I)) {
2073
2074    auto First = I;
2075    auto Second = Next;
2076
2077    if ((*First).Order > (*Second).Order)
2078      std::swap(First, Second);
2079    CombineInfo &CI = *First;
2080    CombineInfo &Paired = *Second;
2081
2082    SmallVector<MachineInstr *, 8> InstsToMove;
2083    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2084      ++I;
2085      continue;
2086    }
2087
2088    Modified = true;
2089
2090    LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2091
2092    switch (CI.InstClass) {
2093    default:
2094      llvm_unreachable("unknown InstClass");
2095      break;
2096    case DS_READ: {
2097      MachineBasicBlock::iterator NewMI =
2098          mergeRead2Pair(CI, Paired, InstsToMove);
2099      CI.setMI(NewMI, *TII, *STM);
2100      break;
2101    }
2102    case DS_WRITE: {
2103      MachineBasicBlock::iterator NewMI =
2104          mergeWrite2Pair(CI, Paired, InstsToMove);
2105      CI.setMI(NewMI, *TII, *STM);
2106      break;
2107    }
2108    case S_BUFFER_LOAD_IMM: {
2109      MachineBasicBlock::iterator NewMI =
2110          mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2111      CI.setMI(NewMI, *TII, *STM);
2112      OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
2113      break;
2114    }
2115    case BUFFER_LOAD: {
2116      MachineBasicBlock::iterator NewMI =
2117          mergeBufferLoadPair(CI, Paired, InstsToMove);
2118      CI.setMI(NewMI, *TII, *STM);
2119      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2120      break;
2121    }
2122    case BUFFER_STORE: {
2123      MachineBasicBlock::iterator NewMI =
2124          mergeBufferStorePair(CI, Paired, InstsToMove);
2125      CI.setMI(NewMI, *TII, *STM);
2126      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2127      break;
2128    }
2129    case MIMG: {
2130      MachineBasicBlock::iterator NewMI =
2131          mergeImagePair(CI, Paired, InstsToMove);
2132      CI.setMI(NewMI, *TII, *STM);
2133      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2134      break;
2135    }
2136    case TBUFFER_LOAD: {
2137      MachineBasicBlock::iterator NewMI =
2138          mergeTBufferLoadPair(CI, Paired, InstsToMove);
2139      CI.setMI(NewMI, *TII, *STM);
2140      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2141      break;
2142    }
2143    case TBUFFER_STORE: {
2144      MachineBasicBlock::iterator NewMI =
2145          mergeTBufferStorePair(CI, Paired, InstsToMove);
2146      CI.setMI(NewMI, *TII, *STM);
2147      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2148      break;
2149    }
2150    }
2151    CI.Order = Paired.Order;
2152    if (I == Second)
2153      I = Next;
2154
2155    MergeList.erase(Second);
2156  }
2157
2158  return Modified;
2159}
2160
2161bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2162  if (skipFunction(MF.getFunction()))
2163    return false;
2164
2165  STM = &MF.getSubtarget<GCNSubtarget>();
2166  if (!STM->loadStoreOptEnabled())
2167    return false;
2168
2169  TII = STM->getInstrInfo();
2170  TRI = &TII->getRegisterInfo();
2171
2172  MRI = &MF.getRegInfo();
2173  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2174
2175  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2176
2177  bool Modified = false;
2178
2179  // Contains the list of instructions for which constant offsets are being
2180  // promoted to the IMM. This is tracked for an entire block at time.
2181  SmallPtrSet<MachineInstr *, 4> AnchorList;
2182  MemInfoMap Visited;
2183
2184  for (MachineBasicBlock &MBB : MF) {
2185    MachineBasicBlock::iterator SectionEnd;
2186    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2187         I = SectionEnd) {
2188      bool CollectModified;
2189      std::list<std::list<CombineInfo>> MergeableInsts;
2190
2191      // First pass: Collect list of all instructions we know how to merge in a
2192      // subset of the block.
2193      std::tie(SectionEnd, CollectModified) =
2194          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2195
2196      Modified |= CollectModified;
2197
2198      do {
2199        OptimizeAgain = false;
2200        Modified |= optimizeBlock(MergeableInsts);
2201      } while (OptimizeAgain);
2202    }
2203
2204    Visited.clear();
2205    AnchorList.clear();
2206  }
2207
2208  return Modified;
2209}
2210