1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUMachineModuleInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "SIDefines.h"
20#include "SIInstrInfo.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/BitmaskEnum.h"
24#include "llvm/ADT/None.h"
25#include "llvm/ADT/Optional.h"
26#include "llvm/CodeGen/MachineBasicBlock.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineFunctionPass.h"
29#include "llvm/CodeGen/MachineInstrBuilder.h"
30#include "llvm/CodeGen/MachineMemOperand.h"
31#include "llvm/CodeGen/MachineModuleInfo.h"
32#include "llvm/CodeGen/MachineOperand.h"
33#include "llvm/IR/DebugLoc.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/IR/Function.h"
36#include "llvm/IR/LLVMContext.h"
37#include "llvm/MC/MCInstrDesc.h"
38#include "llvm/Pass.h"
39#include "llvm/Support/AtomicOrdering.h"
40#include "llvm/Support/MathExtras.h"
41#include <cassert>
42#include <list>
43
44using namespace llvm;
45using namespace llvm::AMDGPU;
46
47#define DEBUG_TYPE "si-memory-legalizer"
48#define PASS_NAME "SI Memory Legalizer"
49
50namespace {
51
52LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
53
54/// Memory operation flags. Can be ORed together.
55enum class SIMemOp {
56  NONE = 0u,
57  LOAD = 1u << 0,
58  STORE = 1u << 1,
59  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
60};
61
62/// Position to insert a new instruction relative to an existing
63/// instruction.
64enum class Position {
65  BEFORE,
66  AFTER
67};
68
69/// The atomic synchronization scopes supported by the AMDGPU target.
70enum class SIAtomicScope {
71  NONE,
72  SINGLETHREAD,
73  WAVEFRONT,
74  WORKGROUP,
75  AGENT,
76  SYSTEM
77};
78
79/// The distinct address spaces supported by the AMDGPU target for
80/// atomic memory operation. Can be ORed toether.
81enum class SIAtomicAddrSpace {
82  NONE = 0u,
83  GLOBAL = 1u << 0,
84  LDS = 1u << 1,
85  SCRATCH = 1u << 2,
86  GDS = 1u << 3,
87  OTHER = 1u << 4,
88
89  /// The address spaces that can be accessed by a FLAT instruction.
90  FLAT = GLOBAL | LDS | SCRATCH,
91
92  /// The address spaces that support atomic instructions.
93  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
94
95  /// All address spaces.
96  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
97
98  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
99};
100
101/// Sets named bit \p BitName to "true" if present in instruction \p MI.
102/// \returns Returns true if \p MI is modified, false otherwise.
103template <uint16_t BitName>
104bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106  if (BitIdx == -1)
107    return false;
108
109  MachineOperand &Bit = MI->getOperand(BitIdx);
110  if (Bit.getImm() != 0)
111    return false;
112
113  Bit.setImm(1);
114  return true;
115}
116
117class SIMemOpInfo final {
118private:
119
120  friend class SIMemOpAccess;
121
122  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
123  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
124  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
125  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
126  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
127  bool IsCrossAddressSpaceOrdering = false;
128  bool IsNonTemporal = false;
129
130  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
131              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
132              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
133              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
134              bool IsCrossAddressSpaceOrdering = true,
135              AtomicOrdering FailureOrdering =
136                AtomicOrdering::SequentiallyConsistent,
137              bool IsNonTemporal = false)
138    : Ordering(Ordering), FailureOrdering(FailureOrdering),
139      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
140      InstrAddrSpace(InstrAddrSpace),
141      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
142      IsNonTemporal(IsNonTemporal) {
143    // There is also no cross address space ordering if the ordering
144    // address space is the same as the instruction address space and
145    // only contains a single address space.
146    if ((OrderingAddrSpace == InstrAddrSpace) &&
147        isPowerOf2_32(uint32_t(InstrAddrSpace)))
148      this->IsCrossAddressSpaceOrdering = false;
149  }
150
151public:
152  /// \returns Atomic synchronization scope of the machine instruction used to
153  /// create this SIMemOpInfo.
154  SIAtomicScope getScope() const {
155    return Scope;
156  }
157
158  /// \returns Ordering constraint of the machine instruction used to
159  /// create this SIMemOpInfo.
160  AtomicOrdering getOrdering() const {
161    return Ordering;
162  }
163
164  /// \returns Failure ordering constraint of the machine instruction used to
165  /// create this SIMemOpInfo.
166  AtomicOrdering getFailureOrdering() const {
167    return FailureOrdering;
168  }
169
170  /// \returns The address spaces be accessed by the machine
171  /// instruction used to create this SiMemOpInfo.
172  SIAtomicAddrSpace getInstrAddrSpace() const {
173    return InstrAddrSpace;
174  }
175
176  /// \returns The address spaces that must be ordered by the machine
177  /// instruction used to create this SiMemOpInfo.
178  SIAtomicAddrSpace getOrderingAddrSpace() const {
179    return OrderingAddrSpace;
180  }
181
182  /// \returns Return true iff memory ordering of operations on
183  /// different address spaces is required.
184  bool getIsCrossAddressSpaceOrdering() const {
185    return IsCrossAddressSpaceOrdering;
186  }
187
188  /// \returns True if memory access of the machine instruction used to
189  /// create this SIMemOpInfo is non-temporal, false otherwise.
190  bool isNonTemporal() const {
191    return IsNonTemporal;
192  }
193
194  /// \returns True if ordering constraint of the machine instruction used to
195  /// create this SIMemOpInfo is unordered or higher, false otherwise.
196  bool isAtomic() const {
197    return Ordering != AtomicOrdering::NotAtomic;
198  }
199
200};
201
202class SIMemOpAccess final {
203private:
204  AMDGPUMachineModuleInfo *MMI = nullptr;
205
206  /// Reports unsupported message \p Msg for \p MI to LLVM context.
207  void reportUnsupported(const MachineBasicBlock::iterator &MI,
208                         const char *Msg) const;
209
210  /// Inspects the target synchonization scope \p SSID and determines
211  /// the SI atomic scope it corresponds to, the address spaces it
212  /// covers, and whether the memory ordering applies between address
213  /// spaces.
214  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
215  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
216
217  /// \return Return a bit set of the address spaces accessed by \p AS.
218  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
219
220  /// \returns Info constructed from \p MI, which has at least machine memory
221  /// operand.
222  Optional<SIMemOpInfo> constructFromMIWithMMO(
223      const MachineBasicBlock::iterator &MI) const;
224
225public:
226  /// Construct class to support accessing the machine memory operands
227  /// of instructions in the machine function \p MF.
228  SIMemOpAccess(MachineFunction &MF);
229
230  /// \returns Load info if \p MI is a load operation, "None" otherwise.
231  Optional<SIMemOpInfo> getLoadInfo(
232      const MachineBasicBlock::iterator &MI) const;
233
234  /// \returns Store info if \p MI is a store operation, "None" otherwise.
235  Optional<SIMemOpInfo> getStoreInfo(
236      const MachineBasicBlock::iterator &MI) const;
237
238  /// \returns Atomic fence info if \p MI is an atomic fence operation,
239  /// "None" otherwise.
240  Optional<SIMemOpInfo> getAtomicFenceInfo(
241      const MachineBasicBlock::iterator &MI) const;
242
243  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
244  /// rmw operation, "None" otherwise.
245  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
246      const MachineBasicBlock::iterator &MI) const;
247};
248
249class SICacheControl {
250protected:
251
252  /// Instruction info.
253  const SIInstrInfo *TII = nullptr;
254
255  IsaVersion IV;
256
257  SICacheControl(const GCNSubtarget &ST);
258
259public:
260
261  /// Create a cache control for the subtarget \p ST.
262  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
263
264  /// Update \p MI memory load instruction to bypass any caches up to
265  /// the \p Scope memory scope for address spaces \p
266  /// AddrSpace. Return true iff the instruction was modified.
267  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
268                                     SIAtomicScope Scope,
269                                     SIAtomicAddrSpace AddrSpace) const = 0;
270
271  /// Update \p MI memory instruction to indicate it is
272  /// nontemporal. Return true iff the instruction was modified.
273  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
274    const = 0;
275
276  /// Inserts any necessary instructions at position \p Pos relative
277  /// to instruction \p MI to ensure any caches associated with
278  /// address spaces \p AddrSpace for memory scopes up to memory scope
279  /// \p Scope are invalidated. Returns true iff any instructions
280  /// inserted.
281  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
282                                     SIAtomicScope Scope,
283                                     SIAtomicAddrSpace AddrSpace,
284                                     Position Pos) const = 0;
285
286  /// Inserts any necessary instructions at position \p Pos relative
287  /// to instruction \p MI to ensure memory instructions of kind \p Op
288  /// associated with address spaces \p AddrSpace have completed as
289  /// observed by other memory instructions executing in memory scope
290  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
291  /// ordering is between address spaces. Returns true iff any
292  /// instructions inserted.
293  virtual bool insertWait(MachineBasicBlock::iterator &MI,
294                          SIAtomicScope Scope,
295                          SIAtomicAddrSpace AddrSpace,
296                          SIMemOp Op,
297                          bool IsCrossAddrSpaceOrdering,
298                          Position Pos) const = 0;
299
300  /// Virtual destructor to allow derivations to be deleted.
301  virtual ~SICacheControl() = default;
302
303};
304
305class SIGfx6CacheControl : public SICacheControl {
306protected:
307
308  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
309  /// is modified, false otherwise.
310  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
311    return enableNamedBit<AMDGPU::OpName::glc>(MI);
312  }
313
314  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
315  /// is modified, false otherwise.
316  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
317    return enableNamedBit<AMDGPU::OpName::slc>(MI);
318  }
319
320public:
321
322  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
323
324  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
325                             SIAtomicScope Scope,
326                             SIAtomicAddrSpace AddrSpace) const override;
327
328  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
329
330  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
331                             SIAtomicScope Scope,
332                             SIAtomicAddrSpace AddrSpace,
333                             Position Pos) const override;
334
335  bool insertWait(MachineBasicBlock::iterator &MI,
336                  SIAtomicScope Scope,
337                  SIAtomicAddrSpace AddrSpace,
338                  SIMemOp Op,
339                  bool IsCrossAddrSpaceOrdering,
340                  Position Pos) const override;
341};
342
343class SIGfx7CacheControl : public SIGfx6CacheControl {
344public:
345
346  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
347
348  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
349                             SIAtomicScope Scope,
350                             SIAtomicAddrSpace AddrSpace,
351                             Position Pos) const override;
352
353};
354
355class SIGfx10CacheControl : public SIGfx7CacheControl {
356protected:
357  bool CuMode = false;
358
359  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
360  /// is modified, false otherwise.
361  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
362    return enableNamedBit<AMDGPU::OpName::dlc>(MI);
363  }
364
365public:
366
367  SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
368    SIGfx7CacheControl(ST), CuMode(CuMode) {};
369
370  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
371                             SIAtomicScope Scope,
372                             SIAtomicAddrSpace AddrSpace) const override;
373
374  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
375
376  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
377                             SIAtomicScope Scope,
378                             SIAtomicAddrSpace AddrSpace,
379                             Position Pos) const override;
380
381  bool insertWait(MachineBasicBlock::iterator &MI,
382                  SIAtomicScope Scope,
383                  SIAtomicAddrSpace AddrSpace,
384                  SIMemOp Op,
385                  bool IsCrossAddrSpaceOrdering,
386                  Position Pos) const override;
387};
388
389class SIMemoryLegalizer final : public MachineFunctionPass {
390private:
391
392  /// Cache Control.
393  std::unique_ptr<SICacheControl> CC = nullptr;
394
395  /// List of atomic pseudo instructions.
396  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
397
398  /// Return true iff instruction \p MI is a atomic instruction that
399  /// returns a result.
400  bool isAtomicRet(const MachineInstr &MI) const {
401    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
402  }
403
404  /// Removes all processed atomic pseudo instructions from the current
405  /// function. Returns true if current function is modified, false otherwise.
406  bool removeAtomicPseudoMIs();
407
408  /// Expands load operation \p MI. Returns true if instructions are
409  /// added/deleted or \p MI is modified, false otherwise.
410  bool expandLoad(const SIMemOpInfo &MOI,
411                  MachineBasicBlock::iterator &MI);
412  /// Expands store operation \p MI. Returns true if instructions are
413  /// added/deleted or \p MI is modified, false otherwise.
414  bool expandStore(const SIMemOpInfo &MOI,
415                   MachineBasicBlock::iterator &MI);
416  /// Expands atomic fence operation \p MI. Returns true if
417  /// instructions are added/deleted or \p MI is modified, false otherwise.
418  bool expandAtomicFence(const SIMemOpInfo &MOI,
419                         MachineBasicBlock::iterator &MI);
420  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
421  /// instructions are added/deleted or \p MI is modified, false otherwise.
422  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
423                                MachineBasicBlock::iterator &MI);
424
425public:
426  static char ID;
427
428  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
429
430  void getAnalysisUsage(AnalysisUsage &AU) const override {
431    AU.setPreservesCFG();
432    MachineFunctionPass::getAnalysisUsage(AU);
433  }
434
435  StringRef getPassName() const override {
436    return PASS_NAME;
437  }
438
439  bool runOnMachineFunction(MachineFunction &MF) override;
440};
441
442} // end namespace anonymous
443
444void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
445                                      const char *Msg) const {
446  const Function &Func = MI->getParent()->getParent()->getFunction();
447  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
448  Func.getContext().diagnose(Diag);
449}
450
451Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
452SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
453                               SIAtomicAddrSpace InstrScope) const {
454  if (SSID == SyncScope::System)
455    return std::make_tuple(SIAtomicScope::SYSTEM,
456                           SIAtomicAddrSpace::ATOMIC,
457                           true);
458  if (SSID == MMI->getAgentSSID())
459    return std::make_tuple(SIAtomicScope::AGENT,
460                           SIAtomicAddrSpace::ATOMIC,
461                           true);
462  if (SSID == MMI->getWorkgroupSSID())
463    return std::make_tuple(SIAtomicScope::WORKGROUP,
464                           SIAtomicAddrSpace::ATOMIC,
465                           true);
466  if (SSID == MMI->getWavefrontSSID())
467    return std::make_tuple(SIAtomicScope::WAVEFRONT,
468                           SIAtomicAddrSpace::ATOMIC,
469                           true);
470  if (SSID == SyncScope::SingleThread)
471    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
472                           SIAtomicAddrSpace::ATOMIC,
473                           true);
474  if (SSID == MMI->getSystemOneAddressSpaceSSID())
475    return std::make_tuple(SIAtomicScope::SYSTEM,
476                           SIAtomicAddrSpace::ATOMIC & InstrScope,
477                           false);
478  if (SSID == MMI->getAgentOneAddressSpaceSSID())
479    return std::make_tuple(SIAtomicScope::AGENT,
480                           SIAtomicAddrSpace::ATOMIC & InstrScope,
481                           false);
482  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
483    return std::make_tuple(SIAtomicScope::WORKGROUP,
484                           SIAtomicAddrSpace::ATOMIC & InstrScope,
485                           false);
486  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
487    return std::make_tuple(SIAtomicScope::WAVEFRONT,
488                           SIAtomicAddrSpace::ATOMIC & InstrScope,
489                           false);
490  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
491    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
492                           SIAtomicAddrSpace::ATOMIC & InstrScope,
493                           false);
494  return None;
495}
496
497SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
498  if (AS == AMDGPUAS::FLAT_ADDRESS)
499    return SIAtomicAddrSpace::FLAT;
500  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
501    return SIAtomicAddrSpace::GLOBAL;
502  if (AS == AMDGPUAS::LOCAL_ADDRESS)
503    return SIAtomicAddrSpace::LDS;
504  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
505    return SIAtomicAddrSpace::SCRATCH;
506  if (AS == AMDGPUAS::REGION_ADDRESS)
507    return SIAtomicAddrSpace::GDS;
508
509  return SIAtomicAddrSpace::OTHER;
510}
511
512SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
513  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
514}
515
516Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
517    const MachineBasicBlock::iterator &MI) const {
518  assert(MI->getNumMemOperands() > 0);
519
520  SyncScope::ID SSID = SyncScope::SingleThread;
521  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
522  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
523  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
524  bool IsNonTemporal = true;
525
526  // Validator should check whether or not MMOs cover the entire set of
527  // locations accessed by the memory instruction.
528  for (const auto &MMO : MI->memoperands()) {
529    IsNonTemporal &= MMO->isNonTemporal();
530    InstrAddrSpace |=
531      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
532    AtomicOrdering OpOrdering = MMO->getOrdering();
533    if (OpOrdering != AtomicOrdering::NotAtomic) {
534      const auto &IsSyncScopeInclusion =
535          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
536      if (!IsSyncScopeInclusion) {
537        reportUnsupported(MI,
538          "Unsupported non-inclusive atomic synchronization scope");
539        return None;
540      }
541
542      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
543      Ordering =
544          isStrongerThan(Ordering, OpOrdering) ?
545              Ordering : MMO->getOrdering();
546      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
547             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
548      FailureOrdering =
549          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
550              FailureOrdering : MMO->getFailureOrdering();
551    }
552  }
553
554  SIAtomicScope Scope = SIAtomicScope::NONE;
555  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
556  bool IsCrossAddressSpaceOrdering = false;
557  if (Ordering != AtomicOrdering::NotAtomic) {
558    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
559    if (!ScopeOrNone) {
560      reportUnsupported(MI, "Unsupported atomic synchronization scope");
561      return None;
562    }
563    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
564      ScopeOrNone.getValue();
565    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
566        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
567      reportUnsupported(MI, "Unsupported atomic address space");
568      return None;
569    }
570  }
571  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
572                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
573}
574
575Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
576    const MachineBasicBlock::iterator &MI) const {
577  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
578
579  if (!(MI->mayLoad() && !MI->mayStore()))
580    return None;
581
582  // Be conservative if there are no memory operands.
583  if (MI->getNumMemOperands() == 0)
584    return SIMemOpInfo();
585
586  return constructFromMIWithMMO(MI);
587}
588
589Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
590    const MachineBasicBlock::iterator &MI) const {
591  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
592
593  if (!(!MI->mayLoad() && MI->mayStore()))
594    return None;
595
596  // Be conservative if there are no memory operands.
597  if (MI->getNumMemOperands() == 0)
598    return SIMemOpInfo();
599
600  return constructFromMIWithMMO(MI);
601}
602
603Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
604    const MachineBasicBlock::iterator &MI) const {
605  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
606
607  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
608    return None;
609
610  AtomicOrdering Ordering =
611    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
612
613  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
614  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
615  if (!ScopeOrNone) {
616    reportUnsupported(MI, "Unsupported atomic synchronization scope");
617    return None;
618  }
619
620  SIAtomicScope Scope = SIAtomicScope::NONE;
621  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
622  bool IsCrossAddressSpaceOrdering = false;
623  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
624    ScopeOrNone.getValue();
625
626  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
627      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
628    reportUnsupported(MI, "Unsupported atomic address space");
629    return None;
630  }
631
632  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
633                     IsCrossAddressSpaceOrdering);
634}
635
636Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
637    const MachineBasicBlock::iterator &MI) const {
638  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
639
640  if (!(MI->mayLoad() && MI->mayStore()))
641    return None;
642
643  // Be conservative if there are no memory operands.
644  if (MI->getNumMemOperands() == 0)
645    return SIMemOpInfo();
646
647  return constructFromMIWithMMO(MI);
648}
649
650SICacheControl::SICacheControl(const GCNSubtarget &ST) {
651  TII = ST.getInstrInfo();
652  IV = getIsaVersion(ST.getCPU());
653}
654
655/* static */
656std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
657  GCNSubtarget::Generation Generation = ST.getGeneration();
658  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
659    return std::make_unique<SIGfx6CacheControl>(ST);
660  if (Generation < AMDGPUSubtarget::GFX10)
661    return std::make_unique<SIGfx7CacheControl>(ST);
662  return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
663}
664
665bool SIGfx6CacheControl::enableLoadCacheBypass(
666    const MachineBasicBlock::iterator &MI,
667    SIAtomicScope Scope,
668    SIAtomicAddrSpace AddrSpace) const {
669  assert(MI->mayLoad() && !MI->mayStore());
670  bool Changed = false;
671
672  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
673    /// TODO: Do not set glc for rmw atomic operations as they
674    /// implicitly bypass the L1 cache.
675
676    switch (Scope) {
677    case SIAtomicScope::SYSTEM:
678    case SIAtomicScope::AGENT:
679      Changed |= enableGLCBit(MI);
680      break;
681    case SIAtomicScope::WORKGROUP:
682    case SIAtomicScope::WAVEFRONT:
683    case SIAtomicScope::SINGLETHREAD:
684      // No cache to bypass.
685      break;
686    default:
687      llvm_unreachable("Unsupported synchronization scope");
688    }
689  }
690
691  /// The scratch address space does not need the global memory caches
692  /// to be bypassed as all memory operations by the same thread are
693  /// sequentially consistent, and no other thread can access scratch
694  /// memory.
695
696  /// Other address spaces do not hava a cache.
697
698  return Changed;
699}
700
701bool SIGfx6CacheControl::enableNonTemporal(
702    const MachineBasicBlock::iterator &MI) const {
703  assert(MI->mayLoad() ^ MI->mayStore());
704  bool Changed = false;
705
706  /// TODO: Do not enableGLCBit if rmw atomic.
707  Changed |= enableGLCBit(MI);
708  Changed |= enableSLCBit(MI);
709
710  return Changed;
711}
712
713bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
714                                               SIAtomicScope Scope,
715                                               SIAtomicAddrSpace AddrSpace,
716                                               Position Pos) const {
717  bool Changed = false;
718
719  MachineBasicBlock &MBB = *MI->getParent();
720  DebugLoc DL = MI->getDebugLoc();
721
722  if (Pos == Position::AFTER)
723    ++MI;
724
725  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
726    switch (Scope) {
727    case SIAtomicScope::SYSTEM:
728    case SIAtomicScope::AGENT:
729      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
730      Changed = true;
731      break;
732    case SIAtomicScope::WORKGROUP:
733    case SIAtomicScope::WAVEFRONT:
734    case SIAtomicScope::SINGLETHREAD:
735      // No cache to invalidate.
736      break;
737    default:
738      llvm_unreachable("Unsupported synchronization scope");
739    }
740  }
741
742  /// The scratch address space does not need the global memory cache
743  /// to be flushed as all memory operations by the same thread are
744  /// sequentially consistent, and no other thread can access scratch
745  /// memory.
746
747  /// Other address spaces do not hava a cache.
748
749  if (Pos == Position::AFTER)
750    --MI;
751
752  return Changed;
753}
754
755bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
756                                    SIAtomicScope Scope,
757                                    SIAtomicAddrSpace AddrSpace,
758                                    SIMemOp Op,
759                                    bool IsCrossAddrSpaceOrdering,
760                                    Position Pos) const {
761  bool Changed = false;
762
763  MachineBasicBlock &MBB = *MI->getParent();
764  DebugLoc DL = MI->getDebugLoc();
765
766  if (Pos == Position::AFTER)
767    ++MI;
768
769  bool VMCnt = false;
770  bool LGKMCnt = false;
771
772  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
773    switch (Scope) {
774    case SIAtomicScope::SYSTEM:
775    case SIAtomicScope::AGENT:
776      VMCnt |= true;
777      break;
778    case SIAtomicScope::WORKGROUP:
779    case SIAtomicScope::WAVEFRONT:
780    case SIAtomicScope::SINGLETHREAD:
781      // The L1 cache keeps all memory operations in order for
782      // wavefronts in the same work-group.
783      break;
784    default:
785      llvm_unreachable("Unsupported synchronization scope");
786    }
787  }
788
789  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
790    switch (Scope) {
791    case SIAtomicScope::SYSTEM:
792    case SIAtomicScope::AGENT:
793    case SIAtomicScope::WORKGROUP:
794      // If no cross address space ordering then an LDS waitcnt is not
795      // needed as LDS operations for all waves are executed in a
796      // total global ordering as observed by all waves. Required if
797      // also synchronizing with global/GDS memory as LDS operations
798      // could be reordered with respect to later global/GDS memory
799      // operations of the same wave.
800      LGKMCnt |= IsCrossAddrSpaceOrdering;
801      break;
802    case SIAtomicScope::WAVEFRONT:
803    case SIAtomicScope::SINGLETHREAD:
804      // The LDS keeps all memory operations in order for
805      // the same wavesfront.
806      break;
807    default:
808      llvm_unreachable("Unsupported synchronization scope");
809    }
810  }
811
812  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
813    switch (Scope) {
814    case SIAtomicScope::SYSTEM:
815    case SIAtomicScope::AGENT:
816      // If no cross address space ordering then an GDS waitcnt is not
817      // needed as GDS operations for all waves are executed in a
818      // total global ordering as observed by all waves. Required if
819      // also synchronizing with global/LDS memory as GDS operations
820      // could be reordered with respect to later global/LDS memory
821      // operations of the same wave.
822      LGKMCnt |= IsCrossAddrSpaceOrdering;
823      break;
824    case SIAtomicScope::WORKGROUP:
825    case SIAtomicScope::WAVEFRONT:
826    case SIAtomicScope::SINGLETHREAD:
827      // The GDS keeps all memory operations in order for
828      // the same work-group.
829      break;
830    default:
831      llvm_unreachable("Unsupported synchronization scope");
832    }
833  }
834
835  if (VMCnt || LGKMCnt) {
836    unsigned WaitCntImmediate =
837      AMDGPU::encodeWaitcnt(IV,
838                            VMCnt ? 0 : getVmcntBitMask(IV),
839                            getExpcntBitMask(IV),
840                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
841    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
842    Changed = true;
843  }
844
845  if (Pos == Position::AFTER)
846    --MI;
847
848  return Changed;
849}
850
851bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
852                                               SIAtomicScope Scope,
853                                               SIAtomicAddrSpace AddrSpace,
854                                               Position Pos) const {
855  bool Changed = false;
856
857  MachineBasicBlock &MBB = *MI->getParent();
858  DebugLoc DL = MI->getDebugLoc();
859
860  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
861
862  const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
863                             ? AMDGPU::BUFFER_WBINVL1
864                             : AMDGPU::BUFFER_WBINVL1_VOL;
865
866  if (Pos == Position::AFTER)
867    ++MI;
868
869  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
870    switch (Scope) {
871    case SIAtomicScope::SYSTEM:
872    case SIAtomicScope::AGENT:
873      BuildMI(MBB, MI, DL, TII->get(Flush));
874      Changed = true;
875      break;
876    case SIAtomicScope::WORKGROUP:
877    case SIAtomicScope::WAVEFRONT:
878    case SIAtomicScope::SINGLETHREAD:
879      // No cache to invalidate.
880      break;
881    default:
882      llvm_unreachable("Unsupported synchronization scope");
883    }
884  }
885
886  /// The scratch address space does not need the global memory cache
887  /// to be flushed as all memory operations by the same thread are
888  /// sequentially consistent, and no other thread can access scratch
889  /// memory.
890
891  /// Other address spaces do not hava a cache.
892
893  if (Pos == Position::AFTER)
894    --MI;
895
896  return Changed;
897}
898
899bool SIGfx10CacheControl::enableLoadCacheBypass(
900    const MachineBasicBlock::iterator &MI,
901    SIAtomicScope Scope,
902    SIAtomicAddrSpace AddrSpace) const {
903  assert(MI->mayLoad() && !MI->mayStore());
904  bool Changed = false;
905
906  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
907    /// TODO Do not set glc for rmw atomic operations as they
908    /// implicitly bypass the L0/L1 caches.
909
910    switch (Scope) {
911    case SIAtomicScope::SYSTEM:
912    case SIAtomicScope::AGENT:
913      Changed |= enableGLCBit(MI);
914      Changed |= enableDLCBit(MI);
915      break;
916    case SIAtomicScope::WORKGROUP:
917      // In WGP mode the waves of a work-group can be executing on either CU of
918      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
919      // CU mode and all waves of a work-group are on the same CU, and so the
920      // L0 does not need to be bypassed.
921      if (!CuMode) Changed |= enableGLCBit(MI);
922      break;
923    case SIAtomicScope::WAVEFRONT:
924    case SIAtomicScope::SINGLETHREAD:
925      // No cache to bypass.
926      break;
927    default:
928      llvm_unreachable("Unsupported synchronization scope");
929    }
930  }
931
932  /// The scratch address space does not need the global memory caches
933  /// to be bypassed as all memory operations by the same thread are
934  /// sequentially consistent, and no other thread can access scratch
935  /// memory.
936
937  /// Other address spaces do not hava a cache.
938
939  return Changed;
940}
941
942bool SIGfx10CacheControl::enableNonTemporal(
943    const MachineBasicBlock::iterator &MI) const {
944  assert(MI->mayLoad() ^ MI->mayStore());
945  bool Changed = false;
946
947  Changed |= enableSLCBit(MI);
948  /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
949
950  return Changed;
951}
952
953bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
954                                                SIAtomicScope Scope,
955                                                SIAtomicAddrSpace AddrSpace,
956                                                Position Pos) const {
957  bool Changed = false;
958
959  MachineBasicBlock &MBB = *MI->getParent();
960  DebugLoc DL = MI->getDebugLoc();
961
962  if (Pos == Position::AFTER)
963    ++MI;
964
965  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
966    switch (Scope) {
967    case SIAtomicScope::SYSTEM:
968    case SIAtomicScope::AGENT:
969      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
970      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
971      Changed = true;
972      break;
973    case SIAtomicScope::WORKGROUP:
974      // In WGP mode the waves of a work-group can be executing on either CU of
975      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
976      // in CU mode and all waves of a work-group are on the same CU, and so the
977      // L0 does not need to be invalidated.
978      if (!CuMode) {
979        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
980        Changed = true;
981      }
982      break;
983    case SIAtomicScope::WAVEFRONT:
984    case SIAtomicScope::SINGLETHREAD:
985      // No cache to invalidate.
986      break;
987    default:
988      llvm_unreachable("Unsupported synchronization scope");
989    }
990  }
991
992  /// The scratch address space does not need the global memory cache
993  /// to be flushed as all memory operations by the same thread are
994  /// sequentially consistent, and no other thread can access scratch
995  /// memory.
996
997  /// Other address spaces do not hava a cache.
998
999  if (Pos == Position::AFTER)
1000    --MI;
1001
1002  return Changed;
1003}
1004
1005bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1006                                     SIAtomicScope Scope,
1007                                     SIAtomicAddrSpace AddrSpace,
1008                                     SIMemOp Op,
1009                                     bool IsCrossAddrSpaceOrdering,
1010                                     Position Pos) const {
1011  bool Changed = false;
1012
1013  MachineBasicBlock &MBB = *MI->getParent();
1014  DebugLoc DL = MI->getDebugLoc();
1015
1016  if (Pos == Position::AFTER)
1017    ++MI;
1018
1019  bool VMCnt = false;
1020  bool VSCnt = false;
1021  bool LGKMCnt = false;
1022
1023  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024    switch (Scope) {
1025    case SIAtomicScope::SYSTEM:
1026    case SIAtomicScope::AGENT:
1027      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1028        VMCnt |= true;
1029      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1030        VSCnt |= true;
1031      break;
1032    case SIAtomicScope::WORKGROUP:
1033      // In WGP mode the waves of a work-group can be executing on either CU of
1034      // the WGP. Therefore need to wait for operations to complete to ensure
1035      // they are visible to waves in the other CU as the L0 is per CU.
1036      // Otherwise in CU mode and all waves of a work-group are on the same CU
1037      // which shares the same L0.
1038      if (!CuMode) {
1039        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1040          VMCnt |= true;
1041        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1042          VSCnt |= true;
1043      }
1044      break;
1045    case SIAtomicScope::WAVEFRONT:
1046    case SIAtomicScope::SINGLETHREAD:
1047      // The L0 cache keeps all memory operations in order for
1048      // work-items in the same wavefront.
1049      break;
1050    default:
1051      llvm_unreachable("Unsupported synchronization scope");
1052    }
1053  }
1054
1055  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1056    switch (Scope) {
1057    case SIAtomicScope::SYSTEM:
1058    case SIAtomicScope::AGENT:
1059    case SIAtomicScope::WORKGROUP:
1060      // If no cross address space ordering then an LDS waitcnt is not
1061      // needed as LDS operations for all waves are executed in a
1062      // total global ordering as observed by all waves. Required if
1063      // also synchronizing with global/GDS memory as LDS operations
1064      // could be reordered with respect to later global/GDS memory
1065      // operations of the same wave.
1066      LGKMCnt |= IsCrossAddrSpaceOrdering;
1067      break;
1068    case SIAtomicScope::WAVEFRONT:
1069    case SIAtomicScope::SINGLETHREAD:
1070      // The LDS keeps all memory operations in order for
1071      // the same wavesfront.
1072      break;
1073    default:
1074      llvm_unreachable("Unsupported synchronization scope");
1075    }
1076  }
1077
1078  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1079    switch (Scope) {
1080    case SIAtomicScope::SYSTEM:
1081    case SIAtomicScope::AGENT:
1082      // If no cross address space ordering then an GDS waitcnt is not
1083      // needed as GDS operations for all waves are executed in a
1084      // total global ordering as observed by all waves. Required if
1085      // also synchronizing with global/LDS memory as GDS operations
1086      // could be reordered with respect to later global/LDS memory
1087      // operations of the same wave.
1088      LGKMCnt |= IsCrossAddrSpaceOrdering;
1089      break;
1090    case SIAtomicScope::WORKGROUP:
1091    case SIAtomicScope::WAVEFRONT:
1092    case SIAtomicScope::SINGLETHREAD:
1093      // The GDS keeps all memory operations in order for
1094      // the same work-group.
1095      break;
1096    default:
1097      llvm_unreachable("Unsupported synchronization scope");
1098    }
1099  }
1100
1101  if (VMCnt || LGKMCnt) {
1102    unsigned WaitCntImmediate =
1103      AMDGPU::encodeWaitcnt(IV,
1104                            VMCnt ? 0 : getVmcntBitMask(IV),
1105                            getExpcntBitMask(IV),
1106                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1107    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1108    Changed = true;
1109  }
1110
1111  if (VSCnt) {
1112    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1113      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1114      .addImm(0);
1115    Changed = true;
1116  }
1117
1118  if (Pos == Position::AFTER)
1119    --MI;
1120
1121  return Changed;
1122}
1123
1124bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1125  if (AtomicPseudoMIs.empty())
1126    return false;
1127
1128  for (auto &MI : AtomicPseudoMIs)
1129    MI->eraseFromParent();
1130
1131  AtomicPseudoMIs.clear();
1132  return true;
1133}
1134
1135bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1136                                   MachineBasicBlock::iterator &MI) {
1137  assert(MI->mayLoad() && !MI->mayStore());
1138
1139  bool Changed = false;
1140
1141  if (MOI.isAtomic()) {
1142    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1143        MOI.getOrdering() == AtomicOrdering::Acquire ||
1144        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1145      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1146                                           MOI.getOrderingAddrSpace());
1147    }
1148
1149    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1150      Changed |= CC->insertWait(MI, MOI.getScope(),
1151                                MOI.getOrderingAddrSpace(),
1152                                SIMemOp::LOAD | SIMemOp::STORE,
1153                                MOI.getIsCrossAddressSpaceOrdering(),
1154                                Position::BEFORE);
1155
1156    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1157        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1158      Changed |= CC->insertWait(MI, MOI.getScope(),
1159                                MOI.getInstrAddrSpace(),
1160                                SIMemOp::LOAD,
1161                                MOI.getIsCrossAddressSpaceOrdering(),
1162                                Position::AFTER);
1163      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1164                                           MOI.getOrderingAddrSpace(),
1165                                           Position::AFTER);
1166    }
1167
1168    return Changed;
1169  }
1170
1171  // Atomic instructions do not have the nontemporal attribute.
1172  if (MOI.isNonTemporal()) {
1173    Changed |= CC->enableNonTemporal(MI);
1174    return Changed;
1175  }
1176
1177  return Changed;
1178}
1179
1180bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1181                                    MachineBasicBlock::iterator &MI) {
1182  assert(!MI->mayLoad() && MI->mayStore());
1183
1184  bool Changed = false;
1185
1186  if (MOI.isAtomic()) {
1187    if (MOI.getOrdering() == AtomicOrdering::Release ||
1188        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1189      Changed |= CC->insertWait(MI, MOI.getScope(),
1190                                MOI.getOrderingAddrSpace(),
1191                                SIMemOp::LOAD | SIMemOp::STORE,
1192                                MOI.getIsCrossAddressSpaceOrdering(),
1193                                Position::BEFORE);
1194
1195    return Changed;
1196  }
1197
1198  // Atomic instructions do not have the nontemporal attribute.
1199  if (MOI.isNonTemporal()) {
1200    Changed |= CC->enableNonTemporal(MI);
1201    return Changed;
1202  }
1203
1204  return Changed;
1205}
1206
1207bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1208                                          MachineBasicBlock::iterator &MI) {
1209  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1210
1211  AtomicPseudoMIs.push_back(MI);
1212  bool Changed = false;
1213
1214  if (MOI.isAtomic()) {
1215    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1216        MOI.getOrdering() == AtomicOrdering::Release ||
1217        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1218        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1219      /// TODO: This relies on a barrier always generating a waitcnt
1220      /// for LDS to ensure it is not reordered with the completion of
1221      /// the proceeding LDS operations. If barrier had a memory
1222      /// ordering and memory scope, then library does not need to
1223      /// generate a fence. Could add support in this file for
1224      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1225      /// adding waitcnt before a S_BARRIER.
1226      Changed |= CC->insertWait(MI, MOI.getScope(),
1227                                MOI.getOrderingAddrSpace(),
1228                                SIMemOp::LOAD | SIMemOp::STORE,
1229                                MOI.getIsCrossAddressSpaceOrdering(),
1230                                Position::BEFORE);
1231
1232    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1233        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1234        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1235      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1236                                           MOI.getOrderingAddrSpace(),
1237                                           Position::BEFORE);
1238
1239    return Changed;
1240  }
1241
1242  return Changed;
1243}
1244
1245bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1246  MachineBasicBlock::iterator &MI) {
1247  assert(MI->mayLoad() && MI->mayStore());
1248
1249  bool Changed = false;
1250
1251  if (MOI.isAtomic()) {
1252    if (MOI.getOrdering() == AtomicOrdering::Release ||
1253        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1254        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1255        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1256      Changed |= CC->insertWait(MI, MOI.getScope(),
1257                                MOI.getOrderingAddrSpace(),
1258                                SIMemOp::LOAD | SIMemOp::STORE,
1259                                MOI.getIsCrossAddressSpaceOrdering(),
1260                                Position::BEFORE);
1261
1262    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1263        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1264        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1265        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1266        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1267      Changed |= CC->insertWait(MI, MOI.getScope(),
1268                                MOI.getOrderingAddrSpace(),
1269                                isAtomicRet(*MI) ? SIMemOp::LOAD :
1270                                                   SIMemOp::STORE,
1271                                MOI.getIsCrossAddressSpaceOrdering(),
1272                                Position::AFTER);
1273      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1274                                           MOI.getOrderingAddrSpace(),
1275                                           Position::AFTER);
1276    }
1277
1278    return Changed;
1279  }
1280
1281  return Changed;
1282}
1283
1284bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1285  bool Changed = false;
1286
1287  SIMemOpAccess MOA(MF);
1288  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1289
1290  for (auto &MBB : MF) {
1291    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1292      if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1293        continue;
1294
1295      if (const auto &MOI = MOA.getLoadInfo(MI))
1296        Changed |= expandLoad(MOI.getValue(), MI);
1297      else if (const auto &MOI = MOA.getStoreInfo(MI))
1298        Changed |= expandStore(MOI.getValue(), MI);
1299      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1300        Changed |= expandAtomicFence(MOI.getValue(), MI);
1301      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1302        Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1303    }
1304  }
1305
1306  Changed |= removeAtomicPseudoMIs();
1307  return Changed;
1308}
1309
1310INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1311
1312char SIMemoryLegalizer::ID = 0;
1313char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1314
1315FunctionPass *llvm::createSIMemoryLegalizerPass() {
1316  return new SIMemoryLegalizer();
1317}
1318