1//===------------------------- LSUnit.h --------------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// A Load/Store unit class that models load/store queues and that implements
11/// a simple weak memory consistency model.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_MCA_LSUNIT_H
16#define LLVM_MCA_LSUNIT_H
17
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/MC/MCSchedule.h"
21#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22#include "llvm/MCA/Instruction.h"
23
24namespace llvm {
25namespace mca {
26
27/// A node of a memory dependency graph. A MemoryGroup describes a set of
28/// instructions with same memory dependencies.
29///
30/// By construction, instructions of a MemoryGroup don't depend on each other.
31/// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32/// A Memory group identifier is then stored as a "token" in field
33/// Instruction::LSUTokenID of each dispatched instructions. That token is used
34/// internally by the LSUnit to track memory dependencies.
35class MemoryGroup {
36  unsigned NumPredecessors;
37  unsigned NumExecutingPredecessors;
38  unsigned NumExecutedPredecessors;
39
40  unsigned NumInstructions;
41  unsigned NumExecuting;
42  unsigned NumExecuted;
43  // Successors that are in a order dependency with this group.
44  SmallVector<MemoryGroup *, 4> OrderSucc;
45  // Successors that are in a data dependency with this group.
46  SmallVector<MemoryGroup *, 4> DataSucc;
47
48  CriticalDependency CriticalPredecessor;
49  InstRef CriticalMemoryInstruction;
50
51  MemoryGroup(const MemoryGroup &) = delete;
52  MemoryGroup &operator=(const MemoryGroup &) = delete;
53
54public:
55  MemoryGroup()
56      : NumPredecessors(0), NumExecutingPredecessors(0),
57        NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
58        NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
59  MemoryGroup(MemoryGroup &&) = default;
60
61  size_t getNumSuccessors() const {
62    return OrderSucc.size() + DataSucc.size();
63  }
64  unsigned getNumPredecessors() const { return NumPredecessors; }
65  unsigned getNumExecutingPredecessors() const {
66    return NumExecutingPredecessors;
67  }
68  unsigned getNumExecutedPredecessors() const {
69    return NumExecutedPredecessors;
70  }
71  unsigned getNumInstructions() const { return NumInstructions; }
72  unsigned getNumExecuting() const { return NumExecuting; }
73  unsigned getNumExecuted() const { return NumExecuted; }
74
75  const InstRef &getCriticalMemoryInstruction() const {
76    return CriticalMemoryInstruction;
77  }
78  const CriticalDependency &getCriticalPredecessor() const {
79    return CriticalPredecessor;
80  }
81
82  void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
83    // Do not need to add a dependency if there is no data
84    // dependency and all instructions from this group have been
85    // issued already.
86    if (!IsDataDependent && isExecuting())
87      return;
88
89    Group->NumPredecessors++;
90    assert(!isExecuted() && "Should have been removed!");
91    if (isExecuting())
92      Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
93
94    if (IsDataDependent)
95      DataSucc.emplace_back(Group);
96    else
97      OrderSucc.emplace_back(Group);
98  }
99
100  bool isWaiting() const {
101    return NumPredecessors >
102           (NumExecutingPredecessors + NumExecutedPredecessors);
103  }
104  bool isPending() const {
105    return NumExecutingPredecessors &&
106           ((NumExecutedPredecessors + NumExecutingPredecessors) ==
107            NumPredecessors);
108  }
109  bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
110  bool isExecuting() const {
111    return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
112  }
113  bool isExecuted() const { return NumInstructions == NumExecuted; }
114
115  void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
116    assert(!isReady() && "Unexpected group-start event!");
117    NumExecutingPredecessors++;
118
119    if (!ShouldUpdateCriticalDep)
120      return;
121
122    unsigned Cycles = IR.getInstruction()->getCyclesLeft();
123    if (CriticalPredecessor.Cycles < Cycles) {
124      CriticalPredecessor.IID = IR.getSourceIndex();
125      CriticalPredecessor.Cycles = Cycles;
126    }
127  }
128
129  void onGroupExecuted() {
130    assert(!isReady() && "Inconsistent state found!");
131    NumExecutingPredecessors--;
132    NumExecutedPredecessors++;
133  }
134
135  void onInstructionIssued(const InstRef &IR) {
136    assert(!isExecuting() && "Invalid internal state!");
137    ++NumExecuting;
138
139    // update the CriticalMemDep.
140    const Instruction &IS = *IR.getInstruction();
141    if ((bool)CriticalMemoryInstruction) {
142      const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
143      if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
144        CriticalMemoryInstruction = IR;
145    } else {
146      CriticalMemoryInstruction = IR;
147    }
148
149    if (!isExecuting())
150      return;
151
152    // Notify successors that this group started execution.
153    for (MemoryGroup *MG : OrderSucc) {
154      MG->onGroupIssued(CriticalMemoryInstruction, false);
155      // Release the order dependency with this group.
156      MG->onGroupExecuted();
157    }
158
159    for (MemoryGroup *MG : DataSucc)
160      MG->onGroupIssued(CriticalMemoryInstruction, true);
161  }
162
163  void onInstructionExecuted() {
164    assert(isReady() && !isExecuted() && "Invalid internal state!");
165    --NumExecuting;
166    ++NumExecuted;
167
168    if (!isExecuted())
169      return;
170
171    // Notify data dependent successors that this group has finished execution.
172    for (MemoryGroup *MG : DataSucc)
173      MG->onGroupExecuted();
174  }
175
176  void addInstruction() {
177    assert(!getNumSuccessors() && "Cannot add instructions to this group!");
178    ++NumInstructions;
179  }
180
181  void cycleEvent() {
182    if (isWaiting() && CriticalPredecessor.Cycles)
183      CriticalPredecessor.Cycles--;
184  }
185};
186
187/// Abstract base interface for LS (load/store) units in llvm-mca.
188class LSUnitBase : public HardwareUnit {
189  /// Load queue size.
190  ///
191  /// A value of zero for this field means that the load queue is unbounded.
192  /// Processor models can declare the size of a load queue via tablegen (see
193  /// the definition of tablegen class LoadQueue in
194  /// llvm/Target/TargetSchedule.td).
195  unsigned LQSize;
196
197  /// Load queue size.
198  ///
199  /// A value of zero for this field means that the store queue is unbounded.
200  /// Processor models can declare the size of a store queue via tablegen (see
201  /// the definition of tablegen class StoreQueue in
202  /// llvm/Target/TargetSchedule.td).
203  unsigned SQSize;
204
205  unsigned UsedLQEntries;
206  unsigned UsedSQEntries;
207
208  /// True if loads don't alias with stores.
209  ///
210  /// By default, the LS unit assumes that loads and stores don't alias with
211  /// eachother. If this field is set to false, then loads are always assumed to
212  /// alias with stores.
213  const bool NoAlias;
214
215  /// Used to map group identifiers to MemoryGroups.
216  DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
217  unsigned NextGroupID;
218
219public:
220  LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
221             unsigned StoreQueueSize, bool AssumeNoAlias);
222
223  virtual ~LSUnitBase();
224
225  /// Returns the total number of entries in the load queue.
226  unsigned getLoadQueueSize() const { return LQSize; }
227
228  /// Returns the total number of entries in the store queue.
229  unsigned getStoreQueueSize() const { return SQSize; }
230
231  unsigned getUsedLQEntries() const { return UsedLQEntries; }
232  unsigned getUsedSQEntries() const { return UsedSQEntries; }
233  void acquireLQSlot() { ++UsedLQEntries; }
234  void acquireSQSlot() { ++UsedSQEntries; }
235  void releaseLQSlot() { --UsedLQEntries; }
236  void releaseSQSlot() { --UsedSQEntries; }
237
238  bool assumeNoAlias() const { return NoAlias; }
239
240  enum Status {
241    LSU_AVAILABLE = 0,
242    LSU_LQUEUE_FULL, // Load Queue unavailable
243    LSU_SQUEUE_FULL  // Store Queue unavailable
244  };
245
246  /// This method checks the availability of the load/store buffers.
247  ///
248  /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
249  /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
250  /// not a memory operation.
251  virtual Status isAvailable(const InstRef &IR) const = 0;
252
253  /// Allocates LS resources for instruction IR.
254  ///
255  /// This method assumes that a previous call to `isAvailable(IR)` succeeded
256  /// with a LSUnitBase::Status value of LSU_AVAILABLE.
257  /// Returns the GroupID associated with this instruction. That value will be
258  /// used to set the LSUTokenID field in class Instruction.
259  virtual unsigned dispatch(const InstRef &IR) = 0;
260
261  bool isSQEmpty() const { return !UsedSQEntries; }
262  bool isLQEmpty() const { return !UsedLQEntries; }
263  bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
264  bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
265
266  bool isValidGroupID(unsigned Index) const {
267    return Index && (Groups.find(Index) != Groups.end());
268  }
269
270  /// Check if a peviously dispatched instruction IR is now ready for execution.
271  bool isReady(const InstRef &IR) const {
272    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
273    const MemoryGroup &Group = getGroup(GroupID);
274    return Group.isReady();
275  }
276
277  /// Check if instruction IR only depends on memory instructions that are
278  /// currently executing.
279  bool isPending(const InstRef &IR) const {
280    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
281    const MemoryGroup &Group = getGroup(GroupID);
282    return Group.isPending();
283  }
284
285  /// Check if instruction IR is still waiting on memory operations, and the
286  /// wait time is still unknown.
287  bool isWaiting(const InstRef &IR) const {
288    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
289    const MemoryGroup &Group = getGroup(GroupID);
290    return Group.isWaiting();
291  }
292
293  bool hasDependentUsers(const InstRef &IR) const {
294    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
295    const MemoryGroup &Group = getGroup(GroupID);
296    return !Group.isExecuted() && Group.getNumSuccessors();
297  }
298
299  const MemoryGroup &getGroup(unsigned Index) const {
300    assert(isValidGroupID(Index) && "Group doesn't exist!");
301    return *Groups.find(Index)->second;
302  }
303
304  MemoryGroup &getGroup(unsigned Index) {
305    assert(isValidGroupID(Index) && "Group doesn't exist!");
306    return *Groups.find(Index)->second;
307  }
308
309  unsigned createMemoryGroup() {
310    Groups.insert(
311        std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
312    return NextGroupID++;
313  }
314
315  virtual void onInstructionExecuted(const InstRef &IR);
316
317  // Loads are tracked by the LDQ (load queue) from dispatch until completion.
318  // Stores are tracked by the STQ (store queue) from dispatch until commitment.
319  // By default we conservatively assume that the LDQ receives a load at
320  // dispatch. Loads leave the LDQ at retirement stage.
321  virtual void onInstructionRetired(const InstRef &IR);
322
323  virtual void onInstructionIssued(const InstRef &IR) {
324    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
325    Groups[GroupID]->onInstructionIssued(IR);
326  }
327
328  virtual void cycleEvent();
329
330#ifndef NDEBUG
331  void dump() const;
332#endif
333};
334
335/// Default Load/Store Unit (LS Unit) for simulated processors.
336///
337/// Each load (or store) consumes one entry in the load (or store) queue.
338///
339/// Rules are:
340/// 1) A younger load is allowed to pass an older load only if there are no
341///    stores nor barriers in between the two loads.
342/// 2) An younger store is not allowed to pass an older store.
343/// 3) A younger store is not allowed to pass an older load.
344/// 4) A younger load is allowed to pass an older store only if the load does
345///    not alias with the store.
346///
347/// This class optimistically assumes that loads don't alias store operations.
348/// Under this assumption, younger loads are always allowed to pass older
349/// stores (this would only affects rule 4).
350/// Essentially, this class doesn't perform any sort alias analysis to
351/// identify aliasing loads and stores.
352///
353/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
354/// set to `false` by the constructor of LSUnit.
355///
356/// Note that this class doesn't know about the existence of different memory
357/// types for memory operations (example: write-through, write-combining, etc.).
358/// Derived classes are responsible for implementing that extra knowledge, and
359/// provide different sets of rules for loads and stores by overriding method
360/// `isReady()`.
361/// To emulate a write-combining memory type, rule 2. must be relaxed in a
362/// derived class to enable the reordering of non-aliasing store operations.
363///
364/// No assumptions are made by this class on the size of the store buffer.  This
365/// class doesn't know how to identify cases where store-to-load forwarding may
366/// occur.
367///
368/// LSUnit doesn't attempt to predict whether a load or store hits or misses
369/// the L1 cache. To be more specific, LSUnit doesn't know anything about
370/// cache hierarchy and memory types.
371/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
372/// scheduling model provides an "optimistic" load-to-use latency (which usually
373/// matches the load-to-use latency for when there is a hit in the L1D).
374/// Derived classes may expand this knowledge.
375///
376/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
377/// memory-barrier like instructions.
378/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
379/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
380/// serializes loads without forcing a flush of the load queue.
381/// Similarly, instructions that both `mayStore` and have `unmodeled side
382/// effects` are treated like store barriers. A full memory
383/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
384/// effects. This is obviously inaccurate, but this is the best that we can do
385/// at the moment.
386///
387/// Each load/store barrier consumes one entry in the load/store queue. A
388/// load/store barrier enforces ordering of loads/stores:
389///  - A younger load cannot pass a load barrier.
390///  - A younger store cannot pass a store barrier.
391///
392/// A younger load has to wait for the memory load barrier to execute.
393/// A load/store barrier is "executed" when it becomes the oldest entry in
394/// the load/store queue(s). That also means, all the older loads/stores have
395/// already been executed.
396class LSUnit : public LSUnitBase {
397  // This class doesn't know about the latency of a load instruction. So, it
398  // conservatively/pessimistically assumes that the latency of a load opcode
399  // matches the instruction latency.
400  //
401  // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
402  // and load/store conflicts, the latency of a load is determined by the depth
403  // of the load pipeline. So, we could use field `LoadLatency` in the
404  // MCSchedModel to model that latency.
405  // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
406  // L1D, and it usually already accounts for any extra latency due to data
407  // forwarding.
408  // When doing throughput analysis, `LoadLatency` is likely to
409  // be a better predictor of load latency than instruction latency. This is
410  // particularly true when simulating code with temporal/spatial locality of
411  // memory accesses.
412  // Using `LoadLatency` (instead of the instruction latency) is also expected
413  // to improve the load queue allocation for long latency instructions with
414  // folded memory operands (See PR39829).
415  //
416  // FIXME: On some processors, load/store operations are split into multiple
417  // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
418  // not 256-bit data types. So, a 256-bit load is effectively split into two
419  // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
420  // simplicity, this class optimistically assumes that a load instruction only
421  // consumes one entry in the LoadQueue.  Similarly, store instructions only
422  // consume a single entry in the StoreQueue.
423  // In future, we should reassess the quality of this design, and consider
424  // alternative approaches that let instructions specify the number of
425  // load/store queue entries which they consume at dispatch stage (See
426  // PR39830).
427  //
428  // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
429  // conservatively treated as a store barrier. It forces older store to be
430  // executed before newer stores are issued.
431  //
432  // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
433  // conservatively treated as a load barrier. It forces older loads to execute
434  // before newer loads are issued.
435  unsigned CurrentLoadGroupID;
436  unsigned CurrentLoadBarrierGroupID;
437  unsigned CurrentStoreGroupID;
438  unsigned CurrentStoreBarrierGroupID;
439
440public:
441  LSUnit(const MCSchedModel &SM)
442      : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
443  LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
444      : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
445  LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
446      : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
447        CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
448        CurrentStoreBarrierGroupID(0) {}
449
450  /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
451  /// accomodate instruction IR.
452  Status isAvailable(const InstRef &IR) const override;
453
454  /// Allocates LS resources for instruction IR.
455  ///
456  /// This method assumes that a previous call to `isAvailable(IR)` succeeded
457  /// returning LSU_AVAILABLE.
458  ///
459  /// Rules are:
460  /// By default, rules are:
461  /// 1. A store may not pass a previous store.
462  /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
463  /// 3. A load may pass a previous load.
464  /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
465  /// 5. A load has to wait until an older load barrier is fully executed.
466  /// 6. A store has to wait until an older store barrier is fully executed.
467  unsigned dispatch(const InstRef &IR) override;
468
469  void onInstructionExecuted(const InstRef &IR) override;
470};
471
472} // namespace mca
473} // namespace llvm
474
475#endif // LLVM_MCA_LSUNIT_H
476