1//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the bottleneck analysis view.
11///
12/// This view internally observes backend pressure increase events in order to
13/// identify problematic data dependencies and processor resource interferences.
14///
15/// Example of bottleneck analysis report for a dot-product on X86 btver2:
16///
17/// Cycles with backend pressure increase [ 40.76% ]
18/// Throughput Bottlenecks:
19///   Resource Pressure       [ 39.34% ]
20///   - JFPA  [ 39.34% ]
21///   - JFPU0  [ 39.34% ]
22///   Data Dependencies:      [ 1.42% ]
23///   - Register Dependencies [ 1.42% ]
24///   - Memory Dependencies   [ 0.00% ]
25///
26/// According to the example, backend pressure increased during the 40.76% of
27/// the simulated cycles.  In particular, the major cause of backend pressure
28/// increases was the contention on floating point adder JFPA accessible from
29/// pipeline resource JFPU0.
30///
31/// At the end of each cycle, if pressure on the simulated out-of-order buffers
32/// has increased, a backend pressure event is reported.
33/// In particular, this occurs when there is a delta between the number of uOps
34/// dispatched and the number of uOps issued to the underlying pipelines.
35///
36/// The bottleneck analysis view is also responsible for identifying and printing
37/// the most "critical" sequence of dependent instructions according to the
38/// simulated run.
39///
40/// Below is the critical sequence computed for the dot-product example on
41/// btver2:
42///
43///              Instruction                     Dependency Information
44/// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
45/// |
46/// |    < loop carried >
47/// |
48/// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
49/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
50/// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
51/// |
52/// |    < loop carried >
53/// |
54/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
55///
56///
57/// The algorithm that computes the critical sequence is very similar to a
58/// critical path analysis.
59///
60/// A dependency graph is used internally to track dependencies between nodes.
61/// Nodes of the graph represent instructions from the input assembly sequence,
62/// and edges of the graph represent data dependencies or processor resource
63/// interferences.
64///
65/// Edges are dynamically 'discovered' by observing instruction state transitions
66/// and backend pressure increase events. Edges are internally ranked based on
67/// their "criticality". A dependency is considered to be critical if it takes a
68/// long time to execute, and if it contributes to backend pressure increases.
69/// Criticality is internally measured in terms of cycles; it is computed for
70/// every edge in the graph as a function of the edge latency and the number of
71/// backend pressure increase cycles contributed by that edge.
72///
73/// At the end of simulation, costs are propagated to nodes through the edges of
74/// the graph, and the most expensive path connecting the root-set (a
75/// set of nodes with no predecessors) to a leaf node is reported as critical
76/// sequence.
77//
78//===----------------------------------------------------------------------===//
79
80#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
81#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82
83#include "Views/View.h"
84#include "llvm/ADT/DenseMap.h"
85#include "llvm/ADT/SmallVector.h"
86#include "llvm/MC/MCInstPrinter.h"
87#include "llvm/MC/MCSchedule.h"
88#include "llvm/MC/MCSubtargetInfo.h"
89#include "llvm/Support/raw_ostream.h"
90
91namespace llvm {
92namespace mca {
93
94class PressureTracker {
95  const MCSchedModel &SM;
96
97  // Resource pressure distribution. There is an element for every processor
98  // resource declared by the scheduling model. Quantities are number of cycles.
99  SmallVector<unsigned, 4> ResourcePressureDistribution;
100
101  // Each processor resource is associated with a so-called processor resource
102  // mask. This vector allows to correlate processor resource IDs with processor
103  // resource masks. There is exactly one element per each processor resource
104  // declared by the scheduling model.
105  SmallVector<uint64_t, 4> ProcResID2Mask;
106
107  // Maps processor resource state indices (returned by calls to
108  // `getResourceStateIndex(Mask)` to processor resource identifiers.
109  SmallVector<unsigned, 4> ResIdx2ProcResID;
110
111  // Maps Processor Resource identifiers to ResourceUsers indices.
112  SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
113
114  // Identifies the last user of a processor resource unit.
115  // This vector is updated on every instruction issued event.
116  // There is one entry for every processor resource unit declared by the
117  // processor model. An all_ones value is treated like an invalid instruction
118  // identifier.
119  using User = std::pair<unsigned, unsigned>;
120  SmallVector<User, 4> ResourceUsers;
121
122  struct InstructionPressureInfo {
123    unsigned RegisterPressureCycles;
124    unsigned MemoryPressureCycles;
125    unsigned ResourcePressureCycles;
126  };
127  DenseMap<unsigned, InstructionPressureInfo> IPI;
128
129  void updateResourcePressureDistribution(uint64_t CumulativeMask);
130
131  User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
132    unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
133    return ResourceUsers[Index + UnitID];
134  }
135
136public:
137  PressureTracker(const MCSchedModel &Model);
138
139  ArrayRef<unsigned> getResourcePressureDistribution() const {
140    return ResourcePressureDistribution;
141  }
142
143  void getResourceUsers(uint64_t ResourceMask,
144                        SmallVectorImpl<User> &Users) const;
145
146  unsigned getRegisterPressureCycles(unsigned IID) const {
147    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
148    const InstructionPressureInfo &Info = IPI.find(IID)->second;
149    return Info.RegisterPressureCycles;
150  }
151
152  unsigned getMemoryPressureCycles(unsigned IID) const {
153    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
154    const InstructionPressureInfo &Info = IPI.find(IID)->second;
155    return Info.MemoryPressureCycles;
156  }
157
158  unsigned getResourcePressureCycles(unsigned IID) const {
159    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
160    const InstructionPressureInfo &Info = IPI.find(IID)->second;
161    return Info.ResourcePressureCycles;
162  }
163
164  const char *resolveResourceName(uint64_t ResourceMask) const {
165    unsigned Index = getResourceStateIndex(ResourceMask);
166    unsigned ProcResID = ResIdx2ProcResID[Index];
167    const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
168    return PRDesc.Name;
169  }
170
171  void onInstructionDispatched(unsigned IID);
172  void onInstructionExecuted(unsigned IID);
173
174  void handlePressureEvent(const HWPressureEvent &Event);
175  void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
176};
177
178// A dependency edge.
179struct DependencyEdge {
180  enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
181
182  // Dependency edge descriptor.
183  //
184  // It specifies the dependency type, as well as the edge cost in cycles.
185  struct Dependency {
186    DependencyType Type;
187    uint64_t ResourceOrRegID;
188    uint64_t Cost;
189  };
190  Dependency Dep;
191
192  unsigned FromIID;
193  unsigned ToIID;
194
195  // Used by the bottleneck analysis to compute the interference
196  // probability for processor resources.
197  unsigned Frequency;
198};
199
200// A dependency graph used by the bottleneck analysis to describe data
201// dependencies and processor resource interferences between instructions.
202//
203// There is a node (an instance of struct DGNode) for every instruction in the
204// input assembly sequence. Edges of the graph represent dependencies between
205// instructions.
206//
207// Each edge of the graph is associated with a cost value which is used
208// internally to rank dependency based on their impact on the runtime
209// performance (see field DependencyEdge::Dependency::Cost). In general, the
210// higher the cost of an edge, the higher the impact on performance.
211//
212// The cost of a dependency is a function of both the latency and the number of
213// cycles where the dependency has been seen as critical (i.e. contributing to
214// back-pressure increases).
215//
216// Loop carried dependencies are carefully expanded by the bottleneck analysis
217// to guarantee that the graph stays acyclic. To this end, extra nodes are
218// pre-allocated at construction time to describe instructions from "past and
219// future" iterations. The graph is kept acyclic mainly because it simplifies the
220// complexity of the algorithm that computes the critical sequence.
221class DependencyGraph {
222  struct DGNode {
223    unsigned NumPredecessors;
224    unsigned NumVisitedPredecessors;
225    uint64_t Cost;
226    unsigned Depth;
227
228    DependencyEdge CriticalPredecessor;
229    SmallVector<DependencyEdge, 8> OutgoingEdges;
230  };
231  SmallVector<DGNode, 16> Nodes;
232
233  DependencyGraph(const DependencyGraph &) = delete;
234  DependencyGraph &operator=(const DependencyGraph &) = delete;
235
236  void addDependency(unsigned From, unsigned To,
237                     DependencyEdge::Dependency &&DE);
238
239  void pruneEdges(unsigned Iterations);
240  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
241  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
242
243#ifndef NDEBUG
244  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
245                          MCInstPrinter &MCIP) const;
246#endif
247
248public:
249  DependencyGraph(unsigned Size) : Nodes(Size) {}
250
251  void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
252                      unsigned Cost) {
253    addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
254  }
255
256  void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
257    addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
258  }
259
260  void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
261                      unsigned Cost) {
262    addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
263  }
264
265  // Called by the bottleneck analysis at the end of simulation to propagate
266  // costs through the edges of the graph, and compute a critical path.
267  void finalizeGraph(unsigned Iterations) {
268    SmallVector<unsigned, 16> RootSet;
269    pruneEdges(Iterations);
270    initializeRootSet(RootSet);
271    propagateThroughEdges(RootSet, Iterations);
272  }
273
274  // Returns a sequence of edges representing the critical sequence based on the
275  // simulated run. It assumes that the graph has already been finalized (i.e.
276  // method `finalizeGraph()` has already been called on this graph).
277  void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
278
279#ifndef NDEBUG
280  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
281#endif
282};
283
284/// A view that collects and prints a few performance numbers.
285class BottleneckAnalysis : public View {
286  const MCSubtargetInfo &STI;
287  MCInstPrinter &MCIP;
288  PressureTracker Tracker;
289  DependencyGraph DG;
290
291  ArrayRef<MCInst> Source;
292  unsigned Iterations;
293  unsigned TotalCycles;
294
295  bool PressureIncreasedBecauseOfResources;
296  bool PressureIncreasedBecauseOfRegisterDependencies;
297  bool PressureIncreasedBecauseOfMemoryDependencies;
298  // True if throughput was affected by dispatch stalls.
299  bool SeenStallCycles;
300
301  struct BackPressureInfo {
302    // Cycles where backpressure increased.
303    unsigned PressureIncreaseCycles;
304    // Cycles where backpressure increased because of pipeline pressure.
305    unsigned ResourcePressureCycles;
306    // Cycles where backpressure increased because of data dependencies.
307    unsigned DataDependencyCycles;
308    // Cycles where backpressure increased because of register dependencies.
309    unsigned RegisterDependencyCycles;
310    // Cycles where backpressure increased because of memory dependencies.
311    unsigned MemoryDependencyCycles;
312  };
313  BackPressureInfo BPI;
314
315  // Used to populate the dependency graph DG.
316  void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
317  void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
318  void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
319
320  // Prints a bottleneck message to OS.
321  void printBottleneckHints(raw_ostream &OS) const;
322  void printCriticalSequence(raw_ostream &OS) const;
323
324public:
325  BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
326                     ArrayRef<MCInst> Sequence, unsigned Iterations);
327
328  void onCycleEnd() override;
329  void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
330  void onEvent(const HWPressureEvent &Event) override;
331  void onEvent(const HWInstructionEvent &Event) override;
332
333  void printView(raw_ostream &OS) const override;
334
335#ifndef NDEBUG
336  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
337#endif
338};
339
340} // namespace mca
341} // namespace llvm
342
343#endif
344