1//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8/// \file 9/// 10/// This file implements the bottleneck analysis view. 11/// 12/// This view internally observes backend pressure increase events in order to 13/// identify problematic data dependencies and processor resource interferences. 14/// 15/// Example of bottleneck analysis report for a dot-product on X86 btver2: 16/// 17/// Cycles with backend pressure increase [ 40.76% ] 18/// Throughput Bottlenecks: 19/// Resource Pressure [ 39.34% ] 20/// - JFPA [ 39.34% ] 21/// - JFPU0 [ 39.34% ] 22/// Data Dependencies: [ 1.42% ] 23/// - Register Dependencies [ 1.42% ] 24/// - Memory Dependencies [ 0.00% ] 25/// 26/// According to the example, backend pressure increased during the 40.76% of 27/// the simulated cycles. In particular, the major cause of backend pressure 28/// increases was the contention on floating point adder JFPA accessible from 29/// pipeline resource JFPU0. 30/// 31/// At the end of each cycle, if pressure on the simulated out-of-order buffers 32/// has increased, a backend pressure event is reported. 33/// In particular, this occurs when there is a delta between the number of uOps 34/// dispatched and the number of uOps issued to the underlying pipelines. 35/// 36/// The bottleneck analysis view is also responsible for identifying and printing 37/// the most "critical" sequence of dependent instructions according to the 38/// simulated run. 39/// 40/// Below is the critical sequence computed for the dot-product example on 41/// btver2: 42/// 43/// Instruction Dependency Information 44/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 45/// | 46/// | < loop carried > 47/// | 48/// | 0. vmulps %xmm0, %xmm0, %xmm2 49/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] 50/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 51/// | 52/// | < loop carried > 53/// | 54/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] 55/// 56/// 57/// The algorithm that computes the critical sequence is very similar to a 58/// critical path analysis. 59/// 60/// A dependency graph is used internally to track dependencies between nodes. 61/// Nodes of the graph represent instructions from the input assembly sequence, 62/// and edges of the graph represent data dependencies or processor resource 63/// interferences. 64/// 65/// Edges are dynamically 'discovered' by observing instruction state transitions 66/// and backend pressure increase events. Edges are internally ranked based on 67/// their "criticality". A dependency is considered to be critical if it takes a 68/// long time to execute, and if it contributes to backend pressure increases. 69/// Criticality is internally measured in terms of cycles; it is computed for 70/// every edge in the graph as a function of the edge latency and the number of 71/// backend pressure increase cycles contributed by that edge. 72/// 73/// At the end of simulation, costs are propagated to nodes through the edges of 74/// the graph, and the most expensive path connecting the root-set (a 75/// set of nodes with no predecessors) to a leaf node is reported as critical 76/// sequence. 77// 78//===----------------------------------------------------------------------===// 79 80#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H 81#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H 82 83#include "Views/View.h" 84#include "llvm/ADT/DenseMap.h" 85#include "llvm/ADT/SmallVector.h" 86#include "llvm/MC/MCInstPrinter.h" 87#include "llvm/MC/MCSchedule.h" 88#include "llvm/MC/MCSubtargetInfo.h" 89#include "llvm/Support/raw_ostream.h" 90 91namespace llvm { 92namespace mca { 93 94class PressureTracker { 95 const MCSchedModel &SM; 96 97 // Resource pressure distribution. There is an element for every processor 98 // resource declared by the scheduling model. Quantities are number of cycles. 99 SmallVector<unsigned, 4> ResourcePressureDistribution; 100 101 // Each processor resource is associated with a so-called processor resource 102 // mask. This vector allows to correlate processor resource IDs with processor 103 // resource masks. There is exactly one element per each processor resource 104 // declared by the scheduling model. 105 SmallVector<uint64_t, 4> ProcResID2Mask; 106 107 // Maps processor resource state indices (returned by calls to 108 // `getResourceStateIndex(Mask)` to processor resource identifiers. 109 SmallVector<unsigned, 4> ResIdx2ProcResID; 110 111 // Maps Processor Resource identifiers to ResourceUsers indices. 112 SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex; 113 114 // Identifies the last user of a processor resource unit. 115 // This vector is updated on every instruction issued event. 116 // There is one entry for every processor resource unit declared by the 117 // processor model. An all_ones value is treated like an invalid instruction 118 // identifier. 119 using User = std::pair<unsigned, unsigned>; 120 SmallVector<User, 4> ResourceUsers; 121 122 struct InstructionPressureInfo { 123 unsigned RegisterPressureCycles; 124 unsigned MemoryPressureCycles; 125 unsigned ResourcePressureCycles; 126 }; 127 DenseMap<unsigned, InstructionPressureInfo> IPI; 128 129 void updateResourcePressureDistribution(uint64_t CumulativeMask); 130 131 User getResourceUser(unsigned ProcResID, unsigned UnitID) const { 132 unsigned Index = ProcResID2ResourceUsersIndex[ProcResID]; 133 return ResourceUsers[Index + UnitID]; 134 } 135 136public: 137 PressureTracker(const MCSchedModel &Model); 138 139 ArrayRef<unsigned> getResourcePressureDistribution() const { 140 return ResourcePressureDistribution; 141 } 142 143 void getResourceUsers(uint64_t ResourceMask, 144 SmallVectorImpl<User> &Users) const; 145 146 unsigned getRegisterPressureCycles(unsigned IID) const { 147 assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); 148 const InstructionPressureInfo &Info = IPI.find(IID)->second; 149 return Info.RegisterPressureCycles; 150 } 151 152 unsigned getMemoryPressureCycles(unsigned IID) const { 153 assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); 154 const InstructionPressureInfo &Info = IPI.find(IID)->second; 155 return Info.MemoryPressureCycles; 156 } 157 158 unsigned getResourcePressureCycles(unsigned IID) const { 159 assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!"); 160 const InstructionPressureInfo &Info = IPI.find(IID)->second; 161 return Info.ResourcePressureCycles; 162 } 163 164 const char *resolveResourceName(uint64_t ResourceMask) const { 165 unsigned Index = getResourceStateIndex(ResourceMask); 166 unsigned ProcResID = ResIdx2ProcResID[Index]; 167 const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); 168 return PRDesc.Name; 169 } 170 171 void onInstructionDispatched(unsigned IID); 172 void onInstructionExecuted(unsigned IID); 173 174 void handlePressureEvent(const HWPressureEvent &Event); 175 void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); 176}; 177 178// A dependency edge. 179struct DependencyEdge { 180 enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; 181 182 // Dependency edge descriptor. 183 // 184 // It specifies the dependency type, as well as the edge cost in cycles. 185 struct Dependency { 186 DependencyType Type; 187 uint64_t ResourceOrRegID; 188 uint64_t Cost; 189 }; 190 Dependency Dep; 191 192 unsigned FromIID; 193 unsigned ToIID; 194 195 // Used by the bottleneck analysis to compute the interference 196 // probability for processor resources. 197 unsigned Frequency; 198}; 199 200// A dependency graph used by the bottleneck analysis to describe data 201// dependencies and processor resource interferences between instructions. 202// 203// There is a node (an instance of struct DGNode) for every instruction in the 204// input assembly sequence. Edges of the graph represent dependencies between 205// instructions. 206// 207// Each edge of the graph is associated with a cost value which is used 208// internally to rank dependency based on their impact on the runtime 209// performance (see field DependencyEdge::Dependency::Cost). In general, the 210// higher the cost of an edge, the higher the impact on performance. 211// 212// The cost of a dependency is a function of both the latency and the number of 213// cycles where the dependency has been seen as critical (i.e. contributing to 214// back-pressure increases). 215// 216// Loop carried dependencies are carefully expanded by the bottleneck analysis 217// to guarantee that the graph stays acyclic. To this end, extra nodes are 218// pre-allocated at construction time to describe instructions from "past and 219// future" iterations. The graph is kept acyclic mainly because it simplifies the 220// complexity of the algorithm that computes the critical sequence. 221class DependencyGraph { 222 struct DGNode { 223 unsigned NumPredecessors; 224 unsigned NumVisitedPredecessors; 225 uint64_t Cost; 226 unsigned Depth; 227 228 DependencyEdge CriticalPredecessor; 229 SmallVector<DependencyEdge, 8> OutgoingEdges; 230 }; 231 SmallVector<DGNode, 16> Nodes; 232 233 DependencyGraph(const DependencyGraph &) = delete; 234 DependencyGraph &operator=(const DependencyGraph &) = delete; 235 236 void addDependency(unsigned From, unsigned To, 237 DependencyEdge::Dependency &&DE); 238 239 void pruneEdges(unsigned Iterations); 240 void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; 241 void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations); 242 243#ifndef NDEBUG 244 void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, 245 MCInstPrinter &MCIP) const; 246#endif 247 248public: 249 DependencyGraph(unsigned Size) : Nodes(Size) {} 250 251 void addRegisterDep(unsigned From, unsigned To, unsigned RegID, 252 unsigned Cost) { 253 addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost}); 254 } 255 256 void addMemoryDep(unsigned From, unsigned To, unsigned Cost) { 257 addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost}); 258 } 259 260 void addResourceDep(unsigned From, unsigned To, uint64_t Mask, 261 unsigned Cost) { 262 addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); 263 } 264 265 // Called by the bottleneck analysis at the end of simulation to propagate 266 // costs through the edges of the graph, and compute a critical path. 267 void finalizeGraph(unsigned Iterations) { 268 SmallVector<unsigned, 16> RootSet; 269 pruneEdges(Iterations); 270 initializeRootSet(RootSet); 271 propagateThroughEdges(RootSet, Iterations); 272 } 273 274 // Returns a sequence of edges representing the critical sequence based on the 275 // simulated run. It assumes that the graph has already been finalized (i.e. 276 // method `finalizeGraph()` has already been called on this graph). 277 void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; 278 279#ifndef NDEBUG 280 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; 281#endif 282}; 283 284/// A view that collects and prints a few performance numbers. 285class BottleneckAnalysis : public View { 286 const MCSubtargetInfo &STI; 287 MCInstPrinter &MCIP; 288 PressureTracker Tracker; 289 DependencyGraph DG; 290 291 ArrayRef<MCInst> Source; 292 unsigned Iterations; 293 unsigned TotalCycles; 294 295 bool PressureIncreasedBecauseOfResources; 296 bool PressureIncreasedBecauseOfRegisterDependencies; 297 bool PressureIncreasedBecauseOfMemoryDependencies; 298 // True if throughput was affected by dispatch stalls. 299 bool SeenStallCycles; 300 301 struct BackPressureInfo { 302 // Cycles where backpressure increased. 303 unsigned PressureIncreaseCycles; 304 // Cycles where backpressure increased because of pipeline pressure. 305 unsigned ResourcePressureCycles; 306 // Cycles where backpressure increased because of data dependencies. 307 unsigned DataDependencyCycles; 308 // Cycles where backpressure increased because of register dependencies. 309 unsigned RegisterDependencyCycles; 310 // Cycles where backpressure increased because of memory dependencies. 311 unsigned MemoryDependencyCycles; 312 }; 313 BackPressureInfo BPI; 314 315 // Used to populate the dependency graph DG. 316 void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy); 317 void addMemoryDep(unsigned From, unsigned To, unsigned Cy); 318 void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy); 319 320 // Prints a bottleneck message to OS. 321 void printBottleneckHints(raw_ostream &OS) const; 322 void printCriticalSequence(raw_ostream &OS) const; 323 324public: 325 BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, 326 ArrayRef<MCInst> Sequence, unsigned Iterations); 327 328 void onCycleEnd() override; 329 void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; } 330 void onEvent(const HWPressureEvent &Event) override; 331 void onEvent(const HWInstructionEvent &Event) override; 332 333 void printView(raw_ostream &OS) const override; 334 335#ifndef NDEBUG 336 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); } 337#endif 338}; 339 340} // namespace mca 341} // namespace llvm 342 343#endif 344