1//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// Shared implementation of BlockFrequency for IR and Machine Instructions. 10// See the documentation below for BlockFrequencyInfoImpl for details. 11// 12//===----------------------------------------------------------------------===// 13 14#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H 15#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H 16 17#include "llvm/ADT/BitVector.h" 18#include "llvm/ADT/DenseMap.h" 19#include "llvm/ADT/DenseSet.h" 20#include "llvm/ADT/GraphTraits.h" 21#include "llvm/ADT/PostOrderIterator.h" 22#include "llvm/ADT/SmallPtrSet.h" 23#include "llvm/ADT/SmallVector.h" 24#include "llvm/ADT/SparseBitVector.h" 25#include "llvm/ADT/Twine.h" 26#include "llvm/ADT/iterator_range.h" 27#include "llvm/IR/BasicBlock.h" 28#include "llvm/IR/Function.h" 29#include "llvm/IR/ValueHandle.h" 30#include "llvm/Support/BlockFrequency.h" 31#include "llvm/Support/BranchProbability.h" 32#include "llvm/Support/CommandLine.h" 33#include "llvm/Support/DOTGraphTraits.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Support/Format.h" 36#include "llvm/Support/ScaledNumber.h" 37#include "llvm/Support/raw_ostream.h" 38#include <algorithm> 39#include <cassert> 40#include <cstddef> 41#include <cstdint> 42#include <deque> 43#include <iterator> 44#include <limits> 45#include <list> 46#include <optional> 47#include <queue> 48#include <string> 49#include <utility> 50#include <vector> 51 52#define DEBUG_TYPE "block-freq" 53 54namespace llvm { 55extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries; 56 57extern llvm::cl::opt<bool> UseIterativeBFIInference; 58extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock; 59extern llvm::cl::opt<double> IterativeBFIPrecision; 60 61class BranchProbabilityInfo; 62class Function; 63class Loop; 64class LoopInfo; 65class MachineBasicBlock; 66class MachineBranchProbabilityInfo; 67class MachineFunction; 68class MachineLoop; 69class MachineLoopInfo; 70 71namespace bfi_detail { 72 73struct IrreducibleGraph; 74 75// This is part of a workaround for a GCC 4.7 crash on lambdas. 76template <class BT> struct BlockEdgesAdder; 77 78/// Mass of a block. 79/// 80/// This class implements a sort of fixed-point fraction always between 0.0 and 81/// 1.0. getMass() == std::numeric_limits<uint64_t>::max() indicates a value of 82/// 1.0. 83/// 84/// Masses can be added and subtracted. Simple saturation arithmetic is used, 85/// so arithmetic operations never overflow or underflow. 86/// 87/// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses 88/// an inexpensive floating-point algorithm that's off-by-one (almost, but not 89/// quite, maximum precision). 90/// 91/// Masses can be scaled by \a BranchProbability at maximum precision. 92class BlockMass { 93 uint64_t Mass = 0; 94 95public: 96 BlockMass() = default; 97 explicit BlockMass(uint64_t Mass) : Mass(Mass) {} 98 99 static BlockMass getEmpty() { return BlockMass(); } 100 101 static BlockMass getFull() { 102 return BlockMass(std::numeric_limits<uint64_t>::max()); 103 } 104 105 uint64_t getMass() const { return Mass; } 106 107 bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); } 108 bool isEmpty() const { return !Mass; } 109 110 bool operator!() const { return isEmpty(); } 111 112 /// Add another mass. 113 /// 114 /// Adds another mass, saturating at \a isFull() rather than overflowing. 115 BlockMass &operator+=(BlockMass X) { 116 uint64_t Sum = Mass + X.Mass; 117 Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum; 118 return *this; 119 } 120 121 /// Subtract another mass. 122 /// 123 /// Subtracts another mass, saturating at \a isEmpty() rather than 124 /// undeflowing. 125 BlockMass &operator-=(BlockMass X) { 126 uint64_t Diff = Mass - X.Mass; 127 Mass = Diff > Mass ? 0 : Diff; 128 return *this; 129 } 130 131 BlockMass &operator*=(BranchProbability P) { 132 Mass = P.scale(Mass); 133 return *this; 134 } 135 136 bool operator==(BlockMass X) const { return Mass == X.Mass; } 137 bool operator!=(BlockMass X) const { return Mass != X.Mass; } 138 bool operator<=(BlockMass X) const { return Mass <= X.Mass; } 139 bool operator>=(BlockMass X) const { return Mass >= X.Mass; } 140 bool operator<(BlockMass X) const { return Mass < X.Mass; } 141 bool operator>(BlockMass X) const { return Mass > X.Mass; } 142 143 /// Convert to scaled number. 144 /// 145 /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty() 146 /// gives slightly above 0.0. 147 ScaledNumber<uint64_t> toScaled() const; 148 149 void dump() const; 150 raw_ostream &print(raw_ostream &OS) const; 151}; 152 153inline BlockMass operator+(BlockMass L, BlockMass R) { 154 return BlockMass(L) += R; 155} 156inline BlockMass operator-(BlockMass L, BlockMass R) { 157 return BlockMass(L) -= R; 158} 159inline BlockMass operator*(BlockMass L, BranchProbability R) { 160 return BlockMass(L) *= R; 161} 162inline BlockMass operator*(BranchProbability L, BlockMass R) { 163 return BlockMass(R) *= L; 164} 165 166inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) { 167 return X.print(OS); 168} 169 170} // end namespace bfi_detail 171 172/// Base class for BlockFrequencyInfoImpl 173/// 174/// BlockFrequencyInfoImplBase has supporting data structures and some 175/// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on 176/// the block type (or that call such algorithms) are skipped here. 177/// 178/// Nevertheless, the majority of the overall algorithm documentation lives with 179/// BlockFrequencyInfoImpl. See there for details. 180class BlockFrequencyInfoImplBase { 181public: 182 using Scaled64 = ScaledNumber<uint64_t>; 183 using BlockMass = bfi_detail::BlockMass; 184 185 /// Representative of a block. 186 /// 187 /// This is a simple wrapper around an index into the reverse-post-order 188 /// traversal of the blocks. 189 /// 190 /// Unlike a block pointer, its order has meaning (location in the 191 /// topological sort) and it's class is the same regardless of block type. 192 struct BlockNode { 193 using IndexType = uint32_t; 194 195 IndexType Index; 196 197 BlockNode() : Index(std::numeric_limits<uint32_t>::max()) {} 198 BlockNode(IndexType Index) : Index(Index) {} 199 200 bool operator==(const BlockNode &X) const { return Index == X.Index; } 201 bool operator!=(const BlockNode &X) const { return Index != X.Index; } 202 bool operator<=(const BlockNode &X) const { return Index <= X.Index; } 203 bool operator>=(const BlockNode &X) const { return Index >= X.Index; } 204 bool operator<(const BlockNode &X) const { return Index < X.Index; } 205 bool operator>(const BlockNode &X) const { return Index > X.Index; } 206 207 bool isValid() const { return Index <= getMaxIndex(); } 208 209 static size_t getMaxIndex() { 210 return std::numeric_limits<uint32_t>::max() - 1; 211 } 212 }; 213 214 /// Stats about a block itself. 215 struct FrequencyData { 216 Scaled64 Scaled; 217 uint64_t Integer; 218 }; 219 220 /// Data about a loop. 221 /// 222 /// Contains the data necessary to represent a loop as a pseudo-node once it's 223 /// packaged. 224 struct LoopData { 225 using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>; 226 using NodeList = SmallVector<BlockNode, 4>; 227 using HeaderMassList = SmallVector<BlockMass, 1>; 228 229 LoopData *Parent; ///< The parent loop. 230 bool IsPackaged = false; ///< Whether this has been packaged. 231 uint32_t NumHeaders = 1; ///< Number of headers. 232 ExitMap Exits; ///< Successor edges (and weights). 233 NodeList Nodes; ///< Header and the members of the loop. 234 HeaderMassList BackedgeMass; ///< Mass returned to each loop header. 235 BlockMass Mass; 236 Scaled64 Scale; 237 238 LoopData(LoopData *Parent, const BlockNode &Header) 239 : Parent(Parent), Nodes(1, Header), BackedgeMass(1) {} 240 241 template <class It1, class It2> 242 LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther, 243 It2 LastOther) 244 : Parent(Parent), Nodes(FirstHeader, LastHeader) { 245 NumHeaders = Nodes.size(); 246 Nodes.insert(Nodes.end(), FirstOther, LastOther); 247 BackedgeMass.resize(NumHeaders); 248 } 249 250 bool isHeader(const BlockNode &Node) const { 251 if (isIrreducible()) 252 return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders, 253 Node); 254 return Node == Nodes[0]; 255 } 256 257 BlockNode getHeader() const { return Nodes[0]; } 258 bool isIrreducible() const { return NumHeaders > 1; } 259 260 HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) { 261 assert(isHeader(B) && "this is only valid on loop header blocks"); 262 if (isIrreducible()) 263 return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) - 264 Nodes.begin(); 265 return 0; 266 } 267 268 NodeList::const_iterator members_begin() const { 269 return Nodes.begin() + NumHeaders; 270 } 271 272 NodeList::const_iterator members_end() const { return Nodes.end(); } 273 iterator_range<NodeList::const_iterator> members() const { 274 return make_range(members_begin(), members_end()); 275 } 276 }; 277 278 /// Index of loop information. 279 struct WorkingData { 280 BlockNode Node; ///< This node. 281 LoopData *Loop = nullptr; ///< The loop this block is inside. 282 BlockMass Mass; ///< Mass distribution from the entry block. 283 284 WorkingData(const BlockNode &Node) : Node(Node) {} 285 286 bool isLoopHeader() const { return Loop && Loop->isHeader(Node); } 287 288 bool isDoubleLoopHeader() const { 289 return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() && 290 Loop->Parent->isHeader(Node); 291 } 292 293 LoopData *getContainingLoop() const { 294 if (!isLoopHeader()) 295 return Loop; 296 if (!isDoubleLoopHeader()) 297 return Loop->Parent; 298 return Loop->Parent->Parent; 299 } 300 301 /// Resolve a node to its representative. 302 /// 303 /// Get the node currently representing Node, which could be a containing 304 /// loop. 305 /// 306 /// This function should only be called when distributing mass. As long as 307 /// there are no irreducible edges to Node, then it will have complexity 308 /// O(1) in this context. 309 /// 310 /// In general, the complexity is O(L), where L is the number of loop 311 /// headers Node has been packaged into. Since this method is called in 312 /// the context of distributing mass, L will be the number of loop headers 313 /// an early exit edge jumps out of. 314 BlockNode getResolvedNode() const { 315 auto *L = getPackagedLoop(); 316 return L ? L->getHeader() : Node; 317 } 318 319 LoopData *getPackagedLoop() const { 320 if (!Loop || !Loop->IsPackaged) 321 return nullptr; 322 auto *L = Loop; 323 while (L->Parent && L->Parent->IsPackaged) 324 L = L->Parent; 325 return L; 326 } 327 328 /// Get the appropriate mass for a node. 329 /// 330 /// Get appropriate mass for Node. If Node is a loop-header (whose loop 331 /// has been packaged), returns the mass of its pseudo-node. If it's a 332 /// node inside a packaged loop, it returns the loop's mass. 333 BlockMass &getMass() { 334 if (!isAPackage()) 335 return Mass; 336 if (!isADoublePackage()) 337 return Loop->Mass; 338 return Loop->Parent->Mass; 339 } 340 341 /// Has ContainingLoop been packaged up? 342 bool isPackaged() const { return getResolvedNode() != Node; } 343 344 /// Has Loop been packaged up? 345 bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; } 346 347 /// Has Loop been packaged up twice? 348 bool isADoublePackage() const { 349 return isDoubleLoopHeader() && Loop->Parent->IsPackaged; 350 } 351 }; 352 353 /// Unscaled probability weight. 354 /// 355 /// Probability weight for an edge in the graph (including the 356 /// successor/target node). 357 /// 358 /// All edges in the original function are 32-bit. However, exit edges from 359 /// loop packages are taken from 64-bit exit masses, so we need 64-bits of 360 /// space in general. 361 /// 362 /// In addition to the raw weight amount, Weight stores the type of the edge 363 /// in the current context (i.e., the context of the loop being processed). 364 /// Is this a local edge within the loop, an exit from the loop, or a 365 /// backedge to the loop header? 366 struct Weight { 367 enum DistType { Local, Exit, Backedge }; 368 DistType Type = Local; 369 BlockNode TargetNode; 370 uint64_t Amount = 0; 371 372 Weight() = default; 373 Weight(DistType Type, BlockNode TargetNode, uint64_t Amount) 374 : Type(Type), TargetNode(TargetNode), Amount(Amount) {} 375 }; 376 377 /// Distribution of unscaled probability weight. 378 /// 379 /// Distribution of unscaled probability weight to a set of successors. 380 /// 381 /// This class collates the successor edge weights for later processing. 382 /// 383 /// \a DidOverflow indicates whether \a Total did overflow while adding to 384 /// the distribution. It should never overflow twice. 385 struct Distribution { 386 using WeightList = SmallVector<Weight, 4>; 387 388 WeightList Weights; ///< Individual successor weights. 389 uint64_t Total = 0; ///< Sum of all weights. 390 bool DidOverflow = false; ///< Whether \a Total did overflow. 391 392 Distribution() = default; 393 394 void addLocal(const BlockNode &Node, uint64_t Amount) { 395 add(Node, Amount, Weight::Local); 396 } 397 398 void addExit(const BlockNode &Node, uint64_t Amount) { 399 add(Node, Amount, Weight::Exit); 400 } 401 402 void addBackedge(const BlockNode &Node, uint64_t Amount) { 403 add(Node, Amount, Weight::Backedge); 404 } 405 406 /// Normalize the distribution. 407 /// 408 /// Combines multiple edges to the same \a Weight::TargetNode and scales 409 /// down so that \a Total fits into 32-bits. 410 /// 411 /// This is linear in the size of \a Weights. For the vast majority of 412 /// cases, adjacent edge weights are combined by sorting WeightList and 413 /// combining adjacent weights. However, for very large edge lists an 414 /// auxiliary hash table is used. 415 void normalize(); 416 417 private: 418 void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type); 419 }; 420 421 /// Data about each block. This is used downstream. 422 std::vector<FrequencyData> Freqs; 423 424 /// Whether each block is an irreducible loop header. 425 /// This is used downstream. 426 SparseBitVector<> IsIrrLoopHeader; 427 428 /// Loop data: see initializeLoops(). 429 std::vector<WorkingData> Working; 430 431 /// Indexed information about loops. 432 std::list<LoopData> Loops; 433 434 /// Virtual destructor. 435 /// 436 /// Need a virtual destructor to mask the compiler warning about 437 /// getBlockName(). 438 virtual ~BlockFrequencyInfoImplBase() = default; 439 440 /// Add all edges out of a packaged loop to the distribution. 441 /// 442 /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each 443 /// successor edge. 444 /// 445 /// \return \c true unless there's an irreducible backedge. 446 bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop, 447 Distribution &Dist); 448 449 /// Add an edge to the distribution. 450 /// 451 /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the 452 /// edge is local/exit/backedge is in the context of LoopHead. Otherwise, 453 /// every edge should be a local edge (since all the loops are packaged up). 454 /// 455 /// \return \c true unless aborted due to an irreducible backedge. 456 bool addToDist(Distribution &Dist, const LoopData *OuterLoop, 457 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight); 458 459 /// Analyze irreducible SCCs. 460 /// 461 /// Separate irreducible SCCs from \c G, which is an explicit graph of \c 462 /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr). 463 /// Insert them into \a Loops before \c Insert. 464 /// 465 /// \return the \c LoopData nodes representing the irreducible SCCs. 466 iterator_range<std::list<LoopData>::iterator> 467 analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop, 468 std::list<LoopData>::iterator Insert); 469 470 /// Update a loop after packaging irreducible SCCs inside of it. 471 /// 472 /// Update \c OuterLoop. Before finding irreducible control flow, it was 473 /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a 474 /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged 475 /// up need to be removed from \a OuterLoop::Nodes. 476 void updateLoopWithIrreducible(LoopData &OuterLoop); 477 478 /// Distribute mass according to a distribution. 479 /// 480 /// Distributes the mass in Source according to Dist. If LoopHead.isValid(), 481 /// backedges and exits are stored in its entry in Loops. 482 /// 483 /// Mass is distributed in parallel from two copies of the source mass. 484 void distributeMass(const BlockNode &Source, LoopData *OuterLoop, 485 Distribution &Dist); 486 487 /// Compute the loop scale for a loop. 488 void computeLoopScale(LoopData &Loop); 489 490 /// Adjust the mass of all headers in an irreducible loop. 491 /// 492 /// Initially, irreducible loops are assumed to distribute their mass 493 /// equally among its headers. This can lead to wrong frequency estimates 494 /// since some headers may be executed more frequently than others. 495 /// 496 /// This adjusts header mass distribution so it matches the weights of 497 /// the backedges going into each of the loop headers. 498 void adjustLoopHeaderMass(LoopData &Loop); 499 500 void distributeIrrLoopHeaderMass(Distribution &Dist); 501 502 /// Package up a loop. 503 void packageLoop(LoopData &Loop); 504 505 /// Unwrap loops. 506 void unwrapLoops(); 507 508 /// Finalize frequency metrics. 509 /// 510 /// Calculates final frequencies and cleans up no-longer-needed data 511 /// structures. 512 void finalizeMetrics(); 513 514 /// Clear all memory. 515 void clear(); 516 517 virtual std::string getBlockName(const BlockNode &Node) const; 518 std::string getLoopName(const LoopData &Loop) const; 519 520 virtual raw_ostream &print(raw_ostream &OS) const { return OS; } 521 void dump() const { print(dbgs()); } 522 523 Scaled64 getFloatingBlockFreq(const BlockNode &Node) const; 524 525 BlockFrequency getBlockFreq(const BlockNode &Node) const; 526 std::optional<uint64_t> 527 getBlockProfileCount(const Function &F, const BlockNode &Node, 528 bool AllowSynthetic = false) const; 529 std::optional<uint64_t> 530 getProfileCountFromFreq(const Function &F, BlockFrequency Freq, 531 bool AllowSynthetic = false) const; 532 bool isIrrLoopHeader(const BlockNode &Node); 533 534 void setBlockFreq(const BlockNode &Node, BlockFrequency Freq); 535 536 BlockFrequency getEntryFreq() const { 537 assert(!Freqs.empty()); 538 return BlockFrequency(Freqs[0].Integer); 539 } 540}; 541 542void printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq, 543 BlockFrequency Freq); 544 545namespace bfi_detail { 546 547template <class BlockT> struct TypeMap {}; 548template <> struct TypeMap<BasicBlock> { 549 using BlockT = BasicBlock; 550 using BlockKeyT = AssertingVH<const BasicBlock>; 551 using FunctionT = Function; 552 using BranchProbabilityInfoT = BranchProbabilityInfo; 553 using LoopT = Loop; 554 using LoopInfoT = LoopInfo; 555}; 556template <> struct TypeMap<MachineBasicBlock> { 557 using BlockT = MachineBasicBlock; 558 using BlockKeyT = const MachineBasicBlock *; 559 using FunctionT = MachineFunction; 560 using BranchProbabilityInfoT = MachineBranchProbabilityInfo; 561 using LoopT = MachineLoop; 562 using LoopInfoT = MachineLoopInfo; 563}; 564 565template <class BlockT, class BFIImplT> 566class BFICallbackVH; 567 568/// Get the name of a MachineBasicBlock. 569/// 570/// Get the name of a MachineBasicBlock. It's templated so that including from 571/// CodeGen is unnecessary (that would be a layering issue). 572/// 573/// This is used mainly for debug output. The name is similar to 574/// MachineBasicBlock::getFullName(), but skips the name of the function. 575template <class BlockT> std::string getBlockName(const BlockT *BB) { 576 assert(BB && "Unexpected nullptr"); 577 auto MachineName = "BB" + Twine(BB->getNumber()); 578 if (BB->getBasicBlock()) 579 return (MachineName + "[" + BB->getName() + "]").str(); 580 return MachineName.str(); 581} 582/// Get the name of a BasicBlock. 583template <> inline std::string getBlockName(const BasicBlock *BB) { 584 assert(BB && "Unexpected nullptr"); 585 return BB->getName().str(); 586} 587 588/// Graph of irreducible control flow. 589/// 590/// This graph is used for determining the SCCs in a loop (or top-level 591/// function) that has irreducible control flow. 592/// 593/// During the block frequency algorithm, the local graphs are defined in a 594/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock 595/// graphs for most edges, but getting others from \a LoopData::ExitMap. The 596/// latter only has successor information. 597/// 598/// \a IrreducibleGraph makes this graph explicit. It's in a form that can use 599/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator), 600/// and it explicitly lists predecessors and successors. The initialization 601/// that relies on \c MachineBasicBlock is defined in the header. 602struct IrreducibleGraph { 603 using BFIBase = BlockFrequencyInfoImplBase; 604 605 BFIBase &BFI; 606 607 using BlockNode = BFIBase::BlockNode; 608 struct IrrNode { 609 BlockNode Node; 610 unsigned NumIn = 0; 611 std::deque<const IrrNode *> Edges; 612 613 IrrNode(const BlockNode &Node) : Node(Node) {} 614 615 using iterator = std::deque<const IrrNode *>::const_iterator; 616 617 iterator pred_begin() const { return Edges.begin(); } 618 iterator succ_begin() const { return Edges.begin() + NumIn; } 619 iterator pred_end() const { return succ_begin(); } 620 iterator succ_end() const { return Edges.end(); } 621 }; 622 BlockNode Start; 623 const IrrNode *StartIrr = nullptr; 624 std::vector<IrrNode> Nodes; 625 SmallDenseMap<uint32_t, IrrNode *, 4> Lookup; 626 627 /// Construct an explicit graph containing irreducible control flow. 628 /// 629 /// Construct an explicit graph of the control flow in \c OuterLoop (or the 630 /// top-level function, if \c OuterLoop is \c nullptr). Uses \c 631 /// addBlockEdges to add block successors that have not been packaged into 632 /// loops. 633 /// 634 /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected 635 /// user of this. 636 template <class BlockEdgesAdder> 637 IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop, 638 BlockEdgesAdder addBlockEdges) : BFI(BFI) { 639 initialize(OuterLoop, addBlockEdges); 640 } 641 642 template <class BlockEdgesAdder> 643 void initialize(const BFIBase::LoopData *OuterLoop, 644 BlockEdgesAdder addBlockEdges); 645 void addNodesInLoop(const BFIBase::LoopData &OuterLoop); 646 void addNodesInFunction(); 647 648 void addNode(const BlockNode &Node) { 649 Nodes.emplace_back(Node); 650 BFI.Working[Node.Index].getMass() = BlockMass::getEmpty(); 651 } 652 653 void indexNodes(); 654 template <class BlockEdgesAdder> 655 void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop, 656 BlockEdgesAdder addBlockEdges); 657 void addEdge(IrrNode &Irr, const BlockNode &Succ, 658 const BFIBase::LoopData *OuterLoop); 659}; 660 661template <class BlockEdgesAdder> 662void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop, 663 BlockEdgesAdder addBlockEdges) { 664 if (OuterLoop) { 665 addNodesInLoop(*OuterLoop); 666 for (auto N : OuterLoop->Nodes) 667 addEdges(N, OuterLoop, addBlockEdges); 668 } else { 669 addNodesInFunction(); 670 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 671 addEdges(Index, OuterLoop, addBlockEdges); 672 } 673 StartIrr = Lookup[Start.Index]; 674} 675 676template <class BlockEdgesAdder> 677void IrreducibleGraph::addEdges(const BlockNode &Node, 678 const BFIBase::LoopData *OuterLoop, 679 BlockEdgesAdder addBlockEdges) { 680 auto L = Lookup.find(Node.Index); 681 if (L == Lookup.end()) 682 return; 683 IrrNode &Irr = *L->second; 684 const auto &Working = BFI.Working[Node.Index]; 685 686 if (Working.isAPackage()) 687 for (const auto &I : Working.Loop->Exits) 688 addEdge(Irr, I.first, OuterLoop); 689 else 690 addBlockEdges(*this, Irr, OuterLoop); 691} 692 693} // end namespace bfi_detail 694 695/// Shared implementation for block frequency analysis. 696/// 697/// This is a shared implementation of BlockFrequencyInfo and 698/// MachineBlockFrequencyInfo, and calculates the relative frequencies of 699/// blocks. 700/// 701/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block, 702/// which is called the header. A given loop, L, can have sub-loops, which are 703/// loops within the subgraph of L that exclude its header. (A "trivial" SCC 704/// consists of a single block that does not have a self-edge.) 705/// 706/// In addition to loops, this algorithm has limited support for irreducible 707/// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are 708/// discovered on the fly, and modelled as loops with multiple headers. 709/// 710/// The headers of irreducible sub-SCCs consist of its entry blocks and all 711/// nodes that are targets of a backedge within it (excluding backedges within 712/// true sub-loops). Block frequency calculations act as if a block is 713/// inserted that intercepts all the edges to the headers. All backedges and 714/// entries point to this block. Its successors are the headers, which split 715/// the frequency evenly. 716/// 717/// This algorithm leverages BlockMass and ScaledNumber to maintain precision, 718/// separates mass distribution from loop scaling, and dithers to eliminate 719/// probability mass loss. 720/// 721/// The implementation is split between BlockFrequencyInfoImpl, which knows the 722/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and 723/// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a 724/// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in 725/// reverse-post order. This gives two advantages: it's easy to compare the 726/// relative ordering of two nodes, and maps keyed on BlockT can be represented 727/// by vectors. 728/// 729/// This algorithm is O(V+E), unless there is irreducible control flow, in 730/// which case it's O(V*E) in the worst case. 731/// 732/// These are the main stages: 733/// 734/// 0. Reverse post-order traversal (\a initializeRPOT()). 735/// 736/// Run a single post-order traversal and save it (in reverse) in RPOT. 737/// All other stages make use of this ordering. Save a lookup from BlockT 738/// to BlockNode (the index into RPOT) in Nodes. 739/// 740/// 1. Loop initialization (\a initializeLoops()). 741/// 742/// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of 743/// the algorithm. In particular, store the immediate members of each loop 744/// in reverse post-order. 745/// 746/// 2. Calculate mass and scale in loops (\a computeMassInLoops()). 747/// 748/// For each loop (bottom-up), distribute mass through the DAG resulting 749/// from ignoring backedges and treating sub-loops as a single pseudo-node. 750/// Track the backedge mass distributed to the loop header, and use it to 751/// calculate the loop scale (number of loop iterations). Immediate 752/// members that represent sub-loops will already have been visited and 753/// packaged into a pseudo-node. 754/// 755/// Distributing mass in a loop is a reverse-post-order traversal through 756/// the loop. Start by assigning full mass to the Loop header. For each 757/// node in the loop: 758/// 759/// - Fetch and categorize the weight distribution for its successors. 760/// If this is a packaged-subloop, the weight distribution is stored 761/// in \a LoopData::Exits. Otherwise, fetch it from 762/// BranchProbabilityInfo. 763/// 764/// - Each successor is categorized as \a Weight::Local, a local edge 765/// within the current loop, \a Weight::Backedge, a backedge to the 766/// loop header, or \a Weight::Exit, any successor outside the loop. 767/// The weight, the successor, and its category are stored in \a 768/// Distribution. There can be multiple edges to each successor. 769/// 770/// - If there's a backedge to a non-header, there's an irreducible SCC. 771/// The usual flow is temporarily aborted. \a 772/// computeIrreducibleMass() finds the irreducible SCCs within the 773/// loop, packages them up, and restarts the flow. 774/// 775/// - Normalize the distribution: scale weights down so that their sum 776/// is 32-bits, and coalesce multiple edges to the same node. 777/// 778/// - Distribute the mass accordingly, dithering to minimize mass loss, 779/// as described in \a distributeMass(). 780/// 781/// In the case of irreducible loops, instead of a single loop header, 782/// there will be several. The computation of backedge masses is similar 783/// but instead of having a single backedge mass, there will be one 784/// backedge per loop header. In these cases, each backedge will carry 785/// a mass proportional to the edge weights along the corresponding 786/// path. 787/// 788/// At the end of propagation, the full mass assigned to the loop will be 789/// distributed among the loop headers proportionally according to the 790/// mass flowing through their backedges. 791/// 792/// Finally, calculate the loop scale from the accumulated backedge mass. 793/// 794/// 3. Distribute mass in the function (\a computeMassInFunction()). 795/// 796/// Finally, distribute mass through the DAG resulting from packaging all 797/// loops in the function. This uses the same algorithm as distributing 798/// mass in a loop, except that there are no exit or backedge edges. 799/// 800/// 4. Unpackage loops (\a unwrapLoops()). 801/// 802/// Initialize each block's frequency to a floating point representation of 803/// its mass. 804/// 805/// Visit loops top-down, scaling the frequencies of its immediate members 806/// by the loop's pseudo-node's frequency. 807/// 808/// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()). 809/// 810/// Using the min and max frequencies as a guide, translate floating point 811/// frequencies to an appropriate range in uint64_t. 812/// 813/// It has some known flaws. 814/// 815/// - The model of irreducible control flow is a rough approximation. 816/// 817/// Modelling irreducible control flow exactly involves setting up and 818/// solving a group of infinite geometric series. Such precision is 819/// unlikely to be worthwhile, since most of our algorithms give up on 820/// irreducible control flow anyway. 821/// 822/// Nevertheless, we might find that we need to get closer. Here's a sort 823/// of TODO list for the model with diminishing returns, to be completed as 824/// necessary. 825/// 826/// - The headers for the \a LoopData representing an irreducible SCC 827/// include non-entry blocks. When these extra blocks exist, they 828/// indicate a self-contained irreducible sub-SCC. We could treat them 829/// as sub-loops, rather than arbitrarily shoving the problematic 830/// blocks into the headers of the main irreducible SCC. 831/// 832/// - Entry frequencies are assumed to be evenly split between the 833/// headers of a given irreducible SCC, which is the only option if we 834/// need to compute mass in the SCC before its parent loop. Instead, 835/// we could partially compute mass in the parent loop, and stop when 836/// we get to the SCC. Here, we have the correct ratio of entry 837/// masses, which we can use to adjust their relative frequencies. 838/// Compute mass in the SCC, and then continue propagation in the 839/// parent. 840/// 841/// - We can propagate mass iteratively through the SCC, for some fixed 842/// number of iterations. Each iteration starts by assigning the entry 843/// blocks their backedge mass from the prior iteration. The final 844/// mass for each block (and each exit, and the total backedge mass 845/// used for computing loop scale) is the sum of all iterations. 846/// (Running this until fixed point would "solve" the geometric 847/// series by simulation.) 848template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { 849 // This is part of a workaround for a GCC 4.7 crash on lambdas. 850 friend struct bfi_detail::BlockEdgesAdder<BT>; 851 852 using BlockT = typename bfi_detail::TypeMap<BT>::BlockT; 853 using BlockKeyT = typename bfi_detail::TypeMap<BT>::BlockKeyT; 854 using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT; 855 using BranchProbabilityInfoT = 856 typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT; 857 using LoopT = typename bfi_detail::TypeMap<BT>::LoopT; 858 using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT; 859 using Successor = GraphTraits<const BlockT *>; 860 using Predecessor = GraphTraits<Inverse<const BlockT *>>; 861 using BFICallbackVH = 862 bfi_detail::BFICallbackVH<BlockT, BlockFrequencyInfoImpl>; 863 864 const BranchProbabilityInfoT *BPI = nullptr; 865 const LoopInfoT *LI = nullptr; 866 const FunctionT *F = nullptr; 867 868 // All blocks in reverse postorder. 869 std::vector<const BlockT *> RPOT; 870 DenseMap<BlockKeyT, std::pair<BlockNode, BFICallbackVH>> Nodes; 871 872 using rpot_iterator = typename std::vector<const BlockT *>::const_iterator; 873 874 rpot_iterator rpot_begin() const { return RPOT.begin(); } 875 rpot_iterator rpot_end() const { return RPOT.end(); } 876 877 size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); } 878 879 BlockNode getNode(const rpot_iterator &I) const { 880 return BlockNode(getIndex(I)); 881 } 882 883 BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB).first; } 884 885 const BlockT *getBlock(const BlockNode &Node) const { 886 assert(Node.Index < RPOT.size()); 887 return RPOT[Node.Index]; 888 } 889 890 /// Run (and save) a post-order traversal. 891 /// 892 /// Saves a reverse post-order traversal of all the nodes in \a F. 893 void initializeRPOT(); 894 895 /// Initialize loop data. 896 /// 897 /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from 898 /// each block to the deepest loop it's in, but we need the inverse. For each 899 /// loop, we store in reverse post-order its "immediate" members, defined as 900 /// the header, the headers of immediate sub-loops, and all other blocks in 901 /// the loop that are not in sub-loops. 902 void initializeLoops(); 903 904 /// Propagate to a block's successors. 905 /// 906 /// In the context of distributing mass through \c OuterLoop, divide the mass 907 /// currently assigned to \c Node between its successors. 908 /// 909 /// \return \c true unless there's an irreducible backedge. 910 bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node); 911 912 /// Compute mass in a particular loop. 913 /// 914 /// Assign mass to \c Loop's header, and then for each block in \c Loop in 915 /// reverse post-order, distribute mass to its successors. Only visits nodes 916 /// that have not been packaged into sub-loops. 917 /// 918 /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop. 919 /// \return \c true unless there's an irreducible backedge. 920 bool computeMassInLoop(LoopData &Loop); 921 922 /// Try to compute mass in the top-level function. 923 /// 924 /// Assign mass to the entry block, and then for each block in reverse 925 /// post-order, distribute mass to its successors. Skips nodes that have 926 /// been packaged into loops. 927 /// 928 /// \pre \a computeMassInLoops() has been called. 929 /// \return \c true unless there's an irreducible backedge. 930 bool tryToComputeMassInFunction(); 931 932 /// Compute mass in (and package up) irreducible SCCs. 933 /// 934 /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front 935 /// of \c Insert), and call \a computeMassInLoop() on each of them. 936 /// 937 /// If \c OuterLoop is \c nullptr, it refers to the top-level function. 938 /// 939 /// \pre \a computeMassInLoop() has been called for each subloop of \c 940 /// OuterLoop. 941 /// \pre \c Insert points at the last loop successfully processed by \a 942 /// computeMassInLoop(). 943 /// \pre \c OuterLoop has irreducible SCCs. 944 void computeIrreducibleMass(LoopData *OuterLoop, 945 std::list<LoopData>::iterator Insert); 946 947 /// Compute mass in all loops. 948 /// 949 /// For each loop bottom-up, call \a computeMassInLoop(). 950 /// 951 /// \a computeMassInLoop() aborts (and returns \c false) on loops that 952 /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then 953 /// re-enter \a computeMassInLoop(). 954 /// 955 /// \post \a computeMassInLoop() has returned \c true for every loop. 956 void computeMassInLoops(); 957 958 /// Compute mass in the top-level function. 959 /// 960 /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to 961 /// compute mass in the top-level function. 962 /// 963 /// \post \a tryToComputeMassInFunction() has returned \c true. 964 void computeMassInFunction(); 965 966 std::string getBlockName(const BlockNode &Node) const override { 967 return bfi_detail::getBlockName(getBlock(Node)); 968 } 969 970 /// The current implementation for computing relative block frequencies does 971 /// not handle correctly control-flow graphs containing irreducible loops. To 972 /// resolve the problem, we apply a post-processing step, which iteratively 973 /// updates block frequencies based on the frequencies of their predesessors. 974 /// This corresponds to finding the stationary point of the Markov chain by 975 /// an iterative method aka "PageRank computation". 976 /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but 977 /// typically converges faster. 978 /// 979 /// Decide whether we want to apply iterative inference for a given function. 980 bool needIterativeInference() const; 981 982 /// Apply an iterative post-processing to infer correct counts for irr loops. 983 void applyIterativeInference(); 984 985 using ProbMatrixType = std::vector<std::vector<std::pair<size_t, Scaled64>>>; 986 987 /// Run iterative inference for a probability matrix and initial frequencies. 988 void iterativeInference(const ProbMatrixType &ProbMatrix, 989 std::vector<Scaled64> &Freq) const; 990 991 /// Find all blocks to apply inference on, that is, reachable from the entry 992 /// and backward reachable from exists along edges with positive probability. 993 void findReachableBlocks(std::vector<const BlockT *> &Blocks) const; 994 995 /// Build a matrix of probabilities with transitions (edges) between the 996 /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P 997 void initTransitionProbabilities( 998 const std::vector<const BlockT *> &Blocks, 999 const DenseMap<const BlockT *, size_t> &BlockIndex, 1000 ProbMatrixType &ProbMatrix) const; 1001 1002#ifndef NDEBUG 1003 /// Compute the discrepancy between current block frequencies and the 1004 /// probability matrix. 1005 Scaled64 discrepancy(const ProbMatrixType &ProbMatrix, 1006 const std::vector<Scaled64> &Freq) const; 1007#endif 1008 1009public: 1010 BlockFrequencyInfoImpl() = default; 1011 1012 const FunctionT *getFunction() const { return F; } 1013 1014 void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI, 1015 const LoopInfoT &LI); 1016 1017 using BlockFrequencyInfoImplBase::getEntryFreq; 1018 1019 BlockFrequency getBlockFreq(const BlockT *BB) const { 1020 return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB)); 1021 } 1022 1023 std::optional<uint64_t> 1024 getBlockProfileCount(const Function &F, const BlockT *BB, 1025 bool AllowSynthetic = false) const { 1026 return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB), 1027 AllowSynthetic); 1028 } 1029 1030 std::optional<uint64_t> 1031 getProfileCountFromFreq(const Function &F, BlockFrequency Freq, 1032 bool AllowSynthetic = false) const { 1033 return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq, 1034 AllowSynthetic); 1035 } 1036 1037 bool isIrrLoopHeader(const BlockT *BB) { 1038 return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB)); 1039 } 1040 1041 void setBlockFreq(const BlockT *BB, BlockFrequency Freq); 1042 1043 void forgetBlock(const BlockT *BB) { 1044 // We don't erase corresponding items from `Freqs`, `RPOT` and other to 1045 // avoid invalidating indices. Doing so would have saved some memory, but 1046 // it's not worth it. 1047 Nodes.erase(BB); 1048 } 1049 1050 Scaled64 getFloatingBlockFreq(const BlockT *BB) const { 1051 return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB)); 1052 } 1053 1054 const BranchProbabilityInfoT &getBPI() const { return *BPI; } 1055 1056 /// Print the frequencies for the current function. 1057 /// 1058 /// Prints the frequencies for the blocks in the current function. 1059 /// 1060 /// Blocks are printed in the natural iteration order of the function, rather 1061 /// than reverse post-order. This provides two advantages: writing -analyze 1062 /// tests is easier (since blocks come out in source order), and even 1063 /// unreachable blocks are printed. 1064 /// 1065 /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so 1066 /// we need to override it here. 1067 raw_ostream &print(raw_ostream &OS) const override; 1068 1069 using BlockFrequencyInfoImplBase::dump; 1070 1071 void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const; 1072}; 1073 1074namespace bfi_detail { 1075 1076template <class BFIImplT> 1077class BFICallbackVH<BasicBlock, BFIImplT> : public CallbackVH { 1078 BFIImplT *BFIImpl; 1079 1080public: 1081 BFICallbackVH() = default; 1082 1083 BFICallbackVH(const BasicBlock *BB, BFIImplT *BFIImpl) 1084 : CallbackVH(BB), BFIImpl(BFIImpl) {} 1085 1086 virtual ~BFICallbackVH() = default; 1087 1088 void deleted() override { 1089 BFIImpl->forgetBlock(cast<BasicBlock>(getValPtr())); 1090 } 1091}; 1092 1093/// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles 1094/// don't apply to them. 1095template <class BFIImplT> 1096class BFICallbackVH<MachineBasicBlock, BFIImplT> { 1097public: 1098 BFICallbackVH() = default; 1099 BFICallbackVH(const MachineBasicBlock *, BFIImplT *) {} 1100}; 1101 1102} // end namespace bfi_detail 1103 1104template <class BT> 1105void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F, 1106 const BranchProbabilityInfoT &BPI, 1107 const LoopInfoT &LI) { 1108 // Save the parameters. 1109 this->BPI = &BPI; 1110 this->LI = &LI; 1111 this->F = &F; 1112 1113 // Clean up left-over data structures. 1114 BlockFrequencyInfoImplBase::clear(); 1115 RPOT.clear(); 1116 Nodes.clear(); 1117 1118 // Initialize. 1119 LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName() 1120 << "\n=================" 1121 << std::string(F.getName().size(), '=') << "\n"); 1122 initializeRPOT(); 1123 initializeLoops(); 1124 1125 // Visit loops in post-order to find the local mass distribution, and then do 1126 // the full function. 1127 computeMassInLoops(); 1128 computeMassInFunction(); 1129 unwrapLoops(); 1130 // Apply a post-processing step improving computed frequencies for functions 1131 // with irreducible loops. 1132 if (needIterativeInference()) 1133 applyIterativeInference(); 1134 finalizeMetrics(); 1135 1136 if (CheckBFIUnknownBlockQueries) { 1137 // To detect BFI queries for unknown blocks, add entries for unreachable 1138 // blocks, if any. This is to distinguish between known/existing unreachable 1139 // blocks and unknown blocks. 1140 for (const BlockT &BB : F) 1141 if (!Nodes.count(&BB)) 1142 setBlockFreq(&BB, BlockFrequency()); 1143 } 1144} 1145 1146template <class BT> 1147void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, 1148 BlockFrequency Freq) { 1149 if (Nodes.count(BB)) 1150 BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq); 1151 else { 1152 // If BB is a newly added block after BFI is done, we need to create a new 1153 // BlockNode for it assigned with a new index. The index can be determined 1154 // by the size of Freqs. 1155 BlockNode NewNode(Freqs.size()); 1156 Nodes[BB] = {NewNode, BFICallbackVH(BB, this)}; 1157 Freqs.emplace_back(); 1158 BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq); 1159 } 1160} 1161 1162template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() { 1163 const BlockT *Entry = &F->front(); 1164 RPOT.reserve(F->size()); 1165 std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT)); 1166 std::reverse(RPOT.begin(), RPOT.end()); 1167 1168 assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() && 1169 "More nodes in function than Block Frequency Info supports"); 1170 1171 LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n"); 1172 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) { 1173 BlockNode Node = getNode(I); 1174 LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) 1175 << "\n"); 1176 Nodes[*I] = {Node, BFICallbackVH(*I, this)}; 1177 } 1178 1179 Working.reserve(RPOT.size()); 1180 for (size_t Index = 0; Index < RPOT.size(); ++Index) 1181 Working.emplace_back(Index); 1182 Freqs.resize(RPOT.size()); 1183} 1184 1185template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() { 1186 LLVM_DEBUG(dbgs() << "loop-detection\n"); 1187 if (LI->empty()) 1188 return; 1189 1190 // Visit loops top down and assign them an index. 1191 std::deque<std::pair<const LoopT *, LoopData *>> Q; 1192 for (const LoopT *L : *LI) 1193 Q.emplace_back(L, nullptr); 1194 while (!Q.empty()) { 1195 const LoopT *Loop = Q.front().first; 1196 LoopData *Parent = Q.front().second; 1197 Q.pop_front(); 1198 1199 BlockNode Header = getNode(Loop->getHeader()); 1200 assert(Header.isValid()); 1201 1202 Loops.emplace_back(Parent, Header); 1203 Working[Header.Index].Loop = &Loops.back(); 1204 LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n"); 1205 1206 for (const LoopT *L : *Loop) 1207 Q.emplace_back(L, &Loops.back()); 1208 } 1209 1210 // Visit nodes in reverse post-order and add them to their deepest containing 1211 // loop. 1212 for (size_t Index = 0; Index < RPOT.size(); ++Index) { 1213 // Loop headers have already been mostly mapped. 1214 if (Working[Index].isLoopHeader()) { 1215 LoopData *ContainingLoop = Working[Index].getContainingLoop(); 1216 if (ContainingLoop) 1217 ContainingLoop->Nodes.push_back(Index); 1218 continue; 1219 } 1220 1221 const LoopT *Loop = LI->getLoopFor(RPOT[Index]); 1222 if (!Loop) 1223 continue; 1224 1225 // Add this node to its containing loop's member list. 1226 BlockNode Header = getNode(Loop->getHeader()); 1227 assert(Header.isValid()); 1228 const auto &HeaderData = Working[Header.Index]; 1229 assert(HeaderData.isLoopHeader()); 1230 1231 Working[Index].Loop = HeaderData.Loop; 1232 HeaderData.Loop->Nodes.push_back(Index); 1233 LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) 1234 << ": member = " << getBlockName(Index) << "\n"); 1235 } 1236} 1237 1238template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() { 1239 // Visit loops with the deepest first, and the top-level loops last. 1240 for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) { 1241 if (computeMassInLoop(*L)) 1242 continue; 1243 auto Next = std::next(L); 1244 computeIrreducibleMass(&*L, L.base()); 1245 L = std::prev(Next); 1246 if (computeMassInLoop(*L)) 1247 continue; 1248 llvm_unreachable("unhandled irreducible control flow"); 1249 } 1250} 1251 1252template <class BT> 1253bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) { 1254 // Compute mass in loop. 1255 LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); 1256 1257 if (Loop.isIrreducible()) { 1258 LLVM_DEBUG(dbgs() << "isIrreducible = true\n"); 1259 Distribution Dist; 1260 unsigned NumHeadersWithWeight = 0; 1261 std::optional<uint64_t> MinHeaderWeight; 1262 DenseSet<uint32_t> HeadersWithoutWeight; 1263 HeadersWithoutWeight.reserve(Loop.NumHeaders); 1264 for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { 1265 auto &HeaderNode = Loop.Nodes[H]; 1266 const BlockT *Block = getBlock(HeaderNode); 1267 IsIrrLoopHeader.set(Loop.Nodes[H].Index); 1268 std::optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight(); 1269 if (!HeaderWeight) { 1270 LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on " 1271 << getBlockName(HeaderNode) << "\n"); 1272 HeadersWithoutWeight.insert(H); 1273 continue; 1274 } 1275 LLVM_DEBUG(dbgs() << getBlockName(HeaderNode) 1276 << " has irr loop header weight " << *HeaderWeight 1277 << "\n"); 1278 NumHeadersWithWeight++; 1279 uint64_t HeaderWeightValue = *HeaderWeight; 1280 if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight) 1281 MinHeaderWeight = HeaderWeightValue; 1282 if (HeaderWeightValue) { 1283 Dist.addLocal(HeaderNode, HeaderWeightValue); 1284 } 1285 } 1286 // As a heuristic, if some headers don't have a weight, give them the 1287 // minimum weight seen (not to disrupt the existing trends too much by 1288 // using a weight that's in the general range of the other headers' weights, 1289 // and the minimum seems to perform better than the average.) 1290 // FIXME: better update in the passes that drop the header weight. 1291 // If no headers have a weight, give them even weight (use weight 1). 1292 if (!MinHeaderWeight) 1293 MinHeaderWeight = 1; 1294 for (uint32_t H : HeadersWithoutWeight) { 1295 auto &HeaderNode = Loop.Nodes[H]; 1296 assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() && 1297 "Shouldn't have a weight metadata"); 1298 uint64_t MinWeight = *MinHeaderWeight; 1299 LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to " 1300 << getBlockName(HeaderNode) << "\n"); 1301 if (MinWeight) 1302 Dist.addLocal(HeaderNode, MinWeight); 1303 } 1304 distributeIrrLoopHeaderMass(Dist); 1305 for (const BlockNode &M : Loop.Nodes) 1306 if (!propagateMassToSuccessors(&Loop, M)) 1307 llvm_unreachable("unhandled irreducible control flow"); 1308 if (NumHeadersWithWeight == 0) 1309 // No headers have a metadata. Adjust header mass. 1310 adjustLoopHeaderMass(Loop); 1311 } else { 1312 Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); 1313 if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) 1314 llvm_unreachable("irreducible control flow to loop header!?"); 1315 for (const BlockNode &M : Loop.members()) 1316 if (!propagateMassToSuccessors(&Loop, M)) 1317 // Irreducible backedge. 1318 return false; 1319 } 1320 1321 computeLoopScale(Loop); 1322 packageLoop(Loop); 1323 return true; 1324} 1325 1326template <class BT> 1327bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() { 1328 // Compute mass in function. 1329 LLVM_DEBUG(dbgs() << "compute-mass-in-function\n"); 1330 assert(!Working.empty() && "no blocks in function"); 1331 assert(!Working[0].isLoopHeader() && "entry block is a loop header"); 1332 1333 Working[0].getMass() = BlockMass::getFull(); 1334 for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) { 1335 // Check for nodes that have been packaged. 1336 BlockNode Node = getNode(I); 1337 if (Working[Node.Index].isPackaged()) 1338 continue; 1339 1340 if (!propagateMassToSuccessors(nullptr, Node)) 1341 return false; 1342 } 1343 return true; 1344} 1345 1346template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() { 1347 if (tryToComputeMassInFunction()) 1348 return; 1349 computeIrreducibleMass(nullptr, Loops.begin()); 1350 if (tryToComputeMassInFunction()) 1351 return; 1352 llvm_unreachable("unhandled irreducible control flow"); 1353} 1354 1355template <class BT> 1356bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const { 1357 if (!UseIterativeBFIInference) 1358 return false; 1359 if (!F->getFunction().hasProfileData()) 1360 return false; 1361 // Apply iterative inference only if the function contains irreducible loops; 1362 // otherwise, computed block frequencies are reasonably correct. 1363 for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) { 1364 if (L->isIrreducible()) 1365 return true; 1366 } 1367 return false; 1368} 1369 1370template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() { 1371 // Extract blocks for processing: a block is considered for inference iff it 1372 // can be reached from the entry by edges with a positive probability. 1373 // Non-processed blocks are assigned with the zero frequency and are ignored 1374 // in the computation 1375 std::vector<const BlockT *> ReachableBlocks; 1376 findReachableBlocks(ReachableBlocks); 1377 if (ReachableBlocks.empty()) 1378 return; 1379 1380 // The map is used to index successors/predecessors of reachable blocks in 1381 // the ReachableBlocks vector 1382 DenseMap<const BlockT *, size_t> BlockIndex; 1383 // Extract initial frequencies for the reachable blocks 1384 auto Freq = std::vector<Scaled64>(ReachableBlocks.size()); 1385 Scaled64 SumFreq; 1386 for (size_t I = 0; I < ReachableBlocks.size(); I++) { 1387 const BlockT *BB = ReachableBlocks[I]; 1388 BlockIndex[BB] = I; 1389 Freq[I] = getFloatingBlockFreq(BB); 1390 SumFreq += Freq[I]; 1391 } 1392 assert(!SumFreq.isZero() && "empty initial block frequencies"); 1393 1394 LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName() 1395 << " with " << ReachableBlocks.size() << " blocks\n"); 1396 1397 // Normalizing frequencies so they sum up to 1.0 1398 for (auto &Value : Freq) { 1399 Value /= SumFreq; 1400 } 1401 1402 // Setting up edge probabilities using sparse matrix representation: 1403 // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P 1404 ProbMatrixType ProbMatrix; 1405 initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix); 1406 1407 // Run the propagation 1408 iterativeInference(ProbMatrix, Freq); 1409 1410 // Assign computed frequency values 1411 for (const BlockT &BB : *F) { 1412 auto Node = getNode(&BB); 1413 if (!Node.isValid()) 1414 continue; 1415 if (BlockIndex.count(&BB)) { 1416 Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]]; 1417 } else { 1418 Freqs[Node.Index].Scaled = Scaled64::getZero(); 1419 } 1420 } 1421} 1422 1423template <class BT> 1424void BlockFrequencyInfoImpl<BT>::iterativeInference( 1425 const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const { 1426 assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 && 1427 "incorrectly specified precision"); 1428 // Convert double precision to Scaled64 1429 const auto Precision = 1430 Scaled64::getInverse(static_cast<uint64_t>(1.0 / IterativeBFIPrecision)); 1431 const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size(); 1432 1433#ifndef NDEBUG 1434 LLVM_DEBUG(dbgs() << " Initial discrepancy = " 1435 << discrepancy(ProbMatrix, Freq).toString() << "\n"); 1436#endif 1437 1438 // Successors[I] holds unique sucessors of the I-th block 1439 auto Successors = std::vector<std::vector<size_t>>(Freq.size()); 1440 for (size_t I = 0; I < Freq.size(); I++) { 1441 for (const auto &Jump : ProbMatrix[I]) { 1442 Successors[Jump.first].push_back(I); 1443 } 1444 } 1445 1446 // To speedup computation, we maintain a set of "active" blocks whose 1447 // frequencies need to be updated based on the incoming edges. 1448 // The set is dynamic and changes after every update. Initially all blocks 1449 // with a positive frequency are active 1450 auto IsActive = BitVector(Freq.size(), false); 1451 std::queue<size_t> ActiveSet; 1452 for (size_t I = 0; I < Freq.size(); I++) { 1453 if (Freq[I] > 0) { 1454 ActiveSet.push(I); 1455 IsActive[I] = true; 1456 } 1457 } 1458 1459 // Iterate over the blocks propagating frequencies 1460 size_t It = 0; 1461 while (It++ < MaxIterations && !ActiveSet.empty()) { 1462 size_t I = ActiveSet.front(); 1463 ActiveSet.pop(); 1464 IsActive[I] = false; 1465 1466 // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix. 1467 // A special care is taken for self-edges that needs to be scaled by 1468 // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges 1469 Scaled64 NewFreq; 1470 Scaled64 OneMinusSelfProb = Scaled64::getOne(); 1471 for (const auto &Jump : ProbMatrix[I]) { 1472 if (Jump.first == I) { 1473 OneMinusSelfProb -= Jump.second; 1474 } else { 1475 NewFreq += Freq[Jump.first] * Jump.second; 1476 } 1477 } 1478 if (OneMinusSelfProb != Scaled64::getOne()) 1479 NewFreq /= OneMinusSelfProb; 1480 1481 // If the block's frequency has changed enough, then 1482 // make sure the block and its successors are in the active set 1483 auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I]; 1484 if (Change > Precision) { 1485 ActiveSet.push(I); 1486 IsActive[I] = true; 1487 for (size_t Succ : Successors[I]) { 1488 if (!IsActive[Succ]) { 1489 ActiveSet.push(Succ); 1490 IsActive[Succ] = true; 1491 } 1492 } 1493 } 1494 1495 // Update the frequency for the block 1496 Freq[I] = NewFreq; 1497 } 1498 1499 LLVM_DEBUG(dbgs() << " Completed " << It << " inference iterations" 1500 << format(" (%0.0f per block)", double(It) / Freq.size()) 1501 << "\n"); 1502#ifndef NDEBUG 1503 LLVM_DEBUG(dbgs() << " Final discrepancy = " 1504 << discrepancy(ProbMatrix, Freq).toString() << "\n"); 1505#endif 1506} 1507 1508template <class BT> 1509void BlockFrequencyInfoImpl<BT>::findReachableBlocks( 1510 std::vector<const BlockT *> &Blocks) const { 1511 // Find all blocks to apply inference on, that is, reachable from the entry 1512 // along edges with non-zero probablities 1513 std::queue<const BlockT *> Queue; 1514 SmallPtrSet<const BlockT *, 8> Reachable; 1515 const BlockT *Entry = &F->front(); 1516 Queue.push(Entry); 1517 Reachable.insert(Entry); 1518 while (!Queue.empty()) { 1519 const BlockT *SrcBB = Queue.front(); 1520 Queue.pop(); 1521 for (const BlockT *DstBB : children<const BlockT *>(SrcBB)) { 1522 auto EP = BPI->getEdgeProbability(SrcBB, DstBB); 1523 if (EP.isZero()) 1524 continue; 1525 if (Reachable.insert(DstBB).second) 1526 Queue.push(DstBB); 1527 } 1528 } 1529 1530 // Find all blocks to apply inference on, that is, backward reachable from 1531 // the entry along (backward) edges with non-zero probablities 1532 SmallPtrSet<const BlockT *, 8> InverseReachable; 1533 for (const BlockT &BB : *F) { 1534 // An exit block is a block without any successors 1535 bool HasSucc = !llvm::children<const BlockT *>(&BB).empty(); 1536 if (!HasSucc && Reachable.count(&BB)) { 1537 Queue.push(&BB); 1538 InverseReachable.insert(&BB); 1539 } 1540 } 1541 while (!Queue.empty()) { 1542 const BlockT *SrcBB = Queue.front(); 1543 Queue.pop(); 1544 for (const BlockT *DstBB : inverse_children<const BlockT *>(SrcBB)) { 1545 auto EP = BPI->getEdgeProbability(DstBB, SrcBB); 1546 if (EP.isZero()) 1547 continue; 1548 if (InverseReachable.insert(DstBB).second) 1549 Queue.push(DstBB); 1550 } 1551 } 1552 1553 // Collect the result 1554 Blocks.reserve(F->size()); 1555 for (const BlockT &BB : *F) { 1556 if (Reachable.count(&BB) && InverseReachable.count(&BB)) { 1557 Blocks.push_back(&BB); 1558 } 1559 } 1560} 1561 1562template <class BT> 1563void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities( 1564 const std::vector<const BlockT *> &Blocks, 1565 const DenseMap<const BlockT *, size_t> &BlockIndex, 1566 ProbMatrixType &ProbMatrix) const { 1567 const size_t NumBlocks = Blocks.size(); 1568 auto Succs = std::vector<std::vector<std::pair<size_t, Scaled64>>>(NumBlocks); 1569 auto SumProb = std::vector<Scaled64>(NumBlocks); 1570 1571 // Find unique successors and corresponding probabilities for every block 1572 for (size_t Src = 0; Src < NumBlocks; Src++) { 1573 const BlockT *BB = Blocks[Src]; 1574 SmallPtrSet<const BlockT *, 2> UniqueSuccs; 1575 for (const auto SI : children<const BlockT *>(BB)) { 1576 // Ignore cold blocks 1577 if (!BlockIndex.contains(SI)) 1578 continue; 1579 // Ignore parallel edges between BB and SI blocks 1580 if (!UniqueSuccs.insert(SI).second) 1581 continue; 1582 // Ignore jumps with zero probability 1583 auto EP = BPI->getEdgeProbability(BB, SI); 1584 if (EP.isZero()) 1585 continue; 1586 1587 auto EdgeProb = 1588 Scaled64::getFraction(EP.getNumerator(), EP.getDenominator()); 1589 size_t Dst = BlockIndex.find(SI)->second; 1590 Succs[Src].push_back(std::make_pair(Dst, EdgeProb)); 1591 SumProb[Src] += EdgeProb; 1592 } 1593 } 1594 1595 // Add transitions for every jump with positive branch probability 1596 ProbMatrix = ProbMatrixType(NumBlocks); 1597 for (size_t Src = 0; Src < NumBlocks; Src++) { 1598 // Ignore blocks w/o successors 1599 if (Succs[Src].empty()) 1600 continue; 1601 1602 assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block"); 1603 for (auto &Jump : Succs[Src]) { 1604 size_t Dst = Jump.first; 1605 Scaled64 Prob = Jump.second; 1606 ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src])); 1607 } 1608 } 1609 1610 // Add transitions from sinks to the source 1611 size_t EntryIdx = BlockIndex.find(&F->front())->second; 1612 for (size_t Src = 0; Src < NumBlocks; Src++) { 1613 if (Succs[Src].empty()) { 1614 ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne())); 1615 } 1616 } 1617} 1618 1619#ifndef NDEBUG 1620template <class BT> 1621BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy( 1622 const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const { 1623 assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block"); 1624 Scaled64 Discrepancy; 1625 for (size_t I = 0; I < ProbMatrix.size(); I++) { 1626 Scaled64 Sum; 1627 for (const auto &Jump : ProbMatrix[I]) { 1628 Sum += Freq[Jump.first] * Jump.second; 1629 } 1630 Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I]; 1631 } 1632 // Normalizing by the frequency of the entry block 1633 return Discrepancy / Freq[0]; 1634} 1635#endif 1636 1637/// \note This should be a lambda, but that crashes GCC 4.7. 1638namespace bfi_detail { 1639 1640template <class BT> struct BlockEdgesAdder { 1641 using BlockT = BT; 1642 using LoopData = BlockFrequencyInfoImplBase::LoopData; 1643 using Successor = GraphTraits<const BlockT *>; 1644 1645 const BlockFrequencyInfoImpl<BT> &BFI; 1646 1647 explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI) 1648 : BFI(BFI) {} 1649 1650 void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr, 1651 const LoopData *OuterLoop) { 1652 const BlockT *BB = BFI.RPOT[Irr.Node.Index]; 1653 for (const auto *Succ : children<const BlockT *>(BB)) 1654 G.addEdge(Irr, BFI.getNode(Succ), OuterLoop); 1655 } 1656}; 1657 1658} // end namespace bfi_detail 1659 1660template <class BT> 1661void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass( 1662 LoopData *OuterLoop, std::list<LoopData>::iterator Insert) { 1663 LLVM_DEBUG(dbgs() << "analyze-irreducible-in-"; 1664 if (OuterLoop) dbgs() 1665 << "loop: " << getLoopName(*OuterLoop) << "\n"; 1666 else dbgs() << "function\n"); 1667 1668 using namespace bfi_detail; 1669 1670 // Ideally, addBlockEdges() would be declared here as a lambda, but that 1671 // crashes GCC 4.7. 1672 BlockEdgesAdder<BT> addBlockEdges(*this); 1673 IrreducibleGraph G(*this, OuterLoop, addBlockEdges); 1674 1675 for (auto &L : analyzeIrreducible(G, OuterLoop, Insert)) 1676 computeMassInLoop(L); 1677 1678 if (!OuterLoop) 1679 return; 1680 updateLoopWithIrreducible(*OuterLoop); 1681} 1682 1683// A helper function that converts a branch probability into weight. 1684inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) { 1685 return Prob.getNumerator(); 1686} 1687 1688template <class BT> 1689bool 1690BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop, 1691 const BlockNode &Node) { 1692 LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n"); 1693 // Calculate probability for successors. 1694 Distribution Dist; 1695 if (auto *Loop = Working[Node.Index].getPackagedLoop()) { 1696 assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop"); 1697 if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist)) 1698 // Irreducible backedge. 1699 return false; 1700 } else { 1701 const BlockT *BB = getBlock(Node); 1702 for (auto SI = GraphTraits<const BlockT *>::child_begin(BB), 1703 SE = GraphTraits<const BlockT *>::child_end(BB); 1704 SI != SE; ++SI) 1705 if (!addToDist( 1706 Dist, OuterLoop, Node, getNode(*SI), 1707 getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI)))) 1708 // Irreducible backedge. 1709 return false; 1710 } 1711 1712 // Distribute mass to successors, saving exit and backedge data in the 1713 // loop header. 1714 distributeMass(Node, OuterLoop, Dist); 1715 return true; 1716} 1717 1718template <class BT> 1719raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const { 1720 if (!F) 1721 return OS; 1722 OS << "block-frequency-info: " << F->getName() << "\n"; 1723 for (const BlockT &BB : *F) { 1724 OS << " - " << bfi_detail::getBlockName(&BB) << ": float = "; 1725 getFloatingBlockFreq(&BB).print(OS, 5) 1726 << ", int = " << getBlockFreq(&BB).getFrequency(); 1727 if (std::optional<uint64_t> ProfileCount = 1728 BlockFrequencyInfoImplBase::getBlockProfileCount( 1729 F->getFunction(), getNode(&BB))) 1730 OS << ", count = " << *ProfileCount; 1731 if (std::optional<uint64_t> IrrLoopHeaderWeight = 1732 BB.getIrrLoopHeaderWeight()) 1733 OS << ", irr_loop_header_weight = " << *IrrLoopHeaderWeight; 1734 OS << "\n"; 1735 } 1736 1737 // Add an extra newline for readability. 1738 OS << "\n"; 1739 return OS; 1740} 1741 1742template <class BT> 1743void BlockFrequencyInfoImpl<BT>::verifyMatch( 1744 BlockFrequencyInfoImpl<BT> &Other) const { 1745 bool Match = true; 1746 DenseMap<const BlockT *, BlockNode> ValidNodes; 1747 DenseMap<const BlockT *, BlockNode> OtherValidNodes; 1748 for (auto &Entry : Nodes) { 1749 const BlockT *BB = Entry.first; 1750 if (BB) { 1751 ValidNodes[BB] = Entry.second.first; 1752 } 1753 } 1754 for (auto &Entry : Other.Nodes) { 1755 const BlockT *BB = Entry.first; 1756 if (BB) { 1757 OtherValidNodes[BB] = Entry.second.first; 1758 } 1759 } 1760 unsigned NumValidNodes = ValidNodes.size(); 1761 unsigned NumOtherValidNodes = OtherValidNodes.size(); 1762 if (NumValidNodes != NumOtherValidNodes) { 1763 Match = false; 1764 dbgs() << "Number of blocks mismatch: " << NumValidNodes << " vs " 1765 << NumOtherValidNodes << "\n"; 1766 } else { 1767 for (auto &Entry : ValidNodes) { 1768 const BlockT *BB = Entry.first; 1769 BlockNode Node = Entry.second; 1770 if (OtherValidNodes.count(BB)) { 1771 BlockNode OtherNode = OtherValidNodes[BB]; 1772 const auto &Freq = Freqs[Node.Index]; 1773 const auto &OtherFreq = Other.Freqs[OtherNode.Index]; 1774 if (Freq.Integer != OtherFreq.Integer) { 1775 Match = false; 1776 dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " " 1777 << Freq.Integer << " vs " << OtherFreq.Integer << "\n"; 1778 } 1779 } else { 1780 Match = false; 1781 dbgs() << "Block " << bfi_detail::getBlockName(BB) << " index " 1782 << Node.Index << " does not exist in Other.\n"; 1783 } 1784 } 1785 // If there's a valid node in OtherValidNodes that's not in ValidNodes, 1786 // either the above num check or the check on OtherValidNodes will fail. 1787 } 1788 if (!Match) { 1789 dbgs() << "This\n"; 1790 print(dbgs()); 1791 dbgs() << "Other\n"; 1792 Other.print(dbgs()); 1793 } 1794 assert(Match && "BFI mismatch"); 1795} 1796 1797// Graph trait base class for block frequency information graph 1798// viewer. 1799 1800enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count }; 1801 1802template <class BlockFrequencyInfoT, class BranchProbabilityInfoT> 1803struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits { 1804 using GTraits = GraphTraits<BlockFrequencyInfoT *>; 1805 using NodeRef = typename GTraits::NodeRef; 1806 using EdgeIter = typename GTraits::ChildIteratorType; 1807 using NodeIter = typename GTraits::nodes_iterator; 1808 1809 uint64_t MaxFrequency = 0; 1810 1811 explicit BFIDOTGraphTraitsBase(bool isSimple = false) 1812 : DefaultDOTGraphTraits(isSimple) {} 1813 1814 static StringRef getGraphName(const BlockFrequencyInfoT *G) { 1815 return G->getFunction()->getName(); 1816 } 1817 1818 std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph, 1819 unsigned HotPercentThreshold = 0) { 1820 std::string Result; 1821 if (!HotPercentThreshold) 1822 return Result; 1823 1824 // Compute MaxFrequency on the fly: 1825 if (!MaxFrequency) { 1826 for (NodeIter I = GTraits::nodes_begin(Graph), 1827 E = GTraits::nodes_end(Graph); 1828 I != E; ++I) { 1829 NodeRef N = *I; 1830 MaxFrequency = 1831 std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency()); 1832 } 1833 } 1834 BlockFrequency Freq = Graph->getBlockFreq(Node); 1835 BlockFrequency HotFreq = 1836 (BlockFrequency(MaxFrequency) * 1837 BranchProbability::getBranchProbability(HotPercentThreshold, 100)); 1838 1839 if (Freq < HotFreq) 1840 return Result; 1841 1842 raw_string_ostream OS(Result); 1843 OS << "color=\"red\""; 1844 OS.flush(); 1845 return Result; 1846 } 1847 1848 std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph, 1849 GVDAGType GType, int layout_order = -1) { 1850 std::string Result; 1851 raw_string_ostream OS(Result); 1852 1853 if (layout_order != -1) 1854 OS << Node->getName() << "[" << layout_order << "] : "; 1855 else 1856 OS << Node->getName() << " : "; 1857 switch (GType) { 1858 case GVDT_Fraction: 1859 OS << printBlockFreq(*Graph, *Node); 1860 break; 1861 case GVDT_Integer: 1862 OS << Graph->getBlockFreq(Node).getFrequency(); 1863 break; 1864 case GVDT_Count: { 1865 auto Count = Graph->getBlockProfileCount(Node); 1866 if (Count) 1867 OS << *Count; 1868 else 1869 OS << "Unknown"; 1870 break; 1871 } 1872 case GVDT_None: 1873 llvm_unreachable("If we are not supposed to render a graph we should " 1874 "never reach this point."); 1875 } 1876 return Result; 1877 } 1878 1879 std::string getEdgeAttributes(NodeRef Node, EdgeIter EI, 1880 const BlockFrequencyInfoT *BFI, 1881 const BranchProbabilityInfoT *BPI, 1882 unsigned HotPercentThreshold = 0) { 1883 std::string Str; 1884 if (!BPI) 1885 return Str; 1886 1887 BranchProbability BP = BPI->getEdgeProbability(Node, EI); 1888 uint32_t N = BP.getNumerator(); 1889 uint32_t D = BP.getDenominator(); 1890 double Percent = 100.0 * N / D; 1891 raw_string_ostream OS(Str); 1892 OS << format("label=\"%.1f%%\"", Percent); 1893 1894 if (HotPercentThreshold) { 1895 BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP; 1896 BlockFrequency HotFreq = BlockFrequency(MaxFrequency) * 1897 BranchProbability(HotPercentThreshold, 100); 1898 1899 if (EFreq >= HotFreq) { 1900 OS << ",color=\"red\""; 1901 } 1902 } 1903 1904 OS.flush(); 1905 return Str; 1906 } 1907}; 1908 1909} // end namespace llvm 1910 1911#undef DEBUG_TYPE 1912 1913#endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H 1914