1//===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file contains classes used to discover if for a particular value 9// there from sue to definition that crosses a suspend block. 10// 11// Using the information discovered we form a Coroutine Frame structure to 12// contain those values. All uses of those values are replaced with appropriate 13// GEP + load from the coroutine frame. At the point of the definition we spill 14// the value into the coroutine frame. 15//===----------------------------------------------------------------------===// 16 17#include "CoroInternal.h" 18#include "llvm/ADT/BitVector.h" 19#include "llvm/ADT/PostOrderIterator.h" 20#include "llvm/ADT/ScopeExit.h" 21#include "llvm/ADT/SmallString.h" 22#include "llvm/Analysis/PtrUseVisitor.h" 23#include "llvm/Analysis/StackLifetime.h" 24#include "llvm/Config/llvm-config.h" 25#include "llvm/IR/CFG.h" 26#include "llvm/IR/DIBuilder.h" 27#include "llvm/IR/DebugInfo.h" 28#include "llvm/IR/Dominators.h" 29#include "llvm/IR/IRBuilder.h" 30#include "llvm/IR/InstIterator.h" 31#include "llvm/IR/IntrinsicInst.h" 32#include "llvm/Support/Debug.h" 33#include "llvm/Support/MathExtras.h" 34#include "llvm/Support/OptimizedStructLayout.h" 35#include "llvm/Support/circular_raw_ostream.h" 36#include "llvm/Support/raw_ostream.h" 37#include "llvm/Transforms/Utils/BasicBlockUtils.h" 38#include "llvm/Transforms/Utils/Local.h" 39#include "llvm/Transforms/Utils/PromoteMemToReg.h" 40#include <algorithm> 41#include <deque> 42#include <optional> 43 44using namespace llvm; 45 46// The "coro-suspend-crossing" flag is very noisy. There is another debug type, 47// "coro-frame", which results in leaner debug spew. 48#define DEBUG_TYPE "coro-suspend-crossing" 49 50enum { SmallVectorThreshold = 32 }; 51 52// Provides two way mapping between the blocks and numbers. 53namespace { 54class BlockToIndexMapping { 55 SmallVector<BasicBlock *, SmallVectorThreshold> V; 56 57public: 58 size_t size() const { return V.size(); } 59 60 BlockToIndexMapping(Function &F) { 61 for (BasicBlock &BB : F) 62 V.push_back(&BB); 63 llvm::sort(V); 64 } 65 66 size_t blockToIndex(BasicBlock const *BB) const { 67 auto *I = llvm::lower_bound(V, BB); 68 assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block"); 69 return I - V.begin(); 70 } 71 72 BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; } 73}; 74} // end anonymous namespace 75 76// The SuspendCrossingInfo maintains data that allows to answer a question 77// whether given two BasicBlocks A and B there is a path from A to B that 78// passes through a suspend point. 79// 80// For every basic block 'i' it maintains a BlockData that consists of: 81// Consumes: a bit vector which contains a set of indices of blocks that can 82// reach block 'i'. A block can trivially reach itself. 83// Kills: a bit vector which contains a set of indices of blocks that can 84// reach block 'i' but there is a path crossing a suspend point 85// not repeating 'i' (path to 'i' without cycles containing 'i'). 86// Suspend: a boolean indicating whether block 'i' contains a suspend point. 87// End: a boolean indicating whether block 'i' contains a coro.end intrinsic. 88// KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that 89// crosses a suspend point. 90// 91namespace { 92class SuspendCrossingInfo { 93 BlockToIndexMapping Mapping; 94 95 struct BlockData { 96 BitVector Consumes; 97 BitVector Kills; 98 bool Suspend = false; 99 bool End = false; 100 bool KillLoop = false; 101 bool Changed = false; 102 }; 103 SmallVector<BlockData, SmallVectorThreshold> Block; 104 105 iterator_range<pred_iterator> predecessors(BlockData const &BD) const { 106 BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); 107 return llvm::predecessors(BB); 108 } 109 110 BlockData &getBlockData(BasicBlock *BB) { 111 return Block[Mapping.blockToIndex(BB)]; 112 } 113 114 /// Compute the BlockData for the current function in one iteration. 115 /// Initialize - Whether this is the first iteration, we can optimize 116 /// the initial case a little bit by manual loop switch. 117 /// Returns whether the BlockData changes in this iteration. 118 template <bool Initialize = false> 119 bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT); 120 121public: 122#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 123 void dump() const; 124 void dump(StringRef Label, BitVector const &BV) const; 125#endif 126 127 SuspendCrossingInfo(Function &F, coro::Shape &Shape); 128 129 /// Returns true if there is a path from \p From to \p To crossing a suspend 130 /// point without crossing \p From a 2nd time. 131 bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const { 132 size_t const FromIndex = Mapping.blockToIndex(From); 133 size_t const ToIndex = Mapping.blockToIndex(To); 134 bool const Result = Block[ToIndex].Kills[FromIndex]; 135 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 136 << " answer is " << Result << "\n"); 137 return Result; 138 } 139 140 /// Returns true if there is a path from \p From to \p To crossing a suspend 141 /// point without crossing \p From a 2nd time. If \p From is the same as \p To 142 /// this will also check if there is a looping path crossing a suspend point. 143 bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From, 144 BasicBlock *To) const { 145 size_t const FromIndex = Mapping.blockToIndex(From); 146 size_t const ToIndex = Mapping.blockToIndex(To); 147 bool Result = Block[ToIndex].Kills[FromIndex] || 148 (From == To && Block[ToIndex].KillLoop); 149 LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName() 150 << " answer is " << Result << " (path or loop)\n"); 151 return Result; 152 } 153 154 bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const { 155 auto *I = cast<Instruction>(U); 156 157 // We rewrote PHINodes, so that only the ones with exactly one incoming 158 // value need to be analyzed. 159 if (auto *PN = dyn_cast<PHINode>(I)) 160 if (PN->getNumIncomingValues() > 1) 161 return false; 162 163 BasicBlock *UseBB = I->getParent(); 164 165 // As a special case, treat uses by an llvm.coro.suspend.retcon or an 166 // llvm.coro.suspend.async as if they were uses in the suspend's single 167 // predecessor: the uses conceptually occur before the suspend. 168 if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) { 169 UseBB = UseBB->getSinglePredecessor(); 170 assert(UseBB && "should have split coro.suspend into its own block"); 171 } 172 173 return hasPathCrossingSuspendPoint(DefBB, UseBB); 174 } 175 176 bool isDefinitionAcrossSuspend(Argument &A, User *U) const { 177 return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U); 178 } 179 180 bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { 181 auto *DefBB = I.getParent(); 182 183 // As a special case, treat values produced by an llvm.coro.suspend.* 184 // as if they were defined in the single successor: the uses 185 // conceptually occur after the suspend. 186 if (isa<AnyCoroSuspendInst>(I)) { 187 DefBB = DefBB->getSingleSuccessor(); 188 assert(DefBB && "should have split coro.suspend into its own block"); 189 } 190 191 return isDefinitionAcrossSuspend(DefBB, U); 192 } 193 194 bool isDefinitionAcrossSuspend(Value &V, User *U) const { 195 if (auto *Arg = dyn_cast<Argument>(&V)) 196 return isDefinitionAcrossSuspend(*Arg, U); 197 if (auto *Inst = dyn_cast<Instruction>(&V)) 198 return isDefinitionAcrossSuspend(*Inst, U); 199 200 llvm_unreachable( 201 "Coroutine could only collect Argument and Instruction now."); 202 } 203}; 204} // end anonymous namespace 205 206#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label, 208 BitVector const &BV) const { 209 dbgs() << Label << ":"; 210 for (size_t I = 0, N = BV.size(); I < N; ++I) 211 if (BV[I]) 212 dbgs() << " " << Mapping.indexToBlock(I)->getName(); 213 dbgs() << "\n"; 214} 215 216LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { 217 for (size_t I = 0, N = Block.size(); I < N; ++I) { 218 BasicBlock *const B = Mapping.indexToBlock(I); 219 dbgs() << B->getName() << ":\n"; 220 dump(" Consumes", Block[I].Consumes); 221 dump(" Kills", Block[I].Kills); 222 } 223 dbgs() << "\n"; 224} 225#endif 226 227template <bool Initialize> 228bool SuspendCrossingInfo::computeBlockData( 229 const ReversePostOrderTraversal<Function *> &RPOT) { 230 bool Changed = false; 231 232 for (const BasicBlock *BB : RPOT) { 233 auto BBNo = Mapping.blockToIndex(BB); 234 auto &B = Block[BBNo]; 235 236 // We don't need to count the predecessors when initialization. 237 if constexpr (!Initialize) 238 // If all the predecessors of the current Block don't change, 239 // the BlockData for the current block must not change too. 240 if (all_of(predecessors(B), [this](BasicBlock *BB) { 241 return !Block[Mapping.blockToIndex(BB)].Changed; 242 })) { 243 B.Changed = false; 244 continue; 245 } 246 247 // Saved Consumes and Kills bitsets so that it is easy to see 248 // if anything changed after propagation. 249 auto SavedConsumes = B.Consumes; 250 auto SavedKills = B.Kills; 251 252 for (BasicBlock *PI : predecessors(B)) { 253 auto PrevNo = Mapping.blockToIndex(PI); 254 auto &P = Block[PrevNo]; 255 256 // Propagate Kills and Consumes from predecessors into B. 257 B.Consumes |= P.Consumes; 258 B.Kills |= P.Kills; 259 260 // If block P is a suspend block, it should propagate kills into block 261 // B for every block P consumes. 262 if (P.Suspend) 263 B.Kills |= P.Consumes; 264 } 265 266 if (B.Suspend) { 267 // If block B is a suspend block, it should kill all of the blocks it 268 // consumes. 269 B.Kills |= B.Consumes; 270 } else if (B.End) { 271 // If block B is an end block, it should not propagate kills as the 272 // blocks following coro.end() are reached during initial invocation 273 // of the coroutine while all the data are still available on the 274 // stack or in the registers. 275 B.Kills.reset(); 276 } else { 277 // This is reached when B block it not Suspend nor coro.end and it 278 // need to make sure that it is not in the kill set. 279 B.KillLoop |= B.Kills[BBNo]; 280 B.Kills.reset(BBNo); 281 } 282 283 if constexpr (!Initialize) { 284 B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); 285 Changed |= B.Changed; 286 } 287 } 288 289 return Changed; 290} 291 292SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) 293 : Mapping(F) { 294 const size_t N = Mapping.size(); 295 Block.resize(N); 296 297 // Initialize every block so that it consumes itself 298 for (size_t I = 0; I < N; ++I) { 299 auto &B = Block[I]; 300 B.Consumes.resize(N); 301 B.Kills.resize(N); 302 B.Consumes.set(I); 303 B.Changed = true; 304 } 305 306 // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as 307 // the code beyond coro.end is reachable during initial invocation of the 308 // coroutine. 309 for (auto *CE : Shape.CoroEnds) 310 getBlockData(CE->getParent()).End = true; 311 312 // Mark all suspend blocks and indicate that they kill everything they 313 // consume. Note, that crossing coro.save also requires a spill, as any code 314 // between coro.save and coro.suspend may resume the coroutine and all of the 315 // state needs to be saved by that time. 316 auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { 317 BasicBlock *SuspendBlock = BarrierInst->getParent(); 318 auto &B = getBlockData(SuspendBlock); 319 B.Suspend = true; 320 B.Kills |= B.Consumes; 321 }; 322 for (auto *CSI : Shape.CoroSuspends) { 323 markSuspendBlock(CSI); 324 if (auto *Save = CSI->getCoroSave()) 325 markSuspendBlock(Save); 326 } 327 328 // It is considered to be faster to use RPO traversal for forward-edges 329 // dataflow analysis. 330 ReversePostOrderTraversal<Function *> RPOT(&F); 331 computeBlockData</*Initialize=*/true>(RPOT); 332 while (computeBlockData</*Initialize*/ false>(RPOT)) 333 ; 334 335 LLVM_DEBUG(dump()); 336} 337 338namespace { 339 340// RematGraph is used to construct a DAG for rematerializable instructions 341// When the constructor is invoked with a candidate instruction (which is 342// materializable) it builds a DAG of materializable instructions from that 343// point. 344// Typically, for each instruction identified as re-materializable across a 345// suspend point, a RematGraph will be created. 346struct RematGraph { 347 // Each RematNode in the graph contains the edges to instructions providing 348 // operands in the current node. 349 struct RematNode { 350 Instruction *Node; 351 SmallVector<RematNode *> Operands; 352 RematNode() = default; 353 RematNode(Instruction *V) : Node(V) {} 354 }; 355 356 RematNode *EntryNode; 357 using RematNodeMap = 358 SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; 359 RematNodeMap Remats; 360 const std::function<bool(Instruction &)> &MaterializableCallback; 361 SuspendCrossingInfo &Checker; 362 363 RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, 364 Instruction *I, SuspendCrossingInfo &Checker) 365 : MaterializableCallback(MaterializableCallback), Checker(Checker) { 366 std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); 367 EntryNode = FirstNode.get(); 368 std::deque<std::unique_ptr<RematNode>> WorkList; 369 addNode(std::move(FirstNode), WorkList, cast<User>(I)); 370 while (WorkList.size()) { 371 std::unique_ptr<RematNode> N = std::move(WorkList.front()); 372 WorkList.pop_front(); 373 addNode(std::move(N), WorkList, cast<User>(I)); 374 } 375 } 376 377 void addNode(std::unique_ptr<RematNode> NUPtr, 378 std::deque<std::unique_ptr<RematNode>> &WorkList, 379 User *FirstUse) { 380 RematNode *N = NUPtr.get(); 381 if (Remats.count(N->Node)) 382 return; 383 384 // We haven't see this node yet - add to the list 385 Remats[N->Node] = std::move(NUPtr); 386 for (auto &Def : N->Node->operands()) { 387 Instruction *D = dyn_cast<Instruction>(Def.get()); 388 if (!D || !MaterializableCallback(*D) || 389 !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) 390 continue; 391 392 if (Remats.count(D)) { 393 // Already have this in the graph 394 N->Operands.push_back(Remats[D].get()); 395 continue; 396 } 397 398 bool NoMatch = true; 399 for (auto &I : WorkList) { 400 if (I->Node == D) { 401 NoMatch = false; 402 N->Operands.push_back(I.get()); 403 break; 404 } 405 } 406 if (NoMatch) { 407 // Create a new node 408 std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); 409 N->Operands.push_back(ChildNode.get()); 410 WorkList.push_back(std::move(ChildNode)); 411 } 412 } 413 } 414 415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 416 void dump() const { 417 dbgs() << "Entry ("; 418 if (EntryNode->Node->getParent()->hasName()) 419 dbgs() << EntryNode->Node->getParent()->getName(); 420 else 421 EntryNode->Node->getParent()->printAsOperand(dbgs(), false); 422 dbgs() << ") : " << *EntryNode->Node << "\n"; 423 for (auto &E : Remats) { 424 dbgs() << *(E.first) << "\n"; 425 for (RematNode *U : E.second->Operands) 426 dbgs() << " " << *U->Node << "\n"; 427 } 428 } 429#endif 430}; 431} // end anonymous namespace 432 433namespace llvm { 434 435template <> struct GraphTraits<RematGraph *> { 436 using NodeRef = RematGraph::RematNode *; 437 using ChildIteratorType = RematGraph::RematNode **; 438 439 static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } 440 static ChildIteratorType child_begin(NodeRef N) { 441 return N->Operands.begin(); 442 } 443 static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } 444}; 445 446} // end namespace llvm 447 448#undef DEBUG_TYPE // "coro-suspend-crossing" 449#define DEBUG_TYPE "coro-frame" 450 451namespace { 452class FrameTypeBuilder; 453// Mapping from the to-be-spilled value to all the users that need reload. 454using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>; 455struct AllocaInfo { 456 AllocaInst *Alloca; 457 DenseMap<Instruction *, std::optional<APInt>> Aliases; 458 bool MayWriteBeforeCoroBegin; 459 AllocaInfo(AllocaInst *Alloca, 460 DenseMap<Instruction *, std::optional<APInt>> Aliases, 461 bool MayWriteBeforeCoroBegin) 462 : Alloca(Alloca), Aliases(std::move(Aliases)), 463 MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} 464}; 465struct FrameDataInfo { 466 // All the values (that are not allocas) that needs to be spilled to the 467 // frame. 468 SpillInfo Spills; 469 // Allocas contains all values defined as allocas that need to live in the 470 // frame. 471 SmallVector<AllocaInfo, 8> Allocas; 472 473 SmallVector<Value *, 8> getAllDefs() const { 474 SmallVector<Value *, 8> Defs; 475 for (const auto &P : Spills) 476 Defs.push_back(P.first); 477 for (const auto &A : Allocas) 478 Defs.push_back(A.Alloca); 479 return Defs; 480 } 481 482 uint32_t getFieldIndex(Value *V) const { 483 auto Itr = FieldIndexMap.find(V); 484 assert(Itr != FieldIndexMap.end() && 485 "Value does not have a frame field index"); 486 return Itr->second; 487 } 488 489 void setFieldIndex(Value *V, uint32_t Index) { 490 assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) && 491 "Cannot set the index for the same field twice."); 492 FieldIndexMap[V] = Index; 493 } 494 495 Align getAlign(Value *V) const { 496 auto Iter = FieldAlignMap.find(V); 497 assert(Iter != FieldAlignMap.end()); 498 return Iter->second; 499 } 500 501 void setAlign(Value *V, Align AL) { 502 assert(FieldAlignMap.count(V) == 0); 503 FieldAlignMap.insert({V, AL}); 504 } 505 506 uint64_t getDynamicAlign(Value *V) const { 507 auto Iter = FieldDynamicAlignMap.find(V); 508 assert(Iter != FieldDynamicAlignMap.end()); 509 return Iter->second; 510 } 511 512 void setDynamicAlign(Value *V, uint64_t Align) { 513 assert(FieldDynamicAlignMap.count(V) == 0); 514 FieldDynamicAlignMap.insert({V, Align}); 515 } 516 517 uint64_t getOffset(Value *V) const { 518 auto Iter = FieldOffsetMap.find(V); 519 assert(Iter != FieldOffsetMap.end()); 520 return Iter->second; 521 } 522 523 void setOffset(Value *V, uint64_t Offset) { 524 assert(FieldOffsetMap.count(V) == 0); 525 FieldOffsetMap.insert({V, Offset}); 526 } 527 528 // Remap the index of every field in the frame, using the final layout index. 529 void updateLayoutIndex(FrameTypeBuilder &B); 530 531private: 532 // LayoutIndexUpdateStarted is used to avoid updating the index of any field 533 // twice by mistake. 534 bool LayoutIndexUpdateStarted = false; 535 // Map from values to their slot indexes on the frame. They will be first set 536 // with their original insertion field index. After the frame is built, their 537 // indexes will be updated into the final layout index. 538 DenseMap<Value *, uint32_t> FieldIndexMap; 539 // Map from values to their alignment on the frame. They would be set after 540 // the frame is built. 541 DenseMap<Value *, Align> FieldAlignMap; 542 DenseMap<Value *, uint64_t> FieldDynamicAlignMap; 543 // Map from values to their offset on the frame. They would be set after 544 // the frame is built. 545 DenseMap<Value *, uint64_t> FieldOffsetMap; 546}; 547} // namespace 548 549#ifndef NDEBUG 550static void dumpSpills(StringRef Title, const SpillInfo &Spills) { 551 dbgs() << "------------- " << Title << "--------------\n"; 552 for (const auto &E : Spills) { 553 E.first->dump(); 554 dbgs() << " user: "; 555 for (auto *I : E.second) 556 I->dump(); 557 } 558} 559static void dumpRemats( 560 StringRef Title, 561 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { 562 dbgs() << "------------- " << Title << "--------------\n"; 563 for (const auto &E : RM) { 564 E.second->dump(); 565 dbgs() << "--\n"; 566 } 567} 568 569static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { 570 dbgs() << "------------- Allocas --------------\n"; 571 for (const auto &A : Allocas) { 572 A.Alloca->dump(); 573 } 574} 575#endif 576 577namespace { 578using FieldIDType = size_t; 579// We cannot rely solely on natural alignment of a type when building a 580// coroutine frame and if the alignment specified on the Alloca instruction 581// differs from the natural alignment of the alloca type we will need to insert 582// padding. 583class FrameTypeBuilder { 584private: 585 struct Field { 586 uint64_t Size; 587 uint64_t Offset; 588 Type *Ty; 589 FieldIDType LayoutFieldIndex; 590 Align Alignment; 591 Align TyAlignment; 592 uint64_t DynamicAlignBuffer; 593 }; 594 595 const DataLayout &DL; 596 LLVMContext &Context; 597 uint64_t StructSize = 0; 598 Align StructAlign; 599 bool IsFinished = false; 600 601 std::optional<Align> MaxFrameAlignment; 602 603 SmallVector<Field, 8> Fields; 604 DenseMap<Value*, unsigned> FieldIndexByKey; 605 606public: 607 FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL, 608 std::optional<Align> MaxFrameAlignment) 609 : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {} 610 611 /// Add a field to this structure for the storage of an `alloca` 612 /// instruction. 613 [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI, 614 bool IsHeader = false) { 615 Type *Ty = AI->getAllocatedType(); 616 617 // Make an array type if this is a static array allocation. 618 if (AI->isArrayAllocation()) { 619 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 620 Ty = ArrayType::get(Ty, CI->getValue().getZExtValue()); 621 else 622 report_fatal_error("Coroutines cannot handle non static allocas yet"); 623 } 624 625 return addField(Ty, AI->getAlign(), IsHeader); 626 } 627 628 /// We want to put the allocas whose lifetime-ranges are not overlapped 629 /// into one slot of coroutine frame. 630 /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566 631 /// 632 /// cppcoro::task<void> alternative_paths(bool cond) { 633 /// if (cond) { 634 /// big_structure a; 635 /// process(a); 636 /// co_await something(); 637 /// } else { 638 /// big_structure b; 639 /// process2(b); 640 /// co_await something(); 641 /// } 642 /// } 643 /// 644 /// We want to put variable a and variable b in the same slot to 645 /// reduce the size of coroutine frame. 646 /// 647 /// This function use StackLifetime algorithm to partition the AllocaInsts in 648 /// Spills to non-overlapped sets in order to put Alloca in the same 649 /// non-overlapped set into the same slot in the Coroutine Frame. Then add 650 /// field for the allocas in the same non-overlapped set by using the largest 651 /// type as the field type. 652 /// 653 /// Side Effects: Because We sort the allocas, the order of allocas in the 654 /// frame may be different with the order in the source code. 655 void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData, 656 coro::Shape &Shape); 657 658 /// Add a field to this structure. 659 [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment, 660 bool IsHeader = false, 661 bool IsSpillOfValue = false) { 662 assert(!IsFinished && "adding fields to a finished builder"); 663 assert(Ty && "must provide a type for a field"); 664 665 // The field size is always the alloc size of the type. 666 uint64_t FieldSize = DL.getTypeAllocSize(Ty); 667 668 // For an alloca with size=0, we don't need to add a field and they 669 // can just point to any index in the frame. Use index 0. 670 if (FieldSize == 0) { 671 return 0; 672 } 673 674 // The field alignment might not be the type alignment, but we need 675 // to remember the type alignment anyway to build the type. 676 // If we are spilling values we don't need to worry about ABI alignment 677 // concerns. 678 Align ABIAlign = DL.getABITypeAlign(Ty); 679 Align TyAlignment = ABIAlign; 680 if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign) 681 TyAlignment = *MaxFrameAlignment; 682 Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment); 683 684 // The field alignment could be bigger than the max frame case, in that case 685 // we request additional storage to be able to dynamically align the 686 // pointer. 687 uint64_t DynamicAlignBuffer = 0; 688 if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) { 689 DynamicAlignBuffer = 690 offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment); 691 FieldAlignment = *MaxFrameAlignment; 692 FieldSize = FieldSize + DynamicAlignBuffer; 693 } 694 695 // Lay out header fields immediately. 696 uint64_t Offset; 697 if (IsHeader) { 698 Offset = alignTo(StructSize, FieldAlignment); 699 StructSize = Offset + FieldSize; 700 701 // Everything else has a flexible offset. 702 } else { 703 Offset = OptimizedStructLayoutField::FlexibleOffset; 704 } 705 706 Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment, 707 DynamicAlignBuffer}); 708 return Fields.size() - 1; 709 } 710 711 /// Finish the layout and set the body on the given type. 712 void finish(StructType *Ty); 713 714 uint64_t getStructSize() const { 715 assert(IsFinished && "not yet finished!"); 716 return StructSize; 717 } 718 719 Align getStructAlign() const { 720 assert(IsFinished && "not yet finished!"); 721 return StructAlign; 722 } 723 724 FieldIDType getLayoutFieldIndex(FieldIDType Id) const { 725 assert(IsFinished && "not yet finished!"); 726 return Fields[Id].LayoutFieldIndex; 727 } 728 729 Field getLayoutField(FieldIDType Id) const { 730 assert(IsFinished && "not yet finished!"); 731 return Fields[Id]; 732 } 733}; 734} // namespace 735 736void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) { 737 auto Updater = [&](Value *I) { 738 auto Field = B.getLayoutField(getFieldIndex(I)); 739 setFieldIndex(I, Field.LayoutFieldIndex); 740 setAlign(I, Field.Alignment); 741 uint64_t dynamicAlign = 742 Field.DynamicAlignBuffer 743 ? Field.DynamicAlignBuffer + Field.Alignment.value() 744 : 0; 745 setDynamicAlign(I, dynamicAlign); 746 setOffset(I, Field.Offset); 747 }; 748 LayoutIndexUpdateStarted = true; 749 for (auto &S : Spills) 750 Updater(S.first); 751 for (const auto &A : Allocas) 752 Updater(A.Alloca); 753 LayoutIndexUpdateStarted = false; 754} 755 756void FrameTypeBuilder::addFieldForAllocas(const Function &F, 757 FrameDataInfo &FrameData, 758 coro::Shape &Shape) { 759 using AllocaSetType = SmallVector<AllocaInst *, 4>; 760 SmallVector<AllocaSetType, 4> NonOverlapedAllocas; 761 762 // We need to add field for allocas at the end of this function. 763 auto AddFieldForAllocasAtExit = make_scope_exit([&]() { 764 for (auto AllocaList : NonOverlapedAllocas) { 765 auto *LargestAI = *AllocaList.begin(); 766 FieldIDType Id = addFieldForAlloca(LargestAI); 767 for (auto *Alloca : AllocaList) 768 FrameData.setFieldIndex(Alloca, Id); 769 } 770 }); 771 772 if (!Shape.OptimizeFrame) { 773 for (const auto &A : FrameData.Allocas) { 774 AllocaInst *Alloca = A.Alloca; 775 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 776 } 777 return; 778 } 779 780 // Because there are paths from the lifetime.start to coro.end 781 // for each alloca, the liferanges for every alloca is overlaped 782 // in the blocks who contain coro.end and the successor blocks. 783 // So we choose to skip there blocks when we calculate the liferange 784 // for each alloca. It should be reasonable since there shouldn't be uses 785 // in these blocks and the coroutine frame shouldn't be used outside the 786 // coroutine body. 787 // 788 // Note that the user of coro.suspend may not be SwitchInst. However, this 789 // case seems too complex to handle. And it is harmless to skip these 790 // patterns since it just prevend putting the allocas to live in the same 791 // slot. 792 DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest; 793 for (auto *CoroSuspendInst : Shape.CoroSuspends) { 794 for (auto *U : CoroSuspendInst->users()) { 795 if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) { 796 auto *SWI = const_cast<SwitchInst *>(ConstSWI); 797 DefaultSuspendDest[SWI] = SWI->getDefaultDest(); 798 SWI->setDefaultDest(SWI->getSuccessor(1)); 799 } 800 } 801 } 802 803 auto ExtractAllocas = [&]() { 804 AllocaSetType Allocas; 805 Allocas.reserve(FrameData.Allocas.size()); 806 for (const auto &A : FrameData.Allocas) 807 Allocas.push_back(A.Alloca); 808 return Allocas; 809 }; 810 StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(), 811 StackLifetime::LivenessType::May); 812 StackLifetimeAnalyzer.run(); 813 auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) { 814 return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps( 815 StackLifetimeAnalyzer.getLiveRange(AI2)); 816 }; 817 auto GetAllocaSize = [&](const AllocaInfo &A) { 818 std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL); 819 assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n"); 820 assert(!RetSize->isScalable() && "Scalable vectors are not yet supported"); 821 return RetSize->getFixedValue(); 822 }; 823 // Put larger allocas in the front. So the larger allocas have higher 824 // priority to merge, which can save more space potentially. Also each 825 // AllocaSet would be ordered. So we can get the largest Alloca in one 826 // AllocaSet easily. 827 sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) { 828 return GetAllocaSize(Iter1) > GetAllocaSize(Iter2); 829 }); 830 for (const auto &A : FrameData.Allocas) { 831 AllocaInst *Alloca = A.Alloca; 832 bool Merged = false; 833 // Try to find if the Alloca is not inferenced with any existing 834 // NonOverlappedAllocaSet. If it is true, insert the alloca to that 835 // NonOverlappedAllocaSet. 836 for (auto &AllocaSet : NonOverlapedAllocas) { 837 assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n"); 838 bool NoInference = none_of(AllocaSet, [&](auto Iter) { 839 return IsAllocaInferenre(Alloca, Iter); 840 }); 841 // If the alignment of A is multiple of the alignment of B, the address 842 // of A should satisfy the requirement for aligning for B. 843 // 844 // There may be other more fine-grained strategies to handle the alignment 845 // infomation during the merging process. But it seems hard to handle 846 // these strategies and benefit little. 847 bool Alignable = [&]() -> bool { 848 auto *LargestAlloca = *AllocaSet.begin(); 849 return LargestAlloca->getAlign().value() % Alloca->getAlign().value() == 850 0; 851 }(); 852 bool CouldMerge = NoInference && Alignable; 853 if (!CouldMerge) 854 continue; 855 AllocaSet.push_back(Alloca); 856 Merged = true; 857 break; 858 } 859 if (!Merged) { 860 NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); 861 } 862 } 863 // Recover the default target destination for each Switch statement 864 // reserved. 865 for (auto SwitchAndDefaultDest : DefaultSuspendDest) { 866 SwitchInst *SWI = SwitchAndDefaultDest.first; 867 BasicBlock *DestBB = SwitchAndDefaultDest.second; 868 SWI->setDefaultDest(DestBB); 869 } 870 // This Debug Info could tell us which allocas are merged into one slot. 871 LLVM_DEBUG(for (auto &AllocaSet 872 : NonOverlapedAllocas) { 873 if (AllocaSet.size() > 1) { 874 dbgs() << "In Function:" << F.getName() << "\n"; 875 dbgs() << "Find Union Set " 876 << "\n"; 877 dbgs() << "\tAllocas are \n"; 878 for (auto Alloca : AllocaSet) 879 dbgs() << "\t\t" << *Alloca << "\n"; 880 } 881 }); 882} 883 884void FrameTypeBuilder::finish(StructType *Ty) { 885 assert(!IsFinished && "already finished!"); 886 887 // Prepare the optimal-layout field array. 888 // The Id in the layout field is a pointer to our Field for it. 889 SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 890 LayoutFields.reserve(Fields.size()); 891 for (auto &Field : Fields) { 892 LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment, 893 Field.Offset); 894 } 895 896 // Perform layout. 897 auto SizeAndAlign = performOptimizedStructLayout(LayoutFields); 898 StructSize = SizeAndAlign.first; 899 StructAlign = SizeAndAlign.second; 900 901 auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & { 902 return *static_cast<Field *>(const_cast<void*>(LayoutField.Id)); 903 }; 904 905 // We need to produce a packed struct type if there's a field whose 906 // assigned offset isn't a multiple of its natural type alignment. 907 bool Packed = [&] { 908 for (auto &LayoutField : LayoutFields) { 909 auto &F = getField(LayoutField); 910 if (!isAligned(F.TyAlignment, LayoutField.Offset)) 911 return true; 912 } 913 return false; 914 }(); 915 916 // Build the struct body. 917 SmallVector<Type*, 16> FieldTypes; 918 FieldTypes.reserve(LayoutFields.size() * 3 / 2); 919 uint64_t LastOffset = 0; 920 for (auto &LayoutField : LayoutFields) { 921 auto &F = getField(LayoutField); 922 923 auto Offset = LayoutField.Offset; 924 925 // Add a padding field if there's a padding gap and we're either 926 // building a packed struct or the padding gap is more than we'd 927 // get from aligning to the field type's natural alignment. 928 assert(Offset >= LastOffset); 929 if (Offset != LastOffset) { 930 if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset) 931 FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context), 932 Offset - LastOffset)); 933 } 934 935 F.Offset = Offset; 936 F.LayoutFieldIndex = FieldTypes.size(); 937 938 FieldTypes.push_back(F.Ty); 939 if (F.DynamicAlignBuffer) { 940 FieldTypes.push_back( 941 ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); 942 } 943 LastOffset = Offset + F.Size; 944 } 945 946 Ty->setBody(FieldTypes, Packed); 947 948#ifndef NDEBUG 949 // Check that the IR layout matches the offsets we expect. 950 auto Layout = DL.getStructLayout(Ty); 951 for (auto &F : Fields) { 952 assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty); 953 assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset); 954 } 955#endif 956 957 IsFinished = true; 958} 959 960static void cacheDIVar(FrameDataInfo &FrameData, 961 DenseMap<Value *, DILocalVariable *> &DIVarCache) { 962 for (auto *V : FrameData.getAllDefs()) { 963 if (DIVarCache.contains(V)) 964 continue; 965 966 auto CacheIt = [&DIVarCache, V](const auto &Container) { 967 auto *I = llvm::find_if(Container, [](auto *DDI) { 968 return DDI->getExpression()->getNumElements() == 0; 969 }); 970 if (I != Container.end()) 971 DIVarCache.insert({V, (*I)->getVariable()}); 972 }; 973 CacheIt(findDbgDeclares(V)); 974 CacheIt(findDPVDeclares(V)); 975 } 976} 977 978/// Create name for Type. It uses MDString to store new created string to 979/// avoid memory leak. 980static StringRef solveTypeName(Type *Ty) { 981 if (Ty->isIntegerTy()) { 982 // The longest name in common may be '__int_128', which has 9 bits. 983 SmallString<16> Buffer; 984 raw_svector_ostream OS(Buffer); 985 OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth(); 986 auto *MDName = MDString::get(Ty->getContext(), OS.str()); 987 return MDName->getString(); 988 } 989 990 if (Ty->isFloatingPointTy()) { 991 if (Ty->isFloatTy()) 992 return "__float_"; 993 if (Ty->isDoubleTy()) 994 return "__double_"; 995 return "__floating_type_"; 996 } 997 998 if (Ty->isPointerTy()) 999 return "PointerType"; 1000 1001 if (Ty->isStructTy()) { 1002 if (!cast<StructType>(Ty)->hasName()) 1003 return "__LiteralStructType_"; 1004 1005 auto Name = Ty->getStructName(); 1006 1007 SmallString<16> Buffer(Name); 1008 for (auto &Iter : Buffer) 1009 if (Iter == '.' || Iter == ':') 1010 Iter = '_'; 1011 auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); 1012 return MDName->getString(); 1013 } 1014 1015 return "UnknownType"; 1016} 1017 1018static DIType *solveDIType(DIBuilder &Builder, Type *Ty, 1019 const DataLayout &Layout, DIScope *Scope, 1020 unsigned LineNum, 1021 DenseMap<Type *, DIType *> &DITypeCache) { 1022 if (DIType *DT = DITypeCache.lookup(Ty)) 1023 return DT; 1024 1025 StringRef Name = solveTypeName(Ty); 1026 1027 DIType *RetType = nullptr; 1028 1029 if (Ty->isIntegerTy()) { 1030 auto BitWidth = cast<IntegerType>(Ty)->getBitWidth(); 1031 RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed, 1032 llvm::DINode::FlagArtificial); 1033 } else if (Ty->isFloatingPointTy()) { 1034 RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), 1035 dwarf::DW_ATE_float, 1036 llvm::DINode::FlagArtificial); 1037 } else if (Ty->isPointerTy()) { 1038 // Construct PointerType points to null (aka void *) instead of exploring 1039 // pointee type to avoid infinite search problem. For example, we would be 1040 // in trouble if we traverse recursively: 1041 // 1042 // struct Node { 1043 // Node* ptr; 1044 // }; 1045 RetType = 1046 Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), 1047 Layout.getABITypeAlign(Ty).value() * CHAR_BIT, 1048 /*DWARFAddressSpace=*/std::nullopt, Name); 1049 } else if (Ty->isStructTy()) { 1050 auto *DIStruct = Builder.createStructType( 1051 Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), 1052 Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT, 1053 llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); 1054 1055 auto *StructTy = cast<StructType>(Ty); 1056 SmallVector<Metadata *, 16> Elements; 1057 for (unsigned I = 0; I < StructTy->getNumElements(); I++) { 1058 DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout, 1059 Scope, LineNum, DITypeCache); 1060 assert(DITy); 1061 Elements.push_back(Builder.createMemberType( 1062 Scope, DITy->getName(), Scope->getFile(), LineNum, 1063 DITy->getSizeInBits(), DITy->getAlignInBits(), 1064 Layout.getStructLayout(StructTy)->getElementOffsetInBits(I), 1065 llvm::DINode::FlagArtificial, DITy)); 1066 } 1067 1068 Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements)); 1069 1070 RetType = DIStruct; 1071 } else { 1072 LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); 1073 TypeSize Size = Layout.getTypeSizeInBits(Ty); 1074 auto *CharSizeType = Builder.createBasicType( 1075 Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); 1076 1077 if (Size <= 8) 1078 RetType = CharSizeType; 1079 else { 1080 if (Size % 8 != 0) 1081 Size = TypeSize::getFixed(Size + 8 - (Size % 8)); 1082 1083 RetType = Builder.createArrayType( 1084 Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, 1085 Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); 1086 } 1087 } 1088 1089 DITypeCache.insert({Ty, RetType}); 1090 return RetType; 1091} 1092 1093/// Build artificial debug info for C++ coroutine frames to allow users to 1094/// inspect the contents of the frame directly 1095/// 1096/// Create Debug information for coroutine frame with debug name "__coro_frame". 1097/// The debug information for the fields of coroutine frame is constructed from 1098/// the following way: 1099/// 1. For all the value in the Frame, we search the use of dbg.declare to find 1100/// the corresponding debug variables for the value. If we can find the 1101/// debug variable, we can get full and accurate debug information. 1102/// 2. If we can't get debug information in step 1 and 2, we could only try to 1103/// build the DIType by Type. We did this in solveDIType. We only handle 1104/// integer, float, double, integer type and struct type for now. 1105static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, 1106 FrameDataInfo &FrameData) { 1107 DISubprogram *DIS = F.getSubprogram(); 1108 // If there is no DISubprogram for F, it implies the Function are not compiled 1109 // with debug info. So we also don't need to generate debug info for the frame 1110 // neither. 1111 if (!DIS || !DIS->getUnit() || 1112 !dwarf::isCPlusPlus( 1113 (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage())) 1114 return; 1115 1116 assert(Shape.ABI == coro::ABI::Switch && 1117 "We could only build debug infomation for C++ coroutine now.\n"); 1118 1119 DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); 1120 1121 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1122 assert(PromiseAlloca && 1123 "Coroutine with switch ABI should own Promise alloca"); 1124 1125 TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(PromiseAlloca); 1126 TinyPtrVector<DPValue *> DPVs = findDPVDeclares(PromiseAlloca); 1127 1128 DILocalVariable *PromiseDIVariable = nullptr; 1129 DILocation *DILoc = nullptr; 1130 if (!DIs.empty()) { 1131 DbgDeclareInst *PromiseDDI = DIs.front(); 1132 PromiseDIVariable = PromiseDDI->getVariable(); 1133 DILoc = PromiseDDI->getDebugLoc().get(); 1134 } else if (!DPVs.empty()) { 1135 DPValue *PromiseDPV = DPVs.front(); 1136 PromiseDIVariable = PromiseDPV->getVariable(); 1137 DILoc = PromiseDPV->getDebugLoc().get(); 1138 } else { 1139 return; 1140 } 1141 1142 DILocalScope *PromiseDIScope = PromiseDIVariable->getScope(); 1143 DIFile *DFile = PromiseDIScope->getFile(); 1144 unsigned LineNum = PromiseDIVariable->getLine(); 1145 1146 DICompositeType *FrameDITy = DBuilder.createStructType( 1147 DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), 1148 DFile, LineNum, Shape.FrameSize * 8, 1149 Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, 1150 llvm::DINodeArray()); 1151 StructType *FrameTy = Shape.FrameTy; 1152 SmallVector<Metadata *, 16> Elements; 1153 DataLayout Layout = F.getParent()->getDataLayout(); 1154 1155 DenseMap<Value *, DILocalVariable *> DIVarCache; 1156 cacheDIVar(FrameData, DIVarCache); 1157 1158 unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume; 1159 unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy; 1160 unsigned IndexIndex = Shape.SwitchLowering.IndexField; 1161 1162 DenseMap<unsigned, StringRef> NameCache; 1163 NameCache.insert({ResumeIndex, "__resume_fn"}); 1164 NameCache.insert({DestroyIndex, "__destroy_fn"}); 1165 NameCache.insert({IndexIndex, "__coro_index"}); 1166 1167 Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex), 1168 *DestroyFnTy = FrameTy->getElementType(DestroyIndex), 1169 *IndexTy = FrameTy->getElementType(IndexIndex); 1170 1171 DenseMap<unsigned, DIType *> TyCache; 1172 TyCache.insert( 1173 {ResumeIndex, DBuilder.createPointerType( 1174 nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); 1175 TyCache.insert( 1176 {DestroyIndex, DBuilder.createPointerType( 1177 nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); 1178 1179 /// FIXME: If we fill the field `SizeInBits` with the actual size of 1180 /// __coro_index in bits, then __coro_index wouldn't show in the debugger. 1181 TyCache.insert({IndexIndex, DBuilder.createBasicType( 1182 "__coro_index", 1183 (Layout.getTypeSizeInBits(IndexTy) < 8) 1184 ? 8 1185 : Layout.getTypeSizeInBits(IndexTy), 1186 dwarf::DW_ATE_unsigned_char)}); 1187 1188 for (auto *V : FrameData.getAllDefs()) { 1189 if (!DIVarCache.contains(V)) 1190 continue; 1191 1192 auto Index = FrameData.getFieldIndex(V); 1193 1194 NameCache.insert({Index, DIVarCache[V]->getName()}); 1195 TyCache.insert({Index, DIVarCache[V]->getType()}); 1196 } 1197 1198 // Cache from index to (Align, Offset Pair) 1199 DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache; 1200 // The Align and Offset of Resume function and Destroy function are fixed. 1201 OffsetCache.insert({ResumeIndex, {8, 0}}); 1202 OffsetCache.insert({DestroyIndex, {8, 8}}); 1203 OffsetCache.insert( 1204 {IndexIndex, 1205 {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}}); 1206 1207 for (auto *V : FrameData.getAllDefs()) { 1208 auto Index = FrameData.getFieldIndex(V); 1209 1210 OffsetCache.insert( 1211 {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}}); 1212 } 1213 1214 DenseMap<Type *, DIType *> DITypeCache; 1215 // This counter is used to avoid same type names. e.g., there would be 1216 // many i32 and i64 types in one coroutine. And we would use i32_0 and 1217 // i32_1 to avoid the same type. Since it makes no sense the name of the 1218 // fields confilicts with each other. 1219 unsigned UnknownTypeNum = 0; 1220 for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { 1221 if (!OffsetCache.contains(Index)) 1222 continue; 1223 1224 std::string Name; 1225 uint64_t SizeInBits; 1226 uint32_t AlignInBits; 1227 uint64_t OffsetInBits; 1228 DIType *DITy = nullptr; 1229 1230 Type *Ty = FrameTy->getElementType(Index); 1231 assert(Ty->isSized() && "We can't handle type which is not sized.\n"); 1232 SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue(); 1233 AlignInBits = OffsetCache[Index].first * 8; 1234 OffsetInBits = OffsetCache[Index].second * 8; 1235 1236 if (NameCache.contains(Index)) { 1237 Name = NameCache[Index].str(); 1238 DITy = TyCache[Index]; 1239 } else { 1240 DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache); 1241 assert(DITy && "SolveDIType shouldn't return nullptr.\n"); 1242 Name = DITy->getName().str(); 1243 Name += "_" + std::to_string(UnknownTypeNum); 1244 UnknownTypeNum++; 1245 } 1246 1247 Elements.push_back(DBuilder.createMemberType( 1248 FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits, 1249 llvm::DINode::FlagArtificial, DITy)); 1250 } 1251 1252 DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); 1253 1254 auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame", 1255 DFile, LineNum, FrameDITy, 1256 true, DINode::FlagArtificial); 1257 assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); 1258 1259 // Subprogram would have ContainedNodes field which records the debug 1260 // variables it contained. So we need to add __coro_frame to the 1261 // ContainedNodes of it. 1262 // 1263 // If we don't add __coro_frame to the RetainedNodes, user may get 1264 // `no symbol __coro_frame in context` rather than `__coro_frame` 1265 // is optimized out, which is more precise. 1266 if (auto *SubProgram = dyn_cast<DISubprogram>(PromiseDIScope)) { 1267 auto RetainedNodes = SubProgram->getRetainedNodes(); 1268 SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(), 1269 RetainedNodes.end()); 1270 RetainedNodesVec.push_back(FrameDIVar); 1271 SubProgram->replaceOperandWith( 1272 7, (MDTuple::get(F.getContext(), RetainedNodesVec))); 1273 } 1274 1275 if (UseNewDbgInfoFormat) { 1276 DPValue *NewDPV = new DPValue(ValueAsMetadata::get(Shape.FramePtr), 1277 FrameDIVar, DBuilder.createExpression(), 1278 DILoc, DPValue::LocationType::Declare); 1279 BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr(); 1280 It->getParent()->insertDPValueBefore(NewDPV, It); 1281 } else { 1282 DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, 1283 DBuilder.createExpression(), DILoc, 1284 &*Shape.getInsertPtAfterFramePtr()); 1285 } 1286} 1287 1288// Build a struct that will keep state for an active coroutine. 1289// struct f.frame { 1290// ResumeFnTy ResumeFnAddr; 1291// ResumeFnTy DestroyFnAddr; 1292// ... promise (if present) ... 1293// int ResumeIndex; 1294// ... spills ... 1295// }; 1296static StructType *buildFrameType(Function &F, coro::Shape &Shape, 1297 FrameDataInfo &FrameData) { 1298 LLVMContext &C = F.getContext(); 1299 const DataLayout &DL = F.getParent()->getDataLayout(); 1300 StructType *FrameTy = [&] { 1301 SmallString<32> Name(F.getName()); 1302 Name.append(".Frame"); 1303 return StructType::create(C, Name); 1304 }(); 1305 1306 // We will use this value to cap the alignment of spilled values. 1307 std::optional<Align> MaxFrameAlignment; 1308 if (Shape.ABI == coro::ABI::Async) 1309 MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment(); 1310 FrameTypeBuilder B(C, DL, MaxFrameAlignment); 1311 1312 AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); 1313 std::optional<FieldIDType> SwitchIndexFieldId; 1314 1315 if (Shape.ABI == coro::ABI::Switch) { 1316 auto *FnPtrTy = PointerType::getUnqual(C); 1317 1318 // Add header fields for the resume and destroy functions. 1319 // We can rely on these being perfectly packed. 1320 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1321 (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true); 1322 1323 // PromiseAlloca field needs to be explicitly added here because it's 1324 // a header field with a fixed offset based on its alignment. Hence it 1325 // needs special handling and cannot be added to FrameData.Allocas. 1326 if (PromiseAlloca) 1327 FrameData.setFieldIndex( 1328 PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true)); 1329 1330 // Add a field to store the suspend index. This doesn't need to 1331 // be in the header. 1332 unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); 1333 Type *IndexType = Type::getIntNTy(C, IndexBits); 1334 1335 SwitchIndexFieldId = B.addField(IndexType, std::nullopt); 1336 } else { 1337 assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); 1338 } 1339 1340 // Because multiple allocas may own the same field slot, 1341 // we add allocas to field here. 1342 B.addFieldForAllocas(F, FrameData, Shape); 1343 // Add PromiseAlloca to Allocas list so that 1344 // 1. updateLayoutIndex could update its index after 1345 // `performOptimizedStructLayout` 1346 // 2. it is processed in insertSpills. 1347 if (Shape.ABI == coro::ABI::Switch && PromiseAlloca) 1348 // We assume that the promise alloca won't be modified before 1349 // CoroBegin and no alias will be create before CoroBegin. 1350 FrameData.Allocas.emplace_back( 1351 PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false); 1352 // Create an entry for every spilled value. 1353 for (auto &S : FrameData.Spills) { 1354 Type *FieldType = S.first->getType(); 1355 // For byval arguments, we need to store the pointed value in the frame, 1356 // instead of the pointer itself. 1357 if (const Argument *A = dyn_cast<Argument>(S.first)) 1358 if (A->hasByValAttr()) 1359 FieldType = A->getParamByValType(); 1360 FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/, 1361 true /*IsSpillOfValue*/); 1362 FrameData.setFieldIndex(S.first, Id); 1363 } 1364 1365 B.finish(FrameTy); 1366 FrameData.updateLayoutIndex(B); 1367 Shape.FrameAlign = B.getStructAlign(); 1368 Shape.FrameSize = B.getStructSize(); 1369 1370 switch (Shape.ABI) { 1371 case coro::ABI::Switch: { 1372 // In the switch ABI, remember the switch-index field. 1373 auto IndexField = B.getLayoutField(*SwitchIndexFieldId); 1374 Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex; 1375 Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value(); 1376 Shape.SwitchLowering.IndexOffset = IndexField.Offset; 1377 1378 // Also round the frame size up to a multiple of its alignment, as is 1379 // generally expected in C/C++. 1380 Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign); 1381 break; 1382 } 1383 1384 // In the retcon ABI, remember whether the frame is inline in the storage. 1385 case coro::ABI::Retcon: 1386 case coro::ABI::RetconOnce: { 1387 auto Id = Shape.getRetconCoroId(); 1388 Shape.RetconLowering.IsFrameInlineInStorage 1389 = (B.getStructSize() <= Id->getStorageSize() && 1390 B.getStructAlign() <= Id->getStorageAlignment()); 1391 break; 1392 } 1393 case coro::ABI::Async: { 1394 Shape.AsyncLowering.FrameOffset = 1395 alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign); 1396 // Also make the final context size a multiple of the context alignment to 1397 // make allocation easier for allocators. 1398 Shape.AsyncLowering.ContextSize = 1399 alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize, 1400 Shape.AsyncLowering.getContextAlignment()); 1401 if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) { 1402 report_fatal_error( 1403 "The alignment requirment of frame variables cannot be higher than " 1404 "the alignment of the async function context"); 1405 } 1406 break; 1407 } 1408 } 1409 1410 return FrameTy; 1411} 1412 1413// We use a pointer use visitor to track how an alloca is being used. 1414// The goal is to be able to answer the following three questions: 1415// 1. Should this alloca be allocated on the frame instead. 1416// 2. Could the content of the alloca be modified prior to CoroBegn, which would 1417// require copying the data from alloca to the frame after CoroBegin. 1418// 3. Is there any alias created for this alloca prior to CoroBegin, but used 1419// after CoroBegin. In that case, we will need to recreate the alias after 1420// CoroBegin based off the frame. To answer question 1, we track two things: 1421// a. List of all BasicBlocks that use this alloca or any of the aliases of 1422// the alloca. In the end, we check if there exists any two basic blocks that 1423// cross suspension points. If so, this alloca must be put on the frame. b. 1424// Whether the alloca or any alias of the alloca is escaped at some point, 1425// either by storing the address somewhere, or the address is used in a 1426// function call that might capture. If it's ever escaped, this alloca must be 1427// put on the frame conservatively. 1428// To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin. 1429// Whenever a potential write happens, either through a store instruction, a 1430// function call or any of the memory intrinsics, we check whether this 1431// instruction is prior to CoroBegin. To answer question 3, we track the offsets 1432// of all aliases created for the alloca prior to CoroBegin but used after 1433// CoroBegin. std::optional is used to be able to represent the case when the 1434// offset is unknown (e.g. when you have a PHINode that takes in different 1435// offset values). We cannot handle unknown offsets and will assert. This is the 1436// potential issue left out. An ideal solution would likely require a 1437// significant redesign. 1438namespace { 1439struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { 1440 using Base = PtrUseVisitor<AllocaUseVisitor>; 1441 AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, 1442 const CoroBeginInst &CB, const SuspendCrossingInfo &Checker, 1443 bool ShouldUseLifetimeStartInfo) 1444 : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker), 1445 ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {} 1446 1447 void visit(Instruction &I) { 1448 Users.insert(&I); 1449 Base::visit(I); 1450 // If the pointer is escaped prior to CoroBegin, we have to assume it would 1451 // be written into before CoroBegin as well. 1452 if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) { 1453 MayWriteBeforeCoroBegin = true; 1454 } 1455 } 1456 // We need to provide this overload as PtrUseVisitor uses a pointer based 1457 // visiting function. 1458 void visit(Instruction *I) { return visit(*I); } 1459 1460 void visitPHINode(PHINode &I) { 1461 enqueueUsers(I); 1462 handleAlias(I); 1463 } 1464 1465 void visitSelectInst(SelectInst &I) { 1466 enqueueUsers(I); 1467 handleAlias(I); 1468 } 1469 1470 void visitStoreInst(StoreInst &SI) { 1471 // Regardless whether the alias of the alloca is the value operand or the 1472 // pointer operand, we need to assume the alloca is been written. 1473 handleMayWrite(SI); 1474 1475 if (SI.getValueOperand() != U->get()) 1476 return; 1477 1478 // We are storing the pointer into a memory location, potentially escaping. 1479 // As an optimization, we try to detect simple cases where it doesn't 1480 // actually escape, for example: 1481 // %ptr = alloca .. 1482 // %addr = alloca .. 1483 // store %ptr, %addr 1484 // %x = load %addr 1485 // .. 1486 // If %addr is only used by loading from it, we could simply treat %x as 1487 // another alias of %ptr, and not considering %ptr being escaped. 1488 auto IsSimpleStoreThenLoad = [&]() { 1489 auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand()); 1490 // If the memory location we are storing to is not an alloca, it 1491 // could be an alias of some other memory locations, which is difficult 1492 // to analyze. 1493 if (!AI) 1494 return false; 1495 // StoreAliases contains aliases of the memory location stored into. 1496 SmallVector<Instruction *, 4> StoreAliases = {AI}; 1497 while (!StoreAliases.empty()) { 1498 Instruction *I = StoreAliases.pop_back_val(); 1499 for (User *U : I->users()) { 1500 // If we are loading from the memory location, we are creating an 1501 // alias of the original pointer. 1502 if (auto *LI = dyn_cast<LoadInst>(U)) { 1503 enqueueUsers(*LI); 1504 handleAlias(*LI); 1505 continue; 1506 } 1507 // If we are overriding the memory location, the pointer certainly 1508 // won't escape. 1509 if (auto *S = dyn_cast<StoreInst>(U)) 1510 if (S->getPointerOperand() == I) 1511 continue; 1512 if (auto *II = dyn_cast<IntrinsicInst>(U)) 1513 if (II->isLifetimeStartOrEnd()) 1514 continue; 1515 // BitCastInst creats aliases of the memory location being stored 1516 // into. 1517 if (auto *BI = dyn_cast<BitCastInst>(U)) { 1518 StoreAliases.push_back(BI); 1519 continue; 1520 } 1521 return false; 1522 } 1523 } 1524 1525 return true; 1526 }; 1527 1528 if (!IsSimpleStoreThenLoad()) 1529 PI.setEscaped(&SI); 1530 } 1531 1532 // All mem intrinsics modify the data. 1533 void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); } 1534 1535 void visitBitCastInst(BitCastInst &BC) { 1536 Base::visitBitCastInst(BC); 1537 handleAlias(BC); 1538 } 1539 1540 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1541 Base::visitAddrSpaceCastInst(ASC); 1542 handleAlias(ASC); 1543 } 1544 1545 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1546 // The base visitor will adjust Offset accordingly. 1547 Base::visitGetElementPtrInst(GEPI); 1548 handleAlias(GEPI); 1549 } 1550 1551 void visitIntrinsicInst(IntrinsicInst &II) { 1552 // When we found the lifetime markers refers to a 1553 // subrange of the original alloca, ignore the lifetime 1554 // markers to avoid misleading the analysis. 1555 if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown || 1556 !Offset.isZero()) 1557 return Base::visitIntrinsicInst(II); 1558 LifetimeStarts.insert(&II); 1559 } 1560 1561 void visitCallBase(CallBase &CB) { 1562 for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op) 1563 if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op)) 1564 PI.setEscaped(&CB); 1565 handleMayWrite(CB); 1566 } 1567 1568 bool getShouldLiveOnFrame() const { 1569 if (!ShouldLiveOnFrame) 1570 ShouldLiveOnFrame = computeShouldLiveOnFrame(); 1571 return *ShouldLiveOnFrame; 1572 } 1573 1574 bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; } 1575 1576 DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const { 1577 assert(getShouldLiveOnFrame() && "This method should only be called if the " 1578 "alloca needs to live on the frame."); 1579 for (const auto &P : AliasOffetMap) 1580 if (!P.second) 1581 report_fatal_error("Unable to handle an alias with unknown offset " 1582 "created before CoroBegin."); 1583 return AliasOffetMap; 1584 } 1585 1586private: 1587 const DominatorTree &DT; 1588 const CoroBeginInst &CoroBegin; 1589 const SuspendCrossingInfo &Checker; 1590 // All alias to the original AllocaInst, created before CoroBegin and used 1591 // after CoroBegin. Each entry contains the instruction and the offset in the 1592 // original Alloca. They need to be recreated after CoroBegin off the frame. 1593 DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{}; 1594 SmallPtrSet<Instruction *, 4> Users{}; 1595 SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; 1596 bool MayWriteBeforeCoroBegin{false}; 1597 bool ShouldUseLifetimeStartInfo{true}; 1598 1599 mutable std::optional<bool> ShouldLiveOnFrame{}; 1600 1601 bool computeShouldLiveOnFrame() const { 1602 // If lifetime information is available, we check it first since it's 1603 // more precise. We look at every pair of lifetime.start intrinsic and 1604 // every basic block that uses the pointer to see if they cross suspension 1605 // points. The uses cover both direct uses as well as indirect uses. 1606 if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { 1607 for (auto *I : Users) 1608 for (auto *S : LifetimeStarts) 1609 if (Checker.isDefinitionAcrossSuspend(*S, I)) 1610 return true; 1611 // Addresses are guaranteed to be identical after every lifetime.start so 1612 // we cannot use the local stack if the address escaped and there is a 1613 // suspend point between lifetime markers. This should also cover the 1614 // case of a single lifetime.start intrinsic in a loop with suspend point. 1615 if (PI.isEscaped()) { 1616 for (auto *A : LifetimeStarts) { 1617 for (auto *B : LifetimeStarts) { 1618 if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(), 1619 B->getParent())) 1620 return true; 1621 } 1622 } 1623 } 1624 return false; 1625 } 1626 // FIXME: Ideally the isEscaped check should come at the beginning. 1627 // However there are a few loose ends that need to be fixed first before 1628 // we can do that. We need to make sure we are not over-conservative, so 1629 // that the data accessed in-between await_suspend and symmetric transfer 1630 // is always put on the stack, and also data accessed after coro.end is 1631 // always put on the stack (esp the return object). To fix that, we need 1632 // to: 1633 // 1) Potentially treat sret as nocapture in calls 1634 // 2) Special handle the return object and put it on the stack 1635 // 3) Utilize lifetime.end intrinsic 1636 if (PI.isEscaped()) 1637 return true; 1638 1639 for (auto *U1 : Users) 1640 for (auto *U2 : Users) 1641 if (Checker.isDefinitionAcrossSuspend(*U1, U2)) 1642 return true; 1643 1644 return false; 1645 } 1646 1647 void handleMayWrite(const Instruction &I) { 1648 if (!DT.dominates(&CoroBegin, &I)) 1649 MayWriteBeforeCoroBegin = true; 1650 } 1651 1652 bool usedAfterCoroBegin(Instruction &I) { 1653 for (auto &U : I.uses()) 1654 if (DT.dominates(&CoroBegin, U)) 1655 return true; 1656 return false; 1657 } 1658 1659 void handleAlias(Instruction &I) { 1660 // We track all aliases created prior to CoroBegin but used after. 1661 // These aliases may need to be recreated after CoroBegin if the alloca 1662 // need to live on the frame. 1663 if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I)) 1664 return; 1665 1666 if (!IsOffsetKnown) { 1667 AliasOffetMap[&I].reset(); 1668 } else { 1669 auto Itr = AliasOffetMap.find(&I); 1670 if (Itr == AliasOffetMap.end()) { 1671 AliasOffetMap[&I] = Offset; 1672 } else if (Itr->second && *Itr->second != Offset) { 1673 // If we have seen two different possible values for this alias, we set 1674 // it to empty. 1675 AliasOffetMap[&I].reset(); 1676 } 1677 } 1678 } 1679}; 1680} // namespace 1681 1682// We need to make room to insert a spill after initial PHIs, but before 1683// catchswitch instruction. Placing it before violates the requirement that 1684// catchswitch, like all other EHPads must be the first nonPHI in a block. 1685// 1686// Split away catchswitch into a separate block and insert in its place: 1687// 1688// cleanuppad <InsertPt> cleanupret. 1689// 1690// cleanupret instruction will act as an insert point for the spill. 1691static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { 1692 BasicBlock *CurrentBlock = CatchSwitch->getParent(); 1693 BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch); 1694 CurrentBlock->getTerminator()->eraseFromParent(); 1695 1696 auto *CleanupPad = 1697 CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock); 1698 auto *CleanupRet = 1699 CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock); 1700 return CleanupRet; 1701} 1702 1703// Replace all alloca and SSA values that are accessed across suspend points 1704// with GetElementPointer from coroutine frame + loads and stores. Create an 1705// AllocaSpillBB that will become the new entry block for the resume parts of 1706// the coroutine: 1707// 1708// %hdl = coro.begin(...) 1709// whatever 1710// 1711// becomes: 1712// 1713// %hdl = coro.begin(...) 1714// br label %AllocaSpillBB 1715// 1716// AllocaSpillBB: 1717// ; geps corresponding to allocas that were moved to coroutine frame 1718// br label PostSpill 1719// 1720// PostSpill: 1721// whatever 1722// 1723// 1724static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { 1725 auto *CB = Shape.CoroBegin; 1726 LLVMContext &C = CB->getContext(); 1727 Function *F = CB->getFunction(); 1728 IRBuilder<> Builder(C); 1729 StructType *FrameTy = Shape.FrameTy; 1730 Value *FramePtr = Shape.FramePtr; 1731 DominatorTree DT(*F); 1732 SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; 1733 1734 // Create a GEP with the given index into the coroutine frame for the original 1735 // value Orig. Appends an extra 0 index for array-allocas, preserving the 1736 // original type. 1737 auto GetFramePointer = [&](Value *Orig) -> Value * { 1738 FieldIDType Index = FrameData.getFieldIndex(Orig); 1739 SmallVector<Value *, 3> Indices = { 1740 ConstantInt::get(Type::getInt32Ty(C), 0), 1741 ConstantInt::get(Type::getInt32Ty(C), Index), 1742 }; 1743 1744 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1745 if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { 1746 auto Count = CI->getValue().getZExtValue(); 1747 if (Count > 1) { 1748 Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0)); 1749 } 1750 } else { 1751 report_fatal_error("Coroutines cannot handle non static allocas yet"); 1752 } 1753 } 1754 1755 auto GEP = cast<GetElementPtrInst>( 1756 Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); 1757 if (auto *AI = dyn_cast<AllocaInst>(Orig)) { 1758 if (FrameData.getDynamicAlign(Orig) != 0) { 1759 assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value()); 1760 auto *M = AI->getModule(); 1761 auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); 1762 auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); 1763 auto *AlignMask = 1764 ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1); 1765 PtrValue = Builder.CreateAdd(PtrValue, AlignMask); 1766 PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); 1767 return Builder.CreateIntToPtr(PtrValue, AI->getType()); 1768 } 1769 // If the type of GEP is not equal to the type of AllocaInst, it implies 1770 // that the AllocaInst may be reused in the Frame slot of other 1771 // AllocaInst. So We cast GEP to the AllocaInst here to re-use 1772 // the Frame storage. 1773 // 1774 // Note: If we change the strategy dealing with alignment, we need to refine 1775 // this casting. 1776 if (GEP->getType() != Orig->getType()) 1777 return Builder.CreateAddrSpaceCast(GEP, Orig->getType(), 1778 Orig->getName() + Twine(".cast")); 1779 } 1780 return GEP; 1781 }; 1782 1783 for (auto const &E : FrameData.Spills) { 1784 Value *Def = E.first; 1785 auto SpillAlignment = Align(FrameData.getAlign(Def)); 1786 // Create a store instruction storing the value into the 1787 // coroutine frame. 1788 BasicBlock::iterator InsertPt; 1789 Type *ByValTy = nullptr; 1790 if (auto *Arg = dyn_cast<Argument>(Def)) { 1791 // For arguments, we will place the store instruction right after 1792 // the coroutine frame pointer instruction, i.e. coro.begin. 1793 InsertPt = Shape.getInsertPtAfterFramePtr(); 1794 1795 // If we're spilling an Argument, make sure we clear 'nocapture' 1796 // from the coroutine function. 1797 Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture); 1798 1799 if (Arg->hasByValAttr()) 1800 ByValTy = Arg->getParamByValType(); 1801 } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) { 1802 // Don't spill immediately after a suspend; splitting assumes 1803 // that the suspend will be followed by a branch. 1804 InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt(); 1805 } else { 1806 auto *I = cast<Instruction>(Def); 1807 if (!DT.dominates(CB, I)) { 1808 // If it is not dominated by CoroBegin, then spill should be 1809 // inserted immediately after CoroFrame is computed. 1810 InsertPt = Shape.getInsertPtAfterFramePtr(); 1811 } else if (auto *II = dyn_cast<InvokeInst>(I)) { 1812 // If we are spilling the result of the invoke instruction, split 1813 // the normal edge and insert the spill in the new block. 1814 auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest()); 1815 InsertPt = NewBB->getTerminator()->getIterator(); 1816 } else if (isa<PHINode>(I)) { 1817 // Skip the PHINodes and EH pads instructions. 1818 BasicBlock *DefBlock = I->getParent(); 1819 if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) 1820 InsertPt = splitBeforeCatchSwitch(CSI)->getIterator(); 1821 else 1822 InsertPt = DefBlock->getFirstInsertionPt(); 1823 } else { 1824 assert(!I->isTerminator() && "unexpected terminator"); 1825 // For all other values, the spill is placed immediately after 1826 // the definition. 1827 InsertPt = I->getNextNode()->getIterator(); 1828 } 1829 } 1830 1831 auto Index = FrameData.getFieldIndex(Def); 1832 Builder.SetInsertPoint(InsertPt->getParent(), InsertPt); 1833 auto *G = Builder.CreateConstInBoundsGEP2_32( 1834 FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr")); 1835 if (ByValTy) { 1836 // For byval arguments, we need to store the pointed value in the frame, 1837 // instead of the pointer itself. 1838 auto *Value = Builder.CreateLoad(ByValTy, Def); 1839 Builder.CreateAlignedStore(Value, G, SpillAlignment); 1840 } else { 1841 Builder.CreateAlignedStore(Def, G, SpillAlignment); 1842 } 1843 1844 BasicBlock *CurrentBlock = nullptr; 1845 Value *CurrentReload = nullptr; 1846 for (auto *U : E.second) { 1847 // If we have not seen the use block, create a load instruction to reload 1848 // the spilled value from the coroutine frame. Populates the Value pointer 1849 // reference provided with the frame GEP. 1850 if (CurrentBlock != U->getParent()) { 1851 CurrentBlock = U->getParent(); 1852 Builder.SetInsertPoint(CurrentBlock, 1853 CurrentBlock->getFirstInsertionPt()); 1854 1855 auto *GEP = GetFramePointer(E.first); 1856 GEP->setName(E.first->getName() + Twine(".reload.addr")); 1857 if (ByValTy) 1858 CurrentReload = GEP; 1859 else 1860 CurrentReload = Builder.CreateAlignedLoad( 1861 FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, 1862 SpillAlignment, E.first->getName() + Twine(".reload")); 1863 1864 TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def); 1865 TinyPtrVector<DPValue *> DPVs = findDPVDeclares(Def); 1866 // Try best to find dbg.declare. If the spill is a temp, there may not 1867 // be a direct dbg.declare. Walk up the load chain to find one from an 1868 // alias. 1869 if (F->getSubprogram()) { 1870 auto *CurDef = Def; 1871 while (DIs.empty() && DPVs.empty() && isa<LoadInst>(CurDef)) { 1872 auto *LdInst = cast<LoadInst>(CurDef); 1873 // Only consider ptr to ptr same type load. 1874 if (LdInst->getPointerOperandType() != LdInst->getType()) 1875 break; 1876 CurDef = LdInst->getPointerOperand(); 1877 if (!isa<AllocaInst, LoadInst>(CurDef)) 1878 break; 1879 DIs = findDbgDeclares(CurDef); 1880 DPVs = findDPVDeclares(CurDef); 1881 } 1882 } 1883 1884 auto SalvageOne = [&](auto *DDI) { 1885 bool AllowUnresolved = false; 1886 // This dbg.declare is preserved for all coro-split function 1887 // fragments. It will be unreachable in the main function, and 1888 // processed by coro::salvageDebugInfo() by CoroCloner. 1889 if (UseNewDbgInfoFormat) { 1890 DPValue *NewDPV = 1891 new DPValue(ValueAsMetadata::get(CurrentReload), 1892 DDI->getVariable(), DDI->getExpression(), 1893 DDI->getDebugLoc(), DPValue::LocationType::Declare); 1894 Builder.GetInsertPoint()->getParent()->insertDPValueBefore( 1895 NewDPV, Builder.GetInsertPoint()); 1896 } else { 1897 DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) 1898 .insertDeclare(CurrentReload, DDI->getVariable(), 1899 DDI->getExpression(), DDI->getDebugLoc(), 1900 &*Builder.GetInsertPoint()); 1901 } 1902 // This dbg.declare is for the main function entry point. It 1903 // will be deleted in all coro-split functions. 1904 coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, 1905 false /*UseEntryValue*/); 1906 }; 1907 for_each(DIs, SalvageOne); 1908 for_each(DPVs, SalvageOne); 1909 } 1910 1911 // If we have a single edge PHINode, remove it and replace it with a 1912 // reload from the coroutine frame. (We already took care of multi edge 1913 // PHINodes by rewriting them in the rewritePHIs function). 1914 if (auto *PN = dyn_cast<PHINode>(U)) { 1915 assert(PN->getNumIncomingValues() == 1 && 1916 "unexpected number of incoming " 1917 "values in the PHINode"); 1918 PN->replaceAllUsesWith(CurrentReload); 1919 PN->eraseFromParent(); 1920 continue; 1921 } 1922 1923 // Replace all uses of CurrentValue in the current instruction with 1924 // reload. 1925 U->replaceUsesOfWith(Def, CurrentReload); 1926 // Instructions are added to Def's user list if the attached 1927 // debug records use Def. Update those now. 1928 for (auto &DPV : U->getDbgValueRange()) 1929 DPV.replaceVariableLocationOp(Def, CurrentReload, true); 1930 } 1931 } 1932 1933 BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent(); 1934 1935 auto SpillBlock = FramePtrBB->splitBasicBlock( 1936 Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB"); 1937 SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); 1938 Shape.AllocaSpillBlock = SpillBlock; 1939 1940 // retcon and retcon.once lowering assumes all uses have been sunk. 1941 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 1942 Shape.ABI == coro::ABI::Async) { 1943 // If we found any allocas, replace all of their remaining uses with Geps. 1944 Builder.SetInsertPoint(SpillBlock, SpillBlock->begin()); 1945 for (const auto &P : FrameData.Allocas) { 1946 AllocaInst *Alloca = P.Alloca; 1947 auto *G = GetFramePointer(Alloca); 1948 1949 // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) 1950 // here, as we are changing location of the instruction. 1951 G->takeName(Alloca); 1952 Alloca->replaceAllUsesWith(G); 1953 Alloca->eraseFromParent(); 1954 } 1955 return; 1956 } 1957 1958 // If we found any alloca, replace all of their remaining uses with GEP 1959 // instructions. To remain debugbility, we replace the uses of allocas for 1960 // dbg.declares and dbg.values with the reload from the frame. 1961 // Note: We cannot replace the alloca with GEP instructions indiscriminately, 1962 // as some of the uses may not be dominated by CoroBegin. 1963 Builder.SetInsertPoint(Shape.AllocaSpillBlock, 1964 Shape.AllocaSpillBlock->begin()); 1965 SmallVector<Instruction *, 4> UsersToUpdate; 1966 for (const auto &A : FrameData.Allocas) { 1967 AllocaInst *Alloca = A.Alloca; 1968 UsersToUpdate.clear(); 1969 for (User *U : Alloca->users()) { 1970 auto *I = cast<Instruction>(U); 1971 if (DT.dominates(CB, I)) 1972 UsersToUpdate.push_back(I); 1973 } 1974 if (UsersToUpdate.empty()) 1975 continue; 1976 auto *G = GetFramePointer(Alloca); 1977 G->setName(Alloca->getName() + Twine(".reload.addr")); 1978 1979 SmallVector<DbgVariableIntrinsic *, 4> DIs; 1980 SmallVector<DPValue *> DPValues; 1981 findDbgUsers(DIs, Alloca, &DPValues); 1982 for (auto *DVI : DIs) 1983 DVI->replaceUsesOfWith(Alloca, G); 1984 for (auto *DPV : DPValues) 1985 DPV->replaceVariableLocationOp(Alloca, G); 1986 1987 for (Instruction *I : UsersToUpdate) { 1988 // It is meaningless to retain the lifetime intrinsics refer for the 1989 // member of coroutine frames and the meaningless lifetime intrinsics 1990 // are possible to block further optimizations. 1991 if (I->isLifetimeStartOrEnd()) { 1992 I->eraseFromParent(); 1993 continue; 1994 } 1995 1996 I->replaceUsesOfWith(Alloca, G); 1997 } 1998 } 1999 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2000 for (const auto &A : FrameData.Allocas) { 2001 AllocaInst *Alloca = A.Alloca; 2002 if (A.MayWriteBeforeCoroBegin) { 2003 // isEscaped really means potentially modified before CoroBegin. 2004 if (Alloca->isArrayAllocation()) 2005 report_fatal_error( 2006 "Coroutines cannot handle copying of array allocas yet"); 2007 2008 auto *G = GetFramePointer(Alloca); 2009 auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); 2010 Builder.CreateStore(Value, G); 2011 } 2012 // For each alias to Alloca created before CoroBegin but used after 2013 // CoroBegin, we recreate them after CoroBegin by appplying the offset 2014 // to the pointer in the frame. 2015 for (const auto &Alias : A.Aliases) { 2016 auto *FramePtr = GetFramePointer(Alloca); 2017 auto &Value = *Alias.second; 2018 auto ITy = IntegerType::get(C, Value.getBitWidth()); 2019 auto *AliasPtr = 2020 Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value)); 2021 Alias.first->replaceUsesWithIf( 2022 AliasPtr, [&](Use &U) { return DT.dominates(CB, U); }); 2023 } 2024 } 2025 2026 // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle 2027 // the case that the PromiseAlloca may have writes before CoroBegin in the 2028 // above codes. And it may be problematic in edge cases. See 2029 // https://github.com/llvm/llvm-project/issues/57861 for an example. 2030 if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) { 2031 AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca; 2032 // If there is memory accessing to promise alloca before CoroBegin; 2033 bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) { 2034 auto *Inst = dyn_cast<Instruction>(U.getUser()); 2035 if (!Inst || DT.dominates(CB, Inst)) 2036 return false; 2037 2038 if (auto *CI = dyn_cast<CallInst>(Inst)) { 2039 // It is fine if the call wouldn't write to the Promise. 2040 // This is possible for @llvm.coro.id intrinsics, which 2041 // would take the promise as the second argument as a 2042 // marker. 2043 if (CI->onlyReadsMemory() || 2044 CI->onlyReadsMemory(CI->getArgOperandNo(&U))) 2045 return false; 2046 return true; 2047 } 2048 2049 return isa<StoreInst>(Inst) || 2050 // It may take too much time to track the uses. 2051 // Be conservative about the case the use may escape. 2052 isa<GetElementPtrInst>(Inst) || 2053 // There would always be a bitcast for the promise alloca 2054 // before we enabled Opaque pointers. And now given 2055 // opaque pointers are enabled by default. This should be 2056 // fine. 2057 isa<BitCastInst>(Inst); 2058 }); 2059 if (HasAccessingPromiseBeforeCB) { 2060 Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr()); 2061 auto *G = GetFramePointer(PA); 2062 auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA); 2063 Builder.CreateStore(Value, G); 2064 } 2065 } 2066} 2067 2068// Moves the values in the PHIs in SuccBB that correspong to PredBB into a new 2069// PHI in InsertedBB. 2070static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB, 2071 BasicBlock *InsertedBB, 2072 BasicBlock *PredBB, 2073 PHINode *UntilPHI = nullptr) { 2074 auto *PN = cast<PHINode>(&SuccBB->front()); 2075 do { 2076 int Index = PN->getBasicBlockIndex(InsertedBB); 2077 Value *V = PN->getIncomingValue(Index); 2078 PHINode *InputV = PHINode::Create( 2079 V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName()); 2080 InputV->insertBefore(InsertedBB->begin()); 2081 InputV->addIncoming(V, PredBB); 2082 PN->setIncomingValue(Index, InputV); 2083 PN = dyn_cast<PHINode>(PN->getNextNode()); 2084 } while (PN != UntilPHI); 2085} 2086 2087// Rewrites the PHI Nodes in a cleanuppad. 2088static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB, 2089 CleanupPadInst *CleanupPad) { 2090 // For every incoming edge to a CleanupPad we will create a new block holding 2091 // all incoming values in single-value PHI nodes. We will then create another 2092 // block to act as a dispather (as all unwind edges for related EH blocks 2093 // must be the same). 2094 // 2095 // cleanuppad: 2096 // %2 = phi i32[%0, %catchswitch], [%1, %catch.1] 2097 // %3 = cleanuppad within none [] 2098 // 2099 // It will create: 2100 // 2101 // cleanuppad.corodispatch 2102 // %2 = phi i8[0, %catchswitch], [1, %catch.1] 2103 // %3 = cleanuppad within none [] 2104 // switch i8 % 2, label %unreachable 2105 // [i8 0, label %cleanuppad.from.catchswitch 2106 // i8 1, label %cleanuppad.from.catch.1] 2107 // cleanuppad.from.catchswitch: 2108 // %4 = phi i32 [%0, %catchswitch] 2109 // br %label cleanuppad 2110 // cleanuppad.from.catch.1: 2111 // %6 = phi i32 [%1, %catch.1] 2112 // br %label cleanuppad 2113 // cleanuppad: 2114 // %8 = phi i32 [%4, %cleanuppad.from.catchswitch], 2115 // [%6, %cleanuppad.from.catch.1] 2116 2117 // Unreachable BB, in case switching on an invalid value in the dispatcher. 2118 auto *UnreachBB = BasicBlock::Create( 2119 CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent()); 2120 IRBuilder<> Builder(UnreachBB); 2121 Builder.CreateUnreachable(); 2122 2123 // Create a new cleanuppad which will be the dispatcher. 2124 auto *NewCleanupPadBB = 2125 BasicBlock::Create(CleanupPadBB->getContext(), 2126 CleanupPadBB->getName() + Twine(".corodispatch"), 2127 CleanupPadBB->getParent(), CleanupPadBB); 2128 Builder.SetInsertPoint(NewCleanupPadBB); 2129 auto *SwitchType = Builder.getInt8Ty(); 2130 auto *SetDispatchValuePN = 2131 Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB)); 2132 CleanupPad->removeFromParent(); 2133 CleanupPad->insertAfter(SetDispatchValuePN); 2134 auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB, 2135 pred_size(CleanupPadBB)); 2136 2137 int SwitchIndex = 0; 2138 SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB)); 2139 for (BasicBlock *Pred : Preds) { 2140 // Create a new cleanuppad and move the PHI values to there. 2141 auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(), 2142 CleanupPadBB->getName() + 2143 Twine(".from.") + Pred->getName(), 2144 CleanupPadBB->getParent(), CleanupPadBB); 2145 updatePhiNodes(CleanupPadBB, Pred, CaseBB); 2146 CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") + 2147 Pred->getName()); 2148 Builder.SetInsertPoint(CaseBB); 2149 Builder.CreateBr(CleanupPadBB); 2150 movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB); 2151 2152 // Update this Pred to the new unwind point. 2153 setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB); 2154 2155 // Setup the switch in the dispatcher. 2156 auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex); 2157 SetDispatchValuePN->addIncoming(SwitchConstant, Pred); 2158 SwitchOnDispatch->addCase(SwitchConstant, CaseBB); 2159 SwitchIndex++; 2160 } 2161} 2162 2163static void cleanupSinglePredPHIs(Function &F) { 2164 SmallVector<PHINode *, 32> Worklist; 2165 for (auto &BB : F) { 2166 for (auto &Phi : BB.phis()) { 2167 if (Phi.getNumIncomingValues() == 1) { 2168 Worklist.push_back(&Phi); 2169 } else 2170 break; 2171 } 2172 } 2173 while (!Worklist.empty()) { 2174 auto *Phi = Worklist.pop_back_val(); 2175 auto *OriginalValue = Phi->getIncomingValue(0); 2176 Phi->replaceAllUsesWith(OriginalValue); 2177 } 2178} 2179 2180static void rewritePHIs(BasicBlock &BB) { 2181 // For every incoming edge we will create a block holding all 2182 // incoming values in a single PHI nodes. 2183 // 2184 // loop: 2185 // %n.val = phi i32[%n, %entry], [%inc, %loop] 2186 // 2187 // It will create: 2188 // 2189 // loop.from.entry: 2190 // %n.loop.pre = phi i32 [%n, %entry] 2191 // br %label loop 2192 // loop.from.loop: 2193 // %inc.loop.pre = phi i32 [%inc, %loop] 2194 // br %label loop 2195 // 2196 // After this rewrite, further analysis will ignore any phi nodes with more 2197 // than one incoming edge. 2198 2199 // TODO: Simplify PHINodes in the basic block to remove duplicate 2200 // predecessors. 2201 2202 // Special case for CleanupPad: all EH blocks must have the same unwind edge 2203 // so we need to create an additional "dispatcher" block. 2204 if (auto *CleanupPad = 2205 dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) { 2206 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2207 for (BasicBlock *Pred : Preds) { 2208 if (CatchSwitchInst *CS = 2209 dyn_cast<CatchSwitchInst>(Pred->getTerminator())) { 2210 // CleanupPad with a CatchSwitch predecessor: therefore this is an 2211 // unwind destination that needs to be handle specially. 2212 assert(CS->getUnwindDest() == &BB); 2213 (void)CS; 2214 rewritePHIsForCleanupPad(&BB, CleanupPad); 2215 return; 2216 } 2217 } 2218 } 2219 2220 LandingPadInst *LandingPad = nullptr; 2221 PHINode *ReplPHI = nullptr; 2222 if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { 2223 // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. 2224 // We replace the original landing pad with a PHINode that will collect the 2225 // results from all of them. 2226 ReplPHI = PHINode::Create(LandingPad->getType(), 1, ""); 2227 ReplPHI->insertBefore(LandingPad->getIterator()); 2228 ReplPHI->takeName(LandingPad); 2229 LandingPad->replaceAllUsesWith(ReplPHI); 2230 // We will erase the original landing pad at the end of this function after 2231 // ehAwareSplitEdge cloned it in the transition blocks. 2232 } 2233 2234 SmallVector<BasicBlock *, 8> Preds(predecessors(&BB)); 2235 for (BasicBlock *Pred : Preds) { 2236 auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); 2237 IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); 2238 2239 // Stop the moving of values at ReplPHI, as this is either null or the PHI 2240 // that replaced the landing pad. 2241 movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI); 2242 } 2243 2244 if (LandingPad) { 2245 // Calls to ehAwareSplitEdge function cloned the original lading pad. 2246 // No longer need it. 2247 LandingPad->eraseFromParent(); 2248 } 2249} 2250 2251static void rewritePHIs(Function &F) { 2252 SmallVector<BasicBlock *, 8> WorkList; 2253 2254 for (BasicBlock &BB : F) 2255 if (auto *PN = dyn_cast<PHINode>(&BB.front())) 2256 if (PN->getNumIncomingValues() > 1) 2257 WorkList.push_back(&BB); 2258 2259 for (BasicBlock *BB : WorkList) 2260 rewritePHIs(*BB); 2261} 2262 2263/// Default materializable callback 2264// Check for instructions that we can recreate on resume as opposed to spill 2265// the result into a coroutine frame. 2266bool coro::defaultMaterializable(Instruction &V) { 2267 return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || 2268 isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); 2269} 2270 2271// Check for structural coroutine intrinsics that should not be spilled into 2272// the coroutine frame. 2273static bool isCoroutineStructureIntrinsic(Instruction &I) { 2274 return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) || 2275 isa<CoroSuspendInst>(&I); 2276} 2277 2278// For each instruction identified as materializable across the suspend point, 2279// and its associated DAG of other rematerializable instructions, 2280// recreate the DAG of instructions after the suspend point. 2281static void rewriteMaterializableInstructions( 2282 const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> 2283 &AllRemats) { 2284 // This has to be done in 2 phases 2285 // Do the remats and record the required defs to be replaced in the 2286 // original use instructions 2287 // Once all the remats are complete, replace the uses in the final 2288 // instructions with the new defs 2289 typedef struct { 2290 Instruction *Use; 2291 Instruction *Def; 2292 Instruction *Remat; 2293 } ProcessNode; 2294 2295 SmallVector<ProcessNode> FinalInstructionsToProcess; 2296 2297 for (const auto &E : AllRemats) { 2298 Instruction *Use = E.first; 2299 Instruction *CurrentMaterialization = nullptr; 2300 RematGraph *RG = E.second.get(); 2301 ReversePostOrderTraversal<RematGraph *> RPOT(RG); 2302 SmallVector<Instruction *> InstructionsToProcess; 2303 2304 // If the target use is actually a suspend instruction then we have to 2305 // insert the remats into the end of the predecessor (there should only be 2306 // one). This is so that suspend blocks always have the suspend instruction 2307 // as the first instruction. 2308 auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); 2309 if (isa<AnyCoroSuspendInst>(Use)) { 2310 BasicBlock *SuspendPredecessorBlock = 2311 Use->getParent()->getSinglePredecessor(); 2312 assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); 2313 InsertPoint = SuspendPredecessorBlock->getTerminator(); 2314 } 2315 2316 // Note: skip the first instruction as this is the actual use that we're 2317 // rematerializing everything for. 2318 auto I = RPOT.begin(); 2319 ++I; 2320 for (; I != RPOT.end(); ++I) { 2321 Instruction *D = (*I)->Node; 2322 CurrentMaterialization = D->clone(); 2323 CurrentMaterialization->setName(D->getName()); 2324 CurrentMaterialization->insertBefore(InsertPoint); 2325 InsertPoint = CurrentMaterialization; 2326 2327 // Replace all uses of Def in the instructions being added as part of this 2328 // rematerialization group 2329 for (auto &I : InstructionsToProcess) 2330 I->replaceUsesOfWith(D, CurrentMaterialization); 2331 2332 // Don't replace the final use at this point as this can cause problems 2333 // for other materializations. Instead, for any final use that uses a 2334 // define that's being rematerialized, record the replace values 2335 for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) 2336 if (Use->getOperand(i) == D) // Is this operand pointing to oldval? 2337 FinalInstructionsToProcess.push_back( 2338 {Use, D, CurrentMaterialization}); 2339 2340 InstructionsToProcess.push_back(CurrentMaterialization); 2341 } 2342 } 2343 2344 // Finally, replace the uses with the defines that we've just rematerialized 2345 for (auto &R : FinalInstructionsToProcess) { 2346 if (auto *PN = dyn_cast<PHINode>(R.Use)) { 2347 assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " 2348 "values in the PHINode"); 2349 PN->replaceAllUsesWith(R.Remat); 2350 PN->eraseFromParent(); 2351 continue; 2352 } 2353 R.Use->replaceUsesOfWith(R.Def, R.Remat); 2354 } 2355} 2356 2357// Splits the block at a particular instruction unless it is the first 2358// instruction in the block with a single predecessor. 2359static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { 2360 auto *BB = I->getParent(); 2361 if (&BB->front() == I) { 2362 if (BB->getSinglePredecessor()) { 2363 BB->setName(Name); 2364 return BB; 2365 } 2366 } 2367 return BB->splitBasicBlock(I, Name); 2368} 2369 2370// Split above and below a particular instruction so that it 2371// will be all alone by itself in a block. 2372static void splitAround(Instruction *I, const Twine &Name) { 2373 splitBlockIfNotFirst(I, Name); 2374 splitBlockIfNotFirst(I->getNextNode(), "After" + Name); 2375} 2376 2377static bool isSuspendBlock(BasicBlock *BB) { 2378 return isa<AnyCoroSuspendInst>(BB->front()); 2379} 2380 2381typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; 2382 2383/// Does control flow starting at the given block ever reach a suspend 2384/// instruction before reaching a block in VisitedOrFreeBBs? 2385static bool isSuspendReachableFrom(BasicBlock *From, 2386 VisitedBlocksSet &VisitedOrFreeBBs) { 2387 // Eagerly try to add this block to the visited set. If it's already 2388 // there, stop recursing; this path doesn't reach a suspend before 2389 // either looping or reaching a freeing block. 2390 if (!VisitedOrFreeBBs.insert(From).second) 2391 return false; 2392 2393 // We assume that we'll already have split suspends into their own blocks. 2394 if (isSuspendBlock(From)) 2395 return true; 2396 2397 // Recurse on the successors. 2398 for (auto *Succ : successors(From)) { 2399 if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) 2400 return true; 2401 } 2402 2403 return false; 2404} 2405 2406/// Is the given alloca "local", i.e. bounded in lifetime to not cross a 2407/// suspend point? 2408static bool isLocalAlloca(CoroAllocaAllocInst *AI) { 2409 // Seed the visited set with all the basic blocks containing a free 2410 // so that we won't pass them up. 2411 VisitedBlocksSet VisitedOrFreeBBs; 2412 for (auto *User : AI->users()) { 2413 if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) 2414 VisitedOrFreeBBs.insert(FI->getParent()); 2415 } 2416 2417 return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); 2418} 2419 2420/// After we split the coroutine, will the given basic block be along 2421/// an obvious exit path for the resumption function? 2422static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, 2423 unsigned depth = 3) { 2424 // If we've bottomed out our depth count, stop searching and assume 2425 // that the path might loop back. 2426 if (depth == 0) return false; 2427 2428 // If this is a suspend block, we're about to exit the resumption function. 2429 if (isSuspendBlock(BB)) return true; 2430 2431 // Recurse into the successors. 2432 for (auto *Succ : successors(BB)) { 2433 if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) 2434 return false; 2435 } 2436 2437 // If none of the successors leads back in a loop, we're on an exit/abort. 2438 return true; 2439} 2440 2441static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { 2442 // Look for a free that isn't sufficiently obviously followed by 2443 // either a suspend or a termination, i.e. something that will leave 2444 // the coro resumption frame. 2445 for (auto *U : AI->users()) { 2446 auto FI = dyn_cast<CoroAllocaFreeInst>(U); 2447 if (!FI) continue; 2448 2449 if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) 2450 return true; 2451 } 2452 2453 // If we never found one, we don't need a stack save. 2454 return false; 2455} 2456 2457/// Turn each of the given local allocas into a normal (dynamic) alloca 2458/// instruction. 2459static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, 2460 SmallVectorImpl<Instruction*> &DeadInsts) { 2461 for (auto *AI : LocalAllocas) { 2462 IRBuilder<> Builder(AI); 2463 2464 // Save the stack depth. Try to avoid doing this if the stackrestore 2465 // is going to immediately precede a return or something. 2466 Value *StackSave = nullptr; 2467 if (localAllocaNeedsStackSave(AI)) 2468 StackSave = Builder.CreateStackSave(); 2469 2470 // Allocate memory. 2471 auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); 2472 Alloca->setAlignment(AI->getAlignment()); 2473 2474 for (auto *U : AI->users()) { 2475 // Replace gets with the allocation. 2476 if (isa<CoroAllocaGetInst>(U)) { 2477 U->replaceAllUsesWith(Alloca); 2478 2479 // Replace frees with stackrestores. This is safe because 2480 // alloca.alloc is required to obey a stack discipline, although we 2481 // don't enforce that structurally. 2482 } else { 2483 auto FI = cast<CoroAllocaFreeInst>(U); 2484 if (StackSave) { 2485 Builder.SetInsertPoint(FI); 2486 Builder.CreateStackRestore(StackSave); 2487 } 2488 } 2489 DeadInsts.push_back(cast<Instruction>(U)); 2490 } 2491 2492 DeadInsts.push_back(AI); 2493 } 2494} 2495 2496/// Turn the given coro.alloca.alloc call into a dynamic allocation. 2497/// This happens during the all-instructions iteration, so it must not 2498/// delete the call. 2499static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, 2500 coro::Shape &Shape, 2501 SmallVectorImpl<Instruction*> &DeadInsts) { 2502 IRBuilder<> Builder(AI); 2503 auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); 2504 2505 for (User *U : AI->users()) { 2506 if (isa<CoroAllocaGetInst>(U)) { 2507 U->replaceAllUsesWith(Alloc); 2508 } else { 2509 auto FI = cast<CoroAllocaFreeInst>(U); 2510 Builder.SetInsertPoint(FI); 2511 Shape.emitDealloc(Builder, Alloc, nullptr); 2512 } 2513 DeadInsts.push_back(cast<Instruction>(U)); 2514 } 2515 2516 // Push this on last so that it gets deleted after all the others. 2517 DeadInsts.push_back(AI); 2518 2519 // Return the new allocation value so that we can check for needed spills. 2520 return cast<Instruction>(Alloc); 2521} 2522 2523/// Get the current swifterror value. 2524static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, 2525 coro::Shape &Shape) { 2526 // Make a fake function pointer as a sort of intrinsic. 2527 auto FnTy = FunctionType::get(ValueTy, {}, false); 2528 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2529 2530 auto Call = Builder.CreateCall(FnTy, Fn, {}); 2531 Shape.SwiftErrorOps.push_back(Call); 2532 2533 return Call; 2534} 2535 2536/// Set the given value as the current swifterror value. 2537/// 2538/// Returns a slot that can be used as a swifterror slot. 2539static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, 2540 coro::Shape &Shape) { 2541 // Make a fake function pointer as a sort of intrinsic. 2542 auto FnTy = FunctionType::get(Builder.getPtrTy(), 2543 {V->getType()}, false); 2544 auto Fn = ConstantPointerNull::get(Builder.getPtrTy()); 2545 2546 auto Call = Builder.CreateCall(FnTy, Fn, { V }); 2547 Shape.SwiftErrorOps.push_back(Call); 2548 2549 return Call; 2550} 2551 2552/// Set the swifterror value from the given alloca before a call, 2553/// then put in back in the alloca afterwards. 2554/// 2555/// Returns an address that will stand in for the swifterror slot 2556/// until splitting. 2557static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, 2558 AllocaInst *Alloca, 2559 coro::Shape &Shape) { 2560 auto ValueTy = Alloca->getAllocatedType(); 2561 IRBuilder<> Builder(Call); 2562 2563 // Load the current value from the alloca and set it as the 2564 // swifterror value. 2565 auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); 2566 auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); 2567 2568 // Move to after the call. Since swifterror only has a guaranteed 2569 // value on normal exits, we can ignore implicit and explicit unwind 2570 // edges. 2571 if (isa<CallInst>(Call)) { 2572 Builder.SetInsertPoint(Call->getNextNode()); 2573 } else { 2574 auto Invoke = cast<InvokeInst>(Call); 2575 Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); 2576 } 2577 2578 // Get the current swifterror value and store it to the alloca. 2579 auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); 2580 Builder.CreateStore(ValueAfterCall, Alloca); 2581 2582 return Addr; 2583} 2584 2585/// Eliminate a formerly-swifterror alloca by inserting the get/set 2586/// intrinsics and attempting to MemToReg the alloca away. 2587static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, 2588 coro::Shape &Shape) { 2589 for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) { 2590 // swifterror values can only be used in very specific ways. 2591 // We take advantage of that here. 2592 auto User = Use.getUser(); 2593 if (isa<LoadInst>(User) || isa<StoreInst>(User)) 2594 continue; 2595 2596 assert(isa<CallInst>(User) || isa<InvokeInst>(User)); 2597 auto Call = cast<Instruction>(User); 2598 2599 auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); 2600 2601 // Use the returned slot address as the call argument. 2602 Use.set(Addr); 2603 } 2604 2605 // All the uses should be loads and stores now. 2606 assert(isAllocaPromotable(Alloca)); 2607} 2608 2609/// "Eliminate" a swifterror argument by reducing it to the alloca case 2610/// and then loading and storing in the prologue and epilog. 2611/// 2612/// The argument keeps the swifterror flag. 2613static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, 2614 coro::Shape &Shape, 2615 SmallVectorImpl<AllocaInst*> &AllocasToPromote) { 2616 IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); 2617 2618 auto ArgTy = cast<PointerType>(Arg.getType()); 2619 auto ValueTy = PointerType::getUnqual(F.getContext()); 2620 2621 // Reduce to the alloca case: 2622 2623 // Create an alloca and replace all uses of the arg with it. 2624 auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); 2625 Arg.replaceAllUsesWith(Alloca); 2626 2627 // Set an initial value in the alloca. swifterror is always null on entry. 2628 auto InitialValue = Constant::getNullValue(ValueTy); 2629 Builder.CreateStore(InitialValue, Alloca); 2630 2631 // Find all the suspends in the function and save and restore around them. 2632 for (auto *Suspend : Shape.CoroSuspends) { 2633 (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); 2634 } 2635 2636 // Find all the coro.ends in the function and restore the error value. 2637 for (auto *End : Shape.CoroEnds) { 2638 Builder.SetInsertPoint(End); 2639 auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); 2640 (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); 2641 } 2642 2643 // Now we can use the alloca logic. 2644 AllocasToPromote.push_back(Alloca); 2645 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2646} 2647 2648/// Eliminate all problematic uses of swifterror arguments and allocas 2649/// from the function. We'll fix them up later when splitting the function. 2650static void eliminateSwiftError(Function &F, coro::Shape &Shape) { 2651 SmallVector<AllocaInst*, 4> AllocasToPromote; 2652 2653 // Look for a swifterror argument. 2654 for (auto &Arg : F.args()) { 2655 if (!Arg.hasSwiftErrorAttr()) continue; 2656 2657 eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); 2658 break; 2659 } 2660 2661 // Look for swifterror allocas. 2662 for (auto &Inst : F.getEntryBlock()) { 2663 auto Alloca = dyn_cast<AllocaInst>(&Inst); 2664 if (!Alloca || !Alloca->isSwiftError()) continue; 2665 2666 // Clear the swifterror flag. 2667 Alloca->setSwiftError(false); 2668 2669 AllocasToPromote.push_back(Alloca); 2670 eliminateSwiftErrorAlloca(F, Alloca, Shape); 2671 } 2672 2673 // If we have any allocas to promote, compute a dominator tree and 2674 // promote them en masse. 2675 if (!AllocasToPromote.empty()) { 2676 DominatorTree DT(F); 2677 PromoteMemToReg(AllocasToPromote, DT); 2678 } 2679} 2680 2681/// retcon and retcon.once conventions assume that all spill uses can be sunk 2682/// after the coro.begin intrinsic. 2683static void sinkSpillUsesAfterCoroBegin(Function &F, 2684 const FrameDataInfo &FrameData, 2685 CoroBeginInst *CoroBegin) { 2686 DominatorTree Dom(F); 2687 2688 SmallSetVector<Instruction *, 32> ToMove; 2689 SmallVector<Instruction *, 32> Worklist; 2690 2691 // Collect all users that precede coro.begin. 2692 for (auto *Def : FrameData.getAllDefs()) { 2693 for (User *U : Def->users()) { 2694 auto Inst = cast<Instruction>(U); 2695 if (Inst->getParent() != CoroBegin->getParent() || 2696 Dom.dominates(CoroBegin, Inst)) 2697 continue; 2698 if (ToMove.insert(Inst)) 2699 Worklist.push_back(Inst); 2700 } 2701 } 2702 // Recursively collect users before coro.begin. 2703 while (!Worklist.empty()) { 2704 auto *Def = Worklist.pop_back_val(); 2705 for (User *U : Def->users()) { 2706 auto Inst = cast<Instruction>(U); 2707 if (Dom.dominates(CoroBegin, Inst)) 2708 continue; 2709 if (ToMove.insert(Inst)) 2710 Worklist.push_back(Inst); 2711 } 2712 } 2713 2714 // Sort by dominance. 2715 SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end()); 2716 llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool { 2717 // If a dominates b it should preceed (<) b. 2718 return Dom.dominates(A, B); 2719 }); 2720 2721 Instruction *InsertPt = CoroBegin->getNextNode(); 2722 for (Instruction *Inst : InsertionList) 2723 Inst->moveBefore(InsertPt); 2724} 2725 2726/// For each local variable that all of its user are only used inside one of 2727/// suspended region, we sink their lifetime.start markers to the place where 2728/// after the suspend block. Doing so minimizes the lifetime of each variable, 2729/// hence minimizing the amount of data we end up putting on the frame. 2730static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, 2731 SuspendCrossingInfo &Checker) { 2732 if (F.hasOptNone()) 2733 return; 2734 2735 DominatorTree DT(F); 2736 2737 // Collect all possible basic blocks which may dominate all uses of allocas. 2738 SmallPtrSet<BasicBlock *, 4> DomSet; 2739 DomSet.insert(&F.getEntryBlock()); 2740 for (auto *CSI : Shape.CoroSuspends) { 2741 BasicBlock *SuspendBlock = CSI->getParent(); 2742 assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && 2743 "should have split coro.suspend into its own block"); 2744 DomSet.insert(SuspendBlock->getSingleSuccessor()); 2745 } 2746 2747 for (Instruction &I : instructions(F)) { 2748 AllocaInst* AI = dyn_cast<AllocaInst>(&I); 2749 if (!AI) 2750 continue; 2751 2752 for (BasicBlock *DomBB : DomSet) { 2753 bool Valid = true; 2754 SmallVector<Instruction *, 1> Lifetimes; 2755 2756 auto isLifetimeStart = [](Instruction* I) { 2757 if (auto* II = dyn_cast<IntrinsicInst>(I)) 2758 return II->getIntrinsicID() == Intrinsic::lifetime_start; 2759 return false; 2760 }; 2761 2762 auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) { 2763 if (isLifetimeStart(U)) { 2764 Lifetimes.push_back(U); 2765 return true; 2766 } 2767 if (!U->hasOneUse() || U->stripPointerCasts() != AI) 2768 return false; 2769 if (isLifetimeStart(U->user_back())) { 2770 Lifetimes.push_back(U->user_back()); 2771 return true; 2772 } 2773 return false; 2774 }; 2775 2776 for (User *U : AI->users()) { 2777 Instruction *UI = cast<Instruction>(U); 2778 // For all users except lifetime.start markers, if they are all 2779 // dominated by one of the basic blocks and do not cross 2780 // suspend points as well, then there is no need to spill the 2781 // instruction. 2782 if (!DT.dominates(DomBB, UI->getParent()) || 2783 Checker.isDefinitionAcrossSuspend(DomBB, UI)) { 2784 // Skip lifetime.start, GEP and bitcast used by lifetime.start 2785 // markers. 2786 if (collectLifetimeStart(UI, AI)) 2787 continue; 2788 Valid = false; 2789 break; 2790 } 2791 } 2792 // Sink lifetime.start markers to dominate block when they are 2793 // only used outside the region. 2794 if (Valid && Lifetimes.size() != 0) { 2795 auto *NewLifetime = Lifetimes[0]->clone(); 2796 NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); 2797 NewLifetime->insertBefore(DomBB->getTerminator()); 2798 2799 // All the outsided lifetime.start markers are no longer necessary. 2800 for (Instruction *S : Lifetimes) 2801 S->eraseFromParent(); 2802 2803 break; 2804 } 2805 } 2806 } 2807} 2808 2809static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, 2810 const SuspendCrossingInfo &Checker, 2811 SmallVectorImpl<AllocaInfo> &Allocas, 2812 const DominatorTree &DT) { 2813 if (Shape.CoroSuspends.empty()) 2814 return; 2815 2816 // The PromiseAlloca will be specially handled since it needs to be in a 2817 // fixed position in the frame. 2818 if (AI == Shape.SwitchLowering.PromiseAlloca) 2819 return; 2820 2821 // The __coro_gro alloca should outlive the promise, make sure we 2822 // keep it outside the frame. 2823 if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame)) 2824 return; 2825 2826 // The code that uses lifetime.start intrinsic does not work for functions 2827 // with loops without exit. Disable it on ABIs we know to generate such 2828 // code. 2829 bool ShouldUseLifetimeStartInfo = 2830 (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 2831 Shape.ABI != coro::ABI::RetconOnce); 2832 AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, 2833 *Shape.CoroBegin, Checker, 2834 ShouldUseLifetimeStartInfo}; 2835 Visitor.visitPtr(*AI); 2836 if (!Visitor.getShouldLiveOnFrame()) 2837 return; 2838 Allocas.emplace_back(AI, Visitor.getAliasesCopy(), 2839 Visitor.getMayWriteBeforeCoroBegin()); 2840} 2841 2842static std::optional<std::pair<Value &, DIExpression &>> 2843salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2844 bool OptimizeFrame, bool UseEntryValue, Function *F, 2845 Value *Storage, DIExpression *Expr, 2846 bool SkipOutermostLoad) { 2847 IRBuilder<> Builder(F->getContext()); 2848 auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); 2849 while (isa<IntrinsicInst>(InsertPt)) 2850 ++InsertPt; 2851 Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt); 2852 2853 while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { 2854 if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { 2855 Storage = LdInst->getPointerOperand(); 2856 // FIXME: This is a heuristic that works around the fact that 2857 // LLVM IR debug intrinsics cannot yet distinguish between 2858 // memory and value locations: Because a dbg.declare(alloca) is 2859 // implicitly a memory location no DW_OP_deref operation for the 2860 // last direct load from an alloca is necessary. This condition 2861 // effectively drops the *last* DW_OP_deref in the expression. 2862 if (!SkipOutermostLoad) 2863 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2864 } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { 2865 Storage = StInst->getValueOperand(); 2866 } else { 2867 SmallVector<uint64_t, 16> Ops; 2868 SmallVector<Value *, 0> AdditionalValues; 2869 Value *Op = llvm::salvageDebugInfoImpl( 2870 *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops, 2871 AdditionalValues); 2872 if (!Op || !AdditionalValues.empty()) { 2873 // If salvaging failed or salvaging produced more than one location 2874 // operand, give up. 2875 break; 2876 } 2877 Storage = Op; 2878 Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false); 2879 } 2880 SkipOutermostLoad = false; 2881 } 2882 if (!Storage) 2883 return std::nullopt; 2884 2885 auto *StorageAsArg = dyn_cast<Argument>(Storage); 2886 const bool IsSwiftAsyncArg = 2887 StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); 2888 2889 // Swift async arguments are described by an entry value of the ABI-defined 2890 // register containing the coroutine context. 2891 // Entry values in variadic expressions are not supported. 2892 if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() && 2893 Expr->isSingleLocationExpression()) 2894 Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); 2895 2896 // If the coroutine frame is an Argument, store it in an alloca to improve 2897 // its availability (e.g. registers may be clobbered). 2898 // Avoid this if optimizations are enabled (they would remove the alloca) or 2899 // if the value is guaranteed to be available through other means (e.g. swift 2900 // ABI guarantees). 2901 if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { 2902 auto &Cached = ArgToAllocaMap[StorageAsArg]; 2903 if (!Cached) { 2904 Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, 2905 Storage->getName() + ".debug"); 2906 Builder.CreateStore(Storage, Cached); 2907 } 2908 Storage = Cached; 2909 // FIXME: LLVM lacks nuanced semantics to differentiate between 2910 // memory and direct locations at the IR level. The backend will 2911 // turn a dbg.declare(alloca, ..., DIExpression()) into a memory 2912 // location. Thus, if there are deref and offset operations in the 2913 // expression, we need to add a DW_OP_deref at the *start* of the 2914 // expression to first load the contents of the alloca before 2915 // adjusting it with the expression. 2916 Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); 2917 } 2918 2919 return {{*Storage, *Expr}}; 2920} 2921 2922void coro::salvageDebugInfo( 2923 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, 2924 DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) { 2925 2926 Function *F = DVI.getFunction(); 2927 // Follow the pointer arithmetic all the way to the incoming 2928 // function argument and convert into a DIExpression. 2929 bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); 2930 Value *OriginalStorage = DVI.getVariableLocationOp(0); 2931 2932 auto SalvagedInfo = ::salvageDebugInfoImpl( 2933 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2934 DVI.getExpression(), SkipOutermostLoad); 2935 if (!SalvagedInfo) 2936 return; 2937 2938 Value *Storage = &SalvagedInfo->first; 2939 DIExpression *Expr = &SalvagedInfo->second; 2940 2941 DVI.replaceVariableLocationOp(OriginalStorage, Storage); 2942 DVI.setExpression(Expr); 2943 // We only hoist dbg.declare today since it doesn't make sense to hoist 2944 // dbg.value since it does not have the same function wide guarantees that 2945 // dbg.declare does. 2946 if (isa<DbgDeclareInst>(DVI)) { 2947 std::optional<BasicBlock::iterator> InsertPt; 2948 if (auto *I = dyn_cast<Instruction>(Storage)) { 2949 InsertPt = I->getInsertionPointAfterDef(); 2950 // Update DILocation only in O0 since it is easy to get out of sync in 2951 // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for 2952 // an example. 2953 if (!OptimizeFrame && I->getDebugLoc()) 2954 DVI.setDebugLoc(I->getDebugLoc()); 2955 } else if (isa<Argument>(Storage)) 2956 InsertPt = F->getEntryBlock().begin(); 2957 if (InsertPt) 2958 DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); 2959 } 2960} 2961 2962void coro::salvageDebugInfo( 2963 SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV, 2964 bool OptimizeFrame, bool UseEntryValue) { 2965 2966 Function *F = DPV.getFunction(); 2967 // Follow the pointer arithmetic all the way to the incoming 2968 // function argument and convert into a DIExpression. 2969 bool SkipOutermostLoad = DPV.isDbgDeclare(); 2970 Value *OriginalStorage = DPV.getVariableLocationOp(0); 2971 2972 auto SalvagedInfo = ::salvageDebugInfoImpl( 2973 ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, 2974 DPV.getExpression(), SkipOutermostLoad); 2975 if (!SalvagedInfo) 2976 return; 2977 2978 Value *Storage = &SalvagedInfo->first; 2979 DIExpression *Expr = &SalvagedInfo->second; 2980 2981 DPV.replaceVariableLocationOp(OriginalStorage, Storage); 2982 DPV.setExpression(Expr); 2983 // We only hoist dbg.declare today since it doesn't make sense to hoist 2984 // dbg.value since it does not have the same function wide guarantees that 2985 // dbg.declare does. 2986 if (DPV.getType() == DPValue::LocationType::Declare) { 2987 std::optional<BasicBlock::iterator> InsertPt; 2988 if (auto *I = dyn_cast<Instruction>(Storage)) { 2989 InsertPt = I->getInsertionPointAfterDef(); 2990 // Update DILocation only in O0 since it is easy to get out of sync in 2991 // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for 2992 // an example. 2993 if (!OptimizeFrame && I->getDebugLoc()) 2994 DPV.setDebugLoc(I->getDebugLoc()); 2995 } else if (isa<Argument>(Storage)) 2996 InsertPt = F->getEntryBlock().begin(); 2997 if (InsertPt) { 2998 DPV.removeFromParent(); 2999 (*InsertPt)->getParent()->insertDPValueBefore(&DPV, *InsertPt); 3000 } 3001 } 3002} 3003 3004static void doRematerializations( 3005 Function &F, SuspendCrossingInfo &Checker, 3006 const std::function<bool(Instruction &)> &MaterializableCallback) { 3007 if (F.hasOptNone()) 3008 return; 3009 3010 SpillInfo Spills; 3011 3012 // See if there are materializable instructions across suspend points 3013 // We record these as the starting point to also identify materializable 3014 // defs of uses in these operations 3015 for (Instruction &I : instructions(F)) { 3016 if (!MaterializableCallback(I)) 3017 continue; 3018 for (User *U : I.users()) 3019 if (Checker.isDefinitionAcrossSuspend(I, U)) 3020 Spills[&I].push_back(cast<Instruction>(U)); 3021 } 3022 3023 // Process each of the identified rematerializable instructions 3024 // and add predecessor instructions that can also be rematerialized. 3025 // This is actually a graph of instructions since we could potentially 3026 // have multiple uses of a def in the set of predecessor instructions. 3027 // The approach here is to maintain a graph of instructions for each bottom 3028 // level instruction - where we have a unique set of instructions (nodes) 3029 // and edges between them. We then walk the graph in reverse post-dominator 3030 // order to insert them past the suspend point, but ensure that ordering is 3031 // correct. We also rely on CSE removing duplicate defs for remats of 3032 // different instructions with a def in common (rather than maintaining more 3033 // complex graphs for each suspend point) 3034 3035 // We can do this by adding new nodes to the list for each suspend 3036 // point. Then using standard GraphTraits to give a reverse post-order 3037 // traversal when we insert the nodes after the suspend 3038 SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; 3039 for (auto &E : Spills) { 3040 for (Instruction *U : E.second) { 3041 // Don't process a user twice (this can happen if the instruction uses 3042 // more than one rematerializable def) 3043 if (AllRemats.count(U)) 3044 continue; 3045 3046 // Constructor creates the whole RematGraph for the given Use 3047 auto RematUPtr = 3048 std::make_unique<RematGraph>(MaterializableCallback, U, Checker); 3049 3050 LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; 3051 ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); 3052 for (auto I = RPOT.begin(); I != RPOT.end(); 3053 ++I) { (*I)->Node->dump(); } dbgs() 3054 << "\n";); 3055 3056 AllRemats[U] = std::move(RematUPtr); 3057 } 3058 } 3059 3060 // Rewrite materializable instructions to be materialized at the use 3061 // point. 3062 LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); 3063 rewriteMaterializableInstructions(AllRemats); 3064} 3065 3066void coro::buildCoroutineFrame( 3067 Function &F, Shape &Shape, 3068 const std::function<bool(Instruction &)> &MaterializableCallback) { 3069 // Don't eliminate swifterror in async functions that won't be split. 3070 if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) 3071 eliminateSwiftError(F, Shape); 3072 3073 if (Shape.ABI == coro::ABI::Switch && 3074 Shape.SwitchLowering.PromiseAlloca) { 3075 Shape.getSwitchCoroId()->clearPromise(); 3076 } 3077 3078 // Make sure that all coro.save, coro.suspend and the fallthrough coro.end 3079 // intrinsics are in their own blocks to simplify the logic of building up 3080 // SuspendCrossing data. 3081 for (auto *CSI : Shape.CoroSuspends) { 3082 if (auto *Save = CSI->getCoroSave()) 3083 splitAround(Save, "CoroSave"); 3084 splitAround(CSI, "CoroSuspend"); 3085 } 3086 3087 // Put CoroEnds into their own blocks. 3088 for (AnyCoroEndInst *CE : Shape.CoroEnds) { 3089 splitAround(CE, "CoroEnd"); 3090 3091 // Emit the musttail call function in a new block before the CoroEnd. 3092 // We do this here so that the right suspend crossing info is computed for 3093 // the uses of the musttail call function call. (Arguments to the coro.end 3094 // instructions would be ignored) 3095 if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) { 3096 auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction(); 3097 if (!MustTailCallFn) 3098 continue; 3099 IRBuilder<> Builder(AsyncEnd); 3100 SmallVector<Value *, 8> Args(AsyncEnd->args()); 3101 auto Arguments = ArrayRef<Value *>(Args).drop_front(3); 3102 auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, 3103 Arguments, Builder); 3104 splitAround(Call, "MustTailCall.Before.CoroEnd"); 3105 } 3106 } 3107 3108 // Later code makes structural assumptions about single predecessors phis e.g 3109 // that they are not live across a suspend point. 3110 cleanupSinglePredPHIs(F); 3111 3112 // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will 3113 // never has its definition separated from the PHI by the suspend point. 3114 rewritePHIs(F); 3115 3116 // Build suspend crossing info. 3117 SuspendCrossingInfo Checker(F, Shape); 3118 3119 doRematerializations(F, Checker, MaterializableCallback); 3120 3121 FrameDataInfo FrameData; 3122 SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; 3123 SmallVector<Instruction*, 4> DeadInstructions; 3124 if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && 3125 Shape.ABI != coro::ABI::RetconOnce) 3126 sinkLifetimeStartMarkers(F, Shape, Checker); 3127 3128 // Collect the spills for arguments and other not-materializable values. 3129 for (Argument &A : F.args()) 3130 for (User *U : A.users()) 3131 if (Checker.isDefinitionAcrossSuspend(A, U)) 3132 FrameData.Spills[&A].push_back(cast<Instruction>(U)); 3133 3134 const DominatorTree DT(F); 3135 for (Instruction &I : instructions(F)) { 3136 // Values returned from coroutine structure intrinsics should not be part 3137 // of the Coroutine Frame. 3138 if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) 3139 continue; 3140 3141 // Handle alloca.alloc specially here. 3142 if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { 3143 // Check whether the alloca's lifetime is bounded by suspend points. 3144 if (isLocalAlloca(AI)) { 3145 LocalAllocas.push_back(AI); 3146 continue; 3147 } 3148 3149 // If not, do a quick rewrite of the alloca and then add spills of 3150 // the rewritten value. The rewrite doesn't invalidate anything in 3151 // Spills because the other alloca intrinsics have no other operands 3152 // besides AI, and it doesn't invalidate the iteration because we delay 3153 // erasing AI. 3154 auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); 3155 3156 for (User *U : Alloc->users()) { 3157 if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) 3158 FrameData.Spills[Alloc].push_back(cast<Instruction>(U)); 3159 } 3160 continue; 3161 } 3162 3163 // Ignore alloca.get; we process this as part of coro.alloca.alloc. 3164 if (isa<CoroAllocaGetInst>(I)) 3165 continue; 3166 3167 if (auto *AI = dyn_cast<AllocaInst>(&I)) { 3168 collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT); 3169 continue; 3170 } 3171 3172 for (User *U : I.users()) 3173 if (Checker.isDefinitionAcrossSuspend(I, U)) { 3174 // We cannot spill a token. 3175 if (I.getType()->isTokenTy()) 3176 report_fatal_error( 3177 "token definition is separated from the use by a suspend point"); 3178 FrameData.Spills[&I].push_back(cast<Instruction>(U)); 3179 } 3180 } 3181 3182 LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); 3183 3184 // We don't want the layout of coroutine frame to be affected 3185 // by debug information. So we only choose to salvage DbgValueInst for 3186 // whose value is already in the frame. 3187 // We would handle the dbg.values for allocas specially 3188 for (auto &Iter : FrameData.Spills) { 3189 auto *V = Iter.first; 3190 SmallVector<DbgValueInst *, 16> DVIs; 3191 SmallVector<DPValue *, 16> DPVs; 3192 findDbgValues(DVIs, V, &DPVs); 3193 for (DbgValueInst *DVI : DVIs) 3194 if (Checker.isDefinitionAcrossSuspend(*V, DVI)) 3195 FrameData.Spills[V].push_back(DVI); 3196 // Add the instructions which carry debug info that is in the frame. 3197 for (DPValue *DPV : DPVs) 3198 if (Checker.isDefinitionAcrossSuspend(*V, DPV->Marker->MarkedInstr)) 3199 FrameData.Spills[V].push_back(DPV->Marker->MarkedInstr); 3200 } 3201 3202 LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills)); 3203 if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce || 3204 Shape.ABI == coro::ABI::Async) 3205 sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin); 3206 Shape.FrameTy = buildFrameType(F, Shape, FrameData); 3207 Shape.FramePtr = Shape.CoroBegin; 3208 // For now, this works for C++ programs only. 3209 buildFrameDebugInfo(F, Shape, FrameData); 3210 insertSpills(FrameData, Shape); 3211 lowerLocalAllocas(LocalAllocas, DeadInstructions); 3212 3213 for (auto *I : DeadInstructions) 3214 I->eraseFromParent(); 3215} 3216