Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

317017Sdim//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
317017Sdim//
353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
353358Sdim// See https://llvm.org/LICENSE.txt for license information.
353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
317017Sdim//
317017Sdim//===----------------------------------------------------------------------===//
317017Sdim//
317017Sdim// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
317017Sdim// there is at most one ret and one unreachable instruction, it ensures there is
317017Sdim// at most one divergent exiting block.
317017Sdim//
317017Sdim// StructurizeCFG can't deal with multi-exit regions formed by branches to
317017Sdim// multiple return nodes. It is not desirable to structurize regions with
317017Sdim// uniform branches, so unifying those to the same return block as divergent
317017Sdim// branches inhibits use of scalar branching. It still can't deal with the case
317017Sdim// where one branch goes to return, and one unreachable. Replace unreachable in
317017Sdim// this case with a return.
317017Sdim//
317017Sdim//===----------------------------------------------------------------------===//
317017Sdim
317017Sdim#include "AMDGPU.h"
327952Sdim#include "llvm/ADT/ArrayRef.h"
327952Sdim#include "llvm/ADT/SmallPtrSet.h"
327952Sdim#include "llvm/ADT/SmallVector.h"
327952Sdim#include "llvm/ADT/StringRef.h"
344779Sdim#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
317017Sdim#include "llvm/Analysis/PostDominators.h"
317017Sdim#include "llvm/Analysis/TargetTransformInfo.h"
317017Sdim#include "llvm/IR/BasicBlock.h"
317017Sdim#include "llvm/IR/CFG.h"
327952Sdim#include "llvm/IR/Constants.h"
317017Sdim#include "llvm/IR/Function.h"
327952Sdim#include "llvm/IR/InstrTypes.h"
317017Sdim#include "llvm/IR/Instructions.h"
327952Sdim#include "llvm/IR/Intrinsics.h"
360784Sdim#include "llvm/IR/IRBuilder.h"
317017Sdim#include "llvm/IR/Type.h"
360784Sdim#include "llvm/InitializePasses.h"
327952Sdim#include "llvm/Pass.h"
327952Sdim#include "llvm/Support/Casting.h"
317017Sdim#include "llvm/Transforms/Scalar.h"
341825Sdim#include "llvm/Transforms/Utils.h"
360784Sdim#include "llvm/Transforms/Utils/Local.h"
327952Sdim
317017Sdimusing namespace llvm;
317017Sdim
317017Sdim#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
317017Sdim
317017Sdimnamespace {
317017Sdim
317017Sdimclass AMDGPUUnifyDivergentExitNodes : public FunctionPass {
317017Sdimpublic:
317017Sdim  static char ID; // Pass identification, replacement for typeid
327952Sdim
317017Sdim  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
317017Sdim    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
317017Sdim  }
317017Sdim
317017Sdim  // We can preserve non-critical-edgeness when we unify function exit nodes
317017Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override;
317017Sdim  bool runOnFunction(Function &F) override;
317017Sdim};
317017Sdim
327952Sdim} // end anonymous namespace
317017Sdim
317017Sdimchar AMDGPUUnifyDivergentExitNodes::ID = 0;
327952Sdim
327952Sdimchar &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
327952Sdim
317017SdimINITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
317017Sdim                     "Unify divergent function exit nodes", false, false)
317017SdimINITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
344779SdimINITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
317017SdimINITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
317017Sdim                    "Unify divergent function exit nodes", false, false)
317017Sdim
317017Sdimvoid AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
317017Sdim  // TODO: Preserve dominator tree.
317017Sdim  AU.addRequired<PostDominatorTreeWrapperPass>();
317017Sdim
344779Sdim  AU.addRequired<LegacyDivergenceAnalysis>();
317017Sdim
317017Sdim  // No divergent values are changed, only blocks and branch edges.
344779Sdim  AU.addPreserved<LegacyDivergenceAnalysis>();
317017Sdim
317017Sdim  // We preserve the non-critical-edgeness property
317017Sdim  AU.addPreservedID(BreakCriticalEdgesID);
317017Sdim
317017Sdim  // This is a cluster of orthogonal Transforms
317017Sdim  AU.addPreservedID(LowerSwitchID);
317017Sdim  FunctionPass::getAnalysisUsage(AU);
317017Sdim
317017Sdim  AU.addRequired<TargetTransformInfoWrapperPass>();
317017Sdim}
317017Sdim
317017Sdim/// \returns true if \p BB is reachable through only uniform branches.
317017Sdim/// XXX - Is there a more efficient way to find this?
344779Sdimstatic bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
317017Sdim                               BasicBlock &BB) {
317017Sdim  SmallVector<BasicBlock *, 8> Stack;
317017Sdim  SmallPtrSet<BasicBlock *, 8> Visited;
317017Sdim
317017Sdim  for (BasicBlock *Pred : predecessors(&BB))
317017Sdim    Stack.push_back(Pred);
317017Sdim
317017Sdim  while (!Stack.empty()) {
317017Sdim    BasicBlock *Top = Stack.pop_back_val();
317017Sdim    if (!DA.isUniform(Top->getTerminator()))
317017Sdim      return false;
317017Sdim
317017Sdim    for (BasicBlock *Pred : predecessors(Top)) {
317017Sdim      if (Visited.insert(Pred).second)
317017Sdim        Stack.push_back(Pred);
317017Sdim    }
317017Sdim  }
317017Sdim
317017Sdim  return true;
317017Sdim}
317017Sdim
360784Sdimstatic void removeDoneExport(Function &F) {
360784Sdim  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
360784Sdim  for (BasicBlock &BB : F) {
360784Sdim    for (Instruction &I : BB) {
360784Sdim      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
360784Sdim        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
360784Sdim          Intrin->setArgOperand(6, BoolFalse); // done
360784Sdim        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
360784Sdim          Intrin->setArgOperand(4, BoolFalse); // done
360784Sdim        }
360784Sdim      }
360784Sdim    }
360784Sdim  }
360784Sdim}
360784Sdim
317017Sdimstatic BasicBlock *unifyReturnBlockSet(Function &F,
317017Sdim                                       ArrayRef<BasicBlock *> ReturningBlocks,
360784Sdim                                       bool InsertExport,
317017Sdim                                       const TargetTransformInfo &TTI,
317017Sdim                                       StringRef Name) {
317017Sdim  // Otherwise, we need to insert a new basic block into the function, add a PHI
317017Sdim  // nodes (if the function returns values), and convert all of the return
317017Sdim  // instructions into unconditional branches.
317017Sdim  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
360784Sdim  IRBuilder<> B(NewRetBlock);
317017Sdim
360784Sdim  if (InsertExport) {
360784Sdim    // Ensure that there's only one "done" export in the shader by removing the
360784Sdim    // "done" bit set on the original final export. More than one "done" export
360784Sdim    // can lead to undefined behavior.
360784Sdim    removeDoneExport(F);
360784Sdim
360784Sdim    Value *Undef = UndefValue::get(B.getFloatTy());
360784Sdim    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
360784Sdim                      {
360784Sdim                        B.getInt32(9), // target, SQ_EXP_NULL
360784Sdim                        B.getInt32(0), // enabled channels
360784Sdim                        Undef, Undef, Undef, Undef, // values
360784Sdim                        B.getTrue(), // done
360784Sdim                        B.getTrue(), // valid mask
360784Sdim                      });
360784Sdim  }
360784Sdim
317017Sdim  PHINode *PN = nullptr;
317017Sdim  if (F.getReturnType()->isVoidTy()) {
360784Sdim    B.CreateRetVoid();
317017Sdim  } else {
317017Sdim    // If the function doesn't return void... add a PHI node to the block...
360784Sdim    PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
360784Sdim                     "UnifiedRetVal");
360784Sdim    assert(!InsertExport);
360784Sdim    B.CreateRet(PN);
317017Sdim  }
317017Sdim
317017Sdim  // Loop over all of the blocks, replacing the return instruction with an
317017Sdim  // unconditional branch.
317017Sdim  for (BasicBlock *BB : ReturningBlocks) {
317017Sdim    // Add an incoming element to the PHI node for every return instruction that
317017Sdim    // is merging into this new block...
317017Sdim    if (PN)
317017Sdim      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
317017Sdim
341825Sdim    // Remove and delete the return inst.
341825Sdim    BB->getTerminator()->eraseFromParent();
317017Sdim    BranchInst::Create(NewRetBlock, BB);
317017Sdim  }
317017Sdim
317017Sdim  for (BasicBlock *BB : ReturningBlocks) {
317017Sdim    // Cleanup possible branch to unconditional branch to the return.
327952Sdim    simplifyCFG(BB, TTI, {2});
317017Sdim  }
317017Sdim
317017Sdim  return NewRetBlock;
317017Sdim}
317017Sdim
317017Sdimbool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
317017Sdim  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
317017Sdim  if (PDT.getRoots().size() <= 1)
317017Sdim    return false;
317017Sdim
344779Sdim  LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
317017Sdim
317017Sdim  // Loop over all of the blocks in a function, tracking all of the blocks that
317017Sdim  // return.
317017Sdim  SmallVector<BasicBlock *, 4> ReturningBlocks;
317017Sdim  SmallVector<BasicBlock *, 4> UnreachableBlocks;
317017Sdim
341825Sdim  // Dummy return block for infinite loop.
341825Sdim  BasicBlock *DummyReturnBB = nullptr;
341825Sdim
360784Sdim  bool InsertExport = false;
360784Sdim
317017Sdim  for (BasicBlock *BB : PDT.getRoots()) {
317017Sdim    if (isa<ReturnInst>(BB->getTerminator())) {
317017Sdim      if (!isUniformlyReached(DA, *BB))
317017Sdim        ReturningBlocks.push_back(BB);
317017Sdim    } else if (isa<UnreachableInst>(BB->getTerminator())) {
317017Sdim      if (!isUniformlyReached(DA, *BB))
317017Sdim        UnreachableBlocks.push_back(BB);
341825Sdim    } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
341825Sdim
341825Sdim      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
341825Sdim      if (DummyReturnBB == nullptr) {
341825Sdim        DummyReturnBB = BasicBlock::Create(F.getContext(),
341825Sdim                                           "DummyReturnBlock", &F);
341825Sdim        Type *RetTy = F.getReturnType();
341825Sdim        Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
360784Sdim
360784Sdim        // For pixel shaders, the producer guarantees that an export is
360784Sdim        // executed before each return instruction. However, if there is an
360784Sdim        // infinite loop and we insert a return ourselves, we need to uphold
360784Sdim        // that guarantee by inserting a null export. This can happen e.g. in
360784Sdim        // an infinite loop with kill instructions, which is supposed to
360784Sdim        // terminate. However, we don't need to do this if there is a non-void
360784Sdim        // return value, since then there is an epilog afterwards which will
360784Sdim        // still export.
360784Sdim        //
360784Sdim        // Note: In the case where only some threads enter the infinite loop,
360784Sdim        // this can result in the null export happening redundantly after the
360784Sdim        // original exports. However, The last "real" export happens after all
360784Sdim        // the threads that didn't enter an infinite loop converged, which
360784Sdim        // means that the only extra threads to execute the null export are
360784Sdim        // threads that entered the infinite loop, and they only could've
360784Sdim        // exited through being killed which sets their exec bit to 0.
360784Sdim        // Therefore, unless there's an actual infinite loop, which can have
360784Sdim        // invalid results, or there's a kill after the last export, which we
360784Sdim        // assume the frontend won't do, this export will have the same exec
360784Sdim        // mask as the last "real" export, and therefore the valid mask will be
360784Sdim        // overwritten with the same value and will still be correct. Also,
360784Sdim        // even though this forces an extra unnecessary export wait, we assume
360784Sdim        // that this happens rare enough in practice to that we don't have to
360784Sdim        // worry about performance.
360784Sdim        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
360784Sdim            RetTy->isVoidTy()) {
360784Sdim          InsertExport = true;
360784Sdim        }
360784Sdim
341825Sdim        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
341825Sdim        ReturningBlocks.push_back(DummyReturnBB);
341825Sdim      }
341825Sdim
341825Sdim      if (BI->isUnconditional()) {
341825Sdim        BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
341825Sdim        BI->eraseFromParent(); // Delete the unconditional branch.
341825Sdim        // Add a new conditional branch with a dummy edge to the return block.
341825Sdim        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
341825Sdim      } else { // Conditional branch.
341825Sdim        // Create a new transition block to hold the conditional branch.
353358Sdim        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
341825Sdim
353358Sdim        // Create a branch that will always branch to the transition block and
353358Sdim        // references DummyReturnBB.
353358Sdim        BB->getTerminator()->eraseFromParent();
341825Sdim        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
341825Sdim      }
317017Sdim    }
317017Sdim  }
317017Sdim
317017Sdim  if (!UnreachableBlocks.empty()) {
317017Sdim    BasicBlock *UnreachableBlock = nullptr;
317017Sdim
317017Sdim    if (UnreachableBlocks.size() == 1) {
317017Sdim      UnreachableBlock = UnreachableBlocks.front();
317017Sdim    } else {
317017Sdim      UnreachableBlock = BasicBlock::Create(F.getContext(),
317017Sdim                                            "UnifiedUnreachableBlock", &F);
317017Sdim      new UnreachableInst(F.getContext(), UnreachableBlock);
317017Sdim
317017Sdim      for (BasicBlock *BB : UnreachableBlocks) {
341825Sdim        // Remove and delete the unreachable inst.
341825Sdim        BB->getTerminator()->eraseFromParent();
317017Sdim        BranchInst::Create(UnreachableBlock, BB);
317017Sdim      }
317017Sdim    }
317017Sdim
317017Sdim    if (!ReturningBlocks.empty()) {
317017Sdim      // Don't create a new unreachable inst if we have a return. The
317017Sdim      // structurizer/annotator can't handle the multiple exits
317017Sdim
317017Sdim      Type *RetTy = F.getReturnType();
317017Sdim      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
341825Sdim      // Remove and delete the unreachable inst.
341825Sdim      UnreachableBlock->getTerminator()->eraseFromParent();
317017Sdim
317017Sdim      Function *UnreachableIntrin =
317017Sdim        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
317017Sdim
317017Sdim      // Insert a call to an intrinsic tracking that this is an unreachable
317017Sdim      // point, in case we want to kill the active lanes or something later.
317017Sdim      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
317017Sdim
317017Sdim      // Don't create a scalar trap. We would only want to trap if this code was
317017Sdim      // really reached, but a scalar trap would happen even if no lanes
317017Sdim      // actually reached here.
317017Sdim      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
317017Sdim      ReturningBlocks.push_back(UnreachableBlock);
317017Sdim    }
317017Sdim  }
317017Sdim
317017Sdim  // Now handle return blocks.
317017Sdim  if (ReturningBlocks.empty())
317017Sdim    return false; // No blocks return
317017Sdim
317017Sdim  if (ReturningBlocks.size() == 1)
317017Sdim    return false; // Already has a single return block
317017Sdim
317017Sdim  const TargetTransformInfo &TTI
317017Sdim    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
317017Sdim
360784Sdim  unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
317017Sdim  return true;
317017Sdim}