1327952Sdim//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 2284677Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6284677Sdim// 7284677Sdim//===----------------------------------------------------------------------===// 8284677Sdim// 9284677Sdim/// \file 10284677Sdim/// Copies from VGPR to SGPR registers are illegal and the register coalescer 11284677Sdim/// will sometimes generate these illegal copies in situations like this: 12284677Sdim/// 13284677Sdim/// Register Class <vsrc> is the union of <vgpr> and <sgpr> 14284677Sdim/// 15284677Sdim/// BB0: 16327952Sdim/// %0 <sgpr> = SCALAR_INST 17327952Sdim/// %1 <vsrc> = COPY %0 <sgpr> 18284677Sdim/// ... 19284677Sdim/// BRANCH %cond BB1, BB2 20284677Sdim/// BB1: 21327952Sdim/// %2 <vgpr> = VECTOR_INST 22327952Sdim/// %3 <vsrc> = COPY %2 <vgpr> 23284677Sdim/// BB2: 24327952Sdim/// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 25327952Sdim/// %5 <vgpr> = VECTOR_INST %4 <vsrc> 26284677Sdim/// 27284677Sdim/// 28284677Sdim/// The coalescer will begin at BB0 and eliminate its copy, then the resulting 29284677Sdim/// code will look like this: 30284677Sdim/// 31284677Sdim/// BB0: 32327952Sdim/// %0 <sgpr> = SCALAR_INST 33284677Sdim/// ... 34284677Sdim/// BRANCH %cond BB1, BB2 35284677Sdim/// BB1: 36327952Sdim/// %2 <vgpr> = VECTOR_INST 37327952Sdim/// %3 <vsrc> = COPY %2 <vgpr> 38284677Sdim/// BB2: 39327952Sdim/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 40327952Sdim/// %5 <vgpr> = VECTOR_INST %4 <sgpr> 41284677Sdim/// 42284677Sdim/// Now that the result of the PHI instruction is an SGPR, the register 43327952Sdim/// allocator is now forced to constrain the register class of %3 to 44284677Sdim/// <sgpr> so we end up with final code like this: 45284677Sdim/// 46284677Sdim/// BB0: 47327952Sdim/// %0 <sgpr> = SCALAR_INST 48284677Sdim/// ... 49284677Sdim/// BRANCH %cond BB1, BB2 50284677Sdim/// BB1: 51327952Sdim/// %2 <vgpr> = VECTOR_INST 52327952Sdim/// %3 <sgpr> = COPY %2 <vgpr> 53284677Sdim/// BB2: 54327952Sdim/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 55327952Sdim/// %5 <vgpr> = VECTOR_INST %4 <sgpr> 56284677Sdim/// 57284677Sdim/// Now this code contains an illegal copy from a VGPR to an SGPR. 58284677Sdim/// 59284677Sdim/// In order to avoid this problem, this pass searches for PHI instructions 60284677Sdim/// which define a <vsrc> register and constrains its definition class to 61284677Sdim/// <vgpr> if the user of the PHI's definition register is a vector instruction. 62284677Sdim/// If the PHI's definition class is constrained to <vgpr> then the coalescer 63284677Sdim/// will be unable to perform the COPY removal from the above example which 64284677Sdim/// ultimately led to the creation of an illegal COPY. 65284677Sdim//===----------------------------------------------------------------------===// 66284677Sdim 67284677Sdim#include "AMDGPU.h" 68284677Sdim#include "AMDGPUSubtarget.h" 69360784Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70284677Sdim#include "SIInstrInfo.h" 71327952Sdim#include "SIRegisterInfo.h" 72321369Sdim#include "llvm/ADT/DenseSet.h" 73327952Sdim#include "llvm/ADT/STLExtras.h" 74327952Sdim#include "llvm/ADT/SmallSet.h" 75327952Sdim#include "llvm/ADT/SmallVector.h" 76327952Sdim#include "llvm/CodeGen/MachineBasicBlock.h" 77314564Sdim#include "llvm/CodeGen/MachineDominators.h" 78327952Sdim#include "llvm/CodeGen/MachineFunction.h" 79284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 80327952Sdim#include "llvm/CodeGen/MachineInstr.h" 81284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 82327952Sdim#include "llvm/CodeGen/MachineOperand.h" 83284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 84327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h" 85360784Sdim#include "llvm/InitializePasses.h" 86327952Sdim#include "llvm/Pass.h" 87327952Sdim#include "llvm/Support/CodeGen.h" 88327952Sdim#include "llvm/Support/CommandLine.h" 89284677Sdim#include "llvm/Support/Debug.h" 90284677Sdim#include "llvm/Support/raw_ostream.h" 91284677Sdim#include "llvm/Target/TargetMachine.h" 92327952Sdim#include <cassert> 93327952Sdim#include <cstdint> 94327952Sdim#include <iterator> 95327952Sdim#include <list> 96327952Sdim#include <map> 97327952Sdim#include <tuple> 98327952Sdim#include <utility> 99284677Sdim 100284677Sdimusing namespace llvm; 101284677Sdim 102309124Sdim#define DEBUG_TYPE "si-fix-sgpr-copies" 103284677Sdim 104321369Sdimstatic cl::opt<bool> EnableM0Merge( 105321369Sdim "amdgpu-enable-merge-m0", 106321369Sdim cl::desc("Merge and hoist M0 initializations"), 107353358Sdim cl::init(true)); 108321369Sdim 109284677Sdimnamespace { 110284677Sdim 111284677Sdimclass SIFixSGPRCopies : public MachineFunctionPass { 112314564Sdim MachineDominatorTree *MDT; 113341825Sdim 114296417Sdimpublic: 115284677Sdim static char ID; 116284677Sdim 117360784Sdim MachineRegisterInfo *MRI; 118360784Sdim const SIRegisterInfo *TRI; 119360784Sdim const SIInstrInfo *TII; 120360784Sdim 121327952Sdim SIFixSGPRCopies() : MachineFunctionPass(ID) {} 122284677Sdim 123284677Sdim bool runOnMachineFunction(MachineFunction &MF) override; 124284677Sdim 125360784Sdim void processPHINode(MachineInstr &MI); 126360784Sdim 127314564Sdim StringRef getPassName() const override { return "SI Fix SGPR copies"; } 128284677Sdim 129296417Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 130314564Sdim AU.addRequired<MachineDominatorTree>(); 131314564Sdim AU.addPreserved<MachineDominatorTree>(); 132296417Sdim AU.setPreservesCFG(); 133296417Sdim MachineFunctionPass::getAnalysisUsage(AU); 134296417Sdim } 135284677Sdim}; 136284677Sdim 137327952Sdim} // end anonymous namespace 138284677Sdim 139314564SdimINITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 140314564Sdim "SI Fix SGPR copies", false, false) 141321369SdimINITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 142314564SdimINITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 143314564Sdim "SI Fix SGPR copies", false, false) 144296417Sdim 145284677Sdimchar SIFixSGPRCopies::ID = 0; 146284677Sdim 147296417Sdimchar &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 148296417Sdim 149296417SdimFunctionPass *llvm::createSIFixSGPRCopiesPass() { 150296417Sdim return new SIFixSGPRCopies(); 151284677Sdim} 152284677Sdim 153353358Sdimstatic bool hasVectorOperands(const MachineInstr &MI, 154353358Sdim const SIRegisterInfo *TRI) { 155284677Sdim const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 156284677Sdim for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 157284677Sdim if (!MI.getOperand(i).isReg() || 158360784Sdim !Register::isVirtualRegister(MI.getOperand(i).getReg())) 159284677Sdim continue; 160284677Sdim 161353358Sdim if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) 162284677Sdim return true; 163284677Sdim } 164284677Sdim return false; 165284677Sdim} 166284677Sdim 167296417Sdimstatic std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 168296417SdimgetCopyRegClasses(const MachineInstr &Copy, 169296417Sdim const SIRegisterInfo &TRI, 170296417Sdim const MachineRegisterInfo &MRI) { 171360784Sdim Register DstReg = Copy.getOperand(0).getReg(); 172360784Sdim Register SrcReg = Copy.getOperand(1).getReg(); 173284677Sdim 174360784Sdim const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg) 175360784Sdim ? MRI.getRegClass(SrcReg) 176360784Sdim : TRI.getPhysRegClass(SrcReg); 177284677Sdim 178296417Sdim // We don't really care about the subregister here. 179296417Sdim // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 180284677Sdim 181360784Sdim const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg) 182360784Sdim ? MRI.getRegClass(DstReg) 183360784Sdim : TRI.getPhysRegClass(DstReg); 184296417Sdim 185296417Sdim return std::make_pair(SrcRC, DstRC); 186284677Sdim} 187284677Sdim 188296417Sdimstatic bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 189296417Sdim const TargetRegisterClass *DstRC, 190296417Sdim const SIRegisterInfo &TRI) { 191344779Sdim return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 192353358Sdim TRI.hasVectorRegisters(SrcRC); 193296417Sdim} 194284677Sdim 195296417Sdimstatic bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 196296417Sdim const TargetRegisterClass *DstRC, 197296417Sdim const SIRegisterInfo &TRI) { 198344779Sdim return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 199353358Sdim TRI.hasVectorRegisters(DstRC); 200284677Sdim} 201284677Sdim 202321369Sdimstatic bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 203321369Sdim const SIRegisterInfo *TRI, 204321369Sdim const SIInstrInfo *TII) { 205321369Sdim MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 206321369Sdim auto &Src = MI.getOperand(1); 207360784Sdim Register DstReg = MI.getOperand(0).getReg(); 208360784Sdim Register SrcReg = Src.getReg(); 209360784Sdim if (!Register::isVirtualRegister(SrcReg) || 210360784Sdim !Register::isVirtualRegister(DstReg)) 211321369Sdim return false; 212321369Sdim 213321369Sdim for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 214321369Sdim const auto *UseMI = MO.getParent(); 215321369Sdim if (UseMI == &MI) 216321369Sdim continue; 217321369Sdim if (MO.isDef() || UseMI->getParent() != MI.getParent() || 218321369Sdim UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || 219321369Sdim !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) 220321369Sdim return false; 221321369Sdim } 222321369Sdim // Change VGPR to SGPR destination. 223321369Sdim MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 224321369Sdim return true; 225321369Sdim} 226321369Sdim 227296417Sdim// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 228296417Sdim// 229296417Sdim// SGPRx = ... 230296417Sdim// SGPRy = REG_SEQUENCE SGPRx, sub0 ... 231296417Sdim// VGPRz = COPY SGPRy 232296417Sdim// 233296417Sdim// ==> 234296417Sdim// 235296417Sdim// VGPRx = COPY SGPRx 236296417Sdim// VGPRz = REG_SEQUENCE VGPRx, sub0 237296417Sdim// 238296417Sdim// This exposes immediate folding opportunities when materializing 64-bit 239296417Sdim// immediates. 240296417Sdimstatic bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 241296417Sdim const SIRegisterInfo *TRI, 242296417Sdim const SIInstrInfo *TII, 243296417Sdim MachineRegisterInfo &MRI) { 244296417Sdim assert(MI.isRegSequence()); 245284677Sdim 246360784Sdim Register DstReg = MI.getOperand(0).getReg(); 247296417Sdim if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 248296417Sdim return false; 249284677Sdim 250296417Sdim if (!MRI.hasOneUse(DstReg)) 251284677Sdim return false; 252284677Sdim 253296417Sdim MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 254296417Sdim if (!CopyUse.isCopy()) 255296417Sdim return false; 256284677Sdim 257321369Sdim // It is illegal to have vreg inputs to a physreg defining reg_sequence. 258360784Sdim if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg())) 259321369Sdim return false; 260321369Sdim 261296417Sdim const TargetRegisterClass *SrcRC, *DstRC; 262296417Sdim std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 263284677Sdim 264296417Sdim if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 265284677Sdim return false; 266284677Sdim 267321369Sdim if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 268321369Sdim return true; 269321369Sdim 270296417Sdim // TODO: Could have multiple extracts? 271296417Sdim unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 272296417Sdim if (SubReg != AMDGPU::NoSubRegister) 273296417Sdim return false; 274296417Sdim 275296417Sdim MRI.setRegClass(DstReg, DstRC); 276296417Sdim 277296417Sdim // SGPRx = ... 278296417Sdim // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 279296417Sdim // VGPRz = COPY SGPRy 280296417Sdim 281296417Sdim // => 282296417Sdim // VGPRx = COPY SGPRx 283296417Sdim // VGPRz = REG_SEQUENCE VGPRx, sub0 284296417Sdim 285296417Sdim MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 286353358Sdim bool IsAGPR = TRI->hasAGPRs(DstRC); 287296417Sdim 288296417Sdim for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 289360784Sdim Register SrcReg = MI.getOperand(I).getReg(); 290296417Sdim unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 291296417Sdim 292296417Sdim const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 293296417Sdim assert(TRI->isSGPRClass(SrcRC) && 294296417Sdim "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 295296417Sdim 296296417Sdim SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 297296417Sdim const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 298296417Sdim 299360784Sdim Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 300296417Sdim 301321369Sdim BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 302321369Sdim TmpReg) 303321369Sdim .add(MI.getOperand(I)); 304296417Sdim 305353358Sdim if (IsAGPR) { 306353358Sdim const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 307360784Sdim Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 308353358Sdim unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 309353358Sdim AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 310353358Sdim BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 311353358Sdim TmpAReg) 312353358Sdim .addReg(TmpReg, RegState::Kill); 313353358Sdim TmpReg = TmpAReg; 314353358Sdim } 315353358Sdim 316296417Sdim MI.getOperand(I).setReg(TmpReg); 317296417Sdim } 318296417Sdim 319296417Sdim CopyUse.eraseFromParent(); 320296417Sdim return true; 321284677Sdim} 322284677Sdim 323314564Sdimstatic bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 324314564Sdim const MachineInstr *MoveImm, 325314564Sdim const SIInstrInfo *TII, 326314564Sdim unsigned &SMovOp, 327314564Sdim int64_t &Imm) { 328327952Sdim if (Copy->getOpcode() != AMDGPU::COPY) 329327952Sdim return false; 330314564Sdim 331314564Sdim if (!MoveImm->isMoveImmediate()) 332314564Sdim return false; 333314564Sdim 334314564Sdim const MachineOperand *ImmOp = 335314564Sdim TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 336314564Sdim if (!ImmOp->isImm()) 337314564Sdim return false; 338314564Sdim 339314564Sdim // FIXME: Handle copies with sub-regs. 340314564Sdim if (Copy->getOperand(0).getSubReg()) 341314564Sdim return false; 342314564Sdim 343314564Sdim switch (MoveImm->getOpcode()) { 344314564Sdim default: 345314564Sdim return false; 346314564Sdim case AMDGPU::V_MOV_B32_e32: 347314564Sdim SMovOp = AMDGPU::S_MOV_B32; 348314564Sdim break; 349314564Sdim case AMDGPU::V_MOV_B64_PSEUDO: 350314564Sdim SMovOp = AMDGPU::S_MOV_B64; 351314564Sdim break; 352314564Sdim } 353314564Sdim Imm = ImmOp->getImm(); 354314564Sdim return true; 355314564Sdim} 356314564Sdim 357321369Sdimtemplate <class UnaryPredicate> 358321369Sdimbool searchPredecessors(const MachineBasicBlock *MBB, 359321369Sdim const MachineBasicBlock *CutOff, 360321369Sdim UnaryPredicate Predicate) { 361321369Sdim if (MBB == CutOff) 362321369Sdim return false; 363321369Sdim 364327952Sdim DenseSet<const MachineBasicBlock *> Visited; 365327952Sdim SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(), 366327952Sdim MBB->pred_end()); 367321369Sdim 368321369Sdim while (!Worklist.empty()) { 369321369Sdim MachineBasicBlock *MBB = Worklist.pop_back_val(); 370321369Sdim 371321369Sdim if (!Visited.insert(MBB).second) 372321369Sdim continue; 373321369Sdim if (MBB == CutOff) 374321369Sdim continue; 375321369Sdim if (Predicate(MBB)) 376321369Sdim return true; 377321369Sdim 378321369Sdim Worklist.append(MBB->pred_begin(), MBB->pred_end()); 379321369Sdim } 380321369Sdim 381321369Sdim return false; 382321369Sdim} 383321369Sdim 384321369Sdim// Checks if there is potential path From instruction To instruction. 385321369Sdim// If CutOff is specified and it sits in between of that path we ignore 386321369Sdim// a higher portion of the path and report it is not reachable. 387321369Sdimstatic bool isReachable(const MachineInstr *From, 388321369Sdim const MachineInstr *To, 389321369Sdim const MachineBasicBlock *CutOff, 390321369Sdim MachineDominatorTree &MDT) { 391321369Sdim // If either From block dominates To block or instructions are in the same 392321369Sdim // block and From is higher. 393321369Sdim if (MDT.dominates(From, To)) 394321369Sdim return true; 395321369Sdim 396321369Sdim const MachineBasicBlock *MBBFrom = From->getParent(); 397321369Sdim const MachineBasicBlock *MBBTo = To->getParent(); 398321369Sdim if (MBBFrom == MBBTo) 399321369Sdim return false; 400321369Sdim 401321369Sdim // Instructions are in different blocks, do predecessor search. 402321369Sdim // We should almost never get here since we do not usually produce M0 stores 403321369Sdim // other than -1. 404321369Sdim return searchPredecessors(MBBTo, CutOff, [MBBFrom] 405321369Sdim (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 406321369Sdim} 407321369Sdim 408353358Sdim// Return the first non-prologue instruction in the block. 409353358Sdimstatic MachineBasicBlock::iterator 410353358SdimgetFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 411353358Sdim MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 412353358Sdim while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 413353358Sdim ++I; 414353358Sdim 415353358Sdim return I; 416353358Sdim} 417353358Sdim 418321369Sdim// Hoist and merge identical SGPR initializations into a common predecessor. 419321369Sdim// This is intended to combine M0 initializations, but can work with any 420321369Sdim// SGPR. A VGPR cannot be processed since we cannot guarantee vector 421321369Sdim// executioon. 422321369Sdimstatic bool hoistAndMergeSGPRInits(unsigned Reg, 423321369Sdim const MachineRegisterInfo &MRI, 424360784Sdim const TargetRegisterInfo *TRI, 425353358Sdim MachineDominatorTree &MDT, 426353358Sdim const TargetInstrInfo *TII) { 427321369Sdim // List of inits by immediate value. 428327952Sdim using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 429321369Sdim InitListMap Inits; 430321369Sdim // List of clobbering instructions. 431321369Sdim SmallVector<MachineInstr*, 8> Clobbers; 432353358Sdim // List of instructions marked for deletion. 433353358Sdim SmallSet<MachineInstr*, 8> MergedInstrs; 434353358Sdim 435321369Sdim bool Changed = false; 436321369Sdim 437321369Sdim for (auto &MI : MRI.def_instructions(Reg)) { 438321369Sdim MachineOperand *Imm = nullptr; 439360784Sdim for (auto &MO : MI.operands()) { 440321369Sdim if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 441321369Sdim (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 442321369Sdim Imm = nullptr; 443321369Sdim break; 444321369Sdim } else if (MO.isImm()) 445321369Sdim Imm = &MO; 446321369Sdim } 447321369Sdim if (Imm) 448321369Sdim Inits[Imm->getImm()].push_front(&MI); 449321369Sdim else 450321369Sdim Clobbers.push_back(&MI); 451321369Sdim } 452321369Sdim 453321369Sdim for (auto &Init : Inits) { 454321369Sdim auto &Defs = Init.second; 455321369Sdim 456321369Sdim for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 457321369Sdim MachineInstr *MI1 = *I1; 458321369Sdim 459321369Sdim for (auto I2 = std::next(I1); I2 != E; ) { 460321369Sdim MachineInstr *MI2 = *I2; 461321369Sdim 462321369Sdim // Check any possible interference 463353358Sdim auto interferes = [&](MachineBasicBlock::iterator From, 464353358Sdim MachineBasicBlock::iterator To) -> bool { 465321369Sdim 466321369Sdim assert(MDT.dominates(&*To, &*From)); 467321369Sdim 468321369Sdim auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 469321369Sdim const MachineBasicBlock *MBBFrom = From->getParent(); 470321369Sdim const MachineBasicBlock *MBBTo = To->getParent(); 471321369Sdim bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 472321369Sdim bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 473321369Sdim if (!MayClobberFrom && !MayClobberTo) 474321369Sdim return false; 475321369Sdim if ((MayClobberFrom && !MayClobberTo) || 476321369Sdim (!MayClobberFrom && MayClobberTo)) 477321369Sdim return true; 478321369Sdim // Both can clobber, this is not an interference only if both are 479321369Sdim // dominated by Clobber and belong to the same block or if Clobber 480321369Sdim // properly dominates To, given that To >> From, so it dominates 481321369Sdim // both and located in a common dominator. 482321369Sdim return !((MBBFrom == MBBTo && 483321369Sdim MDT.dominates(Clobber, &*From) && 484321369Sdim MDT.dominates(Clobber, &*To)) || 485321369Sdim MDT.properlyDominates(Clobber->getParent(), MBBTo)); 486321369Sdim }; 487321369Sdim 488327952Sdim return (llvm::any_of(Clobbers, interferes)) || 489327952Sdim (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 490327952Sdim return C.first != Init.first && 491327952Sdim llvm::any_of(C.second, interferes); 492321369Sdim })); 493321369Sdim }; 494321369Sdim 495321369Sdim if (MDT.dominates(MI1, MI2)) { 496353358Sdim if (!interferes(MI2, MI1)) { 497341825Sdim LLVM_DEBUG(dbgs() 498341825Sdim << "Erasing from " 499341825Sdim << printMBBReference(*MI2->getParent()) << " " << *MI2); 500353358Sdim MergedInstrs.insert(MI2); 501321369Sdim Changed = true; 502353358Sdim ++I2; 503321369Sdim continue; 504321369Sdim } 505321369Sdim } else if (MDT.dominates(MI2, MI1)) { 506353358Sdim if (!interferes(MI1, MI2)) { 507341825Sdim LLVM_DEBUG(dbgs() 508341825Sdim << "Erasing from " 509341825Sdim << printMBBReference(*MI1->getParent()) << " " << *MI1); 510353358Sdim MergedInstrs.insert(MI1); 511321369Sdim Changed = true; 512353358Sdim ++I1; 513321369Sdim break; 514321369Sdim } 515321369Sdim } else { 516321369Sdim auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 517321369Sdim MI2->getParent()); 518321369Sdim if (!MBB) { 519321369Sdim ++I2; 520321369Sdim continue; 521321369Sdim } 522321369Sdim 523353358Sdim MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 524353358Sdim if (!interferes(MI1, I) && !interferes(MI2, I)) { 525341825Sdim LLVM_DEBUG(dbgs() 526341825Sdim << "Erasing from " 527341825Sdim << printMBBReference(*MI1->getParent()) << " " << *MI1 528341825Sdim << "and moving from " 529341825Sdim << printMBBReference(*MI2->getParent()) << " to " 530341825Sdim << printMBBReference(*I->getParent()) << " " << *MI2); 531321369Sdim I->getParent()->splice(I, MI2->getParent(), MI2); 532353358Sdim MergedInstrs.insert(MI1); 533321369Sdim Changed = true; 534353358Sdim ++I1; 535321369Sdim break; 536321369Sdim } 537321369Sdim } 538321369Sdim ++I2; 539321369Sdim } 540321369Sdim ++I1; 541321369Sdim } 542321369Sdim } 543321369Sdim 544360784Sdim // Remove initializations that were merged into another. 545360784Sdim for (auto &Init : Inits) { 546360784Sdim auto &Defs = Init.second; 547360784Sdim auto I = Defs.begin(); 548360784Sdim while (I != Defs.end()) { 549360784Sdim if (MergedInstrs.count(*I)) { 550360784Sdim (*I)->eraseFromParent(); 551360784Sdim I = Defs.erase(I); 552360784Sdim } else 553360784Sdim ++I; 554360784Sdim } 555360784Sdim } 556353358Sdim 557360784Sdim // Try to schedule SGPR initializations as early as possible in the MBB. 558360784Sdim for (auto &Init : Inits) { 559360784Sdim auto &Defs = Init.second; 560360784Sdim for (auto MI : Defs) { 561360784Sdim auto MBB = MI->getParent(); 562360784Sdim MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 563360784Sdim MachineBasicBlock::reverse_iterator B(BoundaryMI); 564360784Sdim // Check if B should actually be a boundary. If not set the previous 565360784Sdim // instruction as the boundary instead. 566360784Sdim if (!TII->isBasicBlockPrologue(*B)) 567360784Sdim B++; 568360784Sdim 569360784Sdim auto R = std::next(MI->getReverseIterator()); 570360784Sdim const unsigned Threshold = 50; 571360784Sdim // Search until B or Threshold for a place to insert the initialization. 572360784Sdim for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 573360784Sdim if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 574360784Sdim TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 575360784Sdim break; 576360784Sdim 577360784Sdim // Move to directly after R. 578360784Sdim if (&*--R != MI) 579360784Sdim MBB->splice(*R, MBB, MI); 580360784Sdim } 581360784Sdim } 582360784Sdim 583321369Sdim if (Changed) 584321369Sdim MRI.clearKillFlags(Reg); 585321369Sdim 586321369Sdim return Changed; 587321369Sdim} 588321369Sdim 589284677Sdimbool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 590341825Sdim const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 591360784Sdim MRI = &MF.getRegInfo(); 592360784Sdim TRI = ST.getRegisterInfo(); 593360784Sdim TII = ST.getInstrInfo(); 594314564Sdim MDT = &getAnalysis<MachineDominatorTree>(); 595296417Sdim 596296417Sdim SmallVector<MachineInstr *, 16> Worklist; 597296417Sdim 598284677Sdim for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 599284677Sdim BI != BE; ++BI) { 600284677Sdim MachineBasicBlock &MBB = *BI; 601284677Sdim for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 602296417Sdim I != E; ++I) { 603284677Sdim MachineInstr &MI = *I; 604284677Sdim 605296417Sdim switch (MI.getOpcode()) { 606296417Sdim default: 607296417Sdim continue; 608327952Sdim case AMDGPU::COPY: 609327952Sdim case AMDGPU::WQM: 610360784Sdim case AMDGPU::SOFT_WQM: 611327952Sdim case AMDGPU::WWM: { 612360784Sdim Register DstReg = MI.getOperand(0).getReg(); 613360784Sdim 614360784Sdim const TargetRegisterClass *SrcRC, *DstRC; 615360784Sdim std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 616360784Sdim 617360784Sdim if (!Register::isVirtualRegister(DstReg)) { 618360784Sdim // If the destination register is a physical register there isn't 619360784Sdim // really much we can do to fix this. 620360784Sdim // Some special instructions use M0 as an input. Some even only use 621360784Sdim // the first lane. Insert a readfirstlane and hope for the best. 622360784Sdim if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { 623360784Sdim Register TmpReg 624360784Sdim = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 625360784Sdim 626360784Sdim BuildMI(MBB, MI, MI.getDebugLoc(), 627360784Sdim TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 628360784Sdim .add(MI.getOperand(1)); 629360784Sdim MI.getOperand(1).setReg(TmpReg); 630360784Sdim } 631360784Sdim 632296417Sdim continue; 633360784Sdim } 634296417Sdim 635296417Sdim if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { 636360784Sdim Register SrcReg = MI.getOperand(1).getReg(); 637360784Sdim if (!Register::isVirtualRegister(SrcReg)) { 638344779Sdim TII->moveToVALU(MI, MDT); 639321369Sdim break; 640321369Sdim } 641321369Sdim 642360784Sdim MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 643314564Sdim unsigned SMovOp; 644314564Sdim int64_t Imm; 645314564Sdim // If we are just copying an immediate, we can replace the copy with 646314564Sdim // s_mov_b32. 647314564Sdim if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { 648314564Sdim MI.getOperand(1).ChangeToImmediate(Imm); 649314564Sdim MI.addImplicitDefUseOperands(MF); 650314564Sdim MI.setDesc(TII->get(SMovOp)); 651314564Sdim break; 652314564Sdim } 653344779Sdim TII->moveToVALU(MI, MDT); 654321369Sdim } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 655321369Sdim tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); 656296417Sdim } 657296417Sdim 658296417Sdim break; 659284677Sdim } 660284677Sdim case AMDGPU::PHI: { 661360784Sdim processPHINode(MI); 662284677Sdim break; 663284677Sdim } 664327952Sdim case AMDGPU::REG_SEQUENCE: 665353358Sdim if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || 666353358Sdim !hasVectorOperands(MI, TRI)) { 667360784Sdim foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); 668284677Sdim continue; 669296417Sdim } 670284677Sdim 671341825Sdim LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 672284677Sdim 673344779Sdim TII->moveToVALU(MI, MDT); 674284677Sdim break; 675284677Sdim case AMDGPU::INSERT_SUBREG: { 676284677Sdim const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 677360784Sdim DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); 678360784Sdim Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); 679360784Sdim Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); 680284677Sdim if (TRI->isSGPRClass(DstRC) && 681353358Sdim (TRI->hasVectorRegisters(Src0RC) || 682353358Sdim TRI->hasVectorRegisters(Src1RC))) { 683341825Sdim LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 684344779Sdim TII->moveToVALU(MI, MDT); 685284677Sdim } 686284677Sdim break; 687284677Sdim } 688360784Sdim case AMDGPU::V_WRITELANE_B32: { 689360784Sdim // Some architectures allow more than one constant bus access without 690360784Sdim // SGPR restriction 691360784Sdim if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 692360784Sdim break; 693360784Sdim 694360784Sdim // Writelane is special in that it can use SGPR and M0 (which would 695360784Sdim // normally count as using the constant bus twice - but in this case it 696360784Sdim // is allowed since the lane selector doesn't count as a use of the 697360784Sdim // constant bus). However, it is still required to abide by the 1 SGPR 698360784Sdim // rule. Apply a fix here as we might have multiple SGPRs after 699360784Sdim // legalizing VGPRs to SGPRs 700360784Sdim int Src0Idx = 701360784Sdim AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 702360784Sdim int Src1Idx = 703360784Sdim AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 704360784Sdim MachineOperand &Src0 = MI.getOperand(Src0Idx); 705360784Sdim MachineOperand &Src1 = MI.getOperand(Src1Idx); 706360784Sdim 707360784Sdim // Check to see if the instruction violates the 1 SGPR rule 708360784Sdim if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 709360784Sdim Src0.getReg() != AMDGPU::M0) && 710360784Sdim (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 711360784Sdim Src1.getReg() != AMDGPU::M0)) { 712360784Sdim 713360784Sdim // Check for trivially easy constant prop into one of the operands 714360784Sdim // If this is the case then perform the operation now to resolve SGPR 715360784Sdim // issue. If we don't do that here we will always insert a mov to m0 716360784Sdim // that can't be resolved in later operand folding pass 717360784Sdim bool Resolved = false; 718360784Sdim for (MachineOperand *MO : {&Src0, &Src1}) { 719360784Sdim if (Register::isVirtualRegister(MO->getReg())) { 720360784Sdim MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 721360784Sdim if (DefMI && TII->isFoldableCopy(*DefMI)) { 722360784Sdim const MachineOperand &Def = DefMI->getOperand(0); 723360784Sdim if (Def.isReg() && 724360784Sdim MO->getReg() == Def.getReg() && 725360784Sdim MO->getSubReg() == Def.getSubReg()) { 726360784Sdim const MachineOperand &Copied = DefMI->getOperand(1); 727360784Sdim if (Copied.isImm() && 728360784Sdim TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 729360784Sdim MO->ChangeToImmediate(Copied.getImm()); 730360784Sdim Resolved = true; 731360784Sdim break; 732360784Sdim } 733360784Sdim } 734360784Sdim } 735360784Sdim } 736360784Sdim } 737360784Sdim 738360784Sdim if (!Resolved) { 739360784Sdim // Haven't managed to resolve by replacing an SGPR with an immediate 740360784Sdim // Move src1 to be in M0 741360784Sdim BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 742360784Sdim TII->get(AMDGPU::COPY), AMDGPU::M0) 743360784Sdim .add(Src1); 744360784Sdim Src1.ChangeToRegister(AMDGPU::M0, false); 745360784Sdim } 746360784Sdim } 747360784Sdim break; 748284677Sdim } 749360784Sdim } 750284677Sdim } 751284677Sdim } 752284677Sdim 753321369Sdim if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) 754360784Sdim hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 755321369Sdim 756284677Sdim return true; 757284677Sdim} 758360784Sdim 759360784Sdimvoid SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 760360784Sdim unsigned numVGPRUses = 0; 761360784Sdim bool AllAGPRUses = true; 762360784Sdim SetVector<const MachineInstr *> worklist; 763360784Sdim SmallSet<const MachineInstr *, 4> Visited; 764360784Sdim worklist.insert(&MI); 765360784Sdim Visited.insert(&MI); 766360784Sdim while (!worklist.empty()) { 767360784Sdim const MachineInstr *Instr = worklist.pop_back_val(); 768360784Sdim unsigned Reg = Instr->getOperand(0).getReg(); 769360784Sdim for (const auto &Use : MRI->use_operands(Reg)) { 770360784Sdim const MachineInstr *UseMI = Use.getParent(); 771360784Sdim AllAGPRUses &= (UseMI->isCopy() && 772360784Sdim TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 773360784Sdim TRI->isAGPR(*MRI, Use.getReg()); 774360784Sdim if (UseMI->isCopy() || UseMI->isRegSequence()) { 775360784Sdim if (UseMI->isCopy() && 776360784Sdim UseMI->getOperand(0).getReg().isPhysical() && 777360784Sdim !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { 778360784Sdim numVGPRUses++; 779360784Sdim } 780360784Sdim if (Visited.insert(UseMI).second) 781360784Sdim worklist.insert(UseMI); 782360784Sdim 783360784Sdim continue; 784360784Sdim } 785360784Sdim 786360784Sdim if (UseMI->isPHI()) { 787360784Sdim const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); 788360784Sdim if (!TRI->isSGPRReg(*MRI, Use.getReg()) && 789360784Sdim UseRC != &AMDGPU::VReg_1RegClass) 790360784Sdim numVGPRUses++; 791360784Sdim continue; 792360784Sdim } 793360784Sdim 794360784Sdim const TargetRegisterClass *OpRC = 795360784Sdim TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); 796360784Sdim if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && 797360784Sdim OpRC != &AMDGPU::VS_64RegClass) { 798360784Sdim numVGPRUses++; 799360784Sdim } 800360784Sdim } 801360784Sdim } 802360784Sdim 803360784Sdim Register PHIRes = MI.getOperand(0).getReg(); 804360784Sdim const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 805360784Sdim if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { 806360784Sdim LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 807360784Sdim MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 808360784Sdim } 809360784Sdim 810360784Sdim bool hasVGPRInput = false; 811360784Sdim for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 812360784Sdim unsigned InputReg = MI.getOperand(i).getReg(); 813360784Sdim MachineInstr *Def = MRI->getVRegDef(InputReg); 814360784Sdim if (TRI->isVectorRegister(*MRI, InputReg)) { 815360784Sdim if (Def->isCopy()) { 816360784Sdim unsigned SrcReg = Def->getOperand(1).getReg(); 817360784Sdim const TargetRegisterClass *RC = 818360784Sdim TRI->getRegClassForReg(*MRI, SrcReg); 819360784Sdim if (TRI->isSGPRClass(RC)) 820360784Sdim continue; 821360784Sdim } 822360784Sdim hasVGPRInput = true; 823360784Sdim break; 824360784Sdim } 825360784Sdim else if (Def->isCopy() && 826360784Sdim TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { 827360784Sdim hasVGPRInput = true; 828360784Sdim break; 829360784Sdim } 830360784Sdim } 831360784Sdim 832360784Sdim if ((!TRI->isVectorRegister(*MRI, PHIRes) && 833360784Sdim RC0 != &AMDGPU::VReg_1RegClass) && 834360784Sdim (hasVGPRInput || numVGPRUses > 1)) { 835360784Sdim LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); 836360784Sdim TII->moveToVALU(MI); 837360784Sdim } 838360784Sdim else { 839360784Sdim LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 840360784Sdim TII->legalizeOperands(MI, MDT); 841360784Sdim } 842360784Sdim 843360784Sdim} 844