1284677Sdim//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// 2284677Sdim// 3284677Sdim// The LLVM Compiler Infrastructure 4284677Sdim// 5284677Sdim// This file is distributed under the University of Illinois Open Source 6284677Sdim// License. See LICENSE.TXT for details. 7284677Sdim// 8284677Sdim//===----------------------------------------------------------------------===// 9284677Sdim// 10284677Sdim/// \file 11284677Sdim/// Copies from VGPR to SGPR registers are illegal and the register coalescer 12284677Sdim/// will sometimes generate these illegal copies in situations like this: 13284677Sdim/// 14284677Sdim/// Register Class <vsrc> is the union of <vgpr> and <sgpr> 15284677Sdim/// 16284677Sdim/// BB0: 17284677Sdim/// %vreg0 <sgpr> = SCALAR_INST 18284677Sdim/// %vreg1 <vsrc> = COPY %vreg0 <sgpr> 19284677Sdim/// ... 20284677Sdim/// BRANCH %cond BB1, BB2 21284677Sdim/// BB1: 22284677Sdim/// %vreg2 <vgpr> = VECTOR_INST 23284677Sdim/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> 24284677Sdim/// BB2: 25284677Sdim/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> 26284677Sdim/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> 27284677Sdim/// 28284677Sdim/// 29284677Sdim/// The coalescer will begin at BB0 and eliminate its copy, then the resulting 30284677Sdim/// code will look like this: 31284677Sdim/// 32284677Sdim/// BB0: 33284677Sdim/// %vreg0 <sgpr> = SCALAR_INST 34284677Sdim/// ... 35284677Sdim/// BRANCH %cond BB1, BB2 36284677Sdim/// BB1: 37284677Sdim/// %vreg2 <vgpr> = VECTOR_INST 38284677Sdim/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> 39284677Sdim/// BB2: 40284677Sdim/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> 41284677Sdim/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> 42284677Sdim/// 43284677Sdim/// Now that the result of the PHI instruction is an SGPR, the register 44284677Sdim/// allocator is now forced to constrain the register class of %vreg3 to 45284677Sdim/// <sgpr> so we end up with final code like this: 46284677Sdim/// 47284677Sdim/// BB0: 48284677Sdim/// %vreg0 <sgpr> = SCALAR_INST 49284677Sdim/// ... 50284677Sdim/// BRANCH %cond BB1, BB2 51284677Sdim/// BB1: 52284677Sdim/// %vreg2 <vgpr> = VECTOR_INST 53284677Sdim/// %vreg3 <sgpr> = COPY %vreg2 <vgpr> 54284677Sdim/// BB2: 55284677Sdim/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> 56284677Sdim/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> 57284677Sdim/// 58284677Sdim/// Now this code contains an illegal copy from a VGPR to an SGPR. 59284677Sdim/// 60284677Sdim/// In order to avoid this problem, this pass searches for PHI instructions 61284677Sdim/// which define a <vsrc> register and constrains its definition class to 62284677Sdim/// <vgpr> if the user of the PHI's definition register is a vector instruction. 63284677Sdim/// If the PHI's definition class is constrained to <vgpr> then the coalescer 64284677Sdim/// will be unable to perform the COPY removal from the above example which 65284677Sdim/// ultimately led to the creation of an illegal COPY. 66284677Sdim//===----------------------------------------------------------------------===// 67284677Sdim 68284677Sdim#include "AMDGPU.h" 69284677Sdim#include "AMDGPUSubtarget.h" 70284677Sdim#include "SIInstrInfo.h" 71284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 72284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 73284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 74284677Sdim#include "llvm/Support/Debug.h" 75284677Sdim#include "llvm/Support/raw_ostream.h" 76284677Sdim#include "llvm/Target/TargetMachine.h" 77284677Sdim 78284677Sdimusing namespace llvm; 79284677Sdim 80284677Sdim#define DEBUG_TYPE "sgpr-copies" 81284677Sdim 82284677Sdimnamespace { 83284677Sdim 84284677Sdimclass SIFixSGPRCopies : public MachineFunctionPass { 85296417Sdimpublic: 86284677Sdim static char ID; 87284677Sdim 88296417Sdim SIFixSGPRCopies() : MachineFunctionPass(ID) { } 89284677Sdim 90284677Sdim bool runOnMachineFunction(MachineFunction &MF) override; 91284677Sdim 92284677Sdim const char *getPassName() const override { 93284677Sdim return "SI Fix SGPR copies"; 94284677Sdim } 95284677Sdim 96296417Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 97296417Sdim AU.setPreservesCFG(); 98296417Sdim MachineFunctionPass::getAnalysisUsage(AU); 99296417Sdim } 100284677Sdim}; 101284677Sdim 102284677Sdim} // End anonymous namespace 103284677Sdim 104296417SdimINITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, 105296417Sdim "SI Fix SGPR copies", false, false) 106296417Sdim 107284677Sdimchar SIFixSGPRCopies::ID = 0; 108284677Sdim 109296417Sdimchar &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 110296417Sdim 111296417SdimFunctionPass *llvm::createSIFixSGPRCopiesPass() { 112296417Sdim return new SIFixSGPRCopies(); 113284677Sdim} 114284677Sdim 115284677Sdimstatic bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { 116284677Sdim const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 117284677Sdim for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 118284677Sdim if (!MI.getOperand(i).isReg() || 119284677Sdim !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 120284677Sdim continue; 121284677Sdim 122284677Sdim if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) 123284677Sdim return true; 124284677Sdim } 125284677Sdim return false; 126284677Sdim} 127284677Sdim 128296417Sdimstatic std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 129296417SdimgetCopyRegClasses(const MachineInstr &Copy, 130296417Sdim const SIRegisterInfo &TRI, 131296417Sdim const MachineRegisterInfo &MRI) { 132296417Sdim unsigned DstReg = Copy.getOperand(0).getReg(); 133296417Sdim unsigned SrcReg = Copy.getOperand(1).getReg(); 134284677Sdim 135296417Sdim const TargetRegisterClass *SrcRC = 136296417Sdim TargetRegisterInfo::isVirtualRegister(SrcReg) ? 137296417Sdim MRI.getRegClass(SrcReg) : 138296417Sdim TRI.getPhysRegClass(SrcReg); 139284677Sdim 140296417Sdim // We don't really care about the subregister here. 141296417Sdim // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 142284677Sdim 143296417Sdim const TargetRegisterClass *DstRC = 144296417Sdim TargetRegisterInfo::isVirtualRegister(DstReg) ? 145296417Sdim MRI.getRegClass(DstReg) : 146296417Sdim TRI.getPhysRegClass(DstReg); 147296417Sdim 148296417Sdim return std::make_pair(SrcRC, DstRC); 149284677Sdim} 150284677Sdim 151296417Sdimstatic bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 152296417Sdim const TargetRegisterClass *DstRC, 153296417Sdim const SIRegisterInfo &TRI) { 154296417Sdim return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); 155296417Sdim} 156284677Sdim 157296417Sdimstatic bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 158296417Sdim const TargetRegisterClass *DstRC, 159296417Sdim const SIRegisterInfo &TRI) { 160296417Sdim return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); 161284677Sdim} 162284677Sdim 163296417Sdim// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 164296417Sdim// 165296417Sdim// SGPRx = ... 166296417Sdim// SGPRy = REG_SEQUENCE SGPRx, sub0 ... 167296417Sdim// VGPRz = COPY SGPRy 168296417Sdim// 169296417Sdim// ==> 170296417Sdim// 171296417Sdim// VGPRx = COPY SGPRx 172296417Sdim// VGPRz = REG_SEQUENCE VGPRx, sub0 173296417Sdim// 174296417Sdim// This exposes immediate folding opportunities when materializing 64-bit 175296417Sdim// immediates. 176296417Sdimstatic bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 177296417Sdim const SIRegisterInfo *TRI, 178296417Sdim const SIInstrInfo *TII, 179296417Sdim MachineRegisterInfo &MRI) { 180296417Sdim assert(MI.isRegSequence()); 181284677Sdim 182296417Sdim unsigned DstReg = MI.getOperand(0).getReg(); 183296417Sdim if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 184296417Sdim return false; 185284677Sdim 186296417Sdim if (!MRI.hasOneUse(DstReg)) 187284677Sdim return false; 188284677Sdim 189296417Sdim MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 190296417Sdim if (!CopyUse.isCopy()) 191296417Sdim return false; 192284677Sdim 193296417Sdim const TargetRegisterClass *SrcRC, *DstRC; 194296417Sdim std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 195284677Sdim 196296417Sdim if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 197284677Sdim return false; 198284677Sdim 199296417Sdim // TODO: Could have multiple extracts? 200296417Sdim unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 201296417Sdim if (SubReg != AMDGPU::NoSubRegister) 202296417Sdim return false; 203296417Sdim 204296417Sdim MRI.setRegClass(DstReg, DstRC); 205296417Sdim 206296417Sdim // SGPRx = ... 207296417Sdim // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 208296417Sdim // VGPRz = COPY SGPRy 209296417Sdim 210296417Sdim // => 211296417Sdim // VGPRx = COPY SGPRx 212296417Sdim // VGPRz = REG_SEQUENCE VGPRx, sub0 213296417Sdim 214296417Sdim MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 215296417Sdim 216296417Sdim for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 217296417Sdim unsigned SrcReg = MI.getOperand(I).getReg(); 218296417Sdim unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 219296417Sdim 220296417Sdim const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 221296417Sdim assert(TRI->isSGPRClass(SrcRC) && 222296417Sdim "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 223296417Sdim 224296417Sdim SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 225296417Sdim const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 226296417Sdim 227296417Sdim unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); 228296417Sdim 229296417Sdim BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) 230296417Sdim .addOperand(MI.getOperand(I)); 231296417Sdim 232296417Sdim MI.getOperand(I).setReg(TmpReg); 233296417Sdim } 234296417Sdim 235296417Sdim CopyUse.eraseFromParent(); 236296417Sdim return true; 237284677Sdim} 238284677Sdim 239284677Sdimbool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 240284677Sdim MachineRegisterInfo &MRI = MF.getRegInfo(); 241284677Sdim const SIRegisterInfo *TRI = 242284677Sdim static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); 243284677Sdim const SIInstrInfo *TII = 244284677Sdim static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 245296417Sdim 246296417Sdim SmallVector<MachineInstr *, 16> Worklist; 247296417Sdim 248284677Sdim for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 249284677Sdim BI != BE; ++BI) { 250284677Sdim 251284677Sdim MachineBasicBlock &MBB = *BI; 252284677Sdim for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 253296417Sdim I != E; ++I) { 254284677Sdim MachineInstr &MI = *I; 255284677Sdim 256296417Sdim switch (MI.getOpcode()) { 257296417Sdim default: 258296417Sdim continue; 259296417Sdim case AMDGPU::COPY: { 260296417Sdim // If the destination register is a physical register there isn't really 261296417Sdim // much we can do to fix this. 262296417Sdim if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) 263296417Sdim continue; 264296417Sdim 265296417Sdim const TargetRegisterClass *SrcRC, *DstRC; 266296417Sdim std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); 267296417Sdim if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { 268296417Sdim DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); 269296417Sdim TII->moveToVALU(MI); 270296417Sdim } 271296417Sdim 272296417Sdim break; 273284677Sdim } 274284677Sdim case AMDGPU::PHI: { 275284677Sdim DEBUG(dbgs() << "Fixing PHI: " << MI); 276284677Sdim unsigned Reg = MI.getOperand(0).getReg(); 277284677Sdim if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) 278284677Sdim break; 279284677Sdim 280284677Sdim // If a PHI node defines an SGPR and any of its operands are VGPRs, 281284677Sdim // then we need to move it to the VALU. 282284677Sdim // 283284677Sdim // Also, if a PHI node defines an SGPR and has all SGPR operands 284284677Sdim // we must move it to the VALU, because the SGPR operands will 285284677Sdim // all end up being assigned the same register, which means 286284677Sdim // there is a potential for a conflict if different threads take 287284677Sdim // different control flow paths. 288284677Sdim // 289284677Sdim // For Example: 290284677Sdim // 291284677Sdim // sgpr0 = def; 292284677Sdim // ... 293284677Sdim // sgpr1 = def; 294284677Sdim // ... 295284677Sdim // sgpr2 = PHI sgpr0, sgpr1 296284677Sdim // use sgpr2; 297284677Sdim // 298284677Sdim // Will Become: 299284677Sdim // 300284677Sdim // sgpr2 = def; 301284677Sdim // ... 302284677Sdim // sgpr2 = def; 303284677Sdim // ... 304284677Sdim // use sgpr2 305284677Sdim // 306284677Sdim // FIXME: This is OK if the branching decision is made based on an 307284677Sdim // SGPR value. 308284677Sdim bool SGPRBranch = false; 309284677Sdim 310284677Sdim // The one exception to this rule is when one of the operands 311284677Sdim // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK 312284677Sdim // instruction. In this case, there we know the program will 313284677Sdim // never enter the second block (the loop) without entering 314284677Sdim // the first block (where the condition is computed), so there 315284677Sdim // is no chance for values to be over-written. 316284677Sdim 317284677Sdim bool HasBreakDef = false; 318284677Sdim for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { 319284677Sdim unsigned Reg = MI.getOperand(i).getReg(); 320284677Sdim if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { 321284677Sdim TII->moveToVALU(MI); 322284677Sdim break; 323284677Sdim } 324284677Sdim MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); 325284677Sdim assert(DefInstr); 326284677Sdim switch(DefInstr->getOpcode()) { 327284677Sdim 328284677Sdim case AMDGPU::SI_BREAK: 329284677Sdim case AMDGPU::SI_IF_BREAK: 330284677Sdim case AMDGPU::SI_ELSE_BREAK: 331284677Sdim // If we see a PHI instruction that defines an SGPR, then that PHI 332284677Sdim // instruction has already been considered and should have 333284677Sdim // a *_BREAK as an operand. 334284677Sdim case AMDGPU::PHI: 335284677Sdim HasBreakDef = true; 336284677Sdim break; 337284677Sdim } 338284677Sdim } 339284677Sdim 340284677Sdim if (!SGPRBranch && !HasBreakDef) 341284677Sdim TII->moveToVALU(MI); 342284677Sdim break; 343284677Sdim } 344284677Sdim case AMDGPU::REG_SEQUENCE: { 345284677Sdim if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || 346296417Sdim !hasVGPROperands(MI, TRI)) { 347296417Sdim foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); 348284677Sdim continue; 349296417Sdim } 350284677Sdim 351284677Sdim DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 352284677Sdim 353284677Sdim TII->moveToVALU(MI); 354284677Sdim break; 355284677Sdim } 356284677Sdim case AMDGPU::INSERT_SUBREG: { 357284677Sdim const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 358284677Sdim DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); 359284677Sdim Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); 360284677Sdim Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); 361284677Sdim if (TRI->isSGPRClass(DstRC) && 362284677Sdim (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { 363284677Sdim DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 364284677Sdim TII->moveToVALU(MI); 365284677Sdim } 366284677Sdim break; 367284677Sdim } 368284677Sdim } 369284677Sdim } 370284677Sdim } 371284677Sdim 372284677Sdim return true; 373284677Sdim} 374