1284677Sdim//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
2284677Sdim//
3284677Sdim//                     The LLVM Compiler Infrastructure
4284677Sdim//
5284677Sdim// This file is distributed under the University of Illinois Open Source
6284677Sdim// License. See LICENSE.TXT for details.
7284677Sdim//
8284677Sdim//===----------------------------------------------------------------------===//
9284677Sdim//
10284677Sdim/// \file
11284677Sdim/// Copies from VGPR to SGPR registers are illegal and the register coalescer
12284677Sdim/// will sometimes generate these illegal copies in situations like this:
13284677Sdim///
14284677Sdim///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
15284677Sdim///
16284677Sdim/// BB0:
17284677Sdim///   %vreg0 <sgpr> = SCALAR_INST
18284677Sdim///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
19284677Sdim///    ...
20284677Sdim///    BRANCH %cond BB1, BB2
21284677Sdim///  BB1:
22284677Sdim///    %vreg2 <vgpr> = VECTOR_INST
23284677Sdim///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
24284677Sdim///  BB2:
25284677Sdim///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
26284677Sdim///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
27284677Sdim///
28284677Sdim///
29284677Sdim/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
30284677Sdim/// code will look like this:
31284677Sdim///
32284677Sdim/// BB0:
33284677Sdim///   %vreg0 <sgpr> = SCALAR_INST
34284677Sdim///    ...
35284677Sdim///    BRANCH %cond BB1, BB2
36284677Sdim/// BB1:
37284677Sdim///   %vreg2 <vgpr> = VECTOR_INST
38284677Sdim///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
39284677Sdim/// BB2:
40284677Sdim///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
41284677Sdim///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
42284677Sdim///
43284677Sdim/// Now that the result of the PHI instruction is an SGPR, the register
44284677Sdim/// allocator is now forced to constrain the register class of %vreg3 to
45284677Sdim/// <sgpr> so we end up with final code like this:
46284677Sdim///
47284677Sdim/// BB0:
48284677Sdim///   %vreg0 <sgpr> = SCALAR_INST
49284677Sdim///    ...
50284677Sdim///    BRANCH %cond BB1, BB2
51284677Sdim/// BB1:
52284677Sdim///   %vreg2 <vgpr> = VECTOR_INST
53284677Sdim///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
54284677Sdim/// BB2:
55284677Sdim///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
56284677Sdim///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
57284677Sdim///
58284677Sdim/// Now this code contains an illegal copy from a VGPR to an SGPR.
59284677Sdim///
60284677Sdim/// In order to avoid this problem, this pass searches for PHI instructions
61284677Sdim/// which define a <vsrc> register and constrains its definition class to
62284677Sdim/// <vgpr> if the user of the PHI's definition register is a vector instruction.
63284677Sdim/// If the PHI's definition class is constrained to <vgpr> then the coalescer
64284677Sdim/// will be unable to perform the COPY removal from the above example  which
65284677Sdim/// ultimately led to the creation of an illegal COPY.
66284677Sdim//===----------------------------------------------------------------------===//
67284677Sdim
68284677Sdim#include "AMDGPU.h"
69284677Sdim#include "AMDGPUSubtarget.h"
70284677Sdim#include "SIInstrInfo.h"
71284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
72284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
73284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
74284677Sdim#include "llvm/Support/Debug.h"
75284677Sdim#include "llvm/Support/raw_ostream.h"
76284677Sdim#include "llvm/Target/TargetMachine.h"
77284677Sdim
78284677Sdimusing namespace llvm;
79284677Sdim
80284677Sdim#define DEBUG_TYPE "sgpr-copies"
81284677Sdim
82284677Sdimnamespace {
83284677Sdim
84284677Sdimclass SIFixSGPRCopies : public MachineFunctionPass {
85296417Sdimpublic:
86284677Sdim  static char ID;
87284677Sdim
88296417Sdim  SIFixSGPRCopies() : MachineFunctionPass(ID) { }
89284677Sdim
90284677Sdim  bool runOnMachineFunction(MachineFunction &MF) override;
91284677Sdim
92284677Sdim  const char *getPassName() const override {
93284677Sdim    return "SI Fix SGPR copies";
94284677Sdim  }
95284677Sdim
96296417Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
97296417Sdim    AU.setPreservesCFG();
98296417Sdim    MachineFunctionPass::getAnalysisUsage(AU);
99296417Sdim  }
100284677Sdim};
101284677Sdim
102284677Sdim} // End anonymous namespace
103284677Sdim
104296417SdimINITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
105296417Sdim                "SI Fix SGPR copies", false, false)
106296417Sdim
107284677Sdimchar SIFixSGPRCopies::ID = 0;
108284677Sdim
109296417Sdimchar &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
110296417Sdim
111296417SdimFunctionPass *llvm::createSIFixSGPRCopiesPass() {
112296417Sdim  return new SIFixSGPRCopies();
113284677Sdim}
114284677Sdim
115284677Sdimstatic bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
116284677Sdim  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
117284677Sdim  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
118284677Sdim    if (!MI.getOperand(i).isReg() ||
119284677Sdim        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
120284677Sdim      continue;
121284677Sdim
122284677Sdim    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
123284677Sdim      return true;
124284677Sdim  }
125284677Sdim  return false;
126284677Sdim}
127284677Sdim
128296417Sdimstatic std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
129296417SdimgetCopyRegClasses(const MachineInstr &Copy,
130296417Sdim                  const SIRegisterInfo &TRI,
131296417Sdim                  const MachineRegisterInfo &MRI) {
132296417Sdim  unsigned DstReg = Copy.getOperand(0).getReg();
133296417Sdim  unsigned SrcReg = Copy.getOperand(1).getReg();
134284677Sdim
135296417Sdim  const TargetRegisterClass *SrcRC =
136296417Sdim    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
137296417Sdim    MRI.getRegClass(SrcReg) :
138296417Sdim    TRI.getPhysRegClass(SrcReg);
139284677Sdim
140296417Sdim  // We don't really care about the subregister here.
141296417Sdim  // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
142284677Sdim
143296417Sdim  const TargetRegisterClass *DstRC =
144296417Sdim    TargetRegisterInfo::isVirtualRegister(DstReg) ?
145296417Sdim    MRI.getRegClass(DstReg) :
146296417Sdim    TRI.getPhysRegClass(DstReg);
147296417Sdim
148296417Sdim  return std::make_pair(SrcRC, DstRC);
149284677Sdim}
150284677Sdim
151296417Sdimstatic bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
152296417Sdim                             const TargetRegisterClass *DstRC,
153296417Sdim                             const SIRegisterInfo &TRI) {
154296417Sdim  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
155296417Sdim}
156284677Sdim
157296417Sdimstatic bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
158296417Sdim                             const TargetRegisterClass *DstRC,
159296417Sdim                             const SIRegisterInfo &TRI) {
160296417Sdim  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
161284677Sdim}
162284677Sdim
163296417Sdim// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
164296417Sdim//
165296417Sdim// SGPRx = ...
166296417Sdim// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
167296417Sdim// VGPRz = COPY SGPRy
168296417Sdim//
169296417Sdim// ==>
170296417Sdim//
171296417Sdim// VGPRx = COPY SGPRx
172296417Sdim// VGPRz = REG_SEQUENCE VGPRx, sub0
173296417Sdim//
174296417Sdim// This exposes immediate folding opportunities when materializing 64-bit
175296417Sdim// immediates.
176296417Sdimstatic bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
177296417Sdim                                        const SIRegisterInfo *TRI,
178296417Sdim                                        const SIInstrInfo *TII,
179296417Sdim                                        MachineRegisterInfo &MRI) {
180296417Sdim  assert(MI.isRegSequence());
181284677Sdim
182296417Sdim  unsigned DstReg = MI.getOperand(0).getReg();
183296417Sdim  if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
184296417Sdim    return false;
185284677Sdim
186296417Sdim  if (!MRI.hasOneUse(DstReg))
187284677Sdim    return false;
188284677Sdim
189296417Sdim  MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
190296417Sdim  if (!CopyUse.isCopy())
191296417Sdim    return false;
192284677Sdim
193296417Sdim  const TargetRegisterClass *SrcRC, *DstRC;
194296417Sdim  std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
195284677Sdim
196296417Sdim  if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
197284677Sdim    return false;
198284677Sdim
199296417Sdim  // TODO: Could have multiple extracts?
200296417Sdim  unsigned SubReg = CopyUse.getOperand(1).getSubReg();
201296417Sdim  if (SubReg != AMDGPU::NoSubRegister)
202296417Sdim    return false;
203296417Sdim
204296417Sdim  MRI.setRegClass(DstReg, DstRC);
205296417Sdim
206296417Sdim  // SGPRx = ...
207296417Sdim  // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
208296417Sdim  // VGPRz = COPY SGPRy
209296417Sdim
210296417Sdim  // =>
211296417Sdim  // VGPRx = COPY SGPRx
212296417Sdim  // VGPRz = REG_SEQUENCE VGPRx, sub0
213296417Sdim
214296417Sdim  MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
215296417Sdim
216296417Sdim  for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
217296417Sdim    unsigned SrcReg = MI.getOperand(I).getReg();
218296417Sdim    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
219296417Sdim
220296417Sdim    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
221296417Sdim    assert(TRI->isSGPRClass(SrcRC) &&
222296417Sdim           "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
223296417Sdim
224296417Sdim    SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
225296417Sdim    const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
226296417Sdim
227296417Sdim    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
228296417Sdim
229296417Sdim    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
230296417Sdim      .addOperand(MI.getOperand(I));
231296417Sdim
232296417Sdim    MI.getOperand(I).setReg(TmpReg);
233296417Sdim  }
234296417Sdim
235296417Sdim  CopyUse.eraseFromParent();
236296417Sdim  return true;
237284677Sdim}
238284677Sdim
239284677Sdimbool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
240284677Sdim  MachineRegisterInfo &MRI = MF.getRegInfo();
241284677Sdim  const SIRegisterInfo *TRI =
242284677Sdim      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
243284677Sdim  const SIInstrInfo *TII =
244284677Sdim      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
245296417Sdim
246296417Sdim  SmallVector<MachineInstr *, 16> Worklist;
247296417Sdim
248284677Sdim  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
249284677Sdim                                                  BI != BE; ++BI) {
250284677Sdim
251284677Sdim    MachineBasicBlock &MBB = *BI;
252284677Sdim    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
253296417Sdim         I != E; ++I) {
254284677Sdim      MachineInstr &MI = *I;
255284677Sdim
256296417Sdim      switch (MI.getOpcode()) {
257296417Sdim      default:
258296417Sdim        continue;
259296417Sdim      case AMDGPU::COPY: {
260296417Sdim        // If the destination register is a physical register there isn't really
261296417Sdim        // much we can do to fix this.
262296417Sdim        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
263296417Sdim          continue;
264296417Sdim
265296417Sdim        const TargetRegisterClass *SrcRC, *DstRC;
266296417Sdim        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
267296417Sdim        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
268296417Sdim          DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
269296417Sdim          TII->moveToVALU(MI);
270296417Sdim        }
271296417Sdim
272296417Sdim        break;
273284677Sdim      }
274284677Sdim      case AMDGPU::PHI: {
275284677Sdim        DEBUG(dbgs() << "Fixing PHI: " << MI);
276284677Sdim        unsigned Reg = MI.getOperand(0).getReg();
277284677Sdim        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
278284677Sdim          break;
279284677Sdim
280284677Sdim        // If a PHI node defines an SGPR and any of its operands are VGPRs,
281284677Sdim        // then we need to move it to the VALU.
282284677Sdim        //
283284677Sdim        // Also, if a PHI node defines an SGPR and has all SGPR operands
284284677Sdim        // we must move it to the VALU, because the SGPR operands will
285284677Sdim        // all end up being assigned the same register, which means
286284677Sdim        // there is a potential for a conflict if different threads take
287284677Sdim        // different control flow paths.
288284677Sdim        //
289284677Sdim        // For Example:
290284677Sdim        //
291284677Sdim        // sgpr0 = def;
292284677Sdim        // ...
293284677Sdim        // sgpr1 = def;
294284677Sdim        // ...
295284677Sdim        // sgpr2 = PHI sgpr0, sgpr1
296284677Sdim        // use sgpr2;
297284677Sdim        //
298284677Sdim        // Will Become:
299284677Sdim        //
300284677Sdim        // sgpr2 = def;
301284677Sdim        // ...
302284677Sdim        // sgpr2 = def;
303284677Sdim        // ...
304284677Sdim        // use sgpr2
305284677Sdim        //
306284677Sdim        // FIXME: This is OK if the branching decision is made based on an
307284677Sdim        // SGPR value.
308284677Sdim        bool SGPRBranch = false;
309284677Sdim
310284677Sdim        // The one exception to this rule is when one of the operands
311284677Sdim        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
312284677Sdim        // instruction.  In this case, there we know the program will
313284677Sdim        // never enter the second block (the loop) without entering
314284677Sdim        // the first block (where the condition is computed), so there
315284677Sdim        // is no chance for values to be over-written.
316284677Sdim
317284677Sdim        bool HasBreakDef = false;
318284677Sdim        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
319284677Sdim          unsigned Reg = MI.getOperand(i).getReg();
320284677Sdim          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
321284677Sdim            TII->moveToVALU(MI);
322284677Sdim            break;
323284677Sdim          }
324284677Sdim          MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
325284677Sdim          assert(DefInstr);
326284677Sdim          switch(DefInstr->getOpcode()) {
327284677Sdim
328284677Sdim          case AMDGPU::SI_BREAK:
329284677Sdim          case AMDGPU::SI_IF_BREAK:
330284677Sdim          case AMDGPU::SI_ELSE_BREAK:
331284677Sdim          // If we see a PHI instruction that defines an SGPR, then that PHI
332284677Sdim          // instruction has already been considered and should have
333284677Sdim          // a *_BREAK as an operand.
334284677Sdim          case AMDGPU::PHI:
335284677Sdim            HasBreakDef = true;
336284677Sdim            break;
337284677Sdim          }
338284677Sdim        }
339284677Sdim
340284677Sdim        if (!SGPRBranch && !HasBreakDef)
341284677Sdim          TII->moveToVALU(MI);
342284677Sdim        break;
343284677Sdim      }
344284677Sdim      case AMDGPU::REG_SEQUENCE: {
345284677Sdim        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
346296417Sdim            !hasVGPROperands(MI, TRI)) {
347296417Sdim          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
348284677Sdim          continue;
349296417Sdim        }
350284677Sdim
351284677Sdim        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
352284677Sdim
353284677Sdim        TII->moveToVALU(MI);
354284677Sdim        break;
355284677Sdim      }
356284677Sdim      case AMDGPU::INSERT_SUBREG: {
357284677Sdim        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
358284677Sdim        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
359284677Sdim        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
360284677Sdim        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
361284677Sdim        if (TRI->isSGPRClass(DstRC) &&
362284677Sdim            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
363284677Sdim          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
364284677Sdim          TII->moveToVALU(MI);
365284677Sdim        }
366284677Sdim        break;
367284677Sdim      }
368284677Sdim      }
369284677Sdim    }
370284677Sdim  }
371284677Sdim
372284677Sdim  return true;
373284677Sdim}
374