1259698Sdim//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
2259698Sdim//
3259698Sdim//                     The LLVM Compiler Infrastructure
4259698Sdim//
5259698Sdim// This file is distributed under the University of Illinois Open Source
6259698Sdim// License. See LICENSE.TXT for details.
7259698Sdim//
8259698Sdim//===----------------------------------------------------------------------===//
9259698Sdim//
10259698Sdim/// \file
11259698Sdim/// This pass merges inputs of swizzeable instructions into vector sharing
12259698Sdim/// common data and/or have enough undef subreg using swizzle abilities.
13259698Sdim///
14259698Sdim/// For instance let's consider the following pseudo code :
15259698Sdim/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
16259698Sdim/// ...
17259698Sdim/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
18259698Sdim/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
19259698Sdim///
20259698Sdim/// is turned into :
21259698Sdim/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
22259698Sdim/// ...
23259698Sdim/// vreg7<def> = INSERT_SUBREG vreg4, sub3
24259698Sdim/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
25259698Sdim///
26259698Sdim/// This allow regalloc to reduce register pressure for vector registers and
27259698Sdim/// to reduce MOV count.
28259698Sdim//===----------------------------------------------------------------------===//
29259698Sdim
30259698Sdim#define DEBUG_TYPE "vec-merger"
31259698Sdim#include "llvm/Support/Debug.h"
32259698Sdim#include "AMDGPU.h"
33259698Sdim#include "R600InstrInfo.h"
34259698Sdim#include "llvm/CodeGen/DFAPacketizer.h"
35259698Sdim#include "llvm/CodeGen/MachineDominators.h"
36259698Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
37259698Sdim#include "llvm/CodeGen/MachineLoopInfo.h"
38259698Sdim#include "llvm/CodeGen/Passes.h"
39259698Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
40259698Sdim#include "llvm/Support/raw_ostream.h"
41259698Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
42259698Sdim
43259698Sdimusing namespace llvm;
44259698Sdim
45259698Sdimnamespace {
46259698Sdim
47259698Sdimstatic bool
48259698SdimisImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
49259698Sdim  for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
50259698Sdim      E = MRI.def_end(); It != E; ++It) {
51259698Sdim    return (*It).isImplicitDef();
52259698Sdim  }
53259698Sdim  if (MRI.isReserved(Reg)) {
54259698Sdim    return false;
55259698Sdim  }
56259698Sdim  llvm_unreachable("Reg without a def");
57259698Sdim  return false;
58259698Sdim}
59259698Sdim
60259698Sdimclass RegSeqInfo {
61259698Sdimpublic:
62259698Sdim  MachineInstr *Instr;
63259698Sdim  DenseMap<unsigned, unsigned> RegToChan;
64259698Sdim  std::vector<unsigned> UndefReg;
65259698Sdim  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
66259698Sdim    assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
67259698Sdim    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
68259698Sdim      MachineOperand &MO = Instr->getOperand(i);
69259698Sdim      unsigned Chan = Instr->getOperand(i + 1).getImm();
70259698Sdim      if (isImplicitlyDef(MRI, MO.getReg()))
71259698Sdim        UndefReg.push_back(Chan);
72259698Sdim      else
73259698Sdim        RegToChan[MO.getReg()] = Chan;
74259698Sdim    }
75259698Sdim  }
76259698Sdim  RegSeqInfo() {}
77259698Sdim
78259698Sdim  bool operator==(const RegSeqInfo &RSI) const {
79259698Sdim    return RSI.Instr == Instr;
80259698Sdim  }
81259698Sdim};
82259698Sdim
83259698Sdimclass R600VectorRegMerger : public MachineFunctionPass {
84259698Sdimprivate:
85259698Sdim  MachineRegisterInfo *MRI;
86259698Sdim  const R600InstrInfo *TII;
87259698Sdim  bool canSwizzle(const MachineInstr &) const;
88259698Sdim  bool areAllUsesSwizzeable(unsigned Reg) const;
89259698Sdim  void SwizzleInput(MachineInstr &,
90259698Sdim      const std::vector<std::pair<unsigned, unsigned> > &) const;
91259698Sdim  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
92259698Sdim      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
93259698Sdim  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
94259698Sdim      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
95259698Sdim  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
96259698Sdim      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
97259698Sdim  MachineInstr *RebuildVector(RegSeqInfo *MI,
98259698Sdim      const RegSeqInfo *BaseVec,
99259698Sdim      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
100259698Sdim  void RemoveMI(MachineInstr *);
101259698Sdim  void trackRSI(const RegSeqInfo &RSI);
102259698Sdim
103259698Sdim  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
104259698Sdim  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
105259698Sdim  InstructionSetMap PreviousRegSeqByReg;
106259698Sdim  InstructionSetMap PreviousRegSeqByUndefCount;
107259698Sdimpublic:
108259698Sdim  static char ID;
109259698Sdim  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
110259698Sdim  TII(0) { }
111259698Sdim
112259698Sdim  void getAnalysisUsage(AnalysisUsage &AU) const {
113259698Sdim    AU.setPreservesCFG();
114259698Sdim    AU.addRequired<MachineDominatorTree>();
115259698Sdim    AU.addPreserved<MachineDominatorTree>();
116259698Sdim    AU.addRequired<MachineLoopInfo>();
117259698Sdim    AU.addPreserved<MachineLoopInfo>();
118259698Sdim    MachineFunctionPass::getAnalysisUsage(AU);
119259698Sdim  }
120259698Sdim
121259698Sdim  const char *getPassName() const {
122259698Sdim    return "R600 Vector Registers Merge Pass";
123259698Sdim  }
124259698Sdim
125259698Sdim  bool runOnMachineFunction(MachineFunction &Fn);
126259698Sdim};
127259698Sdim
128259698Sdimchar R600VectorRegMerger::ID = 0;
129259698Sdim
130259698Sdimbool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
131259698Sdim    const {
132259698Sdim  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
133259698Sdim    return true;
134259698Sdim  switch (MI.getOpcode()) {
135259698Sdim  case AMDGPU::R600_ExportSwz:
136259698Sdim  case AMDGPU::EG_ExportSwz:
137259698Sdim    return true;
138259698Sdim  default:
139259698Sdim    return false;
140259698Sdim  }
141259698Sdim}
142259698Sdim
143259698Sdimbool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
144259698Sdim    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
145259698Sdim    const {
146259698Sdim  unsigned CurrentUndexIdx = 0;
147259698Sdim  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
148259698Sdim      E = ToMerge->RegToChan.end(); It != E; ++It) {
149259698Sdim    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
150259698Sdim        Untouched->RegToChan.find((*It).first);
151259698Sdim    if (PosInUntouched != Untouched->RegToChan.end()) {
152259698Sdim      Remap.push_back(std::pair<unsigned, unsigned>
153259698Sdim          ((*It).second, (*PosInUntouched).second));
154259698Sdim      continue;
155259698Sdim    }
156259698Sdim    if (CurrentUndexIdx >= Untouched->UndefReg.size())
157259698Sdim      return false;
158259698Sdim    Remap.push_back(std::pair<unsigned, unsigned>
159259698Sdim        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
160259698Sdim  }
161259698Sdim
162259698Sdim  return true;
163259698Sdim}
164259698Sdim
165259698Sdimstatic
166259698Sdimunsigned getReassignedChan(
167259698Sdim    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
168259698Sdim    unsigned Chan) {
169259698Sdim  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
170259698Sdim    if (RemapChan[j].first == Chan)
171259698Sdim      return RemapChan[j].second;
172259698Sdim  }
173259698Sdim  llvm_unreachable("Chan wasn't reassigned");
174259698Sdim}
175259698Sdim
176259698SdimMachineInstr *R600VectorRegMerger::RebuildVector(
177259698Sdim    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
178259698Sdim    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
179259698Sdim  unsigned Reg = RSI->Instr->getOperand(0).getReg();
180259698Sdim  MachineBasicBlock::iterator Pos = RSI->Instr;
181259698Sdim  MachineBasicBlock &MBB = *Pos->getParent();
182259698Sdim  DebugLoc DL = Pos->getDebugLoc();
183259698Sdim
184259698Sdim  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
185259698Sdim  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
186259698Sdim  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
187259698Sdim  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
188259698Sdim      E = RSI->RegToChan.end(); It != E; ++It) {
189259698Sdim    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
190259698Sdim    unsigned SubReg = (*It).first;
191259698Sdim    unsigned Swizzle = (*It).second;
192259698Sdim    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
193259698Sdim
194259698Sdim    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
195259698Sdim        DstReg)
196259698Sdim        .addReg(SrcVec)
197259698Sdim        .addReg(SubReg)
198259698Sdim        .addImm(Chan);
199259698Sdim    UpdatedRegToChan[SubReg] = Chan;
200259698Sdim    std::vector<unsigned>::iterator ChanPos =
201259698Sdim        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
202259698Sdim    if (ChanPos != UpdatedUndef.end())
203259698Sdim      UpdatedUndef.erase(ChanPos);
204259698Sdim    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
205259698Sdim               UpdatedUndef.end() &&
206259698Sdim           "UpdatedUndef shouldn't contain Chan more than once!");
207259698Sdim    DEBUG(dbgs() << "    ->"; Tmp->dump(););
208259698Sdim    (void)Tmp;
209259698Sdim    SrcVec = DstReg;
210259698Sdim  }
211259698Sdim  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
212259698Sdim      .addReg(SrcVec);
213259698Sdim  DEBUG(dbgs() << "    ->"; Pos->dump(););
214259698Sdim
215259698Sdim  DEBUG(dbgs() << "  Updating Swizzle:\n");
216259698Sdim  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
217259698Sdim      E = MRI->use_end(); It != E; ++It) {
218259698Sdim    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
219259698Sdim    SwizzleInput(*It, RemapChan);
220259698Sdim    DEBUG((*It).dump());
221259698Sdim  }
222259698Sdim  RSI->Instr->eraseFromParent();
223259698Sdim
224259698Sdim  // Update RSI
225259698Sdim  RSI->Instr = Pos;
226259698Sdim  RSI->RegToChan = UpdatedRegToChan;
227259698Sdim  RSI->UndefReg = UpdatedUndef;
228259698Sdim
229259698Sdim  return Pos;
230259698Sdim}
231259698Sdim
232259698Sdimvoid R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
233259698Sdim  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
234259698Sdim      E = PreviousRegSeqByReg.end(); It != E; ++It) {
235259698Sdim    std::vector<MachineInstr *> &MIs = (*It).second;
236259698Sdim    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
237259698Sdim  }
238259698Sdim  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
239259698Sdim      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
240259698Sdim    std::vector<MachineInstr *> &MIs = (*It).second;
241259698Sdim    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
242259698Sdim  }
243259698Sdim}
244259698Sdim
245259698Sdimvoid R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
246259698Sdim    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
247259698Sdim  unsigned Offset;
248259698Sdim  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
249259698Sdim    Offset = 2;
250259698Sdim  else
251259698Sdim    Offset = 3;
252259698Sdim  for (unsigned i = 0; i < 4; i++) {
253259698Sdim    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
254259698Sdim    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
255259698Sdim      if (RemapChan[j].first == Swizzle) {
256259698Sdim        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
257259698Sdim        break;
258259698Sdim      }
259259698Sdim    }
260259698Sdim  }
261259698Sdim}
262259698Sdim
263259698Sdimbool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
264259698Sdim  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
265259698Sdim      E = MRI->use_end(); It != E; ++It) {
266259698Sdim    if (!canSwizzle(*It))
267259698Sdim      return false;
268259698Sdim  }
269259698Sdim  return true;
270259698Sdim}
271259698Sdim
272259698Sdimbool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
273259698Sdim    RegSeqInfo &CompatibleRSI,
274259698Sdim    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
275259698Sdim  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
276259698Sdim      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
277259698Sdim    if (!MOp->isReg())
278259698Sdim      continue;
279259698Sdim    if (PreviousRegSeqByReg[MOp->getReg()].empty())
280259698Sdim      continue;
281259698Sdim    std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
282259698Sdim    for (unsigned i = 0, e = MIs.size(); i < e; i++) {
283259698Sdim      CompatibleRSI = PreviousRegSeq[MIs[i]];
284259698Sdim      if (RSI == CompatibleRSI)
285259698Sdim        continue;
286259698Sdim      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
287259698Sdim        return true;
288259698Sdim    }
289259698Sdim  }
290259698Sdim  return false;
291259698Sdim}
292259698Sdim
293259698Sdimbool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
294259698Sdim    RegSeqInfo &CompatibleRSI,
295259698Sdim    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
296259698Sdim  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
297259698Sdim  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
298259698Sdim    return false;
299259698Sdim  std::vector<MachineInstr *> &MIs =
300259698Sdim      PreviousRegSeqByUndefCount[NeededUndefs];
301259698Sdim  CompatibleRSI = PreviousRegSeq[MIs.back()];
302259698Sdim  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
303259698Sdim  return true;
304259698Sdim}
305259698Sdim
306259698Sdimvoid R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
307259698Sdim  for (DenseMap<unsigned, unsigned>::const_iterator
308259698Sdim  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
309259698Sdim    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
310259698Sdim  }
311259698Sdim  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
312259698Sdim  PreviousRegSeq[RSI.Instr] = RSI;
313259698Sdim}
314259698Sdim
315259698Sdimbool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
316259698Sdim  TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
317259698Sdim  MRI = &(Fn.getRegInfo());
318259698Sdim  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
319259698Sdim       MBB != MBBe; ++MBB) {
320259698Sdim    MachineBasicBlock *MB = MBB;
321259698Sdim    PreviousRegSeq.clear();
322259698Sdim    PreviousRegSeqByReg.clear();
323259698Sdim    PreviousRegSeqByUndefCount.clear();
324259698Sdim
325259698Sdim    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
326259698Sdim         MII != MIIE; ++MII) {
327259698Sdim      MachineInstr *MI = MII;
328259698Sdim      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
329259698Sdim        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
330259698Sdim          unsigned Reg = MI->getOperand(1).getReg();
331259698Sdim          for (MachineRegisterInfo::def_iterator It = MRI->def_begin(Reg),
332259698Sdim              E = MRI->def_end(); It != E; ++It) {
333259698Sdim            RemoveMI(&(*It));
334259698Sdim          }
335259698Sdim        }
336259698Sdim        continue;
337259698Sdim      }
338259698Sdim
339259698Sdim
340259698Sdim      RegSeqInfo RSI(*MRI, MI);
341259698Sdim
342259698Sdim      // All uses of MI are swizzeable ?
343259698Sdim      unsigned Reg = MI->getOperand(0).getReg();
344259698Sdim      if (!areAllUsesSwizzeable(Reg))
345259698Sdim        continue;
346259698Sdim
347259698Sdim      DEBUG (dbgs() << "Trying to optimize ";
348259698Sdim          MI->dump();
349259698Sdim      );
350259698Sdim
351259698Sdim      RegSeqInfo CandidateRSI;
352259698Sdim      std::vector<std::pair<unsigned, unsigned> > RemapChan;
353259698Sdim      DEBUG(dbgs() << "Using common slots...\n";);
354259698Sdim      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
355259698Sdim        // Remove CandidateRSI mapping
356259698Sdim        RemoveMI(CandidateRSI.Instr);
357259698Sdim        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
358259698Sdim        trackRSI(RSI);
359259698Sdim        continue;
360259698Sdim      }
361259698Sdim      DEBUG(dbgs() << "Using free slots...\n";);
362259698Sdim      RemapChan.clear();
363259698Sdim      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
364259698Sdim        RemoveMI(CandidateRSI.Instr);
365259698Sdim        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
366259698Sdim        trackRSI(RSI);
367259698Sdim        continue;
368259698Sdim      }
369259698Sdim      //Failed to merge
370259698Sdim      trackRSI(RSI);
371259698Sdim    }
372259698Sdim  }
373259698Sdim  return false;
374259698Sdim}
375259698Sdim
376259698Sdim}
377259698Sdim
378259698Sdimllvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
379259698Sdim  return new R600VectorRegMerger(tm);
380259698Sdim}
381