X86VZeroUpper.cpp revision 243830
1226584Sdim//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
2226584Sdim//
3226584Sdim//                     The LLVM Compiler Infrastructure
4226584Sdim//
5226584Sdim// This file is distributed under the University of Illinois Open Source
6226584Sdim// License. See LICENSE.TXT for details.
7226584Sdim//
8226584Sdim//===----------------------------------------------------------------------===//
9226584Sdim//
10226584Sdim// This file defines the pass which inserts x86 AVX vzeroupper instructions
11226584Sdim// before calls to SSE encoded functions. This avoids transition latency
12226584Sdim// penalty when tranfering control between AVX encoded instructions and old
13226584Sdim// SSE encoding mode.
14226584Sdim//
15226584Sdim//===----------------------------------------------------------------------===//
16226584Sdim
17234353Sdim#define DEBUG_TYPE "x86-vzeroupper"
18226584Sdim#include "X86.h"
19226584Sdim#include "X86InstrInfo.h"
20226584Sdim#include "llvm/ADT/Statistic.h"
21226584Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
22226584Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
23234353Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
24226584Sdim#include "llvm/CodeGen/Passes.h"
25234353Sdim#include "llvm/Support/Debug.h"
26234353Sdim#include "llvm/Support/raw_ostream.h"
27226584Sdim#include "llvm/Target/TargetInstrInfo.h"
28226584Sdimusing namespace llvm;
29226584Sdim
30226584SdimSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
31226584Sdim
32226584Sdimnamespace {
33226584Sdim  struct VZeroUpperInserter : public MachineFunctionPass {
34226584Sdim    static char ID;
35226584Sdim    VZeroUpperInserter() : MachineFunctionPass(ID) {}
36226584Sdim
37226584Sdim    virtual bool runOnMachineFunction(MachineFunction &MF);
38226584Sdim
39226584Sdim    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
40226584Sdim
41226584Sdim    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
42226584Sdim
43226584Sdim  private:
44226584Sdim    const TargetInstrInfo *TII; // Machine instruction info.
45234353Sdim
46234353Sdim    // Any YMM register live-in to this function?
47234353Sdim    bool FnHasLiveInYmm;
48234353Sdim
49234353Sdim    // BBState - Contains the state of each MBB: unknown, clean, dirty
50234353Sdim    SmallVector<uint8_t, 8> BBState;
51234353Sdim
52234353Sdim    // BBSolved - Keep track of all MBB which had been already analyzed
53234353Sdim    // and there is no further processing required.
54234353Sdim    BitVector BBSolved;
55234353Sdim
56234353Sdim    // Machine Basic Blocks are classified according this pass:
57234353Sdim    //
58234353Sdim    //  ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
59234353Sdim    //    until the MBB exit there isn't a instruction using YMM to change
60234353Sdim    //    the state to dirty, or one of the incoming predecessors is unknown
61234353Sdim    //    and there's not a dirty predecessor between them.
62234353Sdim    //
63234353Sdim    //  ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
64234353Sdim    //    instructions using YMM and be marked ST_CLEAN, as long as the state
65234353Sdim    //    is cleaned by a vzeroupper before any call.
66234353Sdim    //
67234353Sdim    //  ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
68234353Sdim    //    vzeroupper instruction.
69234353Sdim    //
70234353Sdim    //  ST_INIT - Placeholder for an empty state set
71234353Sdim    //
72234353Sdim    enum {
73234353Sdim      ST_UNKNOWN = 0,
74234353Sdim      ST_CLEAN   = 1,
75234353Sdim      ST_DIRTY   = 2,
76234353Sdim      ST_INIT    = 3
77234353Sdim    };
78234353Sdim
79234353Sdim    // computeState - Given two states, compute the resulting state, in
80234353Sdim    // the following way
81234353Sdim    //
82234353Sdim    //  1) One dirty state yields another dirty state
83234353Sdim    //  2) All states must be clean for the result to be clean
84234353Sdim    //  3) If none above and one unknown, the result state is also unknown
85234353Sdim    //
86243830Sdim    static unsigned computeState(unsigned PrevState, unsigned CurState) {
87234353Sdim      if (PrevState == ST_INIT)
88234353Sdim        return CurState;
89234353Sdim
90234353Sdim      if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
91234353Sdim        return ST_DIRTY;
92234353Sdim
93234353Sdim      if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
94234353Sdim        return ST_CLEAN;
95234353Sdim
96234353Sdim      return ST_UNKNOWN;
97234353Sdim    }
98234353Sdim
99226584Sdim  };
100226584Sdim  char VZeroUpperInserter::ID = 0;
101226584Sdim}
102226584Sdim
103226584SdimFunctionPass *llvm::createX86IssueVZeroUpperPass() {
104226584Sdim  return new VZeroUpperInserter();
105226584Sdim}
106226584Sdim
107234353Sdimstatic bool isYmmReg(unsigned Reg) {
108234353Sdim  if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
109234353Sdim    return true;
110234353Sdim
111234353Sdim  return false;
112234353Sdim}
113234353Sdim
114234353Sdimstatic bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
115234353Sdim  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
116234353Sdim       E = MRI.livein_end(); I != E; ++I)
117234353Sdim    if (isYmmReg(I->first))
118234353Sdim      return true;
119234353Sdim
120234353Sdim  return false;
121234353Sdim}
122234353Sdim
123234353Sdimstatic bool hasYmmReg(MachineInstr *MI) {
124243830Sdim  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
125234353Sdim    const MachineOperand &MO = MI->getOperand(i);
126234353Sdim    if (!MO.isReg())
127234353Sdim      continue;
128234353Sdim    if (MO.isDebug())
129234353Sdim      continue;
130234353Sdim    if (isYmmReg(MO.getReg()))
131234353Sdim      return true;
132234353Sdim  }
133234353Sdim  return false;
134234353Sdim}
135234353Sdim
136226584Sdim/// runOnMachineFunction - Loop over all of the basic blocks, inserting
137226584Sdim/// vzero upper instructions before function calls.
138226584Sdimbool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
139226584Sdim  TII = MF.getTarget().getInstrInfo();
140234353Sdim  MachineRegisterInfo &MRI = MF.getRegInfo();
141234353Sdim  bool EverMadeChange = false;
142226584Sdim
143234353Sdim  // Fast check: if the function doesn't use any ymm registers, we don't need
144234353Sdim  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
145234353Sdim  // cheap in the common case of no ymm use.
146234353Sdim  bool YMMUsed = false;
147239462Sdim  const TargetRegisterClass *RC = &X86::VR256RegClass;
148234353Sdim  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
149234353Sdim       i != e; i++) {
150243830Sdim    if (!MRI.reg_nodbg_empty(*i)) {
151234353Sdim      YMMUsed = true;
152234353Sdim      break;
153234353Sdim    }
154234353Sdim  }
155234353Sdim  if (!YMMUsed)
156234353Sdim    return EverMadeChange;
157226584Sdim
158234353Sdim  // Pre-compute the existence of any live-in YMM registers to this function
159234353Sdim  FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
160226584Sdim
161234353Sdim  assert(BBState.empty());
162234353Sdim  BBState.resize(MF.getNumBlockIDs(), 0);
163234353Sdim  BBSolved.resize(MF.getNumBlockIDs(), 0);
164226584Sdim
165234353Sdim  // Each BB state depends on all predecessors, loop over until everything
166234353Sdim  // converges.  (Once we converge, we can implicitly mark everything that is
167234353Sdim  // still ST_UNKNOWN as ST_CLEAN.)
168234353Sdim  while (1) {
169234353Sdim    bool MadeChange = false;
170226584Sdim
171234353Sdim    // Process all basic blocks.
172234353Sdim    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
173234353Sdim      MadeChange |= processBasicBlock(MF, *I);
174226584Sdim
175234353Sdim    // If this iteration over the code changed anything, keep iterating.
176234353Sdim    if (!MadeChange) break;
177234353Sdim    EverMadeChange = true;
178234353Sdim  }
179226584Sdim
180234353Sdim  BBState.clear();
181234353Sdim  BBSolved.clear();
182234353Sdim  return EverMadeChange;
183226584Sdim}
184226584Sdim
185226584Sdim/// processBasicBlock - Loop over all of the instructions in the basic block,
186226584Sdim/// inserting vzero upper instructions before function calls.
187226584Sdimbool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
188226584Sdim                                           MachineBasicBlock &BB) {
189226584Sdim  bool Changed = false;
190234353Sdim  unsigned BBNum = BB.getNumber();
191226584Sdim
192234353Sdim  // Don't process already solved BBs
193234353Sdim  if (BBSolved[BBNum])
194234353Sdim    return false; // No changes
195234353Sdim
196234353Sdim  // Check the state of all predecessors
197234353Sdim  unsigned EntryState = ST_INIT;
198234353Sdim  for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
199234353Sdim       PE = BB.pred_end(); PI != PE; ++PI) {
200234353Sdim    EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
201234353Sdim    if (EntryState == ST_DIRTY)
202234353Sdim      break;
203234353Sdim  }
204234353Sdim
205234353Sdim
206239462Sdim  // The entry MBB for the function may set the initial state to dirty if
207234353Sdim  // the function receives any YMM incoming arguments
208243830Sdim  if (&BB == MF.begin()) {
209234353Sdim    EntryState = ST_CLEAN;
210234353Sdim    if (FnHasLiveInYmm)
211234353Sdim      EntryState = ST_DIRTY;
212234353Sdim  }
213234353Sdim
214234353Sdim  // The current state is initialized according to the predecessors
215234353Sdim  unsigned CurState = EntryState;
216234353Sdim  bool BBHasCall = false;
217234353Sdim
218226584Sdim  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
219226584Sdim    MachineInstr *MI = I;
220226584Sdim    DebugLoc dl = I->getDebugLoc();
221234353Sdim    bool isControlFlow = MI->isCall() || MI->isReturn();
222226584Sdim
223239462Sdim    // Shortcut: don't need to check regular instructions in dirty state.
224234353Sdim    if (!isControlFlow && CurState == ST_DIRTY)
225234353Sdim      continue;
226234353Sdim
227234353Sdim    if (hasYmmReg(MI)) {
228234353Sdim      // We found a ymm-using instruction; this could be an AVX instruction,
229234353Sdim      // or it could be control flow.
230234353Sdim      CurState = ST_DIRTY;
231234353Sdim      continue;
232226584Sdim    }
233234353Sdim
234234353Sdim    // Check for control-flow out of the current function (which might
235234353Sdim    // indirectly execute SSE instructions).
236234353Sdim    if (!isControlFlow)
237234353Sdim      continue;
238234353Sdim
239234353Sdim    BBHasCall = true;
240234353Sdim
241234353Sdim    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
242234353Sdim    // registers. This instruction has zero latency. In addition, the processor
243234353Sdim    // changes back to Clean state, after which execution of Intel SSE
244234353Sdim    // instructions or Intel AVX instructions has no transition penalty. Add
245234353Sdim    // the VZEROUPPER instruction before any function call/return that might
246234353Sdim    // execute SSE code.
247234353Sdim    // FIXME: In some cases, we may want to move the VZEROUPPER into a
248234353Sdim    // predecessor block.
249234353Sdim    if (CurState == ST_DIRTY) {
250234353Sdim      // Only insert the VZEROUPPER in case the entry state isn't unknown.
251234353Sdim      // When unknown, only compute the information within the block to have
252234353Sdim      // it available in the exit if possible, but don't change the block.
253234353Sdim      if (EntryState != ST_UNKNOWN) {
254243830Sdim        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
255234353Sdim        ++NumVZU;
256234353Sdim      }
257234353Sdim
258234353Sdim      // After the inserted VZEROUPPER the state becomes clean again, but
259234353Sdim      // other YMM may appear before other subsequent calls or even before
260234353Sdim      // the end of the BB.
261234353Sdim      CurState = ST_CLEAN;
262234353Sdim    }
263226584Sdim  }
264226584Sdim
265234353Sdim  DEBUG(dbgs() << "MBB #" << BBNum
266234353Sdim               << ", current state: " << CurState << '\n');
267234353Sdim
268234353Sdim  // A BB can only be considered solved when we both have done all the
269234353Sdim  // necessary transformations, and have computed the exit state.  This happens
270234353Sdim  // in two cases:
271234353Sdim  //  1) We know the entry state: this immediately implies the exit state and
272234353Sdim  //     all the necessary transformations.
273234353Sdim  //  2) There are no calls, and and a non-call instruction marks this block:
274234353Sdim  //     no transformations are necessary, and we know the exit state.
275234353Sdim  if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
276234353Sdim    BBSolved[BBNum] = true;
277234353Sdim
278234353Sdim  if (CurState != BBState[BBNum])
279234353Sdim    Changed = true;
280234353Sdim
281234353Sdim  BBState[BBNum] = CurState;
282226584Sdim  return Changed;
283226584Sdim}
284