1226584Sdim//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
2226584Sdim//
3226584Sdim//                     The LLVM Compiler Infrastructure
4226584Sdim//
5226584Sdim// This file is distributed under the University of Illinois Open Source
6226584Sdim// License. See LICENSE.TXT for details.
7226584Sdim//
8226584Sdim//===----------------------------------------------------------------------===//
9226584Sdim//
10226584Sdim// This file defines the pass which inserts x86 AVX vzeroupper instructions
11226584Sdim// before calls to SSE encoded functions. This avoids transition latency
12226584Sdim// penalty when tranfering control between AVX encoded instructions and old
13226584Sdim// SSE encoding mode.
14226584Sdim//
15226584Sdim//===----------------------------------------------------------------------===//
16226584Sdim
17234353Sdim#define DEBUG_TYPE "x86-vzeroupper"
18226584Sdim#include "X86.h"
19226584Sdim#include "X86InstrInfo.h"
20226584Sdim#include "llvm/ADT/Statistic.h"
21226584Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
22226584Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
23234353Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
24226584Sdim#include "llvm/CodeGen/Passes.h"
25234353Sdim#include "llvm/Support/Debug.h"
26234353Sdim#include "llvm/Support/raw_ostream.h"
27226584Sdim#include "llvm/Target/TargetInstrInfo.h"
28226584Sdimusing namespace llvm;
29226584Sdim
30226584SdimSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
31226584Sdim
32226584Sdimnamespace {
33226584Sdim  struct VZeroUpperInserter : public MachineFunctionPass {
34226584Sdim    static char ID;
35226584Sdim    VZeroUpperInserter() : MachineFunctionPass(ID) {}
36226584Sdim
37226584Sdim    virtual bool runOnMachineFunction(MachineFunction &MF);
38226584Sdim
39226584Sdim    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
40226584Sdim
41226584Sdim    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
42226584Sdim
43226584Sdim  private:
44226584Sdim    const TargetInstrInfo *TII; // Machine instruction info.
45234353Sdim
46234353Sdim    // Any YMM register live-in to this function?
47234353Sdim    bool FnHasLiveInYmm;
48234353Sdim
49234353Sdim    // BBState - Contains the state of each MBB: unknown, clean, dirty
50234353Sdim    SmallVector<uint8_t, 8> BBState;
51234353Sdim
52234353Sdim    // BBSolved - Keep track of all MBB which had been already analyzed
53234353Sdim    // and there is no further processing required.
54234353Sdim    BitVector BBSolved;
55234353Sdim
56234353Sdim    // Machine Basic Blocks are classified according this pass:
57234353Sdim    //
58234353Sdim    //  ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
59234353Sdim    //    until the MBB exit there isn't a instruction using YMM to change
60234353Sdim    //    the state to dirty, or one of the incoming predecessors is unknown
61234353Sdim    //    and there's not a dirty predecessor between them.
62234353Sdim    //
63234353Sdim    //  ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
64234353Sdim    //    instructions using YMM and be marked ST_CLEAN, as long as the state
65234353Sdim    //    is cleaned by a vzeroupper before any call.
66234353Sdim    //
67234353Sdim    //  ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
68234353Sdim    //    vzeroupper instruction.
69234353Sdim    //
70234353Sdim    //  ST_INIT - Placeholder for an empty state set
71234353Sdim    //
72234353Sdim    enum {
73234353Sdim      ST_UNKNOWN = 0,
74234353Sdim      ST_CLEAN   = 1,
75234353Sdim      ST_DIRTY   = 2,
76234353Sdim      ST_INIT    = 3
77234353Sdim    };
78234353Sdim
79234353Sdim    // computeState - Given two states, compute the resulting state, in
80234353Sdim    // the following way
81234353Sdim    //
82234353Sdim    //  1) One dirty state yields another dirty state
83234353Sdim    //  2) All states must be clean for the result to be clean
84234353Sdim    //  3) If none above and one unknown, the result state is also unknown
85234353Sdim    //
86243830Sdim    static unsigned computeState(unsigned PrevState, unsigned CurState) {
87234353Sdim      if (PrevState == ST_INIT)
88234353Sdim        return CurState;
89234353Sdim
90234353Sdim      if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
91234353Sdim        return ST_DIRTY;
92234353Sdim
93234353Sdim      if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
94234353Sdim        return ST_CLEAN;
95234353Sdim
96234353Sdim      return ST_UNKNOWN;
97234353Sdim    }
98234353Sdim
99226584Sdim  };
100226584Sdim  char VZeroUpperInserter::ID = 0;
101226584Sdim}
102226584Sdim
103226584SdimFunctionPass *llvm::createX86IssueVZeroUpperPass() {
104226584Sdim  return new VZeroUpperInserter();
105226584Sdim}
106226584Sdim
107234353Sdimstatic bool isYmmReg(unsigned Reg) {
108263508Sdim  return (Reg >= X86::YMM0 && Reg <= X86::YMM31);
109263508Sdim}
110234353Sdim
111263508Sdimstatic bool isZmmReg(unsigned Reg) {
112263508Sdim  return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31);
113234353Sdim}
114234353Sdim
115234353Sdimstatic bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
116234353Sdim  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
117234353Sdim       E = MRI.livein_end(); I != E; ++I)
118263508Sdim    if (isYmmReg(I->first) || isZmmReg(I->first))
119234353Sdim      return true;
120234353Sdim
121234353Sdim  return false;
122234353Sdim}
123234353Sdim
124249423Sdimstatic bool clobbersAllYmmRegs(const MachineOperand &MO) {
125263508Sdim  for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
126249423Sdim    if (!MO.clobbersPhysReg(reg))
127249423Sdim      return false;
128249423Sdim  }
129263508Sdim  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
130263508Sdim    if (!MO.clobbersPhysReg(reg))
131263508Sdim      return false;
132263508Sdim  }
133249423Sdim  return true;
134249423Sdim}
135249423Sdim
136234353Sdimstatic bool hasYmmReg(MachineInstr *MI) {
137243830Sdim  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
138234353Sdim    const MachineOperand &MO = MI->getOperand(i);
139249423Sdim    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
140249423Sdim      return true;
141234353Sdim    if (!MO.isReg())
142234353Sdim      continue;
143234353Sdim    if (MO.isDebug())
144234353Sdim      continue;
145234353Sdim    if (isYmmReg(MO.getReg()))
146234353Sdim      return true;
147234353Sdim  }
148234353Sdim  return false;
149234353Sdim}
150234353Sdim
151263508Sdim/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
152263508Sdim/// instruction.
153263508Sdimstatic bool clobbersAnyYmmReg(MachineInstr *MI) {
154263508Sdim  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
155263508Sdim    const MachineOperand &MO = MI->getOperand(i);
156263508Sdim    if (!MO.isRegMask())
157263508Sdim      continue;
158263508Sdim    for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
159263508Sdim      if (MO.clobbersPhysReg(reg))
160263508Sdim        return true;
161263508Sdim    }
162263508Sdim    for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
163263508Sdim      if (MO.clobbersPhysReg(reg))
164263508Sdim        return true;
165263508Sdim    }
166263508Sdim  }
167263508Sdim  return false;
168263508Sdim}
169263508Sdim
170226584Sdim/// runOnMachineFunction - Loop over all of the basic blocks, inserting
171226584Sdim/// vzero upper instructions before function calls.
172226584Sdimbool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
173226584Sdim  TII = MF.getTarget().getInstrInfo();
174234353Sdim  MachineRegisterInfo &MRI = MF.getRegInfo();
175234353Sdim  bool EverMadeChange = false;
176226584Sdim
177234353Sdim  // Fast check: if the function doesn't use any ymm registers, we don't need
178234353Sdim  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
179234353Sdim  // cheap in the common case of no ymm use.
180234353Sdim  bool YMMUsed = false;
181239462Sdim  const TargetRegisterClass *RC = &X86::VR256RegClass;
182234353Sdim  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
183234353Sdim       i != e; i++) {
184243830Sdim    if (!MRI.reg_nodbg_empty(*i)) {
185234353Sdim      YMMUsed = true;
186234353Sdim      break;
187234353Sdim    }
188234353Sdim  }
189234353Sdim  if (!YMMUsed)
190234353Sdim    return EverMadeChange;
191226584Sdim
192234353Sdim  // Pre-compute the existence of any live-in YMM registers to this function
193234353Sdim  FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
194226584Sdim
195234353Sdim  assert(BBState.empty());
196234353Sdim  BBState.resize(MF.getNumBlockIDs(), 0);
197234353Sdim  BBSolved.resize(MF.getNumBlockIDs(), 0);
198226584Sdim
199234353Sdim  // Each BB state depends on all predecessors, loop over until everything
200234353Sdim  // converges.  (Once we converge, we can implicitly mark everything that is
201234353Sdim  // still ST_UNKNOWN as ST_CLEAN.)
202234353Sdim  while (1) {
203234353Sdim    bool MadeChange = false;
204226584Sdim
205234353Sdim    // Process all basic blocks.
206234353Sdim    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
207234353Sdim      MadeChange |= processBasicBlock(MF, *I);
208226584Sdim
209234353Sdim    // If this iteration over the code changed anything, keep iterating.
210234353Sdim    if (!MadeChange) break;
211234353Sdim    EverMadeChange = true;
212234353Sdim  }
213226584Sdim
214234353Sdim  BBState.clear();
215234353Sdim  BBSolved.clear();
216234353Sdim  return EverMadeChange;
217226584Sdim}
218226584Sdim
219226584Sdim/// processBasicBlock - Loop over all of the instructions in the basic block,
220226584Sdim/// inserting vzero upper instructions before function calls.
221226584Sdimbool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
222226584Sdim                                           MachineBasicBlock &BB) {
223226584Sdim  bool Changed = false;
224234353Sdim  unsigned BBNum = BB.getNumber();
225226584Sdim
226234353Sdim  // Don't process already solved BBs
227234353Sdim  if (BBSolved[BBNum])
228234353Sdim    return false; // No changes
229234353Sdim
230234353Sdim  // Check the state of all predecessors
231234353Sdim  unsigned EntryState = ST_INIT;
232234353Sdim  for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
233234353Sdim       PE = BB.pred_end(); PI != PE; ++PI) {
234234353Sdim    EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
235234353Sdim    if (EntryState == ST_DIRTY)
236234353Sdim      break;
237234353Sdim  }
238234353Sdim
239234353Sdim
240239462Sdim  // The entry MBB for the function may set the initial state to dirty if
241234353Sdim  // the function receives any YMM incoming arguments
242243830Sdim  if (&BB == MF.begin()) {
243234353Sdim    EntryState = ST_CLEAN;
244234353Sdim    if (FnHasLiveInYmm)
245234353Sdim      EntryState = ST_DIRTY;
246234353Sdim  }
247234353Sdim
248234353Sdim  // The current state is initialized according to the predecessors
249234353Sdim  unsigned CurState = EntryState;
250234353Sdim  bool BBHasCall = false;
251234353Sdim
252226584Sdim  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
253263508Sdim    DebugLoc dl = I->getDebugLoc();
254226584Sdim    MachineInstr *MI = I;
255263508Sdim
256234353Sdim    bool isControlFlow = MI->isCall() || MI->isReturn();
257226584Sdim
258239462Sdim    // Shortcut: don't need to check regular instructions in dirty state.
259234353Sdim    if (!isControlFlow && CurState == ST_DIRTY)
260234353Sdim      continue;
261234353Sdim
262234353Sdim    if (hasYmmReg(MI)) {
263234353Sdim      // We found a ymm-using instruction; this could be an AVX instruction,
264234353Sdim      // or it could be control flow.
265234353Sdim      CurState = ST_DIRTY;
266234353Sdim      continue;
267226584Sdim    }
268234353Sdim
269234353Sdim    // Check for control-flow out of the current function (which might
270234353Sdim    // indirectly execute SSE instructions).
271234353Sdim    if (!isControlFlow)
272234353Sdim      continue;
273234353Sdim
274263508Sdim    // If the call won't clobber any YMM register, skip it as well. It usually
275263508Sdim    // happens on helper function calls (such as '_chkstk', '_ftol2') where
276263508Sdim    // standard calling convention is not used (RegMask is not used to mark
277263508Sdim    // register clobbered and register usage (def/imp-def/use) is well-dfined
278263508Sdim    // and explicitly specified.
279263508Sdim    if (MI->isCall() && !clobbersAnyYmmReg(MI))
280263508Sdim      continue;
281263508Sdim
282234353Sdim    BBHasCall = true;
283234353Sdim
284234353Sdim    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
285234353Sdim    // registers. This instruction has zero latency. In addition, the processor
286234353Sdim    // changes back to Clean state, after which execution of Intel SSE
287234353Sdim    // instructions or Intel AVX instructions has no transition penalty. Add
288234353Sdim    // the VZEROUPPER instruction before any function call/return that might
289234353Sdim    // execute SSE code.
290234353Sdim    // FIXME: In some cases, we may want to move the VZEROUPPER into a
291234353Sdim    // predecessor block.
292234353Sdim    if (CurState == ST_DIRTY) {
293234353Sdim      // Only insert the VZEROUPPER in case the entry state isn't unknown.
294234353Sdim      // When unknown, only compute the information within the block to have
295234353Sdim      // it available in the exit if possible, but don't change the block.
296234353Sdim      if (EntryState != ST_UNKNOWN) {
297243830Sdim        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
298234353Sdim        ++NumVZU;
299234353Sdim      }
300234353Sdim
301234353Sdim      // After the inserted VZEROUPPER the state becomes clean again, but
302234353Sdim      // other YMM may appear before other subsequent calls or even before
303234353Sdim      // the end of the BB.
304234353Sdim      CurState = ST_CLEAN;
305234353Sdim    }
306226584Sdim  }
307226584Sdim
308234353Sdim  DEBUG(dbgs() << "MBB #" << BBNum
309234353Sdim               << ", current state: " << CurState << '\n');
310234353Sdim
311234353Sdim  // A BB can only be considered solved when we both have done all the
312234353Sdim  // necessary transformations, and have computed the exit state.  This happens
313234353Sdim  // in two cases:
314234353Sdim  //  1) We know the entry state: this immediately implies the exit state and
315234353Sdim  //     all the necessary transformations.
316234353Sdim  //  2) There are no calls, and and a non-call instruction marks this block:
317234353Sdim  //     no transformations are necessary, and we know the exit state.
318234353Sdim  if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
319234353Sdim    BBSolved[BBNum] = true;
320234353Sdim
321234353Sdim  if (CurState != BBState[BBNum])
322234353Sdim    Changed = true;
323234353Sdim
324234353Sdim  BBState[BBNum] = CurState;
325226584Sdim  return Changed;
326226584Sdim}
327