X86VZeroUpper.cpp revision 243830
1226584Sdim//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2226584Sdim// 3226584Sdim// The LLVM Compiler Infrastructure 4226584Sdim// 5226584Sdim// This file is distributed under the University of Illinois Open Source 6226584Sdim// License. See LICENSE.TXT for details. 7226584Sdim// 8226584Sdim//===----------------------------------------------------------------------===// 9226584Sdim// 10226584Sdim// This file defines the pass which inserts x86 AVX vzeroupper instructions 11226584Sdim// before calls to SSE encoded functions. This avoids transition latency 12226584Sdim// penalty when tranfering control between AVX encoded instructions and old 13226584Sdim// SSE encoding mode. 14226584Sdim// 15226584Sdim//===----------------------------------------------------------------------===// 16226584Sdim 17234353Sdim#define DEBUG_TYPE "x86-vzeroupper" 18226584Sdim#include "X86.h" 19226584Sdim#include "X86InstrInfo.h" 20226584Sdim#include "llvm/ADT/Statistic.h" 21226584Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 22226584Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 23234353Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 24226584Sdim#include "llvm/CodeGen/Passes.h" 25234353Sdim#include "llvm/Support/Debug.h" 26234353Sdim#include "llvm/Support/raw_ostream.h" 27226584Sdim#include "llvm/Target/TargetInstrInfo.h" 28226584Sdimusing namespace llvm; 29226584Sdim 30226584SdimSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 31226584Sdim 32226584Sdimnamespace { 33226584Sdim struct VZeroUpperInserter : public MachineFunctionPass { 34226584Sdim static char ID; 35226584Sdim VZeroUpperInserter() : MachineFunctionPass(ID) {} 36226584Sdim 37226584Sdim virtual bool runOnMachineFunction(MachineFunction &MF); 38226584Sdim 39226584Sdim bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); 40226584Sdim 41226584Sdim virtual const char *getPassName() const { return "X86 vzeroupper inserter";} 42226584Sdim 43226584Sdim private: 44226584Sdim const TargetInstrInfo *TII; // Machine instruction info. 45234353Sdim 46234353Sdim // Any YMM register live-in to this function? 47234353Sdim bool FnHasLiveInYmm; 48234353Sdim 49234353Sdim // BBState - Contains the state of each MBB: unknown, clean, dirty 50234353Sdim SmallVector<uint8_t, 8> BBState; 51234353Sdim 52234353Sdim // BBSolved - Keep track of all MBB which had been already analyzed 53234353Sdim // and there is no further processing required. 54234353Sdim BitVector BBSolved; 55234353Sdim 56234353Sdim // Machine Basic Blocks are classified according this pass: 57234353Sdim // 58234353Sdim // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state 59234353Sdim // until the MBB exit there isn't a instruction using YMM to change 60234353Sdim // the state to dirty, or one of the incoming predecessors is unknown 61234353Sdim // and there's not a dirty predecessor between them. 62234353Sdim // 63234353Sdim // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have 64234353Sdim // instructions using YMM and be marked ST_CLEAN, as long as the state 65234353Sdim // is cleaned by a vzeroupper before any call. 66234353Sdim // 67234353Sdim // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a 68234353Sdim // vzeroupper instruction. 69234353Sdim // 70234353Sdim // ST_INIT - Placeholder for an empty state set 71234353Sdim // 72234353Sdim enum { 73234353Sdim ST_UNKNOWN = 0, 74234353Sdim ST_CLEAN = 1, 75234353Sdim ST_DIRTY = 2, 76234353Sdim ST_INIT = 3 77234353Sdim }; 78234353Sdim 79234353Sdim // computeState - Given two states, compute the resulting state, in 80234353Sdim // the following way 81234353Sdim // 82234353Sdim // 1) One dirty state yields another dirty state 83234353Sdim // 2) All states must be clean for the result to be clean 84234353Sdim // 3) If none above and one unknown, the result state is also unknown 85234353Sdim // 86243830Sdim static unsigned computeState(unsigned PrevState, unsigned CurState) { 87234353Sdim if (PrevState == ST_INIT) 88234353Sdim return CurState; 89234353Sdim 90234353Sdim if (PrevState == ST_DIRTY || CurState == ST_DIRTY) 91234353Sdim return ST_DIRTY; 92234353Sdim 93234353Sdim if (PrevState == ST_CLEAN && CurState == ST_CLEAN) 94234353Sdim return ST_CLEAN; 95234353Sdim 96234353Sdim return ST_UNKNOWN; 97234353Sdim } 98234353Sdim 99226584Sdim }; 100226584Sdim char VZeroUpperInserter::ID = 0; 101226584Sdim} 102226584Sdim 103226584SdimFunctionPass *llvm::createX86IssueVZeroUpperPass() { 104226584Sdim return new VZeroUpperInserter(); 105226584Sdim} 106226584Sdim 107234353Sdimstatic bool isYmmReg(unsigned Reg) { 108234353Sdim if (Reg >= X86::YMM0 && Reg <= X86::YMM15) 109234353Sdim return true; 110234353Sdim 111234353Sdim return false; 112234353Sdim} 113234353Sdim 114234353Sdimstatic bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 115234353Sdim for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 116234353Sdim E = MRI.livein_end(); I != E; ++I) 117234353Sdim if (isYmmReg(I->first)) 118234353Sdim return true; 119234353Sdim 120234353Sdim return false; 121234353Sdim} 122234353Sdim 123234353Sdimstatic bool hasYmmReg(MachineInstr *MI) { 124243830Sdim for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 125234353Sdim const MachineOperand &MO = MI->getOperand(i); 126234353Sdim if (!MO.isReg()) 127234353Sdim continue; 128234353Sdim if (MO.isDebug()) 129234353Sdim continue; 130234353Sdim if (isYmmReg(MO.getReg())) 131234353Sdim return true; 132234353Sdim } 133234353Sdim return false; 134234353Sdim} 135234353Sdim 136226584Sdim/// runOnMachineFunction - Loop over all of the basic blocks, inserting 137226584Sdim/// vzero upper instructions before function calls. 138226584Sdimbool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 139226584Sdim TII = MF.getTarget().getInstrInfo(); 140234353Sdim MachineRegisterInfo &MRI = MF.getRegInfo(); 141234353Sdim bool EverMadeChange = false; 142226584Sdim 143234353Sdim // Fast check: if the function doesn't use any ymm registers, we don't need 144234353Sdim // to insert any VZEROUPPER instructions. This is constant-time, so it is 145234353Sdim // cheap in the common case of no ymm use. 146234353Sdim bool YMMUsed = false; 147239462Sdim const TargetRegisterClass *RC = &X86::VR256RegClass; 148234353Sdim for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); 149234353Sdim i != e; i++) { 150243830Sdim if (!MRI.reg_nodbg_empty(*i)) { 151234353Sdim YMMUsed = true; 152234353Sdim break; 153234353Sdim } 154234353Sdim } 155234353Sdim if (!YMMUsed) 156234353Sdim return EverMadeChange; 157226584Sdim 158234353Sdim // Pre-compute the existence of any live-in YMM registers to this function 159234353Sdim FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 160226584Sdim 161234353Sdim assert(BBState.empty()); 162234353Sdim BBState.resize(MF.getNumBlockIDs(), 0); 163234353Sdim BBSolved.resize(MF.getNumBlockIDs(), 0); 164226584Sdim 165234353Sdim // Each BB state depends on all predecessors, loop over until everything 166234353Sdim // converges. (Once we converge, we can implicitly mark everything that is 167234353Sdim // still ST_UNKNOWN as ST_CLEAN.) 168234353Sdim while (1) { 169234353Sdim bool MadeChange = false; 170226584Sdim 171234353Sdim // Process all basic blocks. 172234353Sdim for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) 173234353Sdim MadeChange |= processBasicBlock(MF, *I); 174226584Sdim 175234353Sdim // If this iteration over the code changed anything, keep iterating. 176234353Sdim if (!MadeChange) break; 177234353Sdim EverMadeChange = true; 178234353Sdim } 179226584Sdim 180234353Sdim BBState.clear(); 181234353Sdim BBSolved.clear(); 182234353Sdim return EverMadeChange; 183226584Sdim} 184226584Sdim 185226584Sdim/// processBasicBlock - Loop over all of the instructions in the basic block, 186226584Sdim/// inserting vzero upper instructions before function calls. 187226584Sdimbool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, 188226584Sdim MachineBasicBlock &BB) { 189226584Sdim bool Changed = false; 190234353Sdim unsigned BBNum = BB.getNumber(); 191226584Sdim 192234353Sdim // Don't process already solved BBs 193234353Sdim if (BBSolved[BBNum]) 194234353Sdim return false; // No changes 195234353Sdim 196234353Sdim // Check the state of all predecessors 197234353Sdim unsigned EntryState = ST_INIT; 198234353Sdim for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), 199234353Sdim PE = BB.pred_end(); PI != PE; ++PI) { 200234353Sdim EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); 201234353Sdim if (EntryState == ST_DIRTY) 202234353Sdim break; 203234353Sdim } 204234353Sdim 205234353Sdim 206239462Sdim // The entry MBB for the function may set the initial state to dirty if 207234353Sdim // the function receives any YMM incoming arguments 208243830Sdim if (&BB == MF.begin()) { 209234353Sdim EntryState = ST_CLEAN; 210234353Sdim if (FnHasLiveInYmm) 211234353Sdim EntryState = ST_DIRTY; 212234353Sdim } 213234353Sdim 214234353Sdim // The current state is initialized according to the predecessors 215234353Sdim unsigned CurState = EntryState; 216234353Sdim bool BBHasCall = false; 217234353Sdim 218226584Sdim for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { 219226584Sdim MachineInstr *MI = I; 220226584Sdim DebugLoc dl = I->getDebugLoc(); 221234353Sdim bool isControlFlow = MI->isCall() || MI->isReturn(); 222226584Sdim 223239462Sdim // Shortcut: don't need to check regular instructions in dirty state. 224234353Sdim if (!isControlFlow && CurState == ST_DIRTY) 225234353Sdim continue; 226234353Sdim 227234353Sdim if (hasYmmReg(MI)) { 228234353Sdim // We found a ymm-using instruction; this could be an AVX instruction, 229234353Sdim // or it could be control flow. 230234353Sdim CurState = ST_DIRTY; 231234353Sdim continue; 232226584Sdim } 233234353Sdim 234234353Sdim // Check for control-flow out of the current function (which might 235234353Sdim // indirectly execute SSE instructions). 236234353Sdim if (!isControlFlow) 237234353Sdim continue; 238234353Sdim 239234353Sdim BBHasCall = true; 240234353Sdim 241234353Sdim // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 242234353Sdim // registers. This instruction has zero latency. In addition, the processor 243234353Sdim // changes back to Clean state, after which execution of Intel SSE 244234353Sdim // instructions or Intel AVX instructions has no transition penalty. Add 245234353Sdim // the VZEROUPPER instruction before any function call/return that might 246234353Sdim // execute SSE code. 247234353Sdim // FIXME: In some cases, we may want to move the VZEROUPPER into a 248234353Sdim // predecessor block. 249234353Sdim if (CurState == ST_DIRTY) { 250234353Sdim // Only insert the VZEROUPPER in case the entry state isn't unknown. 251234353Sdim // When unknown, only compute the information within the block to have 252234353Sdim // it available in the exit if possible, but don't change the block. 253234353Sdim if (EntryState != ST_UNKNOWN) { 254243830Sdim BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); 255234353Sdim ++NumVZU; 256234353Sdim } 257234353Sdim 258234353Sdim // After the inserted VZEROUPPER the state becomes clean again, but 259234353Sdim // other YMM may appear before other subsequent calls or even before 260234353Sdim // the end of the BB. 261234353Sdim CurState = ST_CLEAN; 262234353Sdim } 263226584Sdim } 264226584Sdim 265234353Sdim DEBUG(dbgs() << "MBB #" << BBNum 266234353Sdim << ", current state: " << CurState << '\n'); 267234353Sdim 268234353Sdim // A BB can only be considered solved when we both have done all the 269234353Sdim // necessary transformations, and have computed the exit state. This happens 270234353Sdim // in two cases: 271234353Sdim // 1) We know the entry state: this immediately implies the exit state and 272234353Sdim // all the necessary transformations. 273234353Sdim // 2) There are no calls, and and a non-call instruction marks this block: 274234353Sdim // no transformations are necessary, and we know the exit state. 275234353Sdim if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) 276234353Sdim BBSolved[BBNum] = true; 277234353Sdim 278234353Sdim if (CurState != BBState[BBNum]) 279234353Sdim Changed = true; 280234353Sdim 281234353Sdim BBState[BBNum] = CurState; 282226584Sdim return Changed; 283226584Sdim} 284