1226584Sdim//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2226584Sdim// 3226584Sdim// The LLVM Compiler Infrastructure 4226584Sdim// 5226584Sdim// This file is distributed under the University of Illinois Open Source 6226584Sdim// License. See LICENSE.TXT for details. 7226584Sdim// 8226584Sdim//===----------------------------------------------------------------------===// 9226584Sdim// 10226584Sdim// This file defines the pass which inserts x86 AVX vzeroupper instructions 11226584Sdim// before calls to SSE encoded functions. This avoids transition latency 12226584Sdim// penalty when tranfering control between AVX encoded instructions and old 13226584Sdim// SSE encoding mode. 14226584Sdim// 15226584Sdim//===----------------------------------------------------------------------===// 16226584Sdim 17234353Sdim#define DEBUG_TYPE "x86-vzeroupper" 18226584Sdim#include "X86.h" 19226584Sdim#include "X86InstrInfo.h" 20226584Sdim#include "llvm/ADT/Statistic.h" 21226584Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 22226584Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 23234353Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 24226584Sdim#include "llvm/CodeGen/Passes.h" 25234353Sdim#include "llvm/Support/Debug.h" 26234353Sdim#include "llvm/Support/raw_ostream.h" 27226584Sdim#include "llvm/Target/TargetInstrInfo.h" 28226584Sdimusing namespace llvm; 29226584Sdim 30226584SdimSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 31226584Sdim 32226584Sdimnamespace { 33226584Sdim struct VZeroUpperInserter : public MachineFunctionPass { 34226584Sdim static char ID; 35226584Sdim VZeroUpperInserter() : MachineFunctionPass(ID) {} 36226584Sdim 37226584Sdim virtual bool runOnMachineFunction(MachineFunction &MF); 38226584Sdim 39226584Sdim bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); 40226584Sdim 41226584Sdim virtual const char *getPassName() const { return "X86 vzeroupper inserter";} 42226584Sdim 43226584Sdim private: 44226584Sdim const TargetInstrInfo *TII; // Machine instruction info. 45234353Sdim 46234353Sdim // Any YMM register live-in to this function? 47234353Sdim bool FnHasLiveInYmm; 48234353Sdim 49234353Sdim // BBState - Contains the state of each MBB: unknown, clean, dirty 50234353Sdim SmallVector<uint8_t, 8> BBState; 51234353Sdim 52234353Sdim // BBSolved - Keep track of all MBB which had been already analyzed 53234353Sdim // and there is no further processing required. 54234353Sdim BitVector BBSolved; 55234353Sdim 56234353Sdim // Machine Basic Blocks are classified according this pass: 57234353Sdim // 58234353Sdim // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state 59234353Sdim // until the MBB exit there isn't a instruction using YMM to change 60234353Sdim // the state to dirty, or one of the incoming predecessors is unknown 61234353Sdim // and there's not a dirty predecessor between them. 62234353Sdim // 63234353Sdim // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have 64234353Sdim // instructions using YMM and be marked ST_CLEAN, as long as the state 65234353Sdim // is cleaned by a vzeroupper before any call. 66234353Sdim // 67234353Sdim // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a 68234353Sdim // vzeroupper instruction. 69234353Sdim // 70234353Sdim // ST_INIT - Placeholder for an empty state set 71234353Sdim // 72234353Sdim enum { 73234353Sdim ST_UNKNOWN = 0, 74234353Sdim ST_CLEAN = 1, 75234353Sdim ST_DIRTY = 2, 76234353Sdim ST_INIT = 3 77234353Sdim }; 78234353Sdim 79234353Sdim // computeState - Given two states, compute the resulting state, in 80234353Sdim // the following way 81234353Sdim // 82234353Sdim // 1) One dirty state yields another dirty state 83234353Sdim // 2) All states must be clean for the result to be clean 84234353Sdim // 3) If none above and one unknown, the result state is also unknown 85234353Sdim // 86243830Sdim static unsigned computeState(unsigned PrevState, unsigned CurState) { 87234353Sdim if (PrevState == ST_INIT) 88234353Sdim return CurState; 89234353Sdim 90234353Sdim if (PrevState == ST_DIRTY || CurState == ST_DIRTY) 91234353Sdim return ST_DIRTY; 92234353Sdim 93234353Sdim if (PrevState == ST_CLEAN && CurState == ST_CLEAN) 94234353Sdim return ST_CLEAN; 95234353Sdim 96234353Sdim return ST_UNKNOWN; 97234353Sdim } 98234353Sdim 99226584Sdim }; 100226584Sdim char VZeroUpperInserter::ID = 0; 101226584Sdim} 102226584Sdim 103226584SdimFunctionPass *llvm::createX86IssueVZeroUpperPass() { 104226584Sdim return new VZeroUpperInserter(); 105226584Sdim} 106226584Sdim 107234353Sdimstatic bool isYmmReg(unsigned Reg) { 108263508Sdim return (Reg >= X86::YMM0 && Reg <= X86::YMM31); 109263508Sdim} 110234353Sdim 111263508Sdimstatic bool isZmmReg(unsigned Reg) { 112263508Sdim return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); 113234353Sdim} 114234353Sdim 115234353Sdimstatic bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 116234353Sdim for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 117234353Sdim E = MRI.livein_end(); I != E; ++I) 118263508Sdim if (isYmmReg(I->first) || isZmmReg(I->first)) 119234353Sdim return true; 120234353Sdim 121234353Sdim return false; 122234353Sdim} 123234353Sdim 124249423Sdimstatic bool clobbersAllYmmRegs(const MachineOperand &MO) { 125263508Sdim for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { 126249423Sdim if (!MO.clobbersPhysReg(reg)) 127249423Sdim return false; 128249423Sdim } 129263508Sdim for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { 130263508Sdim if (!MO.clobbersPhysReg(reg)) 131263508Sdim return false; 132263508Sdim } 133249423Sdim return true; 134249423Sdim} 135249423Sdim 136234353Sdimstatic bool hasYmmReg(MachineInstr *MI) { 137243830Sdim for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 138234353Sdim const MachineOperand &MO = MI->getOperand(i); 139249423Sdim if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) 140249423Sdim return true; 141234353Sdim if (!MO.isReg()) 142234353Sdim continue; 143234353Sdim if (MO.isDebug()) 144234353Sdim continue; 145234353Sdim if (isYmmReg(MO.getReg())) 146234353Sdim return true; 147234353Sdim } 148234353Sdim return false; 149234353Sdim} 150234353Sdim 151263508Sdim/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this 152263508Sdim/// instruction. 153263508Sdimstatic bool clobbersAnyYmmReg(MachineInstr *MI) { 154263508Sdim for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 155263508Sdim const MachineOperand &MO = MI->getOperand(i); 156263508Sdim if (!MO.isRegMask()) 157263508Sdim continue; 158263508Sdim for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { 159263508Sdim if (MO.clobbersPhysReg(reg)) 160263508Sdim return true; 161263508Sdim } 162263508Sdim for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { 163263508Sdim if (MO.clobbersPhysReg(reg)) 164263508Sdim return true; 165263508Sdim } 166263508Sdim } 167263508Sdim return false; 168263508Sdim} 169263508Sdim 170226584Sdim/// runOnMachineFunction - Loop over all of the basic blocks, inserting 171226584Sdim/// vzero upper instructions before function calls. 172226584Sdimbool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 173226584Sdim TII = MF.getTarget().getInstrInfo(); 174234353Sdim MachineRegisterInfo &MRI = MF.getRegInfo(); 175234353Sdim bool EverMadeChange = false; 176226584Sdim 177234353Sdim // Fast check: if the function doesn't use any ymm registers, we don't need 178234353Sdim // to insert any VZEROUPPER instructions. This is constant-time, so it is 179234353Sdim // cheap in the common case of no ymm use. 180234353Sdim bool YMMUsed = false; 181239462Sdim const TargetRegisterClass *RC = &X86::VR256RegClass; 182234353Sdim for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); 183234353Sdim i != e; i++) { 184243830Sdim if (!MRI.reg_nodbg_empty(*i)) { 185234353Sdim YMMUsed = true; 186234353Sdim break; 187234353Sdim } 188234353Sdim } 189234353Sdim if (!YMMUsed) 190234353Sdim return EverMadeChange; 191226584Sdim 192234353Sdim // Pre-compute the existence of any live-in YMM registers to this function 193234353Sdim FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 194226584Sdim 195234353Sdim assert(BBState.empty()); 196234353Sdim BBState.resize(MF.getNumBlockIDs(), 0); 197234353Sdim BBSolved.resize(MF.getNumBlockIDs(), 0); 198226584Sdim 199234353Sdim // Each BB state depends on all predecessors, loop over until everything 200234353Sdim // converges. (Once we converge, we can implicitly mark everything that is 201234353Sdim // still ST_UNKNOWN as ST_CLEAN.) 202234353Sdim while (1) { 203234353Sdim bool MadeChange = false; 204226584Sdim 205234353Sdim // Process all basic blocks. 206234353Sdim for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) 207234353Sdim MadeChange |= processBasicBlock(MF, *I); 208226584Sdim 209234353Sdim // If this iteration over the code changed anything, keep iterating. 210234353Sdim if (!MadeChange) break; 211234353Sdim EverMadeChange = true; 212234353Sdim } 213226584Sdim 214234353Sdim BBState.clear(); 215234353Sdim BBSolved.clear(); 216234353Sdim return EverMadeChange; 217226584Sdim} 218226584Sdim 219226584Sdim/// processBasicBlock - Loop over all of the instructions in the basic block, 220226584Sdim/// inserting vzero upper instructions before function calls. 221226584Sdimbool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, 222226584Sdim MachineBasicBlock &BB) { 223226584Sdim bool Changed = false; 224234353Sdim unsigned BBNum = BB.getNumber(); 225226584Sdim 226234353Sdim // Don't process already solved BBs 227234353Sdim if (BBSolved[BBNum]) 228234353Sdim return false; // No changes 229234353Sdim 230234353Sdim // Check the state of all predecessors 231234353Sdim unsigned EntryState = ST_INIT; 232234353Sdim for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), 233234353Sdim PE = BB.pred_end(); PI != PE; ++PI) { 234234353Sdim EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); 235234353Sdim if (EntryState == ST_DIRTY) 236234353Sdim break; 237234353Sdim } 238234353Sdim 239234353Sdim 240239462Sdim // The entry MBB for the function may set the initial state to dirty if 241234353Sdim // the function receives any YMM incoming arguments 242243830Sdim if (&BB == MF.begin()) { 243234353Sdim EntryState = ST_CLEAN; 244234353Sdim if (FnHasLiveInYmm) 245234353Sdim EntryState = ST_DIRTY; 246234353Sdim } 247234353Sdim 248234353Sdim // The current state is initialized according to the predecessors 249234353Sdim unsigned CurState = EntryState; 250234353Sdim bool BBHasCall = false; 251234353Sdim 252226584Sdim for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { 253263508Sdim DebugLoc dl = I->getDebugLoc(); 254226584Sdim MachineInstr *MI = I; 255263508Sdim 256234353Sdim bool isControlFlow = MI->isCall() || MI->isReturn(); 257226584Sdim 258239462Sdim // Shortcut: don't need to check regular instructions in dirty state. 259234353Sdim if (!isControlFlow && CurState == ST_DIRTY) 260234353Sdim continue; 261234353Sdim 262234353Sdim if (hasYmmReg(MI)) { 263234353Sdim // We found a ymm-using instruction; this could be an AVX instruction, 264234353Sdim // or it could be control flow. 265234353Sdim CurState = ST_DIRTY; 266234353Sdim continue; 267226584Sdim } 268234353Sdim 269234353Sdim // Check for control-flow out of the current function (which might 270234353Sdim // indirectly execute SSE instructions). 271234353Sdim if (!isControlFlow) 272234353Sdim continue; 273234353Sdim 274263508Sdim // If the call won't clobber any YMM register, skip it as well. It usually 275263508Sdim // happens on helper function calls (such as '_chkstk', '_ftol2') where 276263508Sdim // standard calling convention is not used (RegMask is not used to mark 277263508Sdim // register clobbered and register usage (def/imp-def/use) is well-dfined 278263508Sdim // and explicitly specified. 279263508Sdim if (MI->isCall() && !clobbersAnyYmmReg(MI)) 280263508Sdim continue; 281263508Sdim 282234353Sdim BBHasCall = true; 283234353Sdim 284234353Sdim // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 285234353Sdim // registers. This instruction has zero latency. In addition, the processor 286234353Sdim // changes back to Clean state, after which execution of Intel SSE 287234353Sdim // instructions or Intel AVX instructions has no transition penalty. Add 288234353Sdim // the VZEROUPPER instruction before any function call/return that might 289234353Sdim // execute SSE code. 290234353Sdim // FIXME: In some cases, we may want to move the VZEROUPPER into a 291234353Sdim // predecessor block. 292234353Sdim if (CurState == ST_DIRTY) { 293234353Sdim // Only insert the VZEROUPPER in case the entry state isn't unknown. 294234353Sdim // When unknown, only compute the information within the block to have 295234353Sdim // it available in the exit if possible, but don't change the block. 296234353Sdim if (EntryState != ST_UNKNOWN) { 297243830Sdim BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); 298234353Sdim ++NumVZU; 299234353Sdim } 300234353Sdim 301234353Sdim // After the inserted VZEROUPPER the state becomes clean again, but 302234353Sdim // other YMM may appear before other subsequent calls or even before 303234353Sdim // the end of the BB. 304234353Sdim CurState = ST_CLEAN; 305234353Sdim } 306226584Sdim } 307226584Sdim 308234353Sdim DEBUG(dbgs() << "MBB #" << BBNum 309234353Sdim << ", current state: " << CurState << '\n'); 310234353Sdim 311234353Sdim // A BB can only be considered solved when we both have done all the 312234353Sdim // necessary transformations, and have computed the exit state. This happens 313234353Sdim // in two cases: 314234353Sdim // 1) We know the entry state: this immediately implies the exit state and 315234353Sdim // all the necessary transformations. 316234353Sdim // 2) There are no calls, and and a non-call instruction marks this block: 317234353Sdim // no transformations are necessary, and we know the exit state. 318234353Sdim if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) 319234353Sdim BBSolved[BBNum] = true; 320234353Sdim 321234353Sdim if (CurState != BBState[BBNum]) 322234353Sdim Changed = true; 323234353Sdim 324234353Sdim BBState[BBNum] = CurState; 325226584Sdim return Changed; 326226584Sdim} 327