1284677Sdim//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2284677Sdim// 3284677Sdim// The LLVM Compiler Infrastructure 4284677Sdim// 5284677Sdim// This file is distributed under the University of Illinois Open Source 6284677Sdim// License. See LICENSE.TXT for details. 7284677Sdim// 8284677Sdim/// The pass tries to use the 32-bit encoding for instructions when possible. 9284677Sdim//===----------------------------------------------------------------------===// 10284677Sdim// 11284677Sdim 12284677Sdim#include "AMDGPU.h" 13284677Sdim#include "AMDGPUMCInstLower.h" 14284677Sdim#include "AMDGPUSubtarget.h" 15284677Sdim#include "SIInstrInfo.h" 16284677Sdim#include "llvm/ADT/Statistic.h" 17284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 18284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 19284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 20284677Sdim#include "llvm/IR/Constants.h" 21284677Sdim#include "llvm/IR/Function.h" 22284677Sdim#include "llvm/IR/LLVMContext.h" 23284677Sdim#include "llvm/Support/Debug.h" 24284677Sdim#include "llvm/Support/raw_ostream.h" 25284677Sdim#include "llvm/Target/TargetMachine.h" 26284677Sdim 27284677Sdim#define DEBUG_TYPE "si-shrink-instructions" 28284677Sdim 29284677SdimSTATISTIC(NumInstructionsShrunk, 30284677Sdim "Number of 64-bit instruction reduced to 32-bit."); 31284677SdimSTATISTIC(NumLiteralConstantsFolded, 32284677Sdim "Number of literal constants folded into 32-bit instructions."); 33284677Sdim 34284677Sdimnamespace llvm { 35284677Sdim void initializeSIShrinkInstructionsPass(PassRegistry&); 36284677Sdim} 37284677Sdim 38284677Sdimusing namespace llvm; 39284677Sdim 40284677Sdimnamespace { 41284677Sdim 42284677Sdimclass SIShrinkInstructions : public MachineFunctionPass { 43284677Sdimpublic: 44284677Sdim static char ID; 45284677Sdim 46284677Sdimpublic: 47284677Sdim SIShrinkInstructions() : MachineFunctionPass(ID) { 48284677Sdim } 49284677Sdim 50284677Sdim bool runOnMachineFunction(MachineFunction &MF) override; 51284677Sdim 52284677Sdim const char *getPassName() const override { 53284677Sdim return "SI Shrink Instructions"; 54284677Sdim } 55284677Sdim 56284677Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 57284677Sdim AU.setPreservesCFG(); 58284677Sdim MachineFunctionPass::getAnalysisUsage(AU); 59284677Sdim } 60284677Sdim}; 61284677Sdim 62284677Sdim} // End anonymous namespace. 63284677Sdim 64284677SdimINITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, 65284677Sdim "SI Lower il Copies", false, false) 66284677SdimINITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, 67284677Sdim "SI Lower il Copies", false, false) 68284677Sdim 69284677Sdimchar SIShrinkInstructions::ID = 0; 70284677Sdim 71284677SdimFunctionPass *llvm::createSIShrinkInstructionsPass() { 72284677Sdim return new SIShrinkInstructions(); 73284677Sdim} 74284677Sdim 75284677Sdimstatic bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, 76284677Sdim const MachineRegisterInfo &MRI) { 77284677Sdim if (!MO->isReg()) 78284677Sdim return false; 79284677Sdim 80284677Sdim if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) 81284677Sdim return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); 82284677Sdim 83284677Sdim return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); 84284677Sdim} 85284677Sdim 86284677Sdimstatic bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, 87284677Sdim const SIRegisterInfo &TRI, 88284677Sdim const MachineRegisterInfo &MRI) { 89284677Sdim 90284677Sdim const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 91284677Sdim // Can't shrink instruction with three operands. 92284677Sdim // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 93284677Sdim // a special case for it. It can only be shrunk if the third operand 94284677Sdim // is vcc. We should handle this the same way we handle vopc, by addding 95284677Sdim // a register allocation hint pre-regalloc and then do the shrining 96284677Sdim // post-regalloc. 97286684Sdim if (Src2) { 98286684Sdim switch (MI.getOpcode()) { 99286684Sdim default: return false; 100284677Sdim 101286684Sdim case AMDGPU::V_MAC_F32_e64: 102286684Sdim if (!isVGPR(Src2, TRI, MRI) || 103286684Sdim TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 104286684Sdim return false; 105286684Sdim break; 106286684Sdim 107286684Sdim case AMDGPU::V_CNDMASK_B32_e64: 108286684Sdim break; 109286684Sdim } 110286684Sdim } 111286684Sdim 112284677Sdim const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 113284677Sdim const MachineOperand *Src1Mod = 114284677Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 115284677Sdim 116284677Sdim if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) 117284677Sdim return false; 118284677Sdim 119284677Sdim // We don't need to check src0, all input types are legal, so just make sure 120284677Sdim // src0 isn't using any modifiers. 121284677Sdim if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 122284677Sdim return false; 123284677Sdim 124284677Sdim // Check output modifiers 125284677Sdim if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 126284677Sdim return false; 127284677Sdim 128284677Sdim if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) 129284677Sdim return false; 130284677Sdim 131284677Sdim return true; 132284677Sdim} 133284677Sdim 134284677Sdim/// \brief This function checks \p MI for operands defined by a move immediate 135284677Sdim/// instruction and then folds the literal constant into the instruction if it 136284677Sdim/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction 137284677Sdim/// and will only fold literal constants if we are still in SSA. 138284677Sdimstatic void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 139284677Sdim MachineRegisterInfo &MRI, bool TryToCommute = true) { 140284677Sdim 141284677Sdim if (!MRI.isSSA()) 142284677Sdim return; 143284677Sdim 144296417Sdim assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 145284677Sdim 146284677Sdim const SIRegisterInfo &TRI = TII->getRegisterInfo(); 147284677Sdim int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 148284677Sdim MachineOperand &Src0 = MI.getOperand(Src0Idx); 149284677Sdim 150284677Sdim // Only one literal constant is allowed per instruction, so if src0 is a 151284677Sdim // literal constant then we can't do any folding. 152284677Sdim if (Src0.isImm() && 153284677Sdim TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) 154284677Sdim return; 155284677Sdim 156284677Sdim // Literal constants and SGPRs can only be used in Src0, so if Src0 is an 157284677Sdim // SGPR, we cannot commute the instruction, so we can't fold any literal 158284677Sdim // constants. 159284677Sdim if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) 160284677Sdim return; 161284677Sdim 162284677Sdim // Try to fold Src0 163286684Sdim if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { 164284677Sdim unsigned Reg = Src0.getReg(); 165284677Sdim MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 166284677Sdim if (Def && Def->isMoveImmediate()) { 167284677Sdim MachineOperand &MovSrc = Def->getOperand(1); 168284677Sdim bool ConstantFolded = false; 169284677Sdim 170284677Sdim if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { 171284677Sdim Src0.ChangeToImmediate(MovSrc.getImm()); 172284677Sdim ConstantFolded = true; 173284677Sdim } 174284677Sdim if (ConstantFolded) { 175284677Sdim if (MRI.use_empty(Reg)) 176284677Sdim Def->eraseFromParent(); 177284677Sdim ++NumLiteralConstantsFolded; 178284677Sdim return; 179284677Sdim } 180284677Sdim } 181284677Sdim } 182284677Sdim 183284677Sdim // We have failed to fold src0, so commute the instruction and try again. 184284677Sdim if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) 185284677Sdim foldImmediates(MI, TII, MRI, false); 186284677Sdim 187284677Sdim} 188284677Sdim 189296417Sdim// Copy MachineOperand with all flags except setting it as implicit. 190296417Sdimstatic MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { 191296417Sdim assert(!Orig.isImplicit()); 192296417Sdim return MachineOperand::CreateReg(Orig.getReg(), 193296417Sdim Orig.isDef(), 194296417Sdim true, 195296417Sdim Orig.isKill(), 196296417Sdim Orig.isDead(), 197296417Sdim Orig.isUndef(), 198296417Sdim Orig.isEarlyClobber(), 199296417Sdim Orig.getSubReg(), 200296417Sdim Orig.isDebug(), 201296417Sdim Orig.isInternalRead()); 202296417Sdim} 203296417Sdim 204284677Sdimbool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 205284677Sdim MachineRegisterInfo &MRI = MF.getRegInfo(); 206284677Sdim const SIInstrInfo *TII = 207284677Sdim static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 208284677Sdim const SIRegisterInfo &TRI = TII->getRegisterInfo(); 209284677Sdim std::vector<unsigned> I1Defs; 210284677Sdim 211284677Sdim for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 212284677Sdim BI != BE; ++BI) { 213284677Sdim 214284677Sdim MachineBasicBlock &MBB = *BI; 215284677Sdim MachineBasicBlock::iterator I, Next; 216284677Sdim for (I = MBB.begin(); I != MBB.end(); I = Next) { 217284677Sdim Next = std::next(I); 218284677Sdim MachineInstr &MI = *I; 219284677Sdim 220284677Sdim // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 221284677Sdim if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 222284677Sdim const MachineOperand &Src = MI.getOperand(1); 223284677Sdim 224284677Sdim if (Src.isImm()) { 225284677Sdim if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) 226284677Sdim MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 227284677Sdim } 228284677Sdim 229284677Sdim continue; 230284677Sdim } 231284677Sdim 232284677Sdim if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 233284677Sdim continue; 234284677Sdim 235284677Sdim if (!canShrink(MI, TII, TRI, MRI)) { 236284677Sdim // Try commuting the instruction and see if that enables us to shrink 237284677Sdim // it. 238284677Sdim if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || 239284677Sdim !canShrink(MI, TII, TRI, MRI)) 240284677Sdim continue; 241284677Sdim } 242284677Sdim 243284677Sdim // getVOPe32 could be -1 here if we started with an instruction that had 244284677Sdim // a 32-bit encoding and then commuted it to an instruction that did not. 245284677Sdim if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 246284677Sdim continue; 247284677Sdim 248284677Sdim int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 249284677Sdim 250284677Sdim if (TII->isVOPC(Op32)) { 251284677Sdim unsigned DstReg = MI.getOperand(0).getReg(); 252284677Sdim if (TargetRegisterInfo::isVirtualRegister(DstReg)) { 253296417Sdim // VOPC instructions can only write to the VCC register. We can't 254296417Sdim // force them to use VCC here, because this is only one register and 255296417Sdim // cannot deal with sequences which would require multiple copies of 256296417Sdim // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 257284677Sdim // 258284677Sdim // So, instead of forcing the instruction to write to VCC, we provide 259284677Sdim // a hint to the register allocator to use VCC and then we we will run 260284677Sdim // this pass again after RA and shrink it if it outputs to VCC. 261284677Sdim MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); 262284677Sdim continue; 263284677Sdim } 264284677Sdim if (DstReg != AMDGPU::VCC) 265284677Sdim continue; 266284677Sdim } 267284677Sdim 268286684Sdim if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 269286684Sdim // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 270286684Sdim // instructions. 271286684Sdim const MachineOperand *Src2 = 272286684Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src2); 273286684Sdim if (!Src2->isReg()) 274286684Sdim continue; 275286684Sdim unsigned SReg = Src2->getReg(); 276286684Sdim if (TargetRegisterInfo::isVirtualRegister(SReg)) { 277286684Sdim MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); 278286684Sdim continue; 279286684Sdim } 280286684Sdim if (SReg != AMDGPU::VCC) 281286684Sdim continue; 282286684Sdim } 283286684Sdim 284284677Sdim // We can shrink this instruction 285296417Sdim DEBUG(dbgs() << "Shrinking " << MI); 286284677Sdim 287284677Sdim MachineInstrBuilder Inst32 = 288284677Sdim BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); 289284677Sdim 290296417Sdim // Add the dst operand if the 32-bit encoding also has an explicit $dst. 291296417Sdim // For VOPC instructions, this is replaced by an implicit def of vcc. 292296417Sdim int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); 293296417Sdim if (Op32DstIdx != -1) { 294296417Sdim // dst 295296417Sdim Inst32.addOperand(MI.getOperand(0)); 296296417Sdim } else { 297296417Sdim assert(MI.getOperand(0).getReg() == AMDGPU::VCC && 298296417Sdim "Unexpected case"); 299296417Sdim } 300284677Sdim 301296417Sdim 302284677Sdim Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 303284677Sdim 304284677Sdim const MachineOperand *Src1 = 305284677Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src1); 306284677Sdim if (Src1) 307284677Sdim Inst32.addOperand(*Src1); 308284677Sdim 309286684Sdim const MachineOperand *Src2 = 310296417Sdim TII->getNamedOperand(MI, AMDGPU::OpName::src2); 311296417Sdim if (Src2) { 312296417Sdim int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 313296417Sdim if (Op32Src2Idx != -1) { 314296417Sdim Inst32.addOperand(*Src2); 315296417Sdim } else { 316296417Sdim // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 317296417Sdim // replaced with an implicit read of vcc. 318296417Sdim assert(Src2->getReg() == AMDGPU::VCC && 319296417Sdim "Unexpected missing register operand"); 320296417Sdim Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); 321296417Sdim } 322296417Sdim } 323286684Sdim 324284677Sdim ++NumInstructionsShrunk; 325284677Sdim MI.eraseFromParent(); 326284677Sdim 327284677Sdim foldImmediates(*Inst32, TII, MRI); 328284677Sdim DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 329284677Sdim 330284677Sdim 331284677Sdim } 332284677Sdim } 333284677Sdim return false; 334284677Sdim} 335