1249259Sdim//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==// 2249259Sdim// 3249259Sdim// The LLVM Compiler Infrastructure 4249259Sdim// 5249259Sdim// This file is distributed under the University of Illinois Open Source 6249259Sdim// License. See LICENSE.TXT for details. 7249259Sdim// 8249259Sdim//===----------------------------------------------------------------------===// 9249259Sdim// 10249259Sdim// The Cortex-A15 processor employs a tracking scheme in its register renaming 11249259Sdim// in order to process each instruction's micro-ops speculatively and 12249259Sdim// out-of-order with appropriate forwarding. The ARM architecture allows VFP 13249259Sdim// instructions to read and write 32-bit S-registers. Each S-register 14249259Sdim// corresponds to one half (upper or lower) of an overlaid 64-bit D-register. 15249259Sdim// 16249259Sdim// There are several instruction patterns which can be used to provide this 17249259Sdim// capability which can provide higher performance than other, potentially more 18249259Sdim// direct patterns, specifically around when one micro-op reads a D-register 19249259Sdim// operand that has recently been written as one or more S-register results. 20249259Sdim// 21249259Sdim// This file defines a pre-regalloc pass which looks for SPR producers which 22249259Sdim// are going to be used by a DPR (or QPR) consumers and creates the more 23249259Sdim// optimized access pattern. 24249259Sdim// 25249259Sdim//===----------------------------------------------------------------------===// 26249259Sdim 27249259Sdim#define DEBUG_TYPE "a15-sd-optimizer" 28249259Sdim#include "ARM.h" 29249259Sdim#include "ARMBaseInstrInfo.h" 30249259Sdim#include "ARMSubtarget.h" 31249259Sdim#include "ARMISelLowering.h" 32249259Sdim#include "ARMTargetMachine.h" 33249259Sdim 34249259Sdim#include "llvm/ADT/SmallPtrSet.h" 35249259Sdim#include "llvm/ADT/Statistic.h" 36249259Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 37249259Sdim#include "llvm/CodeGen/MachineInstr.h" 38249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 39249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 40249259Sdim#include "llvm/Support/CommandLine.h" 41249259Sdim#include "llvm/Support/Debug.h" 42249259Sdim#include "llvm/Support/raw_ostream.h" 43249259Sdim#include "llvm/Target/TargetRegisterInfo.h" 44249259Sdim 45249259Sdim#include <set> 46249259Sdim 47249259Sdimusing namespace llvm; 48249259Sdim 49249259Sdimnamespace { 50249259Sdim struct A15SDOptimizer : public MachineFunctionPass { 51249259Sdim static char ID; 52249259Sdim A15SDOptimizer() : MachineFunctionPass(ID) {} 53249259Sdim 54249259Sdim virtual bool runOnMachineFunction(MachineFunction &Fn); 55249259Sdim 56249259Sdim virtual const char *getPassName() const { 57249259Sdim return "ARM A15 S->D optimizer"; 58249259Sdim } 59249259Sdim 60249259Sdim private: 61249259Sdim const ARMBaseInstrInfo *TII; 62249259Sdim const TargetRegisterInfo *TRI; 63249259Sdim MachineRegisterInfo *MRI; 64249259Sdim 65249259Sdim bool runOnInstruction(MachineInstr *MI); 66249259Sdim 67249259Sdim // 68249259Sdim // Instruction builder helpers 69249259Sdim // 70249259Sdim unsigned createDupLane(MachineBasicBlock &MBB, 71249259Sdim MachineBasicBlock::iterator InsertBefore, 72249259Sdim DebugLoc DL, 73249259Sdim unsigned Reg, unsigned Lane, 74249259Sdim bool QPR=false); 75249259Sdim 76249259Sdim unsigned createExtractSubreg(MachineBasicBlock &MBB, 77249259Sdim MachineBasicBlock::iterator InsertBefore, 78249259Sdim DebugLoc DL, 79249259Sdim unsigned DReg, unsigned Lane, 80249259Sdim const TargetRegisterClass *TRC); 81249259Sdim 82249259Sdim unsigned createVExt(MachineBasicBlock &MBB, 83249259Sdim MachineBasicBlock::iterator InsertBefore, 84249259Sdim DebugLoc DL, 85249259Sdim unsigned Ssub0, unsigned Ssub1); 86249259Sdim 87249259Sdim unsigned createRegSequence(MachineBasicBlock &MBB, 88249259Sdim MachineBasicBlock::iterator InsertBefore, 89249259Sdim DebugLoc DL, 90249259Sdim unsigned Reg1, unsigned Reg2); 91249259Sdim 92249259Sdim unsigned createInsertSubreg(MachineBasicBlock &MBB, 93249259Sdim MachineBasicBlock::iterator InsertBefore, 94249259Sdim DebugLoc DL, unsigned DReg, unsigned Lane, 95249259Sdim unsigned ToInsert); 96249259Sdim 97249259Sdim unsigned createImplicitDef(MachineBasicBlock &MBB, 98249259Sdim MachineBasicBlock::iterator InsertBefore, 99249259Sdim DebugLoc DL); 100249259Sdim 101249259Sdim // 102249259Sdim // Various property checkers 103249259Sdim // 104249259Sdim bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC); 105249259Sdim bool hasPartialWrite(MachineInstr *MI); 106249259Sdim SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI); 107249259Sdim unsigned getDPRLaneFromSPR(unsigned SReg); 108249259Sdim 109249259Sdim // 110249259Sdim // Methods used for getting the definitions of partial registers 111249259Sdim // 112249259Sdim 113249259Sdim MachineInstr *elideCopies(MachineInstr *MI); 114249259Sdim void elideCopiesAndPHIs(MachineInstr *MI, 115249259Sdim SmallVectorImpl<MachineInstr*> &Outs); 116249259Sdim 117249259Sdim // 118249259Sdim // Pattern optimization methods 119249259Sdim // 120249259Sdim unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg); 121249259Sdim unsigned optimizeSDPattern(MachineInstr *MI); 122249259Sdim unsigned getPrefSPRLane(unsigned SReg); 123249259Sdim 124249259Sdim // 125249259Sdim // Sanitizing method - used to make sure if don't leave dead code around. 126249259Sdim // 127249259Sdim void eraseInstrWithNoUses(MachineInstr *MI); 128249259Sdim 129249259Sdim // 130249259Sdim // A map used to track the changes done by this pass. 131249259Sdim // 132249259Sdim std::map<MachineInstr*, unsigned> Replacements; 133249259Sdim std::set<MachineInstr *> DeadInstr; 134249259Sdim }; 135249259Sdim char A15SDOptimizer::ID = 0; 136249259Sdim} // end anonymous namespace 137249259Sdim 138249259Sdim// Returns true if this is a use of a SPR register. 139249259Sdimbool A15SDOptimizer::usesRegClass(MachineOperand &MO, 140249259Sdim const TargetRegisterClass *TRC) { 141249259Sdim if (!MO.isReg()) 142249259Sdim return false; 143249259Sdim unsigned Reg = MO.getReg(); 144249259Sdim 145249259Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) 146249259Sdim return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); 147249259Sdim else 148249259Sdim return TRC->contains(Reg); 149249259Sdim} 150249259Sdim 151249259Sdimunsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { 152249259Sdim unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, 153249259Sdim &ARM::DPRRegClass); 154249259Sdim if (DReg != ARM::NoRegister) return ARM::ssub_1; 155249259Sdim return ARM::ssub_0; 156249259Sdim} 157249259Sdim 158249259Sdim// Get the subreg type that is most likely to be coalesced 159249259Sdim// for an SPR register that will be used in VDUP32d pseudo. 160249259Sdimunsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { 161249259Sdim if (!TRI->isVirtualRegister(SReg)) 162249259Sdim return getDPRLaneFromSPR(SReg); 163249259Sdim 164249259Sdim MachineInstr *MI = MRI->getVRegDef(SReg); 165249259Sdim if (!MI) return ARM::ssub_0; 166249259Sdim MachineOperand *MO = MI->findRegisterDefOperand(SReg); 167249259Sdim 168249259Sdim assert(MO->isReg() && "Non register operand found!"); 169249259Sdim if (!MO) return ARM::ssub_0; 170249259Sdim 171249259Sdim if (MI->isCopy() && usesRegClass(MI->getOperand(1), 172249259Sdim &ARM::SPRRegClass)) { 173249259Sdim SReg = MI->getOperand(1).getReg(); 174249259Sdim } 175249259Sdim 176249259Sdim if (TargetRegisterInfo::isVirtualRegister(SReg)) { 177249259Sdim if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1; 178249259Sdim return ARM::ssub_0; 179249259Sdim } 180249259Sdim return getDPRLaneFromSPR(SReg); 181249259Sdim} 182249259Sdim 183249259Sdim// MI is known to be dead. Figure out what instructions 184249259Sdim// are also made dead by this and mark them for removal. 185249259Sdimvoid A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { 186249259Sdim SmallVector<MachineInstr *, 8> Front; 187249259Sdim DeadInstr.insert(MI); 188249259Sdim 189249259Sdim DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n"); 190249259Sdim Front.push_back(MI); 191249259Sdim 192249259Sdim while (Front.size() != 0) { 193249259Sdim MI = Front.back(); 194249259Sdim Front.pop_back(); 195249259Sdim 196249259Sdim // MI is already known to be dead. We need to see 197249259Sdim // if other instructions can also be removed. 198249259Sdim for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { 199249259Sdim MachineOperand &MO = MI->getOperand(i); 200249259Sdim if ((!MO.isReg()) || (!MO.isUse())) 201249259Sdim continue; 202249259Sdim unsigned Reg = MO.getReg(); 203249259Sdim if (!TRI->isVirtualRegister(Reg)) 204249259Sdim continue; 205249259Sdim MachineOperand *Op = MI->findRegisterDefOperand(Reg); 206249259Sdim 207249259Sdim if (!Op) 208249259Sdim continue; 209249259Sdim 210249259Sdim MachineInstr *Def = Op->getParent(); 211249259Sdim 212249259Sdim // We don't need to do anything if we have already marked 213249259Sdim // this instruction as being dead. 214249259Sdim if (DeadInstr.find(Def) != DeadInstr.end()) 215249259Sdim continue; 216249259Sdim 217249259Sdim // Check if all the uses of this instruction are marked as 218249259Sdim // dead. If so, we can also mark this instruction as being 219249259Sdim // dead. 220249259Sdim bool IsDead = true; 221249259Sdim for (unsigned int j = 0; j < Def->getNumOperands(); ++j) { 222249259Sdim MachineOperand &MODef = Def->getOperand(j); 223249259Sdim if ((!MODef.isReg()) || (!MODef.isDef())) 224249259Sdim continue; 225249259Sdim unsigned DefReg = MODef.getReg(); 226249259Sdim if (!TRI->isVirtualRegister(DefReg)) { 227249259Sdim IsDead = false; 228249259Sdim break; 229249259Sdim } 230249259Sdim for (MachineRegisterInfo::use_iterator II = MRI->use_begin(Reg), 231249259Sdim EE = MRI->use_end(); 232249259Sdim II != EE; ++II) { 233249259Sdim // We don't care about self references. 234249259Sdim if (&*II == Def) 235249259Sdim continue; 236249259Sdim if (DeadInstr.find(&*II) == DeadInstr.end()) { 237249259Sdim IsDead = false; 238249259Sdim break; 239249259Sdim } 240249259Sdim } 241249259Sdim } 242249259Sdim 243249259Sdim if (!IsDead) continue; 244249259Sdim 245249259Sdim DEBUG(dbgs() << "Deleting instruction " << *Def << "\n"); 246249259Sdim DeadInstr.insert(Def); 247249259Sdim } 248249259Sdim } 249249259Sdim} 250249259Sdim 251249259Sdim// Creates the more optimized patterns and generally does all the code 252249259Sdim// transformations in this pass. 253249259Sdimunsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { 254249259Sdim if (MI->isCopy()) { 255249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg()); 256249259Sdim } 257249259Sdim 258249259Sdim if (MI->isInsertSubreg()) { 259249259Sdim unsigned DPRReg = MI->getOperand(1).getReg(); 260249259Sdim unsigned SPRReg = MI->getOperand(2).getReg(); 261249259Sdim 262249259Sdim if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) { 263249259Sdim MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); 264249259Sdim MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); 265249259Sdim 266249259Sdim if (DPRMI && SPRMI) { 267249259Sdim // See if the first operand of this insert_subreg is IMPLICIT_DEF 268249259Sdim MachineInstr *ECDef = elideCopies(DPRMI); 269249259Sdim if (ECDef != 0 && ECDef->isImplicitDef()) { 270249259Sdim // Another corner case - if we're inserting something that is purely 271249259Sdim // a subreg copy of a DPR, just use that DPR. 272249259Sdim 273249259Sdim MachineInstr *EC = elideCopies(SPRMI); 274249259Sdim // Is it a subreg copy of ssub_0? 275249259Sdim if (EC && EC->isCopy() && 276249259Sdim EC->getOperand(1).getSubReg() == ARM::ssub_0) { 277249259Sdim DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI); 278249259Sdim 279249259Sdim // Find the thing we're subreg copying out of - is it of the same 280249259Sdim // regclass as DPRMI? (i.e. a DPR or QPR). 281249259Sdim unsigned FullReg = SPRMI->getOperand(1).getReg(); 282249259Sdim const TargetRegisterClass *TRC = 283249259Sdim MRI->getRegClass(MI->getOperand(1).getReg()); 284249259Sdim if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) { 285249259Sdim DEBUG(dbgs() << "Subreg copy is compatible - returning "); 286249259Sdim DEBUG(dbgs() << PrintReg(FullReg) << "\n"); 287249259Sdim eraseInstrWithNoUses(MI); 288249259Sdim return FullReg; 289249259Sdim } 290249259Sdim } 291249259Sdim 292249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg()); 293249259Sdim } 294249259Sdim } 295249259Sdim } 296249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); 297249259Sdim } 298249259Sdim 299249259Sdim if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), 300249259Sdim &ARM::SPRRegClass)) { 301249259Sdim // See if all bar one of the operands are IMPLICIT_DEF and insert the 302249259Sdim // optimizer pattern accordingly. 303249259Sdim unsigned NumImplicit = 0, NumTotal = 0; 304249259Sdim unsigned NonImplicitReg = ~0U; 305249259Sdim 306249259Sdim for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) { 307249259Sdim if (!MI->getOperand(I).isReg()) 308249259Sdim continue; 309249259Sdim ++NumTotal; 310249259Sdim unsigned OpReg = MI->getOperand(I).getReg(); 311249259Sdim 312249259Sdim if (!TRI->isVirtualRegister(OpReg)) 313249259Sdim break; 314249259Sdim 315249259Sdim MachineInstr *Def = MRI->getVRegDef(OpReg); 316249259Sdim if (!Def) 317249259Sdim break; 318249259Sdim if (Def->isImplicitDef()) 319249259Sdim ++NumImplicit; 320249259Sdim else 321249259Sdim NonImplicitReg = MI->getOperand(I).getReg(); 322249259Sdim } 323249259Sdim 324249259Sdim if (NumImplicit == NumTotal - 1) 325249259Sdim return optimizeAllLanesPattern(MI, NonImplicitReg); 326249259Sdim else 327249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); 328249259Sdim } 329249259Sdim 330249259Sdim assert(0 && "Unhandled update pattern!"); 331249259Sdim return 0; 332249259Sdim} 333249259Sdim 334249259Sdim// Return true if this MachineInstr inserts a scalar (SPR) value into 335249259Sdim// a D or Q register. 336249259Sdimbool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { 337249259Sdim // The only way we can do a partial register update is through a COPY, 338249259Sdim // INSERT_SUBREG or REG_SEQUENCE. 339249259Sdim if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) 340249259Sdim return true; 341249259Sdim 342249259Sdim if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2), 343249259Sdim &ARM::SPRRegClass)) 344249259Sdim return true; 345249259Sdim 346249259Sdim if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) 347249259Sdim return true; 348249259Sdim 349249259Sdim return false; 350249259Sdim} 351249259Sdim 352249259Sdim// Looks through full copies to get the instruction that defines the input 353249259Sdim// operand for MI. 354249259SdimMachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { 355249259Sdim if (!MI->isFullCopy()) 356249259Sdim return MI; 357249259Sdim if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) 358249259Sdim return NULL; 359249259Sdim MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); 360249259Sdim if (!Def) 361249259Sdim return NULL; 362249259Sdim return elideCopies(Def); 363249259Sdim} 364249259Sdim 365249259Sdim// Look through full copies and PHIs to get the set of non-copy MachineInstrs 366249259Sdim// that can produce MI. 367249259Sdimvoid A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, 368249259Sdim SmallVectorImpl<MachineInstr*> &Outs) { 369249259Sdim // Looking through PHIs may create loops so we need to track what 370249259Sdim // instructions we have visited before. 371249259Sdim std::set<MachineInstr *> Reached; 372249259Sdim SmallVector<MachineInstr *, 8> Front; 373249259Sdim Front.push_back(MI); 374249259Sdim while (Front.size() != 0) { 375249259Sdim MI = Front.back(); 376249259Sdim Front.pop_back(); 377249259Sdim 378249259Sdim // If we have already explored this MachineInstr, ignore it. 379249259Sdim if (Reached.find(MI) != Reached.end()) 380249259Sdim continue; 381249259Sdim Reached.insert(MI); 382249259Sdim if (MI->isPHI()) { 383249259Sdim for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 384249259Sdim unsigned Reg = MI->getOperand(I).getReg(); 385249259Sdim if (!TRI->isVirtualRegister(Reg)) { 386249259Sdim continue; 387249259Sdim } 388249259Sdim MachineInstr *NewMI = MRI->getVRegDef(Reg); 389249259Sdim if (!NewMI) 390249259Sdim continue; 391249259Sdim Front.push_back(NewMI); 392249259Sdim } 393249259Sdim } else if (MI->isFullCopy()) { 394249259Sdim if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) 395249259Sdim continue; 396249259Sdim MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); 397249259Sdim if (!NewMI) 398249259Sdim continue; 399249259Sdim Front.push_back(NewMI); 400249259Sdim } else { 401249259Sdim DEBUG(dbgs() << "Found partial copy" << *MI <<"\n"); 402249259Sdim Outs.push_back(MI); 403249259Sdim } 404249259Sdim } 405249259Sdim} 406249259Sdim 407249259Sdim// Return the DPR virtual registers that are read by this machine instruction 408249259Sdim// (if any). 409249259SdimSmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) { 410249259Sdim if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() || 411249259Sdim MI->isKill()) 412249259Sdim return SmallVector<unsigned, 8>(); 413249259Sdim 414249259Sdim SmallVector<unsigned, 8> Defs; 415249259Sdim for (unsigned i = 0; i < MI->getNumOperands(); ++i) { 416249259Sdim MachineOperand &MO = MI->getOperand(i); 417249259Sdim 418249259Sdim if (!MO.isReg() || !MO.isUse()) 419249259Sdim continue; 420249259Sdim if (!usesRegClass(MO, &ARM::DPRRegClass) && 421266715Sdim !usesRegClass(MO, &ARM::QPRRegClass) && 422266715Sdim !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR 423249259Sdim continue; 424249259Sdim 425249259Sdim Defs.push_back(MO.getReg()); 426249259Sdim } 427249259Sdim return Defs; 428249259Sdim} 429249259Sdim 430249259Sdim// Creates a DPR register from an SPR one by using a VDUP. 431249259Sdimunsigned 432249259SdimA15SDOptimizer::createDupLane(MachineBasicBlock &MBB, 433249259Sdim MachineBasicBlock::iterator InsertBefore, 434249259Sdim DebugLoc DL, 435249259Sdim unsigned Reg, unsigned Lane, bool QPR) { 436249259Sdim unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : 437249259Sdim &ARM::DPRRegClass); 438249259Sdim AddDefaultPred(BuildMI(MBB, 439249259Sdim InsertBefore, 440249259Sdim DL, 441249259Sdim TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), 442249259Sdim Out) 443249259Sdim .addReg(Reg) 444249259Sdim .addImm(Lane)); 445249259Sdim 446249259Sdim return Out; 447249259Sdim} 448249259Sdim 449249259Sdim// Creates a SPR register from a DPR by copying the value in lane 0. 450249259Sdimunsigned 451249259SdimA15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB, 452249259Sdim MachineBasicBlock::iterator InsertBefore, 453249259Sdim DebugLoc DL, 454249259Sdim unsigned DReg, unsigned Lane, 455249259Sdim const TargetRegisterClass *TRC) { 456249259Sdim unsigned Out = MRI->createVirtualRegister(TRC); 457249259Sdim BuildMI(MBB, 458249259Sdim InsertBefore, 459249259Sdim DL, 460249259Sdim TII->get(TargetOpcode::COPY), Out) 461249259Sdim .addReg(DReg, 0, Lane); 462249259Sdim 463249259Sdim return Out; 464249259Sdim} 465249259Sdim 466249259Sdim// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE. 467249259Sdimunsigned 468249259SdimA15SDOptimizer::createRegSequence(MachineBasicBlock &MBB, 469249259Sdim MachineBasicBlock::iterator InsertBefore, 470249259Sdim DebugLoc DL, 471249259Sdim unsigned Reg1, unsigned Reg2) { 472249259Sdim unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); 473249259Sdim BuildMI(MBB, 474249259Sdim InsertBefore, 475249259Sdim DL, 476249259Sdim TII->get(TargetOpcode::REG_SEQUENCE), Out) 477249259Sdim .addReg(Reg1) 478249259Sdim .addImm(ARM::dsub_0) 479249259Sdim .addReg(Reg2) 480249259Sdim .addImm(ARM::dsub_1); 481249259Sdim return Out; 482249259Sdim} 483249259Sdim 484249259Sdim// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1) 485249259Sdim// and merges them into one DPR register. 486249259Sdimunsigned 487249259SdimA15SDOptimizer::createVExt(MachineBasicBlock &MBB, 488249259Sdim MachineBasicBlock::iterator InsertBefore, 489249259Sdim DebugLoc DL, 490249259Sdim unsigned Ssub0, unsigned Ssub1) { 491249259Sdim unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); 492249259Sdim AddDefaultPred(BuildMI(MBB, 493249259Sdim InsertBefore, 494249259Sdim DL, 495249259Sdim TII->get(ARM::VEXTd32), Out) 496249259Sdim .addReg(Ssub0) 497249259Sdim .addReg(Ssub1) 498249259Sdim .addImm(1)); 499249259Sdim return Out; 500249259Sdim} 501249259Sdim 502249259Sdimunsigned 503249259SdimA15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB, 504249259Sdim MachineBasicBlock::iterator InsertBefore, 505249259Sdim DebugLoc DL, unsigned DReg, unsigned Lane, 506249259Sdim unsigned ToInsert) { 507249259Sdim unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); 508249259Sdim BuildMI(MBB, 509249259Sdim InsertBefore, 510249259Sdim DL, 511249259Sdim TII->get(TargetOpcode::INSERT_SUBREG), Out) 512249259Sdim .addReg(DReg) 513249259Sdim .addReg(ToInsert) 514249259Sdim .addImm(Lane); 515249259Sdim 516249259Sdim return Out; 517249259Sdim} 518249259Sdim 519249259Sdimunsigned 520249259SdimA15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, 521249259Sdim MachineBasicBlock::iterator InsertBefore, 522249259Sdim DebugLoc DL) { 523249259Sdim unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); 524249259Sdim BuildMI(MBB, 525249259Sdim InsertBefore, 526249259Sdim DL, 527249259Sdim TII->get(TargetOpcode::IMPLICIT_DEF), Out); 528249259Sdim return Out; 529249259Sdim} 530249259Sdim 531249259Sdim// This function inserts instructions in order to optimize interactions between 532249259Sdim// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all 533249259Sdim// lanes, and the using VEXT instructions to recompose the result. 534249259Sdimunsigned 535249259SdimA15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) { 536249259Sdim MachineBasicBlock::iterator InsertPt(MI); 537249259Sdim DebugLoc DL = MI->getDebugLoc(); 538249259Sdim MachineBasicBlock &MBB = *MI->getParent(); 539249259Sdim InsertPt++; 540249259Sdim unsigned Out; 541249259Sdim 542266715Sdim // DPair has the same length as QPR and also has two DPRs as subreg. 543266715Sdim // Treat DPair as QPR. 544266715Sdim if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) || 545266715Sdim MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) { 546249259Sdim unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg, 547249259Sdim ARM::dsub_0, &ARM::DPRRegClass); 548249259Sdim unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg, 549249259Sdim ARM::dsub_1, &ARM::DPRRegClass); 550249259Sdim 551249259Sdim unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0); 552249259Sdim unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1); 553249259Sdim Out = createVExt(MBB, InsertPt, DL, Out1, Out2); 554249259Sdim 555249259Sdim unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0); 556249259Sdim unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1); 557249259Sdim Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4); 558249259Sdim 559249259Sdim Out = createRegSequence(MBB, InsertPt, DL, Out, Out2); 560249259Sdim 561249259Sdim } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) { 562249259Sdim unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0); 563249259Sdim unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1); 564249259Sdim Out = createVExt(MBB, InsertPt, DL, Out1, Out2); 565249259Sdim 566249259Sdim } else { 567249259Sdim assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) && 568249259Sdim "Found unexpected regclass!"); 569249259Sdim 570249259Sdim unsigned PrefLane = getPrefSPRLane(Reg); 571249259Sdim unsigned Lane; 572249259Sdim switch (PrefLane) { 573249259Sdim case ARM::ssub_0: Lane = 0; break; 574249259Sdim case ARM::ssub_1: Lane = 1; break; 575249259Sdim default: llvm_unreachable("Unknown preferred lane!"); 576249259Sdim } 577249259Sdim 578266715Sdim // Treat DPair as QPR 579266715Sdim bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) || 580266715Sdim usesRegClass(MI->getOperand(0), &ARM::DPairRegClass); 581249259Sdim 582249259Sdim Out = createImplicitDef(MBB, InsertPt, DL); 583249259Sdim Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg); 584249259Sdim Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR); 585249259Sdim eraseInstrWithNoUses(MI); 586249259Sdim } 587249259Sdim return Out; 588249259Sdim} 589249259Sdim 590249259Sdimbool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { 591249259Sdim // We look for instructions that write S registers that are then read as 592249259Sdim // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and 593249259Sdim // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or 594249259Sdim // merge two SPR values to form a DPR register. In order avoid false 595249259Sdim // positives we make sure that there is an SPR producer so we look past 596249259Sdim // COPY and PHI nodes to find it. 597249259Sdim // 598249259Sdim // The best code pattern for when an SPR producer is going to be used by a 599249259Sdim // DPR or QPR consumer depends on whether the other lanes of the 600249259Sdim // corresponding DPR/QPR are currently defined. 601249259Sdim // 602249259Sdim // We can handle these efficiently, depending on the type of 603249259Sdim // pseudo-instruction that is producing the pattern 604249259Sdim // 605249259Sdim // * COPY: * VDUP all lanes and merge the results together 606249259Sdim // using VEXTs. 607249259Sdim // 608249259Sdim // * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR 609249259Sdim // lane, and the other lane(s) of the DPR/QPR register 610249259Sdim // that we are inserting in are undefined, use the 611249259Sdim // original DPR/QPR value. 612249259Sdim // * Otherwise, fall back on the same stategy as COPY. 613249259Sdim // 614249259Sdim // * REG_SEQUENCE: * If all except one of the input operands are 615249259Sdim // IMPLICIT_DEFs, insert the VDUP pattern for just the 616249259Sdim // defined input operand 617249259Sdim // * Otherwise, fall back on the same stategy as COPY. 618249259Sdim // 619249259Sdim 620249259Sdim // First, get all the reads of D-registers done by this instruction. 621249259Sdim SmallVector<unsigned, 8> Defs = getReadDPRs(MI); 622249259Sdim bool Modified = false; 623249259Sdim 624263508Sdim for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end(); 625249259Sdim I != E; ++I) { 626249259Sdim // Follow the def-use chain for this DPR through COPYs, and also through 627249259Sdim // PHIs (which are essentially multi-way COPYs). It is because of PHIs that 628249259Sdim // we can end up with multiple defs of this DPR. 629249259Sdim 630249259Sdim SmallVector<MachineInstr *, 8> DefSrcs; 631249259Sdim if (!TRI->isVirtualRegister(*I)) 632249259Sdim continue; 633249259Sdim MachineInstr *Def = MRI->getVRegDef(*I); 634249259Sdim if (!Def) 635249259Sdim continue; 636249259Sdim 637249259Sdim elideCopiesAndPHIs(Def, DefSrcs); 638249259Sdim 639263508Sdim for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(), 640249259Sdim EE = DefSrcs.end(); II != EE; ++II) { 641249259Sdim MachineInstr *MI = *II; 642249259Sdim 643249259Sdim // If we've already analyzed and replaced this operand, don't do 644249259Sdim // anything. 645249259Sdim if (Replacements.find(MI) != Replacements.end()) 646249259Sdim continue; 647249259Sdim 648249259Sdim // Now, work out if the instruction causes a SPR->DPR dependency. 649249259Sdim if (!hasPartialWrite(MI)) 650249259Sdim continue; 651249259Sdim 652249259Sdim // Collect all the uses of this MI's DPR def for updating later. 653249259Sdim SmallVector<MachineOperand*, 8> Uses; 654249259Sdim unsigned DPRDefReg = MI->getOperand(0).getReg(); 655249259Sdim for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), 656249259Sdim E = MRI->use_end(); I != E; ++I) 657249259Sdim Uses.push_back(&I.getOperand()); 658249259Sdim 659249259Sdim // We can optimize this. 660249259Sdim unsigned NewReg = optimizeSDPattern(MI); 661249259Sdim 662249259Sdim if (NewReg != 0) { 663249259Sdim Modified = true; 664263508Sdim for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(), 665249259Sdim E = Uses.end(); I != E; ++I) { 666263508Sdim // Make sure to constrain the register class of the new register to 667263508Sdim // match what we're replacing. Otherwise we can optimize a DPR_VFP2 668263508Sdim // reference into a plain DPR, and that will end poorly. NewReg is 669263508Sdim // always virtual here, so there will always be a matching subclass 670263508Sdim // to find. 671263508Sdim MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); 672263508Sdim 673249259Sdim DEBUG(dbgs() << "Replacing operand " 674249259Sdim << **I << " with " 675249259Sdim << PrintReg(NewReg) << "\n"); 676249259Sdim (*I)->substVirtReg(NewReg, 0, *TRI); 677249259Sdim } 678249259Sdim } 679249259Sdim Replacements[MI] = NewReg; 680249259Sdim } 681249259Sdim } 682249259Sdim return Modified; 683249259Sdim} 684249259Sdim 685249259Sdimbool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { 686249259Sdim TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); 687249259Sdim TRI = Fn.getTarget().getRegisterInfo(); 688249259Sdim MRI = &Fn.getRegInfo(); 689249259Sdim bool Modified = false; 690249259Sdim 691249259Sdim DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n"); 692249259Sdim 693249259Sdim DeadInstr.clear(); 694249259Sdim Replacements.clear(); 695249259Sdim 696249259Sdim for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 697249259Sdim ++MFI) { 698249259Sdim 699249259Sdim for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end(); 700249259Sdim MI != ME;) { 701249259Sdim Modified |= runOnInstruction(MI++); 702249259Sdim } 703249259Sdim 704249259Sdim } 705249259Sdim 706249259Sdim for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(), 707249259Sdim E = DeadInstr.end(); 708249259Sdim I != E; ++I) { 709249259Sdim (*I)->eraseFromParent(); 710249259Sdim } 711249259Sdim 712249259Sdim return Modified; 713249259Sdim} 714249259Sdim 715249259SdimFunctionPass *llvm::createA15SDOptimizerPass() { 716249259Sdim return new A15SDOptimizer(); 717249259Sdim} 718