1249259Sdim//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==// 2249259Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6249259Sdim// 7249259Sdim//===----------------------------------------------------------------------===// 8249259Sdim// 9249259Sdim// The Cortex-A15 processor employs a tracking scheme in its register renaming 10249259Sdim// in order to process each instruction's micro-ops speculatively and 11249259Sdim// out-of-order with appropriate forwarding. The ARM architecture allows VFP 12249259Sdim// instructions to read and write 32-bit S-registers. Each S-register 13249259Sdim// corresponds to one half (upper or lower) of an overlaid 64-bit D-register. 14249259Sdim// 15249259Sdim// There are several instruction patterns which can be used to provide this 16249259Sdim// capability which can provide higher performance than other, potentially more 17249259Sdim// direct patterns, specifically around when one micro-op reads a D-register 18249259Sdim// operand that has recently been written as one or more S-register results. 19249259Sdim// 20249259Sdim// This file defines a pre-regalloc pass which looks for SPR producers which 21249259Sdim// are going to be used by a DPR (or QPR) consumers and creates the more 22249259Sdim// optimized access pattern. 23249259Sdim// 24249259Sdim//===----------------------------------------------------------------------===// 25249259Sdim 26249259Sdim#include "ARM.h" 27249259Sdim#include "ARMBaseInstrInfo.h" 28276479Sdim#include "ARMBaseRegisterInfo.h" 29288943Sdim#include "ARMSubtarget.h" 30249259Sdim#include "llvm/ADT/Statistic.h" 31288943Sdim#include "llvm/CodeGen/MachineFunction.h" 32249259Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 33249259Sdim#include "llvm/CodeGen/MachineInstr.h" 34249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 35249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 36327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h" 37327952Sdim#include "llvm/CodeGen/TargetSubtargetInfo.h" 38249259Sdim#include "llvm/Support/Debug.h" 39288943Sdim#include "llvm/Support/raw_ostream.h" 40280031Sdim#include <map> 41249259Sdim#include <set> 42249259Sdim 43249259Sdimusing namespace llvm; 44249259Sdim 45276479Sdim#define DEBUG_TYPE "a15-sd-optimizer" 46276479Sdim 47249259Sdimnamespace { 48249259Sdim struct A15SDOptimizer : public MachineFunctionPass { 49249259Sdim static char ID; 50249259Sdim A15SDOptimizer() : MachineFunctionPass(ID) {} 51249259Sdim 52276479Sdim bool runOnMachineFunction(MachineFunction &Fn) override; 53249259Sdim 54314564Sdim StringRef getPassName() const override { return "ARM A15 S->D optimizer"; } 55249259Sdim 56249259Sdim private: 57249259Sdim const ARMBaseInstrInfo *TII; 58249259Sdim const TargetRegisterInfo *TRI; 59249259Sdim MachineRegisterInfo *MRI; 60249259Sdim 61249259Sdim bool runOnInstruction(MachineInstr *MI); 62249259Sdim 63249259Sdim // 64249259Sdim // Instruction builder helpers 65249259Sdim // 66249259Sdim unsigned createDupLane(MachineBasicBlock &MBB, 67249259Sdim MachineBasicBlock::iterator InsertBefore, 68309124Sdim const DebugLoc &DL, unsigned Reg, unsigned Lane, 69309124Sdim bool QPR = false); 70249259Sdim 71249259Sdim unsigned createExtractSubreg(MachineBasicBlock &MBB, 72249259Sdim MachineBasicBlock::iterator InsertBefore, 73309124Sdim const DebugLoc &DL, unsigned DReg, 74309124Sdim unsigned Lane, const TargetRegisterClass *TRC); 75249259Sdim 76249259Sdim unsigned createVExt(MachineBasicBlock &MBB, 77249259Sdim MachineBasicBlock::iterator InsertBefore, 78309124Sdim const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1); 79249259Sdim 80249259Sdim unsigned createRegSequence(MachineBasicBlock &MBB, 81249259Sdim MachineBasicBlock::iterator InsertBefore, 82309124Sdim const DebugLoc &DL, unsigned Reg1, 83309124Sdim unsigned Reg2); 84249259Sdim 85249259Sdim unsigned createInsertSubreg(MachineBasicBlock &MBB, 86249259Sdim MachineBasicBlock::iterator InsertBefore, 87309124Sdim const DebugLoc &DL, unsigned DReg, 88309124Sdim unsigned Lane, unsigned ToInsert); 89249259Sdim 90249259Sdim unsigned createImplicitDef(MachineBasicBlock &MBB, 91249259Sdim MachineBasicBlock::iterator InsertBefore, 92309124Sdim const DebugLoc &DL); 93276479Sdim 94249259Sdim // 95249259Sdim // Various property checkers 96249259Sdim // 97249259Sdim bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC); 98249259Sdim bool hasPartialWrite(MachineInstr *MI); 99249259Sdim SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI); 100249259Sdim unsigned getDPRLaneFromSPR(unsigned SReg); 101249259Sdim 102249259Sdim // 103249259Sdim // Methods used for getting the definitions of partial registers 104249259Sdim // 105249259Sdim 106249259Sdim MachineInstr *elideCopies(MachineInstr *MI); 107249259Sdim void elideCopiesAndPHIs(MachineInstr *MI, 108249259Sdim SmallVectorImpl<MachineInstr*> &Outs); 109249259Sdim 110249259Sdim // 111249259Sdim // Pattern optimization methods 112249259Sdim // 113249259Sdim unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg); 114249259Sdim unsigned optimizeSDPattern(MachineInstr *MI); 115249259Sdim unsigned getPrefSPRLane(unsigned SReg); 116249259Sdim 117249259Sdim // 118249259Sdim // Sanitizing method - used to make sure if don't leave dead code around. 119249259Sdim // 120249259Sdim void eraseInstrWithNoUses(MachineInstr *MI); 121249259Sdim 122249259Sdim // 123249259Sdim // A map used to track the changes done by this pass. 124249259Sdim // 125249259Sdim std::map<MachineInstr*, unsigned> Replacements; 126249259Sdim std::set<MachineInstr *> DeadInstr; 127249259Sdim }; 128249259Sdim char A15SDOptimizer::ID = 0; 129249259Sdim} // end anonymous namespace 130249259Sdim 131249259Sdim// Returns true if this is a use of a SPR register. 132249259Sdimbool A15SDOptimizer::usesRegClass(MachineOperand &MO, 133249259Sdim const TargetRegisterClass *TRC) { 134249259Sdim if (!MO.isReg()) 135249259Sdim return false; 136360784Sdim Register Reg = MO.getReg(); 137249259Sdim 138360784Sdim if (Register::isVirtualRegister(Reg)) 139249259Sdim return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); 140249259Sdim else 141249259Sdim return TRC->contains(Reg); 142249259Sdim} 143249259Sdim 144249259Sdimunsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { 145249259Sdim unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, 146249259Sdim &ARM::DPRRegClass); 147249259Sdim if (DReg != ARM::NoRegister) return ARM::ssub_1; 148249259Sdim return ARM::ssub_0; 149249259Sdim} 150249259Sdim 151249259Sdim// Get the subreg type that is most likely to be coalesced 152249259Sdim// for an SPR register that will be used in VDUP32d pseudo. 153249259Sdimunsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { 154360784Sdim if (!Register::isVirtualRegister(SReg)) 155249259Sdim return getDPRLaneFromSPR(SReg); 156249259Sdim 157249259Sdim MachineInstr *MI = MRI->getVRegDef(SReg); 158249259Sdim if (!MI) return ARM::ssub_0; 159249259Sdim MachineOperand *MO = MI->findRegisterDefOperand(SReg); 160360784Sdim if (!MO) return ARM::ssub_0; 161276479Sdim assert(MO->isReg() && "Non-register operand found!"); 162249259Sdim 163249259Sdim if (MI->isCopy() && usesRegClass(MI->getOperand(1), 164249259Sdim &ARM::SPRRegClass)) { 165249259Sdim SReg = MI->getOperand(1).getReg(); 166249259Sdim } 167249259Sdim 168360784Sdim if (Register::isVirtualRegister(SReg)) { 169249259Sdim if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1; 170249259Sdim return ARM::ssub_0; 171249259Sdim } 172249259Sdim return getDPRLaneFromSPR(SReg); 173249259Sdim} 174249259Sdim 175249259Sdim// MI is known to be dead. Figure out what instructions 176249259Sdim// are also made dead by this and mark them for removal. 177249259Sdimvoid A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { 178249259Sdim SmallVector<MachineInstr *, 8> Front; 179249259Sdim DeadInstr.insert(MI); 180249259Sdim 181341825Sdim LLVM_DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n"); 182249259Sdim Front.push_back(MI); 183249259Sdim 184249259Sdim while (Front.size() != 0) { 185249259Sdim MI = Front.back(); 186249259Sdim Front.pop_back(); 187249259Sdim 188249259Sdim // MI is already known to be dead. We need to see 189249259Sdim // if other instructions can also be removed. 190327952Sdim for (MachineOperand &MO : MI->operands()) { 191249259Sdim if ((!MO.isReg()) || (!MO.isUse())) 192249259Sdim continue; 193360784Sdim Register Reg = MO.getReg(); 194360784Sdim if (!Register::isVirtualRegister(Reg)) 195249259Sdim continue; 196249259Sdim MachineOperand *Op = MI->findRegisterDefOperand(Reg); 197249259Sdim 198249259Sdim if (!Op) 199249259Sdim continue; 200249259Sdim 201249259Sdim MachineInstr *Def = Op->getParent(); 202249259Sdim 203249259Sdim // We don't need to do anything if we have already marked 204249259Sdim // this instruction as being dead. 205249259Sdim if (DeadInstr.find(Def) != DeadInstr.end()) 206249259Sdim continue; 207249259Sdim 208249259Sdim // Check if all the uses of this instruction are marked as 209249259Sdim // dead. If so, we can also mark this instruction as being 210249259Sdim // dead. 211249259Sdim bool IsDead = true; 212327952Sdim for (MachineOperand &MODef : Def->operands()) { 213249259Sdim if ((!MODef.isReg()) || (!MODef.isDef())) 214249259Sdim continue; 215360784Sdim Register DefReg = MODef.getReg(); 216360784Sdim if (!Register::isVirtualRegister(DefReg)) { 217249259Sdim IsDead = false; 218249259Sdim break; 219249259Sdim } 220327952Sdim for (MachineInstr &Use : MRI->use_instructions(Reg)) { 221249259Sdim // We don't care about self references. 222327952Sdim if (&Use == Def) 223249259Sdim continue; 224327952Sdim if (DeadInstr.find(&Use) == DeadInstr.end()) { 225249259Sdim IsDead = false; 226249259Sdim break; 227249259Sdim } 228249259Sdim } 229249259Sdim } 230249259Sdim 231249259Sdim if (!IsDead) continue; 232249259Sdim 233341825Sdim LLVM_DEBUG(dbgs() << "Deleting instruction " << *Def << "\n"); 234249259Sdim DeadInstr.insert(Def); 235249259Sdim } 236249259Sdim } 237249259Sdim} 238249259Sdim 239249259Sdim// Creates the more optimized patterns and generally does all the code 240249259Sdim// transformations in this pass. 241249259Sdimunsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { 242249259Sdim if (MI->isCopy()) { 243249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg()); 244249259Sdim } 245249259Sdim 246249259Sdim if (MI->isInsertSubreg()) { 247360784Sdim Register DPRReg = MI->getOperand(1).getReg(); 248360784Sdim Register SPRReg = MI->getOperand(2).getReg(); 249249259Sdim 250360784Sdim if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) { 251249259Sdim MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); 252249259Sdim MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); 253249259Sdim 254249259Sdim if (DPRMI && SPRMI) { 255249259Sdim // See if the first operand of this insert_subreg is IMPLICIT_DEF 256249259Sdim MachineInstr *ECDef = elideCopies(DPRMI); 257276479Sdim if (ECDef && ECDef->isImplicitDef()) { 258249259Sdim // Another corner case - if we're inserting something that is purely 259249259Sdim // a subreg copy of a DPR, just use that DPR. 260249259Sdim 261249259Sdim MachineInstr *EC = elideCopies(SPRMI); 262249259Sdim // Is it a subreg copy of ssub_0? 263249259Sdim if (EC && EC->isCopy() && 264249259Sdim EC->getOperand(1).getSubReg() == ARM::ssub_0) { 265341825Sdim LLVM_DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI); 266249259Sdim 267249259Sdim // Find the thing we're subreg copying out of - is it of the same 268249259Sdim // regclass as DPRMI? (i.e. a DPR or QPR). 269360784Sdim Register FullReg = SPRMI->getOperand(1).getReg(); 270249259Sdim const TargetRegisterClass *TRC = 271249259Sdim MRI->getRegClass(MI->getOperand(1).getReg()); 272249259Sdim if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) { 273341825Sdim LLVM_DEBUG(dbgs() << "Subreg copy is compatible - returning "); 274341825Sdim LLVM_DEBUG(dbgs() << printReg(FullReg) << "\n"); 275249259Sdim eraseInstrWithNoUses(MI); 276249259Sdim return FullReg; 277249259Sdim } 278249259Sdim } 279249259Sdim 280249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg()); 281249259Sdim } 282249259Sdim } 283249259Sdim } 284249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); 285249259Sdim } 286249259Sdim 287249259Sdim if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), 288249259Sdim &ARM::SPRRegClass)) { 289249259Sdim // See if all bar one of the operands are IMPLICIT_DEF and insert the 290249259Sdim // optimizer pattern accordingly. 291249259Sdim unsigned NumImplicit = 0, NumTotal = 0; 292249259Sdim unsigned NonImplicitReg = ~0U; 293249259Sdim 294249259Sdim for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) { 295249259Sdim if (!MI->getOperand(I).isReg()) 296249259Sdim continue; 297249259Sdim ++NumTotal; 298360784Sdim Register OpReg = MI->getOperand(I).getReg(); 299249259Sdim 300360784Sdim if (!Register::isVirtualRegister(OpReg)) 301249259Sdim break; 302249259Sdim 303249259Sdim MachineInstr *Def = MRI->getVRegDef(OpReg); 304249259Sdim if (!Def) 305249259Sdim break; 306249259Sdim if (Def->isImplicitDef()) 307249259Sdim ++NumImplicit; 308249259Sdim else 309249259Sdim NonImplicitReg = MI->getOperand(I).getReg(); 310249259Sdim } 311249259Sdim 312249259Sdim if (NumImplicit == NumTotal - 1) 313249259Sdim return optimizeAllLanesPattern(MI, NonImplicitReg); 314249259Sdim else 315249259Sdim return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); 316249259Sdim } 317249259Sdim 318276479Sdim llvm_unreachable("Unhandled update pattern!"); 319249259Sdim} 320249259Sdim 321249259Sdim// Return true if this MachineInstr inserts a scalar (SPR) value into 322249259Sdim// a D or Q register. 323249259Sdimbool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { 324249259Sdim // The only way we can do a partial register update is through a COPY, 325249259Sdim // INSERT_SUBREG or REG_SEQUENCE. 326249259Sdim if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) 327249259Sdim return true; 328249259Sdim 329249259Sdim if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2), 330249259Sdim &ARM::SPRRegClass)) 331249259Sdim return true; 332249259Sdim 333249259Sdim if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) 334249259Sdim return true; 335249259Sdim 336249259Sdim return false; 337249259Sdim} 338249259Sdim 339249259Sdim// Looks through full copies to get the instruction that defines the input 340249259Sdim// operand for MI. 341249259SdimMachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { 342249259Sdim if (!MI->isFullCopy()) 343249259Sdim return MI; 344360784Sdim if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) 345276479Sdim return nullptr; 346249259Sdim MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); 347249259Sdim if (!Def) 348276479Sdim return nullptr; 349249259Sdim return elideCopies(Def); 350249259Sdim} 351249259Sdim 352249259Sdim// Look through full copies and PHIs to get the set of non-copy MachineInstrs 353249259Sdim// that can produce MI. 354249259Sdimvoid A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, 355249259Sdim SmallVectorImpl<MachineInstr*> &Outs) { 356249259Sdim // Looking through PHIs may create loops so we need to track what 357249259Sdim // instructions we have visited before. 358249259Sdim std::set<MachineInstr *> Reached; 359249259Sdim SmallVector<MachineInstr *, 8> Front; 360249259Sdim Front.push_back(MI); 361249259Sdim while (Front.size() != 0) { 362249259Sdim MI = Front.back(); 363249259Sdim Front.pop_back(); 364249259Sdim 365249259Sdim // If we have already explored this MachineInstr, ignore it. 366249259Sdim if (Reached.find(MI) != Reached.end()) 367249259Sdim continue; 368249259Sdim Reached.insert(MI); 369249259Sdim if (MI->isPHI()) { 370249259Sdim for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 371360784Sdim Register Reg = MI->getOperand(I).getReg(); 372360784Sdim if (!Register::isVirtualRegister(Reg)) { 373249259Sdim continue; 374249259Sdim } 375249259Sdim MachineInstr *NewMI = MRI->getVRegDef(Reg); 376249259Sdim if (!NewMI) 377249259Sdim continue; 378249259Sdim Front.push_back(NewMI); 379249259Sdim } 380249259Sdim } else if (MI->isFullCopy()) { 381360784Sdim if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) 382249259Sdim continue; 383249259Sdim MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); 384249259Sdim if (!NewMI) 385249259Sdim continue; 386249259Sdim Front.push_back(NewMI); 387249259Sdim } else { 388341825Sdim LLVM_DEBUG(dbgs() << "Found partial copy" << *MI << "\n"); 389249259Sdim Outs.push_back(MI); 390249259Sdim } 391249259Sdim } 392249259Sdim} 393249259Sdim 394249259Sdim// Return the DPR virtual registers that are read by this machine instruction 395249259Sdim// (if any). 396249259SdimSmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) { 397249259Sdim if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() || 398249259Sdim MI->isKill()) 399249259Sdim return SmallVector<unsigned, 8>(); 400249259Sdim 401249259Sdim SmallVector<unsigned, 8> Defs; 402327952Sdim for (MachineOperand &MO : MI->operands()) { 403249259Sdim if (!MO.isReg() || !MO.isUse()) 404249259Sdim continue; 405249259Sdim if (!usesRegClass(MO, &ARM::DPRRegClass) && 406265925Sdim !usesRegClass(MO, &ARM::QPRRegClass) && 407265925Sdim !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR 408249259Sdim continue; 409249259Sdim 410249259Sdim Defs.push_back(MO.getReg()); 411249259Sdim } 412249259Sdim return Defs; 413249259Sdim} 414249259Sdim 415249259Sdim// Creates a DPR register from an SPR one by using a VDUP. 416309124Sdimunsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, 417309124Sdim MachineBasicBlock::iterator InsertBefore, 418309124Sdim const DebugLoc &DL, unsigned Reg, 419309124Sdim unsigned Lane, bool QPR) { 420360784Sdim Register Out = 421360784Sdim MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass); 422321369Sdim BuildMI(MBB, InsertBefore, DL, 423321369Sdim TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out) 424321369Sdim .addReg(Reg) 425321369Sdim .addImm(Lane) 426321369Sdim .add(predOps(ARMCC::AL)); 427276479Sdim 428249259Sdim return Out; 429249259Sdim} 430249259Sdim 431249259Sdim// Creates a SPR register from a DPR by copying the value in lane 0. 432309124Sdimunsigned A15SDOptimizer::createExtractSubreg( 433309124Sdim MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, 434309124Sdim const DebugLoc &DL, unsigned DReg, unsigned Lane, 435309124Sdim const TargetRegisterClass *TRC) { 436360784Sdim Register Out = MRI->createVirtualRegister(TRC); 437249259Sdim BuildMI(MBB, 438249259Sdim InsertBefore, 439249259Sdim DL, 440249259Sdim TII->get(TargetOpcode::COPY), Out) 441249259Sdim .addReg(DReg, 0, Lane); 442249259Sdim 443249259Sdim return Out; 444249259Sdim} 445249259Sdim 446249259Sdim// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE. 447309124Sdimunsigned A15SDOptimizer::createRegSequence( 448309124Sdim MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, 449309124Sdim const DebugLoc &DL, unsigned Reg1, unsigned Reg2) { 450360784Sdim Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass); 451249259Sdim BuildMI(MBB, 452249259Sdim InsertBefore, 453249259Sdim DL, 454249259Sdim TII->get(TargetOpcode::REG_SEQUENCE), Out) 455249259Sdim .addReg(Reg1) 456249259Sdim .addImm(ARM::dsub_0) 457249259Sdim .addReg(Reg2) 458249259Sdim .addImm(ARM::dsub_1); 459249259Sdim return Out; 460249259Sdim} 461249259Sdim 462249259Sdim// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1) 463249259Sdim// and merges them into one DPR register. 464309124Sdimunsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, 465309124Sdim MachineBasicBlock::iterator InsertBefore, 466309124Sdim const DebugLoc &DL, unsigned Ssub0, 467309124Sdim unsigned Ssub1) { 468360784Sdim Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); 469321369Sdim BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out) 470321369Sdim .addReg(Ssub0) 471321369Sdim .addReg(Ssub1) 472321369Sdim .addImm(1) 473321369Sdim .add(predOps(ARMCC::AL)); 474249259Sdim return Out; 475249259Sdim} 476249259Sdim 477309124Sdimunsigned A15SDOptimizer::createInsertSubreg( 478309124Sdim MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, 479309124Sdim const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) { 480360784Sdim Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); 481249259Sdim BuildMI(MBB, 482249259Sdim InsertBefore, 483249259Sdim DL, 484249259Sdim TII->get(TargetOpcode::INSERT_SUBREG), Out) 485249259Sdim .addReg(DReg) 486249259Sdim .addReg(ToInsert) 487249259Sdim .addImm(Lane); 488249259Sdim 489249259Sdim return Out; 490249259Sdim} 491249259Sdim 492249259Sdimunsigned 493249259SdimA15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, 494249259Sdim MachineBasicBlock::iterator InsertBefore, 495309124Sdim const DebugLoc &DL) { 496360784Sdim Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); 497249259Sdim BuildMI(MBB, 498249259Sdim InsertBefore, 499249259Sdim DL, 500249259Sdim TII->get(TargetOpcode::IMPLICIT_DEF), Out); 501249259Sdim return Out; 502249259Sdim} 503249259Sdim 504249259Sdim// This function inserts instructions in order to optimize interactions between 505249259Sdim// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all 506249259Sdim// lanes, and the using VEXT instructions to recompose the result. 507249259Sdimunsigned 508249259SdimA15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) { 509249259Sdim MachineBasicBlock::iterator InsertPt(MI); 510249259Sdim DebugLoc DL = MI->getDebugLoc(); 511249259Sdim MachineBasicBlock &MBB = *MI->getParent(); 512249259Sdim InsertPt++; 513249259Sdim unsigned Out; 514249259Sdim 515265925Sdim // DPair has the same length as QPR and also has two DPRs as subreg. 516265925Sdim // Treat DPair as QPR. 517265925Sdim if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) || 518265925Sdim MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) { 519249259Sdim unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg, 520249259Sdim ARM::dsub_0, &ARM::DPRRegClass); 521249259Sdim unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg, 522249259Sdim ARM::dsub_1, &ARM::DPRRegClass); 523249259Sdim 524249259Sdim unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0); 525249259Sdim unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1); 526249259Sdim Out = createVExt(MBB, InsertPt, DL, Out1, Out2); 527249259Sdim 528249259Sdim unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0); 529249259Sdim unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1); 530249259Sdim Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4); 531249259Sdim 532249259Sdim Out = createRegSequence(MBB, InsertPt, DL, Out, Out2); 533249259Sdim 534249259Sdim } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) { 535249259Sdim unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0); 536249259Sdim unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1); 537249259Sdim Out = createVExt(MBB, InsertPt, DL, Out1, Out2); 538249259Sdim 539249259Sdim } else { 540249259Sdim assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) && 541249259Sdim "Found unexpected regclass!"); 542249259Sdim 543249259Sdim unsigned PrefLane = getPrefSPRLane(Reg); 544249259Sdim unsigned Lane; 545249259Sdim switch (PrefLane) { 546249259Sdim case ARM::ssub_0: Lane = 0; break; 547249259Sdim case ARM::ssub_1: Lane = 1; break; 548249259Sdim default: llvm_unreachable("Unknown preferred lane!"); 549249259Sdim } 550249259Sdim 551265925Sdim // Treat DPair as QPR 552265925Sdim bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) || 553265925Sdim usesRegClass(MI->getOperand(0), &ARM::DPairRegClass); 554249259Sdim 555249259Sdim Out = createImplicitDef(MBB, InsertPt, DL); 556249259Sdim Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg); 557249259Sdim Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR); 558249259Sdim eraseInstrWithNoUses(MI); 559249259Sdim } 560249259Sdim return Out; 561249259Sdim} 562249259Sdim 563249259Sdimbool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { 564249259Sdim // We look for instructions that write S registers that are then read as 565249259Sdim // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and 566249259Sdim // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or 567249259Sdim // merge two SPR values to form a DPR register. In order avoid false 568249259Sdim // positives we make sure that there is an SPR producer so we look past 569249259Sdim // COPY and PHI nodes to find it. 570249259Sdim // 571249259Sdim // The best code pattern for when an SPR producer is going to be used by a 572249259Sdim // DPR or QPR consumer depends on whether the other lanes of the 573249259Sdim // corresponding DPR/QPR are currently defined. 574249259Sdim // 575249259Sdim // We can handle these efficiently, depending on the type of 576249259Sdim // pseudo-instruction that is producing the pattern 577249259Sdim // 578249259Sdim // * COPY: * VDUP all lanes and merge the results together 579249259Sdim // using VEXTs. 580249259Sdim // 581249259Sdim // * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR 582249259Sdim // lane, and the other lane(s) of the DPR/QPR register 583249259Sdim // that we are inserting in are undefined, use the 584276479Sdim // original DPR/QPR value. 585249259Sdim // * Otherwise, fall back on the same stategy as COPY. 586249259Sdim // 587249259Sdim // * REG_SEQUENCE: * If all except one of the input operands are 588249259Sdim // IMPLICIT_DEFs, insert the VDUP pattern for just the 589249259Sdim // defined input operand 590249259Sdim // * Otherwise, fall back on the same stategy as COPY. 591249259Sdim // 592249259Sdim 593249259Sdim // First, get all the reads of D-registers done by this instruction. 594249259Sdim SmallVector<unsigned, 8> Defs = getReadDPRs(MI); 595249259Sdim bool Modified = false; 596249259Sdim 597261991Sdim for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end(); 598249259Sdim I != E; ++I) { 599249259Sdim // Follow the def-use chain for this DPR through COPYs, and also through 600249259Sdim // PHIs (which are essentially multi-way COPYs). It is because of PHIs that 601249259Sdim // we can end up with multiple defs of this DPR. 602249259Sdim 603249259Sdim SmallVector<MachineInstr *, 8> DefSrcs; 604360784Sdim if (!Register::isVirtualRegister(*I)) 605249259Sdim continue; 606249259Sdim MachineInstr *Def = MRI->getVRegDef(*I); 607249259Sdim if (!Def) 608249259Sdim continue; 609249259Sdim 610249259Sdim elideCopiesAndPHIs(Def, DefSrcs); 611249259Sdim 612327952Sdim for (MachineInstr *MI : DefSrcs) { 613249259Sdim // If we've already analyzed and replaced this operand, don't do 614249259Sdim // anything. 615249259Sdim if (Replacements.find(MI) != Replacements.end()) 616249259Sdim continue; 617249259Sdim 618249259Sdim // Now, work out if the instruction causes a SPR->DPR dependency. 619249259Sdim if (!hasPartialWrite(MI)) 620249259Sdim continue; 621249259Sdim 622249259Sdim // Collect all the uses of this MI's DPR def for updating later. 623249259Sdim SmallVector<MachineOperand*, 8> Uses; 624360784Sdim Register DPRDefReg = MI->getOperand(0).getReg(); 625249259Sdim for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), 626249259Sdim E = MRI->use_end(); I != E; ++I) 627276479Sdim Uses.push_back(&*I); 628249259Sdim 629249259Sdim // We can optimize this. 630249259Sdim unsigned NewReg = optimizeSDPattern(MI); 631249259Sdim 632249259Sdim if (NewReg != 0) { 633249259Sdim Modified = true; 634261991Sdim for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(), 635249259Sdim E = Uses.end(); I != E; ++I) { 636261991Sdim // Make sure to constrain the register class of the new register to 637261991Sdim // match what we're replacing. Otherwise we can optimize a DPR_VFP2 638261991Sdim // reference into a plain DPR, and that will end poorly. NewReg is 639261991Sdim // always virtual here, so there will always be a matching subclass 640261991Sdim // to find. 641261991Sdim MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); 642261991Sdim 643341825Sdim LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with " 644341825Sdim << printReg(NewReg) << "\n"); 645249259Sdim (*I)->substVirtReg(NewReg, 0, *TRI); 646249259Sdim } 647249259Sdim } 648249259Sdim Replacements[MI] = NewReg; 649249259Sdim } 650249259Sdim } 651249259Sdim return Modified; 652249259Sdim} 653249259Sdim 654249259Sdimbool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { 655327952Sdim if (skipFunction(Fn.getFunction())) 656309124Sdim return false; 657309124Sdim 658288943Sdim const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); 659288943Sdim // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be 660288943Sdim // enabled when NEON is available. 661341825Sdim if (!(STI.useSplatVFPToNeon() && STI.hasNEON())) 662288943Sdim return false; 663341825Sdim 664288943Sdim TII = STI.getInstrInfo(); 665288943Sdim TRI = STI.getRegisterInfo(); 666249259Sdim MRI = &Fn.getRegInfo(); 667249259Sdim bool Modified = false; 668249259Sdim 669341825Sdim LLVM_DEBUG(dbgs() << "Running on function " << Fn.getName() << "\n"); 670249259Sdim 671249259Sdim DeadInstr.clear(); 672249259Sdim Replacements.clear(); 673249259Sdim 674327952Sdim for (MachineBasicBlock &MBB : Fn) { 675327952Sdim for (MachineInstr &MI : MBB) { 676327952Sdim Modified |= runOnInstruction(&MI); 677249259Sdim } 678249259Sdim } 679249259Sdim 680327952Sdim for (MachineInstr *MI : DeadInstr) { 681327952Sdim MI->eraseFromParent(); 682249259Sdim } 683249259Sdim 684249259Sdim return Modified; 685249259Sdim} 686249259Sdim 687249259SdimFunctionPass *llvm::createA15SDOptimizerPass() { 688249259Sdim return new A15SDOptimizer(); 689249259Sdim} 690