AArch64LoadStoreOptimizer.cpp revision 280031
1//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file contains a pass that performs load / store related peephole 11// optimizations. This pass should be run after register allocation. 12// 13//===----------------------------------------------------------------------===// 14 15#include "AArch64InstrInfo.h" 16#include "AArch64Subtarget.h" 17#include "MCTargetDesc/AArch64AddressingModes.h" 18#include "llvm/ADT/BitVector.h" 19#include "llvm/ADT/Statistic.h" 20#include "llvm/CodeGen/MachineBasicBlock.h" 21#include "llvm/CodeGen/MachineFunctionPass.h" 22#include "llvm/CodeGen/MachineInstr.h" 23#include "llvm/CodeGen/MachineInstrBuilder.h" 24#include "llvm/Support/CommandLine.h" 25#include "llvm/Support/Debug.h" 26#include "llvm/Support/ErrorHandling.h" 27#include "llvm/Support/raw_ostream.h" 28#include "llvm/Target/TargetInstrInfo.h" 29#include "llvm/Target/TargetMachine.h" 30#include "llvm/Target/TargetRegisterInfo.h" 31using namespace llvm; 32 33#define DEBUG_TYPE "aarch64-ldst-opt" 34 35/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 36/// load / store instructions to form ldp / stp instructions. 37 38STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 39STATISTIC(NumPostFolded, "Number of post-index updates folded"); 40STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 41STATISTIC(NumUnscaledPairCreated, 42 "Number of load/store from unscaled generated"); 43 44static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 45 cl::init(20), cl::Hidden); 46 47// Place holder while testing unscaled load/store combining 48static cl::opt<bool> EnableAArch64UnscaledMemOp( 49 "aarch64-unscaled-mem-op", cl::Hidden, 50 cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); 51 52namespace { 53struct AArch64LoadStoreOpt : public MachineFunctionPass { 54 static char ID; 55 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} 56 57 const AArch64InstrInfo *TII; 58 const TargetRegisterInfo *TRI; 59 60 // Scan the instructions looking for a load/store that can be combined 61 // with the current instruction into a load/store pair. 62 // Return the matching instruction if one is found, else MBB->end(). 63 // If a matching instruction is found, MergeForward is set to true if the 64 // merge is to remove the first instruction and replace the second with 65 // a pair-wise insn, and false if the reverse is true. 66 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 67 bool &MergeForward, 68 unsigned Limit); 69 // Merge the two instructions indicated into a single pair-wise instruction. 70 // If MergeForward is true, erase the first instruction and fold its 71 // operation into the second. If false, the reverse. Return the instruction 72 // following the first instruction (which may change during processing). 73 MachineBasicBlock::iterator 74 mergePairedInsns(MachineBasicBlock::iterator I, 75 MachineBasicBlock::iterator Paired, bool MergeForward); 76 77 // Scan the instruction list to find a base register update that can 78 // be combined with the current instruction (a load or store) using 79 // pre or post indexed addressing with writeback. Scan forwards. 80 MachineBasicBlock::iterator 81 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 82 int Value); 83 84 // Scan the instruction list to find a base register update that can 85 // be combined with the current instruction (a load or store) using 86 // pre or post indexed addressing with writeback. Scan backwards. 87 MachineBasicBlock::iterator 88 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 89 90 // Merge a pre-index base register update into a ld/st instruction. 91 MachineBasicBlock::iterator 92 mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, 93 MachineBasicBlock::iterator Update); 94 95 // Merge a post-index base register update into a ld/st instruction. 96 MachineBasicBlock::iterator 97 mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, 98 MachineBasicBlock::iterator Update); 99 100 bool optimizeBlock(MachineBasicBlock &MBB); 101 102 bool runOnMachineFunction(MachineFunction &Fn) override; 103 104 const char *getPassName() const override { 105 return "AArch64 load / store optimization pass"; 106 } 107 108private: 109 int getMemSize(MachineInstr *MemMI); 110}; 111char AArch64LoadStoreOpt::ID = 0; 112} // namespace 113 114static bool isUnscaledLdst(unsigned Opc) { 115 switch (Opc) { 116 default: 117 return false; 118 case AArch64::STURSi: 119 return true; 120 case AArch64::STURDi: 121 return true; 122 case AArch64::STURQi: 123 return true; 124 case AArch64::STURWi: 125 return true; 126 case AArch64::STURXi: 127 return true; 128 case AArch64::LDURSi: 129 return true; 130 case AArch64::LDURDi: 131 return true; 132 case AArch64::LDURQi: 133 return true; 134 case AArch64::LDURWi: 135 return true; 136 case AArch64::LDURXi: 137 return true; 138 } 139} 140 141// Size in bytes of the data moved by an unscaled load or store 142int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { 143 switch (MemMI->getOpcode()) { 144 default: 145 llvm_unreachable("Opcode has unknown size!"); 146 case AArch64::STRSui: 147 case AArch64::STURSi: 148 return 4; 149 case AArch64::STRDui: 150 case AArch64::STURDi: 151 return 8; 152 case AArch64::STRQui: 153 case AArch64::STURQi: 154 return 16; 155 case AArch64::STRWui: 156 case AArch64::STURWi: 157 return 4; 158 case AArch64::STRXui: 159 case AArch64::STURXi: 160 return 8; 161 case AArch64::LDRSui: 162 case AArch64::LDURSi: 163 return 4; 164 case AArch64::LDRDui: 165 case AArch64::LDURDi: 166 return 8; 167 case AArch64::LDRQui: 168 case AArch64::LDURQi: 169 return 16; 170 case AArch64::LDRWui: 171 case AArch64::LDURWi: 172 return 4; 173 case AArch64::LDRXui: 174 case AArch64::LDURXi: 175 return 8; 176 } 177} 178 179static unsigned getMatchingPairOpcode(unsigned Opc) { 180 switch (Opc) { 181 default: 182 llvm_unreachable("Opcode has no pairwise equivalent!"); 183 case AArch64::STRSui: 184 case AArch64::STURSi: 185 return AArch64::STPSi; 186 case AArch64::STRDui: 187 case AArch64::STURDi: 188 return AArch64::STPDi; 189 case AArch64::STRQui: 190 case AArch64::STURQi: 191 return AArch64::STPQi; 192 case AArch64::STRWui: 193 case AArch64::STURWi: 194 return AArch64::STPWi; 195 case AArch64::STRXui: 196 case AArch64::STURXi: 197 return AArch64::STPXi; 198 case AArch64::LDRSui: 199 case AArch64::LDURSi: 200 return AArch64::LDPSi; 201 case AArch64::LDRDui: 202 case AArch64::LDURDi: 203 return AArch64::LDPDi; 204 case AArch64::LDRQui: 205 case AArch64::LDURQi: 206 return AArch64::LDPQi; 207 case AArch64::LDRWui: 208 case AArch64::LDURWi: 209 return AArch64::LDPWi; 210 case AArch64::LDRXui: 211 case AArch64::LDURXi: 212 return AArch64::LDPXi; 213 } 214} 215 216static unsigned getPreIndexedOpcode(unsigned Opc) { 217 switch (Opc) { 218 default: 219 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 220 case AArch64::STRSui: 221 return AArch64::STRSpre; 222 case AArch64::STRDui: 223 return AArch64::STRDpre; 224 case AArch64::STRQui: 225 return AArch64::STRQpre; 226 case AArch64::STRWui: 227 return AArch64::STRWpre; 228 case AArch64::STRXui: 229 return AArch64::STRXpre; 230 case AArch64::LDRSui: 231 return AArch64::LDRSpre; 232 case AArch64::LDRDui: 233 return AArch64::LDRDpre; 234 case AArch64::LDRQui: 235 return AArch64::LDRQpre; 236 case AArch64::LDRWui: 237 return AArch64::LDRWpre; 238 case AArch64::LDRXui: 239 return AArch64::LDRXpre; 240 } 241} 242 243static unsigned getPostIndexedOpcode(unsigned Opc) { 244 switch (Opc) { 245 default: 246 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 247 case AArch64::STRSui: 248 return AArch64::STRSpost; 249 case AArch64::STRDui: 250 return AArch64::STRDpost; 251 case AArch64::STRQui: 252 return AArch64::STRQpost; 253 case AArch64::STRWui: 254 return AArch64::STRWpost; 255 case AArch64::STRXui: 256 return AArch64::STRXpost; 257 case AArch64::LDRSui: 258 return AArch64::LDRSpost; 259 case AArch64::LDRDui: 260 return AArch64::LDRDpost; 261 case AArch64::LDRQui: 262 return AArch64::LDRQpost; 263 case AArch64::LDRWui: 264 return AArch64::LDRWpost; 265 case AArch64::LDRXui: 266 return AArch64::LDRXpost; 267 } 268} 269 270MachineBasicBlock::iterator 271AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 272 MachineBasicBlock::iterator Paired, 273 bool MergeForward) { 274 MachineBasicBlock::iterator NextI = I; 275 ++NextI; 276 // If NextI is the second of the two instructions to be merged, we need 277 // to skip one further. Either way we merge will invalidate the iterator, 278 // and we don't need to scan the new instruction, as it's a pairwise 279 // instruction, which we're not considering for further action anyway. 280 if (NextI == Paired) 281 ++NextI; 282 283 bool IsUnscaled = isUnscaledLdst(I->getOpcode()); 284 int OffsetStride = 285 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; 286 287 unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); 288 // Insert our new paired instruction after whichever of the paired 289 // instructions MergeForward indicates. 290 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 291 // Also based on MergeForward is from where we copy the base register operand 292 // so we get the flags compatible with the input code. 293 MachineOperand &BaseRegOp = 294 MergeForward ? Paired->getOperand(1) : I->getOperand(1); 295 296 // Which register is Rt and which is Rt2 depends on the offset order. 297 MachineInstr *RtMI, *Rt2MI; 298 if (I->getOperand(2).getImm() == 299 Paired->getOperand(2).getImm() + OffsetStride) { 300 RtMI = Paired; 301 Rt2MI = I; 302 } else { 303 RtMI = I; 304 Rt2MI = Paired; 305 } 306 // Handle Unscaled 307 int OffsetImm = RtMI->getOperand(2).getImm(); 308 if (IsUnscaled && EnableAArch64UnscaledMemOp) 309 OffsetImm /= OffsetStride; 310 311 // Construct the new instruction. 312 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 313 I->getDebugLoc(), TII->get(NewOpc)) 314 .addOperand(RtMI->getOperand(0)) 315 .addOperand(Rt2MI->getOperand(0)) 316 .addOperand(BaseRegOp) 317 .addImm(OffsetImm); 318 (void)MIB; 319 320 // FIXME: Do we need/want to copy the mem operands from the source 321 // instructions? Probably. What uses them after this? 322 323 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 324 DEBUG(I->print(dbgs())); 325 DEBUG(dbgs() << " "); 326 DEBUG(Paired->print(dbgs())); 327 DEBUG(dbgs() << " with instruction:\n "); 328 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 329 DEBUG(dbgs() << "\n"); 330 331 // Erase the old instructions. 332 I->eraseFromParent(); 333 Paired->eraseFromParent(); 334 335 return NextI; 336} 337 338/// trackRegDefsUses - Remember what registers the specified instruction uses 339/// and modifies. 340static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, 341 BitVector &UsedRegs, 342 const TargetRegisterInfo *TRI) { 343 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 344 MachineOperand &MO = MI->getOperand(i); 345 if (MO.isRegMask()) 346 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 347 348 if (!MO.isReg()) 349 continue; 350 unsigned Reg = MO.getReg(); 351 if (MO.isDef()) { 352 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 353 ModifiedRegs.set(*AI); 354 } else { 355 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 356 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 357 UsedRegs.set(*AI); 358 } 359 } 360} 361 362static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 363 if (!IsUnscaled && (Offset > 63 || Offset < -64)) 364 return false; 365 if (IsUnscaled) { 366 // Convert the byte-offset used by unscaled into an "element" offset used 367 // by the scaled pair load/store instructions. 368 int ElemOffset = Offset / OffsetStride; 369 if (ElemOffset > 63 || ElemOffset < -64) 370 return false; 371 } 372 return true; 373} 374 375// Do alignment, specialized to power of 2 and for signed ints, 376// avoiding having to do a C-style cast from uint_64t to int when 377// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 378// FIXME: Move this function to include/MathExtras.h? 379static int alignTo(int Num, int PowOf2) { 380 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 381} 382 383/// findMatchingInsn - Scan the instructions looking for a load/store that can 384/// be combined with the current instruction into a load/store pair. 385MachineBasicBlock::iterator 386AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 387 bool &MergeForward, unsigned Limit) { 388 MachineBasicBlock::iterator E = I->getParent()->end(); 389 MachineBasicBlock::iterator MBBI = I; 390 MachineInstr *FirstMI = I; 391 ++MBBI; 392 393 int Opc = FirstMI->getOpcode(); 394 bool MayLoad = FirstMI->mayLoad(); 395 bool IsUnscaled = isUnscaledLdst(Opc); 396 unsigned Reg = FirstMI->getOperand(0).getReg(); 397 unsigned BaseReg = FirstMI->getOperand(1).getReg(); 398 int Offset = FirstMI->getOperand(2).getImm(); 399 400 // Early exit if the first instruction modifies the base register. 401 // e.g., ldr x0, [x0] 402 // Early exit if the offset if not possible to match. (6 bits of positive 403 // range, plus allow an extra one in case we find a later insn that matches 404 // with Offset-1 405 if (FirstMI->modifiesRegister(BaseReg, TRI)) 406 return E; 407 int OffsetStride = 408 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; 409 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 410 return E; 411 412 // Track which registers have been modified and used between the first insn 413 // (inclusive) and the second insn. 414 BitVector ModifiedRegs, UsedRegs; 415 ModifiedRegs.resize(TRI->getNumRegs()); 416 UsedRegs.resize(TRI->getNumRegs()); 417 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 418 MachineInstr *MI = MBBI; 419 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 420 // optimization by changing how far we scan. 421 if (MI->isDebugValue()) 422 continue; 423 424 // Now that we know this is a real instruction, count it. 425 ++Count; 426 427 if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { 428 // If we've found another instruction with the same opcode, check to see 429 // if the base and offset are compatible with our starting instruction. 430 // These instructions all have scaled immediate operands, so we just 431 // check for +1/-1. Make sure to check the new instruction offset is 432 // actually an immediate and not a symbolic reference destined for 433 // a relocation. 434 // 435 // Pairwise instructions have a 7-bit signed offset field. Single insns 436 // have a 12-bit unsigned offset field. To be a valid combine, the 437 // final offset must be in range. 438 unsigned MIBaseReg = MI->getOperand(1).getReg(); 439 int MIOffset = MI->getOperand(2).getImm(); 440 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 441 (Offset + OffsetStride == MIOffset))) { 442 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 443 // If this is a volatile load/store that otherwise matched, stop looking 444 // as something is going on that we don't have enough information to 445 // safely transform. Similarly, stop if we see a hint to avoid pairs. 446 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 447 return E; 448 // If the resultant immediate offset of merging these instructions 449 // is out of range for a pairwise instruction, bail and keep looking. 450 bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); 451 if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 452 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 453 continue; 454 } 455 // If the alignment requirements of the paired (scaled) instruction 456 // can't express the offset of the unscaled input, bail and keep 457 // looking. 458 if (IsUnscaled && EnableAArch64UnscaledMemOp && 459 (alignTo(MinOffset, OffsetStride) != MinOffset)) { 460 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 461 continue; 462 } 463 // If the destination register of the loads is the same register, bail 464 // and keep looking. A load-pair instruction with both destination 465 // registers the same is UNPREDICTABLE and will result in an exception. 466 if (MayLoad && Reg == MI->getOperand(0).getReg()) { 467 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 468 continue; 469 } 470 471 // If the Rt of the second instruction was not modified or used between 472 // the two instructions, we can combine the second into the first. 473 if (!ModifiedRegs[MI->getOperand(0).getReg()] && 474 !UsedRegs[MI->getOperand(0).getReg()]) { 475 MergeForward = false; 476 return MBBI; 477 } 478 479 // Likewise, if the Rt of the first instruction is not modified or used 480 // between the two instructions, we can combine the first into the 481 // second. 482 if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && 483 !UsedRegs[FirstMI->getOperand(0).getReg()]) { 484 MergeForward = true; 485 return MBBI; 486 } 487 // Unable to combine these instructions due to interference in between. 488 // Keep looking. 489 } 490 } 491 492 // If the instruction wasn't a matching load or store, but does (or can) 493 // modify memory, stop searching, as we don't have alias analysis or 494 // anything like that to tell us whether the access is tromping on the 495 // locations we care about. The big one we want to catch is calls. 496 // 497 // FIXME: Theoretically, we can do better than that for SP and FP based 498 // references since we can effectively know where those are touching. It's 499 // unclear if it's worth the extra code, though. Most paired instructions 500 // will be sequential, perhaps with a few intervening non-memory related 501 // instructions. 502 if (MI->mayStore() || MI->isCall()) 503 return E; 504 // Likewise, if we're matching a store instruction, we don't want to 505 // move across a load, as it may be reading the same location. 506 if (FirstMI->mayStore() && MI->mayLoad()) 507 return E; 508 509 // Update modified / uses register lists. 510 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 511 512 // Otherwise, if the base register is modified, we have no match, so 513 // return early. 514 if (ModifiedRegs[BaseReg]) 515 return E; 516 } 517 return E; 518} 519 520MachineBasicBlock::iterator 521AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, 522 MachineBasicBlock::iterator Update) { 523 assert((Update->getOpcode() == AArch64::ADDXri || 524 Update->getOpcode() == AArch64::SUBXri) && 525 "Unexpected base register update instruction to merge!"); 526 MachineBasicBlock::iterator NextI = I; 527 // Return the instruction following the merged instruction, which is 528 // the instruction following our unmerged load. Unless that's the add/sub 529 // instruction we're merging, in which case it's the one after that. 530 if (++NextI == Update) 531 ++NextI; 532 533 int Value = Update->getOperand(2).getImm(); 534 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 535 "Can't merge 1 << 12 offset into pre-indexed load / store"); 536 if (Update->getOpcode() == AArch64::SUBXri) 537 Value = -Value; 538 539 unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); 540 MachineInstrBuilder MIB = 541 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 542 .addOperand(Update->getOperand(0)) 543 .addOperand(I->getOperand(0)) 544 .addOperand(I->getOperand(1)) 545 .addImm(Value); 546 (void)MIB; 547 548 DEBUG(dbgs() << "Creating pre-indexed load/store."); 549 DEBUG(dbgs() << " Replacing instructions:\n "); 550 DEBUG(I->print(dbgs())); 551 DEBUG(dbgs() << " "); 552 DEBUG(Update->print(dbgs())); 553 DEBUG(dbgs() << " with instruction:\n "); 554 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 555 DEBUG(dbgs() << "\n"); 556 557 // Erase the old instructions for the block. 558 I->eraseFromParent(); 559 Update->eraseFromParent(); 560 561 return NextI; 562} 563 564MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( 565 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { 566 assert((Update->getOpcode() == AArch64::ADDXri || 567 Update->getOpcode() == AArch64::SUBXri) && 568 "Unexpected base register update instruction to merge!"); 569 MachineBasicBlock::iterator NextI = I; 570 // Return the instruction following the merged instruction, which is 571 // the instruction following our unmerged load. Unless that's the add/sub 572 // instruction we're merging, in which case it's the one after that. 573 if (++NextI == Update) 574 ++NextI; 575 576 int Value = Update->getOperand(2).getImm(); 577 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 578 "Can't merge 1 << 12 offset into post-indexed load / store"); 579 if (Update->getOpcode() == AArch64::SUBXri) 580 Value = -Value; 581 582 unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); 583 MachineInstrBuilder MIB = 584 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 585 .addOperand(Update->getOperand(0)) 586 .addOperand(I->getOperand(0)) 587 .addOperand(I->getOperand(1)) 588 .addImm(Value); 589 (void)MIB; 590 591 DEBUG(dbgs() << "Creating post-indexed load/store."); 592 DEBUG(dbgs() << " Replacing instructions:\n "); 593 DEBUG(I->print(dbgs())); 594 DEBUG(dbgs() << " "); 595 DEBUG(Update->print(dbgs())); 596 DEBUG(dbgs() << " with instruction:\n "); 597 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 598 DEBUG(dbgs() << "\n"); 599 600 // Erase the old instructions for the block. 601 I->eraseFromParent(); 602 Update->eraseFromParent(); 603 604 return NextI; 605} 606 607static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, 608 int Offset) { 609 switch (MI->getOpcode()) { 610 default: 611 break; 612 case AArch64::SUBXri: 613 // Negate the offset for a SUB instruction. 614 Offset *= -1; 615 // FALLTHROUGH 616 case AArch64::ADDXri: 617 // Make sure it's a vanilla immediate operand, not a relocation or 618 // anything else we can't handle. 619 if (!MI->getOperand(2).isImm()) 620 break; 621 // Watch out for 1 << 12 shifted value. 622 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 623 break; 624 // If the instruction has the base register as source and dest and the 625 // immediate will fit in a signed 9-bit integer, then we have a match. 626 if (MI->getOperand(0).getReg() == BaseReg && 627 MI->getOperand(1).getReg() == BaseReg && 628 MI->getOperand(2).getImm() <= 255 && 629 MI->getOperand(2).getImm() >= -256) { 630 // If we have a non-zero Offset, we check that it matches the amount 631 // we're adding to the register. 632 if (!Offset || Offset == MI->getOperand(2).getImm()) 633 return true; 634 } 635 break; 636 } 637 return false; 638} 639 640MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 641 MachineBasicBlock::iterator I, unsigned Limit, int Value) { 642 MachineBasicBlock::iterator E = I->getParent()->end(); 643 MachineInstr *MemMI = I; 644 MachineBasicBlock::iterator MBBI = I; 645 const MachineFunction &MF = *MemMI->getParent()->getParent(); 646 647 unsigned DestReg = MemMI->getOperand(0).getReg(); 648 unsigned BaseReg = MemMI->getOperand(1).getReg(); 649 int Offset = MemMI->getOperand(2).getImm() * 650 TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); 651 652 // If the base register overlaps the destination register, we can't 653 // merge the update. 654 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 655 return E; 656 657 // Scan forward looking for post-index opportunities. 658 // Updating instructions can't be formed if the memory insn already 659 // has an offset other than the value we're looking for. 660 if (Offset != Value) 661 return E; 662 663 // Track which registers have been modified and used between the first insn 664 // (inclusive) and the second insn. 665 BitVector ModifiedRegs, UsedRegs; 666 ModifiedRegs.resize(TRI->getNumRegs()); 667 UsedRegs.resize(TRI->getNumRegs()); 668 ++MBBI; 669 for (unsigned Count = 0; MBBI != E; ++MBBI) { 670 MachineInstr *MI = MBBI; 671 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 672 // optimization by changing how far we scan. 673 if (MI->isDebugValue()) 674 continue; 675 676 // Now that we know this is a real instruction, count it. 677 ++Count; 678 679 // If we found a match, return it. 680 if (isMatchingUpdateInsn(MI, BaseReg, Value)) 681 return MBBI; 682 683 // Update the status of what the instruction clobbered and used. 684 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 685 686 // Otherwise, if the base register is used or modified, we have no match, so 687 // return early. 688 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 689 return E; 690 } 691 return E; 692} 693 694MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 695 MachineBasicBlock::iterator I, unsigned Limit) { 696 MachineBasicBlock::iterator B = I->getParent()->begin(); 697 MachineBasicBlock::iterator E = I->getParent()->end(); 698 MachineInstr *MemMI = I; 699 MachineBasicBlock::iterator MBBI = I; 700 const MachineFunction &MF = *MemMI->getParent()->getParent(); 701 702 unsigned DestReg = MemMI->getOperand(0).getReg(); 703 unsigned BaseReg = MemMI->getOperand(1).getReg(); 704 int Offset = MemMI->getOperand(2).getImm(); 705 unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); 706 707 // If the load/store is the first instruction in the block, there's obviously 708 // not any matching update. Ditto if the memory offset isn't zero. 709 if (MBBI == B || Offset != 0) 710 return E; 711 // If the base register overlaps the destination register, we can't 712 // merge the update. 713 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 714 return E; 715 716 // Track which registers have been modified and used between the first insn 717 // (inclusive) and the second insn. 718 BitVector ModifiedRegs, UsedRegs; 719 ModifiedRegs.resize(TRI->getNumRegs()); 720 UsedRegs.resize(TRI->getNumRegs()); 721 --MBBI; 722 for (unsigned Count = 0; MBBI != B; --MBBI) { 723 MachineInstr *MI = MBBI; 724 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 725 // optimization by changing how far we scan. 726 if (MI->isDebugValue()) 727 continue; 728 729 // Now that we know this is a real instruction, count it. 730 ++Count; 731 732 // If we found a match, return it. 733 if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) 734 return MBBI; 735 736 // Update the status of what the instruction clobbered and used. 737 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 738 739 // Otherwise, if the base register is used or modified, we have no match, so 740 // return early. 741 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 742 return E; 743 } 744 return E; 745} 746 747bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { 748 bool Modified = false; 749 // Two tranformations to do here: 750 // 1) Find loads and stores that can be merged into a single load or store 751 // pair instruction. 752 // e.g., 753 // ldr x0, [x2] 754 // ldr x1, [x2, #8] 755 // ; becomes 756 // ldp x0, x1, [x2] 757 // 2) Find base register updates that can be merged into the load or store 758 // as a base-reg writeback. 759 // e.g., 760 // ldr x0, [x2] 761 // add x2, x2, #4 762 // ; becomes 763 // ldr x0, [x2], #4 764 765 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 766 MBBI != E;) { 767 MachineInstr *MI = MBBI; 768 switch (MI->getOpcode()) { 769 default: 770 // Just move on to the next instruction. 771 ++MBBI; 772 break; 773 case AArch64::STRSui: 774 case AArch64::STRDui: 775 case AArch64::STRQui: 776 case AArch64::STRXui: 777 case AArch64::STRWui: 778 case AArch64::LDRSui: 779 case AArch64::LDRDui: 780 case AArch64::LDRQui: 781 case AArch64::LDRXui: 782 case AArch64::LDRWui: 783 // do the unscaled versions as well 784 case AArch64::STURSi: 785 case AArch64::STURDi: 786 case AArch64::STURQi: 787 case AArch64::STURWi: 788 case AArch64::STURXi: 789 case AArch64::LDURSi: 790 case AArch64::LDURDi: 791 case AArch64::LDURQi: 792 case AArch64::LDURWi: 793 case AArch64::LDURXi: { 794 // If this is a volatile load/store, don't mess with it. 795 if (MI->hasOrderedMemoryRef()) { 796 ++MBBI; 797 break; 798 } 799 // Make sure this is a reg+imm (as opposed to an address reloc). 800 if (!MI->getOperand(2).isImm()) { 801 ++MBBI; 802 break; 803 } 804 // Check if this load/store has a hint to avoid pair formation. 805 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 806 if (TII->isLdStPairSuppressed(MI)) { 807 ++MBBI; 808 break; 809 } 810 // Look ahead up to ScanLimit instructions for a pairable instruction. 811 bool MergeForward = false; 812 MachineBasicBlock::iterator Paired = 813 findMatchingInsn(MBBI, MergeForward, ScanLimit); 814 if (Paired != E) { 815 // Merge the loads into a pair. Keeping the iterator straight is a 816 // pain, so we let the merge routine tell us what the next instruction 817 // is after it's done mucking about. 818 MBBI = mergePairedInsns(MBBI, Paired, MergeForward); 819 820 Modified = true; 821 ++NumPairCreated; 822 if (isUnscaledLdst(MI->getOpcode())) 823 ++NumUnscaledPairCreated; 824 break; 825 } 826 ++MBBI; 827 break; 828 } 829 // FIXME: Do the other instructions. 830 } 831 } 832 833 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 834 MBBI != E;) { 835 MachineInstr *MI = MBBI; 836 // Do update merging. It's simpler to keep this separate from the above 837 // switch, though not strictly necessary. 838 int Opc = MI->getOpcode(); 839 switch (Opc) { 840 default: 841 // Just move on to the next instruction. 842 ++MBBI; 843 break; 844 case AArch64::STRSui: 845 case AArch64::STRDui: 846 case AArch64::STRQui: 847 case AArch64::STRXui: 848 case AArch64::STRWui: 849 case AArch64::LDRSui: 850 case AArch64::LDRDui: 851 case AArch64::LDRQui: 852 case AArch64::LDRXui: 853 case AArch64::LDRWui: 854 // do the unscaled versions as well 855 case AArch64::STURSi: 856 case AArch64::STURDi: 857 case AArch64::STURQi: 858 case AArch64::STURWi: 859 case AArch64::STURXi: 860 case AArch64::LDURSi: 861 case AArch64::LDURDi: 862 case AArch64::LDURQi: 863 case AArch64::LDURWi: 864 case AArch64::LDURXi: { 865 // Make sure this is a reg+imm (as opposed to an address reloc). 866 if (!MI->getOperand(2).isImm()) { 867 ++MBBI; 868 break; 869 } 870 // Look ahead up to ScanLimit instructions for a mergable instruction. 871 MachineBasicBlock::iterator Update = 872 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 873 if (Update != E) { 874 // Merge the update into the ld/st. 875 MBBI = mergePostIdxUpdateInsn(MBBI, Update); 876 Modified = true; 877 ++NumPostFolded; 878 break; 879 } 880 // Don't know how to handle pre/post-index versions, so move to the next 881 // instruction. 882 if (isUnscaledLdst(Opc)) { 883 ++MBBI; 884 break; 885 } 886 887 // Look back to try to find a pre-index instruction. For example, 888 // add x0, x0, #8 889 // ldr x1, [x0] 890 // merged into: 891 // ldr x1, [x0, #8]! 892 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 893 if (Update != E) { 894 // Merge the update into the ld/st. 895 MBBI = mergePreIdxUpdateInsn(MBBI, Update); 896 Modified = true; 897 ++NumPreFolded; 898 break; 899 } 900 901 // Look forward to try to find a post-index instruction. For example, 902 // ldr x1, [x0, #64] 903 // add x0, x0, #64 904 // merged into: 905 // ldr x1, [x0, #64]! 906 907 // The immediate in the load/store is scaled by the size of the register 908 // being loaded. The immediate in the add we're looking for, 909 // however, is not, so adjust here. 910 int Value = MI->getOperand(2).getImm() * 911 TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) 912 ->getSize(); 913 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); 914 if (Update != E) { 915 // Merge the update into the ld/st. 916 MBBI = mergePreIdxUpdateInsn(MBBI, Update); 917 Modified = true; 918 ++NumPreFolded; 919 break; 920 } 921 922 // Nothing found. Just move to the next instruction. 923 ++MBBI; 924 break; 925 } 926 // FIXME: Do the other instructions. 927 } 928 } 929 930 return Modified; 931} 932 933bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 934 const TargetMachine &TM = Fn.getTarget(); 935 TII = static_cast<const AArch64InstrInfo *>( 936 TM.getSubtargetImpl()->getInstrInfo()); 937 TRI = TM.getSubtargetImpl()->getRegisterInfo(); 938 939 bool Modified = false; 940 for (auto &MBB : Fn) 941 Modified |= optimizeBlock(MBB); 942 943 return Modified; 944} 945 946// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 947// loads and stores near one another? 948 949/// createARMLoadStoreOptimizationPass - returns an instance of the load / store 950/// optimization pass. 951FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 952 return new AArch64LoadStoreOpt(); 953} 954