AArch64SIMDInstrOpt.cpp revision 341825
1// 2// The LLVM Compiler Infrastructure 3// 4// This file is distributed under the University of Illinois Open Source 5// License. See LICENSE.TXT for details. 6// 7//===----------------------------------------------------------------------===// 8// 9// This file contains a pass that performs optimization on SIMD instructions 10// with high latency by splitting them into more efficient series of 11// instructions. 12// 13// 1. Rewrite certain SIMD instructions with vector element due to their 14// inefficiency on some targets. 15// 16// For example: 17// fmla v0.4s, v1.4s, v2.s[1] 18// 19// Is rewritten into: 20// dup v3.4s, v2.s[1] 21// fmla v0.4s, v1.4s, v3.4s 22// 23// 2. Rewrite interleaved memory access instructions due to their 24// inefficiency on some targets. 25// 26// For example: 27// st2 {v0.4s, v1.4s}, addr 28// 29// Is rewritten into: 30// zip1 v2.4s, v0.4s, v1.4s 31// zip2 v3.4s, v0.4s, v1.4s 32// stp q2, q3, addr 33// 34//===----------------------------------------------------------------------===// 35 36#include "AArch64InstrInfo.h" 37#include "llvm/ADT/SmallVector.h" 38#include "llvm/ADT/Statistic.h" 39#include "llvm/ADT/StringRef.h" 40#include "llvm/CodeGen/MachineBasicBlock.h" 41#include "llvm/CodeGen/MachineFunction.h" 42#include "llvm/CodeGen/MachineFunctionPass.h" 43#include "llvm/CodeGen/MachineInstr.h" 44#include "llvm/CodeGen/MachineInstrBuilder.h" 45#include "llvm/CodeGen/MachineOperand.h" 46#include "llvm/CodeGen/MachineRegisterInfo.h" 47#include "llvm/CodeGen/TargetInstrInfo.h" 48#include "llvm/CodeGen/TargetSchedule.h" 49#include "llvm/CodeGen/TargetSubtargetInfo.h" 50#include "llvm/MC/MCInstrDesc.h" 51#include "llvm/MC/MCSchedule.h" 52#include "llvm/Pass.h" 53#include <unordered_map> 54 55using namespace llvm; 56 57#define DEBUG_TYPE "aarch64-simdinstr-opt" 58 59STATISTIC(NumModifiedInstr, 60 "Number of SIMD instructions modified"); 61 62#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 63 "AArch64 SIMD instructions optimization pass" 64 65namespace { 66 67struct AArch64SIMDInstrOpt : public MachineFunctionPass { 68 static char ID; 69 70 const TargetInstrInfo *TII; 71 MachineRegisterInfo *MRI; 72 TargetSchedModel SchedModel; 73 74 // The two maps below are used to cache decisions instead of recomputing: 75 // This is used to cache instruction replacement decisions within function 76 // units and across function units. 77 std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; 78 // This is used to cache the decision of whether to leave the interleaved 79 // store instructions replacement pass early or not for a particular target. 80 std::unordered_map<std::string, bool> InterlEarlyExit; 81 82 typedef enum { 83 VectorElem, 84 Interleave 85 } Subpass; 86 87 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. 88 struct InstReplInfo { 89 unsigned OrigOpc; 90 std::vector<unsigned> ReplOpc; 91 const TargetRegisterClass RC; 92 }; 93 94#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 96#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 97 OpcR7, OpcR8, OpcR9, RC) \ 98 {OpcOrg, \ 99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 100 101 // The Instruction Replacement Table: 102 std::vector<InstReplInfo> IRT = { 103 // ST2 instructions 104 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 105 AArch64::STPQi, AArch64::FPR128RegClass), 106 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 107 AArch64::STPQi, AArch64::FPR128RegClass), 108 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 109 AArch64::STPDi, AArch64::FPR64RegClass), 110 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 111 AArch64::STPQi, AArch64::FPR128RegClass), 112 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 113 AArch64::STPDi, AArch64::FPR64RegClass), 114 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 115 AArch64::STPQi, AArch64::FPR128RegClass), 116 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 117 AArch64::STPDi, AArch64::FPR64RegClass), 118 // ST4 instructions 119 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 120 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, 121 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 122 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 123 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 124 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, 125 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 126 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 127 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 128 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, 129 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 130 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 131 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 132 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, 133 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 134 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 135 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 136 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, 137 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 138 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 139 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 140 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, 141 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 142 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 143 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 144 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, 145 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 146 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) 147 }; 148 149 // A costly instruction is replaced in this work by N efficient instructions 150 // The maximum of N is curently 10 and it is for ST4 case. 151 static const unsigned MaxNumRepl = 10; 152 153 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { 154 initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); 155 } 156 157 /// Based only on latency of instructions, determine if it is cost efficient 158 /// to replace the instruction InstDesc by the instructions stored in the 159 /// array InstDescRepl. 160 /// Return true if replacement is expected to be faster. 161 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 162 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); 163 164 /// Determine if we need to exit the instruction replacement optimization 165 /// passes early. This makes sure that no compile time is spent in this pass 166 /// for targets with no need for any of these optimizations. 167 /// Return true if early exit of the pass is recommended. 168 bool shouldExitEarly(MachineFunction *MF, Subpass SP); 169 170 /// Check whether an equivalent DUP instruction has already been 171 /// created or not. 172 /// Return true when the DUP instruction already exists. In this case, 173 /// DestReg will point to the destination of the already created DUP. 174 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, 175 unsigned LaneNumber, unsigned *DestReg) const; 176 177 /// Certain SIMD instructions with vector element operand are not efficient. 178 /// Rewrite them into SIMD instructions with vector operands. This rewrite 179 /// is driven by the latency of the instructions. 180 /// Return true if the SIMD instruction is modified. 181 bool optimizeVectElement(MachineInstr &MI); 182 183 /// Process The REG_SEQUENCE instruction, and extract the source 184 /// operands of the ST2/4 instruction from it. 185 /// Example of such instructions. 186 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 187 /// Return true when the instruction is processed successfully. 188 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, 189 unsigned* StRegKill, unsigned NumArg) const; 190 191 /// Load/Store Interleaving instructions are not always beneficial. 192 /// Replace them by ZIP instructionand classical load/store. 193 /// Return true if the SIMD instruction is modified. 194 bool optimizeLdStInterleave(MachineInstr &MI); 195 196 /// Return the number of useful source registers for this 197 /// instruction (2 for ST2 and 4 for ST4). 198 unsigned determineSrcReg(MachineInstr &MI) const; 199 200 bool runOnMachineFunction(MachineFunction &Fn) override; 201 202 StringRef getPassName() const override { 203 return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; 204 } 205}; 206 207char AArch64SIMDInstrOpt::ID = 0; 208 209} // end anonymous namespace 210 211INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", 212 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) 213 214/// Based only on latency of instructions, determine if it is cost efficient 215/// to replace the instruction InstDesc by the instructions stored in the 216/// array InstDescRepl. 217/// Return true if replacement is expected to be faster. 218bool AArch64SIMDInstrOpt:: 219shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 220 SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { 221 // Check if replacement decision is already available in the cached table. 222 // if so, return it. 223 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 224 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); 225 if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) 226 return SIMDInstrTable[InstID]; 227 228 unsigned SCIdx = InstDesc->getSchedClass(); 229 const MCSchedClassDesc *SCDesc = 230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); 231 232 // If a target does not define resources for the instructions 233 // of interest, then return false for no replacement. 234 const MCSchedClassDesc *SCDescRepl; 235 if (!SCDesc->isValid() || SCDesc->isVariant()) 236 { 237 SIMDInstrTable[InstID] = false; 238 return false; 239 } 240 for (auto IDesc : InstDescRepl) 241 { 242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( 243 IDesc->getSchedClass()); 244 if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) 245 { 246 SIMDInstrTable[InstID] = false; 247 return false; 248 } 249 } 250 251 // Replacement cost. 252 unsigned ReplCost = 0; 253 for (auto IDesc :InstDescRepl) 254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); 255 256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) 257 { 258 SIMDInstrTable[InstID] = true; 259 return true; 260 } 261 else 262 { 263 SIMDInstrTable[InstID] = false; 264 return false; 265 } 266} 267 268/// Determine if we need to exit this pass for a kind of instruction replacement 269/// early. This makes sure that no compile time is spent in this pass for 270/// targets with no need for any of these optimizations beyond performing this 271/// check. 272/// Return true if early exit of this pass for a kind of instruction 273/// replacement is recommended for a target. 274bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { 275 const MCInstrDesc* OriginalMCID; 276 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 277 278 switch (SP) { 279 // For this optimization, check by comparing the latency of a representative 280 // instruction to that of the replacement instructions. 281 // TODO: check for all concerned instructions. 282 case VectorElem: 283 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); 284 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); 285 ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); 286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) 287 return false; 288 break; 289 290 // For this optimization, check for all concerned instructions. 291 case Interleave: 292 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 293 if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) 294 return InterlEarlyExit[Subtarget]; 295 296 for (auto &I : IRT) { 297 OriginalMCID = &TII->get(I.OrigOpc); 298 for (auto &Repl : I.ReplOpc) 299 ReplInstrMCID.push_back(&TII->get(Repl)); 300 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { 301 InterlEarlyExit[Subtarget] = false; 302 return false; 303 } 304 ReplInstrMCID.clear(); 305 } 306 InterlEarlyExit[Subtarget] = true; 307 break; 308 } 309 310 return true; 311} 312 313/// Check whether an equivalent DUP instruction has already been 314/// created or not. 315/// Return true when the DUP instruction already exists. In this case, 316/// DestReg will point to the destination of the already created DUP. 317bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, 318 unsigned SrcReg, unsigned LaneNumber, 319 unsigned *DestReg) const { 320 for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); 321 MII != MIE;) { 322 MII--; 323 MachineInstr *CurrentMI = &*MII; 324 325 if (CurrentMI->getOpcode() == DupOpcode && 326 CurrentMI->getNumOperands() == 3 && 327 CurrentMI->getOperand(1).getReg() == SrcReg && 328 CurrentMI->getOperand(2).getImm() == LaneNumber) { 329 *DestReg = CurrentMI->getOperand(0).getReg(); 330 return true; 331 } 332 } 333 334 return false; 335} 336 337/// Certain SIMD instructions with vector element operand are not efficient. 338/// Rewrite them into SIMD instructions with vector operands. This rewrite 339/// is driven by the latency of the instructions. 340/// The instruction of concerns are for the time being FMLA, FMLS, FMUL, 341/// and FMULX and hence they are hardcoded. 342/// 343/// For example: 344/// fmla v0.4s, v1.4s, v2.s[1] 345/// 346/// Is rewritten into 347/// dup v3.4s, v2.s[1] // DUP not necessary if redundant 348/// fmla v0.4s, v1.4s, v3.4s 349/// 350/// Return true if the SIMD instruction is modified. 351bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { 352 const MCInstrDesc *MulMCID, *DupMCID; 353 const TargetRegisterClass *RC = &AArch64::FPR128RegClass; 354 355 switch (MI.getOpcode()) { 356 default: 357 return false; 358 359 // 4X32 instructions 360 case AArch64::FMLAv4i32_indexed: 361 DupMCID = &TII->get(AArch64::DUPv4i32lane); 362 MulMCID = &TII->get(AArch64::FMLAv4f32); 363 break; 364 case AArch64::FMLSv4i32_indexed: 365 DupMCID = &TII->get(AArch64::DUPv4i32lane); 366 MulMCID = &TII->get(AArch64::FMLSv4f32); 367 break; 368 case AArch64::FMULXv4i32_indexed: 369 DupMCID = &TII->get(AArch64::DUPv4i32lane); 370 MulMCID = &TII->get(AArch64::FMULXv4f32); 371 break; 372 case AArch64::FMULv4i32_indexed: 373 DupMCID = &TII->get(AArch64::DUPv4i32lane); 374 MulMCID = &TII->get(AArch64::FMULv4f32); 375 break; 376 377 // 2X64 instructions 378 case AArch64::FMLAv2i64_indexed: 379 DupMCID = &TII->get(AArch64::DUPv2i64lane); 380 MulMCID = &TII->get(AArch64::FMLAv2f64); 381 break; 382 case AArch64::FMLSv2i64_indexed: 383 DupMCID = &TII->get(AArch64::DUPv2i64lane); 384 MulMCID = &TII->get(AArch64::FMLSv2f64); 385 break; 386 case AArch64::FMULXv2i64_indexed: 387 DupMCID = &TII->get(AArch64::DUPv2i64lane); 388 MulMCID = &TII->get(AArch64::FMULXv2f64); 389 break; 390 case AArch64::FMULv2i64_indexed: 391 DupMCID = &TII->get(AArch64::DUPv2i64lane); 392 MulMCID = &TII->get(AArch64::FMULv2f64); 393 break; 394 395 // 2X32 instructions 396 case AArch64::FMLAv2i32_indexed: 397 RC = &AArch64::FPR64RegClass; 398 DupMCID = &TII->get(AArch64::DUPv2i32lane); 399 MulMCID = &TII->get(AArch64::FMLAv2f32); 400 break; 401 case AArch64::FMLSv2i32_indexed: 402 RC = &AArch64::FPR64RegClass; 403 DupMCID = &TII->get(AArch64::DUPv2i32lane); 404 MulMCID = &TII->get(AArch64::FMLSv2f32); 405 break; 406 case AArch64::FMULXv2i32_indexed: 407 RC = &AArch64::FPR64RegClass; 408 DupMCID = &TII->get(AArch64::DUPv2i32lane); 409 MulMCID = &TII->get(AArch64::FMULXv2f32); 410 break; 411 case AArch64::FMULv2i32_indexed: 412 RC = &AArch64::FPR64RegClass; 413 DupMCID = &TII->get(AArch64::DUPv2i32lane); 414 MulMCID = &TII->get(AArch64::FMULv2f32); 415 break; 416 } 417 418 SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; 419 ReplInstrMCID.push_back(DupMCID); 420 ReplInstrMCID.push_back(MulMCID); 421 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 422 ReplInstrMCID)) 423 return false; 424 425 const DebugLoc &DL = MI.getDebugLoc(); 426 MachineBasicBlock &MBB = *MI.getParent(); 427 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 428 429 // Get the operands of the current SIMD arithmetic instruction. 430 unsigned MulDest = MI.getOperand(0).getReg(); 431 unsigned SrcReg0 = MI.getOperand(1).getReg(); 432 unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); 433 unsigned SrcReg1 = MI.getOperand(2).getReg(); 434 unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); 435 unsigned DupDest; 436 437 // Instructions of interest have either 4 or 5 operands. 438 if (MI.getNumOperands() == 5) { 439 unsigned SrcReg2 = MI.getOperand(3).getReg(); 440 unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); 441 unsigned LaneNumber = MI.getOperand(4).getImm(); 442 // Create a new DUP instruction. Note that if an equivalent DUP instruction 443 // has already been created before, then use that one instead of creating 444 // a new one. 445 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { 446 DupDest = MRI.createVirtualRegister(RC); 447 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 448 .addReg(SrcReg2, Src2IsKill) 449 .addImm(LaneNumber); 450 } 451 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 452 .addReg(SrcReg0, Src0IsKill) 453 .addReg(SrcReg1, Src1IsKill) 454 .addReg(DupDest, Src2IsKill); 455 } else if (MI.getNumOperands() == 4) { 456 unsigned LaneNumber = MI.getOperand(3).getImm(); 457 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { 458 DupDest = MRI.createVirtualRegister(RC); 459 BuildMI(MBB, MI, DL, *DupMCID, DupDest) 460 .addReg(SrcReg1, Src1IsKill) 461 .addImm(LaneNumber); 462 } 463 BuildMI(MBB, MI, DL, *MulMCID, MulDest) 464 .addReg(SrcReg0, Src0IsKill) 465 .addReg(DupDest, Src1IsKill); 466 } else { 467 return false; 468 } 469 470 ++NumModifiedInstr; 471 return true; 472} 473 474/// Load/Store Interleaving instructions are not always beneficial. 475/// Replace them by ZIP instructions and classical load/store. 476/// 477/// For example: 478/// st2 {v0.4s, v1.4s}, addr 479/// 480/// Is rewritten into: 481/// zip1 v2.4s, v0.4s, v1.4s 482/// zip2 v3.4s, v0.4s, v1.4s 483/// stp q2, q3, addr 484// 485/// For example: 486/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr 487/// 488/// Is rewritten into: 489/// zip1 v4.4s, v0.4s, v2.4s 490/// zip2 v5.4s, v0.4s, v2.4s 491/// zip1 v6.4s, v1.4s, v3.4s 492/// zip2 v7.4s, v1.4s, v3.4s 493/// zip1 v8.4s, v4.4s, v6.4s 494/// zip2 v9.4s, v4.4s, v6.4s 495/// zip1 v10.4s, v5.4s, v7.4s 496/// zip2 v11.4s, v5.4s, v7.4s 497/// stp q8, q9, addr 498/// stp q10, q11, addr+32 499/// 500/// Currently only instructions related to ST2 and ST4 are considered. 501/// Other may be added later. 502/// Return true if the SIMD instruction is modified. 503bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { 504 505 unsigned SeqReg, AddrReg; 506 unsigned StReg[4], StRegKill[4]; 507 MachineInstr *DefiningMI; 508 const DebugLoc &DL = MI.getDebugLoc(); 509 MachineBasicBlock &MBB = *MI.getParent(); 510 SmallVector<unsigned, MaxNumRepl> ZipDest; 511 SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 512 513 // If current instruction matches any of the rewriting rules, then 514 // gather information about parameters of the new instructions. 515 bool Match = false; 516 for (auto &I : IRT) { 517 if (MI.getOpcode() == I.OrigOpc) { 518 SeqReg = MI.getOperand(0).getReg(); 519 AddrReg = MI.getOperand(1).getReg(); 520 DefiningMI = MRI->getUniqueVRegDef(SeqReg); 521 unsigned NumReg = determineSrcReg(MI); 522 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) 523 return false; 524 525 for (auto &Repl : I.ReplOpc) { 526 ReplInstrMCID.push_back(&TII->get(Repl)); 527 // Generate destination registers but only for non-store instruction. 528 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) 529 ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); 530 } 531 Match = true; 532 break; 533 } 534 } 535 536 if (!Match) 537 return false; 538 539 // Determine if it is profitable to replace MI by the series of instructions 540 // represented in ReplInstrMCID. 541 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 542 ReplInstrMCID)) 543 return false; 544 545 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at 546 // this point, the code generation is hardcoded and does not rely on the IRT 547 // table used above given that code generation for ST2 replacement is somewhat 548 // different than for ST4 replacement. We could have added more info into the 549 // table related to how we build new instructions but we may be adding more 550 // complexity with that). 551 switch (MI.getOpcode()) { 552 default: 553 return false; 554 555 case AArch64::ST2Twov16b: 556 case AArch64::ST2Twov8b: 557 case AArch64::ST2Twov8h: 558 case AArch64::ST2Twov4h: 559 case AArch64::ST2Twov4s: 560 case AArch64::ST2Twov2s: 561 case AArch64::ST2Twov2d: 562 // ZIP instructions 563 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 564 .addReg(StReg[0]) 565 .addReg(StReg[1]); 566 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 567 .addReg(StReg[0], StRegKill[0]) 568 .addReg(StReg[1], StRegKill[1]); 569 // STP instructions 570 BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) 571 .addReg(ZipDest[0]) 572 .addReg(ZipDest[1]) 573 .addReg(AddrReg) 574 .addImm(0); 575 break; 576 577 case AArch64::ST4Fourv16b: 578 case AArch64::ST4Fourv8b: 579 case AArch64::ST4Fourv8h: 580 case AArch64::ST4Fourv4h: 581 case AArch64::ST4Fourv4s: 582 case AArch64::ST4Fourv2s: 583 case AArch64::ST4Fourv2d: 584 // ZIP instructions 585 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 586 .addReg(StReg[0]) 587 .addReg(StReg[2]); 588 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 589 .addReg(StReg[0], StRegKill[0]) 590 .addReg(StReg[2], StRegKill[2]); 591 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) 592 .addReg(StReg[1]) 593 .addReg(StReg[3]); 594 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) 595 .addReg(StReg[1], StRegKill[1]) 596 .addReg(StReg[3], StRegKill[3]); 597 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) 598 .addReg(ZipDest[0]) 599 .addReg(ZipDest[2]); 600 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) 601 .addReg(ZipDest[0]) 602 .addReg(ZipDest[2]); 603 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) 604 .addReg(ZipDest[1]) 605 .addReg(ZipDest[3]); 606 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) 607 .addReg(ZipDest[1]) 608 .addReg(ZipDest[3]); 609 // stp instructions 610 BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) 611 .addReg(ZipDest[4]) 612 .addReg(ZipDest[5]) 613 .addReg(AddrReg) 614 .addImm(0); 615 BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) 616 .addReg(ZipDest[6]) 617 .addReg(ZipDest[7]) 618 .addReg(AddrReg) 619 .addImm(2); 620 break; 621 } 622 623 ++NumModifiedInstr; 624 return true; 625} 626 627/// Process The REG_SEQUENCE instruction, and extract the source 628/// operands of the ST2/4 instruction from it. 629/// Example of such instruction. 630/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 631/// Return true when the instruction is processed successfully. 632bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, 633 unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { 634 assert (DefiningMI != NULL); 635 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) 636 return false; 637 638 for (unsigned i=0; i<NumArg; i++) { 639 StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); 640 StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); 641 642 // Sanity check for the other arguments. 643 if (DefiningMI->getOperand(2*i+2).isImm()) { 644 switch (DefiningMI->getOperand(2*i+2).getImm()) { 645 default: 646 return false; 647 648 case AArch64::dsub0: 649 case AArch64::dsub1: 650 case AArch64::dsub2: 651 case AArch64::dsub3: 652 case AArch64::qsub0: 653 case AArch64::qsub1: 654 case AArch64::qsub2: 655 case AArch64::qsub3: 656 break; 657 } 658 } 659 else 660 return false; 661 } 662 return true; 663} 664 665/// Return the number of useful source registers for this instruction 666/// (2 for ST2 and 4 for ST4). 667unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { 668 switch (MI.getOpcode()) { 669 default: 670 llvm_unreachable("Unsupported instruction for this pass"); 671 672 case AArch64::ST2Twov16b: 673 case AArch64::ST2Twov8b: 674 case AArch64::ST2Twov8h: 675 case AArch64::ST2Twov4h: 676 case AArch64::ST2Twov4s: 677 case AArch64::ST2Twov2s: 678 case AArch64::ST2Twov2d: 679 return 2; 680 681 case AArch64::ST4Fourv16b: 682 case AArch64::ST4Fourv8b: 683 case AArch64::ST4Fourv8h: 684 case AArch64::ST4Fourv4h: 685 case AArch64::ST4Fourv4s: 686 case AArch64::ST4Fourv2s: 687 case AArch64::ST4Fourv2d: 688 return 4; 689 } 690} 691 692bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { 693 if (skipFunction(MF.getFunction())) 694 return false; 695 696 TII = MF.getSubtarget().getInstrInfo(); 697 MRI = &MF.getRegInfo(); 698 const TargetSubtargetInfo &ST = MF.getSubtarget(); 699 const AArch64InstrInfo *AAII = 700 static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 701 if (!AAII) 702 return false; 703 SchedModel.init(&ST); 704 if (!SchedModel.hasInstrSchedModel()) 705 return false; 706 707 bool Changed = false; 708 for (auto OptimizationKind : {VectorElem, Interleave}) { 709 if (!shouldExitEarly(&MF, OptimizationKind)) { 710 SmallVector<MachineInstr *, 8> RemoveMIs; 711 for (MachineBasicBlock &MBB : MF) { 712 for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); 713 MII != MIE;) { 714 MachineInstr &MI = *MII; 715 bool InstRewrite; 716 if (OptimizationKind == VectorElem) 717 InstRewrite = optimizeVectElement(MI) ; 718 else 719 InstRewrite = optimizeLdStInterleave(MI); 720 if (InstRewrite) { 721 // Add MI to the list of instructions to be removed given that it 722 // has been replaced. 723 RemoveMIs.push_back(&MI); 724 Changed = true; 725 } 726 ++MII; 727 } 728 } 729 for (MachineInstr *MI : RemoveMIs) 730 MI->eraseFromParent(); 731 } 732 } 733 734 return Changed; 735} 736 737/// Returns an instance of the high cost ASIMD instruction replacement 738/// optimization pass. 739FunctionPass *llvm::createAArch64SIMDInstrOptPass() { 740 return new AArch64SIMDInstrOpt(); 741} 742