1326938Sdim// 2353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 3353358Sdim// See https://llvm.org/LICENSE.txt for license information. 4353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5326938Sdim// 6326938Sdim//===----------------------------------------------------------------------===// 7326938Sdim// 8326938Sdim// This file contains a pass that performs optimization on SIMD instructions 9326938Sdim// with high latency by splitting them into more efficient series of 10326938Sdim// instructions. 11326938Sdim// 12326938Sdim// 1. Rewrite certain SIMD instructions with vector element due to their 13326938Sdim// inefficiency on some targets. 14326938Sdim// 15326938Sdim// For example: 16326938Sdim// fmla v0.4s, v1.4s, v2.s[1] 17326938Sdim// 18326938Sdim// Is rewritten into: 19326938Sdim// dup v3.4s, v2.s[1] 20326938Sdim// fmla v0.4s, v1.4s, v3.4s 21326938Sdim// 22326938Sdim// 2. Rewrite interleaved memory access instructions due to their 23326938Sdim// inefficiency on some targets. 24326938Sdim// 25326938Sdim// For example: 26326938Sdim// st2 {v0.4s, v1.4s}, addr 27326938Sdim// 28326938Sdim// Is rewritten into: 29326938Sdim// zip1 v2.4s, v0.4s, v1.4s 30326938Sdim// zip2 v3.4s, v0.4s, v1.4s 31326938Sdim// stp q2, q3, addr 32326938Sdim// 33326938Sdim//===----------------------------------------------------------------------===// 34326938Sdim 35326938Sdim#include "AArch64InstrInfo.h" 36326938Sdim#include "llvm/ADT/SmallVector.h" 37326938Sdim#include "llvm/ADT/Statistic.h" 38326938Sdim#include "llvm/ADT/StringRef.h" 39326938Sdim#include "llvm/CodeGen/MachineBasicBlock.h" 40326938Sdim#include "llvm/CodeGen/MachineFunction.h" 41326938Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 42326938Sdim#include "llvm/CodeGen/MachineInstr.h" 43326938Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 44326938Sdim#include "llvm/CodeGen/MachineOperand.h" 45326938Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 46326938Sdim#include "llvm/CodeGen/TargetInstrInfo.h" 47326938Sdim#include "llvm/CodeGen/TargetSchedule.h" 48326938Sdim#include "llvm/CodeGen/TargetSubtargetInfo.h" 49326938Sdim#include "llvm/MC/MCInstrDesc.h" 50326938Sdim#include "llvm/MC/MCSchedule.h" 51326938Sdim#include "llvm/Pass.h" 52326938Sdim#include <unordered_map> 53326938Sdim 54326938Sdimusing namespace llvm; 55326938Sdim 56326938Sdim#define DEBUG_TYPE "aarch64-simdinstr-opt" 57326938Sdim 58326938SdimSTATISTIC(NumModifiedInstr, 59326938Sdim "Number of SIMD instructions modified"); 60326938Sdim 61326938Sdim#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 62326938Sdim "AArch64 SIMD instructions optimization pass" 63326938Sdim 64326938Sdimnamespace { 65326938Sdim 66326938Sdimstruct AArch64SIMDInstrOpt : public MachineFunctionPass { 67326938Sdim static char ID; 68326938Sdim 69326938Sdim const TargetInstrInfo *TII; 70326938Sdim MachineRegisterInfo *MRI; 71326938Sdim TargetSchedModel SchedModel; 72326938Sdim 73326938Sdim // The two maps below are used to cache decisions instead of recomputing: 74326938Sdim // This is used to cache instruction replacement decisions within function 75326938Sdim // units and across function units. 76326938Sdim std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; 77326938Sdim // This is used to cache the decision of whether to leave the interleaved 78326938Sdim // store instructions replacement pass early or not for a particular target. 79326938Sdim std::unordered_map<std::string, bool> InterlEarlyExit; 80326938Sdim 81326938Sdim typedef enum { 82326938Sdim VectorElem, 83326938Sdim Interleave 84326938Sdim } Subpass; 85326938Sdim 86326938Sdim // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. 87326938Sdim struct InstReplInfo { 88326938Sdim unsigned OrigOpc; 89326938Sdim std::vector<unsigned> ReplOpc; 90326938Sdim const TargetRegisterClass RC; 91326938Sdim }; 92326938Sdim 93326938Sdim#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 94326938Sdim {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 95326938Sdim#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 96326938Sdim OpcR7, OpcR8, OpcR9, RC) \ 97326938Sdim {OpcOrg, \ 98326938Sdim {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 99326938Sdim 100326938Sdim // The Instruction Replacement Table: 101326938Sdim std::vector<InstReplInfo> IRT = { 102326938Sdim // ST2 instructions 103326938Sdim RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 104326938Sdim AArch64::STPQi, AArch64::FPR128RegClass), 105326938Sdim RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 106326938Sdim AArch64::STPQi, AArch64::FPR128RegClass), 107326938Sdim RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 108326938Sdim AArch64::STPDi, AArch64::FPR64RegClass), 109326938Sdim RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 110326938Sdim AArch64::STPQi, AArch64::FPR128RegClass), 111326938Sdim RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 112326938Sdim AArch64::STPDi, AArch64::FPR64RegClass), 113326938Sdim RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 114326938Sdim AArch64::STPQi, AArch64::FPR128RegClass), 115326938Sdim RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 116326938Sdim AArch64::STPDi, AArch64::FPR64RegClass), 117326938Sdim // ST4 instructions 118326938Sdim RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 119326938Sdim AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, 120326938Sdim AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, 121326938Sdim AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 122326938Sdim RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 123326938Sdim AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, 124326938Sdim AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, 125326938Sdim AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 126326938Sdim RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 127326938Sdim AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, 128326938Sdim AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, 129326938Sdim AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 130326938Sdim RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 131326938Sdim AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, 132326938Sdim AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, 133326938Sdim AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 134326938Sdim RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 135326938Sdim AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, 136326938Sdim AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, 137326938Sdim AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), 138326938Sdim RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 139326938Sdim AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, 140326938Sdim AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, 141326938Sdim AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), 142326938Sdim RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 143326938Sdim AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, 144326938Sdim AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, 145326938Sdim AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) 146326938Sdim }; 147326938Sdim 148326938Sdim // A costly instruction is replaced in this work by N efficient instructions 149326938Sdim // The maximum of N is curently 10 and it is for ST4 case. 150326938Sdim static const unsigned MaxNumRepl = 10; 151326938Sdim 152326938Sdim AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { 153326938Sdim initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); 154326938Sdim } 155326938Sdim 156326938Sdim /// Based only on latency of instructions, determine if it is cost efficient 157326938Sdim /// to replace the instruction InstDesc by the instructions stored in the 158326938Sdim /// array InstDescRepl. 159326938Sdim /// Return true if replacement is expected to be faster. 160326938Sdim bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 161326938Sdim SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); 162326938Sdim 163326938Sdim /// Determine if we need to exit the instruction replacement optimization 164326938Sdim /// passes early. This makes sure that no compile time is spent in this pass 165326938Sdim /// for targets with no need for any of these optimizations. 166326938Sdim /// Return true if early exit of the pass is recommended. 167326938Sdim bool shouldExitEarly(MachineFunction *MF, Subpass SP); 168326938Sdim 169326938Sdim /// Check whether an equivalent DUP instruction has already been 170326938Sdim /// created or not. 171326938Sdim /// Return true when the DUP instruction already exists. In this case, 172326938Sdim /// DestReg will point to the destination of the already created DUP. 173326938Sdim bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, 174326938Sdim unsigned LaneNumber, unsigned *DestReg) const; 175326938Sdim 176326938Sdim /// Certain SIMD instructions with vector element operand are not efficient. 177326938Sdim /// Rewrite them into SIMD instructions with vector operands. This rewrite 178326938Sdim /// is driven by the latency of the instructions. 179326938Sdim /// Return true if the SIMD instruction is modified. 180326938Sdim bool optimizeVectElement(MachineInstr &MI); 181326938Sdim 182326938Sdim /// Process The REG_SEQUENCE instruction, and extract the source 183326938Sdim /// operands of the ST2/4 instruction from it. 184326938Sdim /// Example of such instructions. 185326938Sdim /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 186326938Sdim /// Return true when the instruction is processed successfully. 187326938Sdim bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, 188326938Sdim unsigned* StRegKill, unsigned NumArg) const; 189326938Sdim 190326938Sdim /// Load/Store Interleaving instructions are not always beneficial. 191326938Sdim /// Replace them by ZIP instructionand classical load/store. 192326938Sdim /// Return true if the SIMD instruction is modified. 193326938Sdim bool optimizeLdStInterleave(MachineInstr &MI); 194326938Sdim 195326938Sdim /// Return the number of useful source registers for this 196326938Sdim /// instruction (2 for ST2 and 4 for ST4). 197326938Sdim unsigned determineSrcReg(MachineInstr &MI) const; 198326938Sdim 199326938Sdim bool runOnMachineFunction(MachineFunction &Fn) override; 200326938Sdim 201326938Sdim StringRef getPassName() const override { 202326938Sdim return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; 203326938Sdim } 204326938Sdim}; 205326938Sdim 206326938Sdimchar AArch64SIMDInstrOpt::ID = 0; 207326938Sdim 208326938Sdim} // end anonymous namespace 209326938Sdim 210326938SdimINITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", 211326938Sdim AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) 212326938Sdim 213326938Sdim/// Based only on latency of instructions, determine if it is cost efficient 214326938Sdim/// to replace the instruction InstDesc by the instructions stored in the 215326938Sdim/// array InstDescRepl. 216326938Sdim/// Return true if replacement is expected to be faster. 217326938Sdimbool AArch64SIMDInstrOpt:: 218326938SdimshouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, 219326938Sdim SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { 220326938Sdim // Check if replacement decision is already available in the cached table. 221326938Sdim // if so, return it. 222326938Sdim std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 223326938Sdim auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); 224326938Sdim if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) 225326938Sdim return SIMDInstrTable[InstID]; 226326938Sdim 227326938Sdim unsigned SCIdx = InstDesc->getSchedClass(); 228326938Sdim const MCSchedClassDesc *SCDesc = 229326938Sdim SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); 230326938Sdim 231326938Sdim // If a target does not define resources for the instructions 232326938Sdim // of interest, then return false for no replacement. 233326938Sdim const MCSchedClassDesc *SCDescRepl; 234326938Sdim if (!SCDesc->isValid() || SCDesc->isVariant()) 235326938Sdim { 236326938Sdim SIMDInstrTable[InstID] = false; 237326938Sdim return false; 238326938Sdim } 239326938Sdim for (auto IDesc : InstDescRepl) 240326938Sdim { 241326938Sdim SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( 242326938Sdim IDesc->getSchedClass()); 243326938Sdim if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) 244326938Sdim { 245326938Sdim SIMDInstrTable[InstID] = false; 246326938Sdim return false; 247326938Sdim } 248326938Sdim } 249326938Sdim 250326938Sdim // Replacement cost. 251326938Sdim unsigned ReplCost = 0; 252326938Sdim for (auto IDesc :InstDescRepl) 253326938Sdim ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); 254326938Sdim 255326938Sdim if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) 256326938Sdim { 257326938Sdim SIMDInstrTable[InstID] = true; 258326938Sdim return true; 259326938Sdim } 260326938Sdim else 261326938Sdim { 262326938Sdim SIMDInstrTable[InstID] = false; 263326938Sdim return false; 264326938Sdim } 265326938Sdim} 266326938Sdim 267326938Sdim/// Determine if we need to exit this pass for a kind of instruction replacement 268326938Sdim/// early. This makes sure that no compile time is spent in this pass for 269326938Sdim/// targets with no need for any of these optimizations beyond performing this 270326938Sdim/// check. 271326938Sdim/// Return true if early exit of this pass for a kind of instruction 272326938Sdim/// replacement is recommended for a target. 273326938Sdimbool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { 274326938Sdim const MCInstrDesc* OriginalMCID; 275326938Sdim SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 276326938Sdim 277326938Sdim switch (SP) { 278326938Sdim // For this optimization, check by comparing the latency of a representative 279326938Sdim // instruction to that of the replacement instructions. 280326938Sdim // TODO: check for all concerned instructions. 281326938Sdim case VectorElem: 282326938Sdim OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); 283326938Sdim ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); 284326938Sdim ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); 285326938Sdim if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) 286326938Sdim return false; 287326938Sdim break; 288326938Sdim 289326938Sdim // For this optimization, check for all concerned instructions. 290326938Sdim case Interleave: 291326938Sdim std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); 292326938Sdim if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) 293326938Sdim return InterlEarlyExit[Subtarget]; 294326938Sdim 295326938Sdim for (auto &I : IRT) { 296326938Sdim OriginalMCID = &TII->get(I.OrigOpc); 297326938Sdim for (auto &Repl : I.ReplOpc) 298326938Sdim ReplInstrMCID.push_back(&TII->get(Repl)); 299326938Sdim if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { 300326938Sdim InterlEarlyExit[Subtarget] = false; 301326938Sdim return false; 302326938Sdim } 303326938Sdim ReplInstrMCID.clear(); 304326938Sdim } 305326938Sdim InterlEarlyExit[Subtarget] = true; 306326938Sdim break; 307326938Sdim } 308326938Sdim 309326938Sdim return true; 310326938Sdim} 311326938Sdim 312326938Sdim/// Check whether an equivalent DUP instruction has already been 313326938Sdim/// created or not. 314326938Sdim/// Return true when the DUP instruction already exists. In this case, 315326938Sdim/// DestReg will point to the destination of the already created DUP. 316326938Sdimbool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, 317326938Sdim unsigned SrcReg, unsigned LaneNumber, 318326938Sdim unsigned *DestReg) const { 319326938Sdim for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); 320326938Sdim MII != MIE;) { 321326938Sdim MII--; 322326938Sdim MachineInstr *CurrentMI = &*MII; 323326938Sdim 324326938Sdim if (CurrentMI->getOpcode() == DupOpcode && 325326938Sdim CurrentMI->getNumOperands() == 3 && 326326938Sdim CurrentMI->getOperand(1).getReg() == SrcReg && 327326938Sdim CurrentMI->getOperand(2).getImm() == LaneNumber) { 328326938Sdim *DestReg = CurrentMI->getOperand(0).getReg(); 329326938Sdim return true; 330326938Sdim } 331326938Sdim } 332326938Sdim 333326938Sdim return false; 334326938Sdim} 335326938Sdim 336326938Sdim/// Certain SIMD instructions with vector element operand are not efficient. 337326938Sdim/// Rewrite them into SIMD instructions with vector operands. This rewrite 338326938Sdim/// is driven by the latency of the instructions. 339326938Sdim/// The instruction of concerns are for the time being FMLA, FMLS, FMUL, 340326938Sdim/// and FMULX and hence they are hardcoded. 341326938Sdim/// 342326938Sdim/// For example: 343326938Sdim/// fmla v0.4s, v1.4s, v2.s[1] 344326938Sdim/// 345326938Sdim/// Is rewritten into 346326938Sdim/// dup v3.4s, v2.s[1] // DUP not necessary if redundant 347326938Sdim/// fmla v0.4s, v1.4s, v3.4s 348326938Sdim/// 349326938Sdim/// Return true if the SIMD instruction is modified. 350326938Sdimbool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { 351326938Sdim const MCInstrDesc *MulMCID, *DupMCID; 352326938Sdim const TargetRegisterClass *RC = &AArch64::FPR128RegClass; 353326938Sdim 354326938Sdim switch (MI.getOpcode()) { 355326938Sdim default: 356326938Sdim return false; 357326938Sdim 358326938Sdim // 4X32 instructions 359326938Sdim case AArch64::FMLAv4i32_indexed: 360326938Sdim DupMCID = &TII->get(AArch64::DUPv4i32lane); 361326938Sdim MulMCID = &TII->get(AArch64::FMLAv4f32); 362326938Sdim break; 363326938Sdim case AArch64::FMLSv4i32_indexed: 364326938Sdim DupMCID = &TII->get(AArch64::DUPv4i32lane); 365326938Sdim MulMCID = &TII->get(AArch64::FMLSv4f32); 366326938Sdim break; 367326938Sdim case AArch64::FMULXv4i32_indexed: 368326938Sdim DupMCID = &TII->get(AArch64::DUPv4i32lane); 369326938Sdim MulMCID = &TII->get(AArch64::FMULXv4f32); 370326938Sdim break; 371326938Sdim case AArch64::FMULv4i32_indexed: 372326938Sdim DupMCID = &TII->get(AArch64::DUPv4i32lane); 373326938Sdim MulMCID = &TII->get(AArch64::FMULv4f32); 374326938Sdim break; 375326938Sdim 376326938Sdim // 2X64 instructions 377326938Sdim case AArch64::FMLAv2i64_indexed: 378326938Sdim DupMCID = &TII->get(AArch64::DUPv2i64lane); 379326938Sdim MulMCID = &TII->get(AArch64::FMLAv2f64); 380326938Sdim break; 381326938Sdim case AArch64::FMLSv2i64_indexed: 382326938Sdim DupMCID = &TII->get(AArch64::DUPv2i64lane); 383326938Sdim MulMCID = &TII->get(AArch64::FMLSv2f64); 384326938Sdim break; 385326938Sdim case AArch64::FMULXv2i64_indexed: 386326938Sdim DupMCID = &TII->get(AArch64::DUPv2i64lane); 387326938Sdim MulMCID = &TII->get(AArch64::FMULXv2f64); 388326938Sdim break; 389326938Sdim case AArch64::FMULv2i64_indexed: 390326938Sdim DupMCID = &TII->get(AArch64::DUPv2i64lane); 391326938Sdim MulMCID = &TII->get(AArch64::FMULv2f64); 392326938Sdim break; 393326938Sdim 394326938Sdim // 2X32 instructions 395326938Sdim case AArch64::FMLAv2i32_indexed: 396326938Sdim RC = &AArch64::FPR64RegClass; 397326938Sdim DupMCID = &TII->get(AArch64::DUPv2i32lane); 398326938Sdim MulMCID = &TII->get(AArch64::FMLAv2f32); 399326938Sdim break; 400326938Sdim case AArch64::FMLSv2i32_indexed: 401326938Sdim RC = &AArch64::FPR64RegClass; 402326938Sdim DupMCID = &TII->get(AArch64::DUPv2i32lane); 403326938Sdim MulMCID = &TII->get(AArch64::FMLSv2f32); 404326938Sdim break; 405326938Sdim case AArch64::FMULXv2i32_indexed: 406326938Sdim RC = &AArch64::FPR64RegClass; 407326938Sdim DupMCID = &TII->get(AArch64::DUPv2i32lane); 408326938Sdim MulMCID = &TII->get(AArch64::FMULXv2f32); 409326938Sdim break; 410326938Sdim case AArch64::FMULv2i32_indexed: 411326938Sdim RC = &AArch64::FPR64RegClass; 412326938Sdim DupMCID = &TII->get(AArch64::DUPv2i32lane); 413326938Sdim MulMCID = &TII->get(AArch64::FMULv2f32); 414326938Sdim break; 415326938Sdim } 416326938Sdim 417326938Sdim SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; 418326938Sdim ReplInstrMCID.push_back(DupMCID); 419326938Sdim ReplInstrMCID.push_back(MulMCID); 420326938Sdim if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 421326938Sdim ReplInstrMCID)) 422326938Sdim return false; 423326938Sdim 424326938Sdim const DebugLoc &DL = MI.getDebugLoc(); 425326938Sdim MachineBasicBlock &MBB = *MI.getParent(); 426326938Sdim MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 427326938Sdim 428326938Sdim // Get the operands of the current SIMD arithmetic instruction. 429360784Sdim Register MulDest = MI.getOperand(0).getReg(); 430360784Sdim Register SrcReg0 = MI.getOperand(1).getReg(); 431326938Sdim unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); 432360784Sdim Register SrcReg1 = MI.getOperand(2).getReg(); 433326938Sdim unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); 434326938Sdim unsigned DupDest; 435326938Sdim 436326938Sdim // Instructions of interest have either 4 or 5 operands. 437326938Sdim if (MI.getNumOperands() == 5) { 438360784Sdim Register SrcReg2 = MI.getOperand(3).getReg(); 439326938Sdim unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); 440326938Sdim unsigned LaneNumber = MI.getOperand(4).getImm(); 441326938Sdim // Create a new DUP instruction. Note that if an equivalent DUP instruction 442326938Sdim // has already been created before, then use that one instead of creating 443326938Sdim // a new one. 444326938Sdim if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { 445326938Sdim DupDest = MRI.createVirtualRegister(RC); 446326938Sdim BuildMI(MBB, MI, DL, *DupMCID, DupDest) 447326938Sdim .addReg(SrcReg2, Src2IsKill) 448326938Sdim .addImm(LaneNumber); 449326938Sdim } 450326938Sdim BuildMI(MBB, MI, DL, *MulMCID, MulDest) 451326938Sdim .addReg(SrcReg0, Src0IsKill) 452326938Sdim .addReg(SrcReg1, Src1IsKill) 453326938Sdim .addReg(DupDest, Src2IsKill); 454326938Sdim } else if (MI.getNumOperands() == 4) { 455326938Sdim unsigned LaneNumber = MI.getOperand(3).getImm(); 456326938Sdim if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { 457326938Sdim DupDest = MRI.createVirtualRegister(RC); 458326938Sdim BuildMI(MBB, MI, DL, *DupMCID, DupDest) 459326938Sdim .addReg(SrcReg1, Src1IsKill) 460326938Sdim .addImm(LaneNumber); 461326938Sdim } 462326938Sdim BuildMI(MBB, MI, DL, *MulMCID, MulDest) 463326938Sdim .addReg(SrcReg0, Src0IsKill) 464326938Sdim .addReg(DupDest, Src1IsKill); 465326938Sdim } else { 466326938Sdim return false; 467326938Sdim } 468326938Sdim 469326938Sdim ++NumModifiedInstr; 470326938Sdim return true; 471326938Sdim} 472326938Sdim 473326938Sdim/// Load/Store Interleaving instructions are not always beneficial. 474326938Sdim/// Replace them by ZIP instructions and classical load/store. 475326938Sdim/// 476326938Sdim/// For example: 477326938Sdim/// st2 {v0.4s, v1.4s}, addr 478326938Sdim/// 479326938Sdim/// Is rewritten into: 480326938Sdim/// zip1 v2.4s, v0.4s, v1.4s 481326938Sdim/// zip2 v3.4s, v0.4s, v1.4s 482326938Sdim/// stp q2, q3, addr 483326938Sdim// 484326938Sdim/// For example: 485326938Sdim/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr 486326938Sdim/// 487326938Sdim/// Is rewritten into: 488326938Sdim/// zip1 v4.4s, v0.4s, v2.4s 489326938Sdim/// zip2 v5.4s, v0.4s, v2.4s 490326938Sdim/// zip1 v6.4s, v1.4s, v3.4s 491326938Sdim/// zip2 v7.4s, v1.4s, v3.4s 492326938Sdim/// zip1 v8.4s, v4.4s, v6.4s 493326938Sdim/// zip2 v9.4s, v4.4s, v6.4s 494326938Sdim/// zip1 v10.4s, v5.4s, v7.4s 495326938Sdim/// zip2 v11.4s, v5.4s, v7.4s 496326938Sdim/// stp q8, q9, addr 497326938Sdim/// stp q10, q11, addr+32 498326938Sdim/// 499326938Sdim/// Currently only instructions related to ST2 and ST4 are considered. 500326938Sdim/// Other may be added later. 501326938Sdim/// Return true if the SIMD instruction is modified. 502326938Sdimbool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { 503326938Sdim 504326938Sdim unsigned SeqReg, AddrReg; 505326938Sdim unsigned StReg[4], StRegKill[4]; 506326938Sdim MachineInstr *DefiningMI; 507326938Sdim const DebugLoc &DL = MI.getDebugLoc(); 508326938Sdim MachineBasicBlock &MBB = *MI.getParent(); 509326938Sdim SmallVector<unsigned, MaxNumRepl> ZipDest; 510326938Sdim SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; 511326938Sdim 512326938Sdim // If current instruction matches any of the rewriting rules, then 513326938Sdim // gather information about parameters of the new instructions. 514326938Sdim bool Match = false; 515326938Sdim for (auto &I : IRT) { 516326938Sdim if (MI.getOpcode() == I.OrigOpc) { 517326938Sdim SeqReg = MI.getOperand(0).getReg(); 518326938Sdim AddrReg = MI.getOperand(1).getReg(); 519326938Sdim DefiningMI = MRI->getUniqueVRegDef(SeqReg); 520326938Sdim unsigned NumReg = determineSrcReg(MI); 521326938Sdim if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) 522326938Sdim return false; 523326938Sdim 524326938Sdim for (auto &Repl : I.ReplOpc) { 525326938Sdim ReplInstrMCID.push_back(&TII->get(Repl)); 526326938Sdim // Generate destination registers but only for non-store instruction. 527326938Sdim if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) 528326938Sdim ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); 529326938Sdim } 530326938Sdim Match = true; 531326938Sdim break; 532326938Sdim } 533326938Sdim } 534326938Sdim 535326938Sdim if (!Match) 536326938Sdim return false; 537326938Sdim 538326938Sdim // Determine if it is profitable to replace MI by the series of instructions 539326938Sdim // represented in ReplInstrMCID. 540326938Sdim if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), 541326938Sdim ReplInstrMCID)) 542326938Sdim return false; 543326938Sdim 544326938Sdim // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at 545326938Sdim // this point, the code generation is hardcoded and does not rely on the IRT 546326938Sdim // table used above given that code generation for ST2 replacement is somewhat 547326938Sdim // different than for ST4 replacement. We could have added more info into the 548326938Sdim // table related to how we build new instructions but we may be adding more 549326938Sdim // complexity with that). 550326938Sdim switch (MI.getOpcode()) { 551326938Sdim default: 552326938Sdim return false; 553326938Sdim 554326938Sdim case AArch64::ST2Twov16b: 555326938Sdim case AArch64::ST2Twov8b: 556326938Sdim case AArch64::ST2Twov8h: 557326938Sdim case AArch64::ST2Twov4h: 558326938Sdim case AArch64::ST2Twov4s: 559326938Sdim case AArch64::ST2Twov2s: 560326938Sdim case AArch64::ST2Twov2d: 561326938Sdim // ZIP instructions 562326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 563326938Sdim .addReg(StReg[0]) 564326938Sdim .addReg(StReg[1]); 565326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 566326938Sdim .addReg(StReg[0], StRegKill[0]) 567326938Sdim .addReg(StReg[1], StRegKill[1]); 568326938Sdim // STP instructions 569326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) 570326938Sdim .addReg(ZipDest[0]) 571326938Sdim .addReg(ZipDest[1]) 572326938Sdim .addReg(AddrReg) 573326938Sdim .addImm(0); 574326938Sdim break; 575326938Sdim 576326938Sdim case AArch64::ST4Fourv16b: 577326938Sdim case AArch64::ST4Fourv8b: 578326938Sdim case AArch64::ST4Fourv8h: 579326938Sdim case AArch64::ST4Fourv4h: 580326938Sdim case AArch64::ST4Fourv4s: 581326938Sdim case AArch64::ST4Fourv2s: 582326938Sdim case AArch64::ST4Fourv2d: 583326938Sdim // ZIP instructions 584326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) 585326938Sdim .addReg(StReg[0]) 586326938Sdim .addReg(StReg[2]); 587326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) 588326938Sdim .addReg(StReg[0], StRegKill[0]) 589326938Sdim .addReg(StReg[2], StRegKill[2]); 590326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) 591326938Sdim .addReg(StReg[1]) 592326938Sdim .addReg(StReg[3]); 593326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) 594326938Sdim .addReg(StReg[1], StRegKill[1]) 595326938Sdim .addReg(StReg[3], StRegKill[3]); 596326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) 597326938Sdim .addReg(ZipDest[0]) 598326938Sdim .addReg(ZipDest[2]); 599326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) 600326938Sdim .addReg(ZipDest[0]) 601326938Sdim .addReg(ZipDest[2]); 602326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) 603326938Sdim .addReg(ZipDest[1]) 604326938Sdim .addReg(ZipDest[3]); 605326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) 606326938Sdim .addReg(ZipDest[1]) 607326938Sdim .addReg(ZipDest[3]); 608326938Sdim // stp instructions 609326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) 610326938Sdim .addReg(ZipDest[4]) 611326938Sdim .addReg(ZipDest[5]) 612326938Sdim .addReg(AddrReg) 613326938Sdim .addImm(0); 614326938Sdim BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) 615326938Sdim .addReg(ZipDest[6]) 616326938Sdim .addReg(ZipDest[7]) 617326938Sdim .addReg(AddrReg) 618326938Sdim .addImm(2); 619326938Sdim break; 620326938Sdim } 621326938Sdim 622326938Sdim ++NumModifiedInstr; 623326938Sdim return true; 624326938Sdim} 625326938Sdim 626326938Sdim/// Process The REG_SEQUENCE instruction, and extract the source 627326938Sdim/// operands of the ST2/4 instruction from it. 628326938Sdim/// Example of such instruction. 629326938Sdim/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; 630326938Sdim/// Return true when the instruction is processed successfully. 631326938Sdimbool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, 632326938Sdim unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { 633326938Sdim assert (DefiningMI != NULL); 634326938Sdim if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) 635326938Sdim return false; 636326938Sdim 637326938Sdim for (unsigned i=0; i<NumArg; i++) { 638326938Sdim StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); 639326938Sdim StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); 640326938Sdim 641326938Sdim // Sanity check for the other arguments. 642326938Sdim if (DefiningMI->getOperand(2*i+2).isImm()) { 643326938Sdim switch (DefiningMI->getOperand(2*i+2).getImm()) { 644326938Sdim default: 645326938Sdim return false; 646326938Sdim 647326938Sdim case AArch64::dsub0: 648326938Sdim case AArch64::dsub1: 649326938Sdim case AArch64::dsub2: 650326938Sdim case AArch64::dsub3: 651326938Sdim case AArch64::qsub0: 652326938Sdim case AArch64::qsub1: 653326938Sdim case AArch64::qsub2: 654326938Sdim case AArch64::qsub3: 655326938Sdim break; 656326938Sdim } 657326938Sdim } 658326938Sdim else 659326938Sdim return false; 660326938Sdim } 661326938Sdim return true; 662326938Sdim} 663326938Sdim 664326938Sdim/// Return the number of useful source registers for this instruction 665326938Sdim/// (2 for ST2 and 4 for ST4). 666326938Sdimunsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { 667326938Sdim switch (MI.getOpcode()) { 668326938Sdim default: 669326938Sdim llvm_unreachable("Unsupported instruction for this pass"); 670326938Sdim 671326938Sdim case AArch64::ST2Twov16b: 672326938Sdim case AArch64::ST2Twov8b: 673326938Sdim case AArch64::ST2Twov8h: 674326938Sdim case AArch64::ST2Twov4h: 675326938Sdim case AArch64::ST2Twov4s: 676326938Sdim case AArch64::ST2Twov2s: 677326938Sdim case AArch64::ST2Twov2d: 678326938Sdim return 2; 679326938Sdim 680326938Sdim case AArch64::ST4Fourv16b: 681326938Sdim case AArch64::ST4Fourv8b: 682326938Sdim case AArch64::ST4Fourv8h: 683326938Sdim case AArch64::ST4Fourv4h: 684326938Sdim case AArch64::ST4Fourv4s: 685326938Sdim case AArch64::ST4Fourv2s: 686326938Sdim case AArch64::ST4Fourv2d: 687326938Sdim return 4; 688326938Sdim } 689326938Sdim} 690326938Sdim 691326938Sdimbool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { 692326938Sdim if (skipFunction(MF.getFunction())) 693326938Sdim return false; 694326938Sdim 695326938Sdim TII = MF.getSubtarget().getInstrInfo(); 696326938Sdim MRI = &MF.getRegInfo(); 697326938Sdim const TargetSubtargetInfo &ST = MF.getSubtarget(); 698326938Sdim const AArch64InstrInfo *AAII = 699326938Sdim static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); 700326938Sdim if (!AAII) 701326938Sdim return false; 702341825Sdim SchedModel.init(&ST); 703326938Sdim if (!SchedModel.hasInstrSchedModel()) 704326938Sdim return false; 705326938Sdim 706326938Sdim bool Changed = false; 707326938Sdim for (auto OptimizationKind : {VectorElem, Interleave}) { 708326938Sdim if (!shouldExitEarly(&MF, OptimizationKind)) { 709326938Sdim SmallVector<MachineInstr *, 8> RemoveMIs; 710326938Sdim for (MachineBasicBlock &MBB : MF) { 711326938Sdim for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); 712326938Sdim MII != MIE;) { 713326938Sdim MachineInstr &MI = *MII; 714326938Sdim bool InstRewrite; 715326938Sdim if (OptimizationKind == VectorElem) 716326938Sdim InstRewrite = optimizeVectElement(MI) ; 717326938Sdim else 718326938Sdim InstRewrite = optimizeLdStInterleave(MI); 719326938Sdim if (InstRewrite) { 720326938Sdim // Add MI to the list of instructions to be removed given that it 721326938Sdim // has been replaced. 722326938Sdim RemoveMIs.push_back(&MI); 723326938Sdim Changed = true; 724326938Sdim } 725326938Sdim ++MII; 726326938Sdim } 727326938Sdim } 728326938Sdim for (MachineInstr *MI : RemoveMIs) 729326938Sdim MI->eraseFromParent(); 730326938Sdim } 731326938Sdim } 732326938Sdim 733326938Sdim return Changed; 734326938Sdim} 735326938Sdim 736326938Sdim/// Returns an instance of the high cost ASIMD instruction replacement 737326938Sdim/// optimization pass. 738326938SdimFunctionPass *llvm::createAArch64SIMDInstrOptPass() { 739326938Sdim return new AArch64SIMDInstrOpt(); 740326938Sdim} 741