1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the interfaces that ARM uses to lower LLVM code into a 10// selection DAG. 11// 12//===----------------------------------------------------------------------===// 13 14#include "ARMISelLowering.h" 15#include "ARMBaseInstrInfo.h" 16#include "ARMBaseRegisterInfo.h" 17#include "ARMCallingConv.h" 18#include "ARMConstantPoolValue.h" 19#include "ARMMachineFunctionInfo.h" 20#include "ARMPerfectShuffle.h" 21#include "ARMRegisterInfo.h" 22#include "ARMSelectionDAGInfo.h" 23#include "ARMSubtarget.h" 24#include "MCTargetDesc/ARMAddressingModes.h" 25#include "MCTargetDesc/ARMBaseInfo.h" 26#include "Utils/ARMBaseInfo.h" 27#include "llvm/ADT/APFloat.h" 28#include "llvm/ADT/APInt.h" 29#include "llvm/ADT/ArrayRef.h" 30#include "llvm/ADT/BitVector.h" 31#include "llvm/ADT/DenseMap.h" 32#include "llvm/ADT/STLExtras.h" 33#include "llvm/ADT/SmallPtrSet.h" 34#include "llvm/ADT/SmallVector.h" 35#include "llvm/ADT/Statistic.h" 36#include "llvm/ADT/StringExtras.h" 37#include "llvm/ADT/StringRef.h" 38#include "llvm/ADT/StringSwitch.h" 39#include "llvm/ADT/Triple.h" 40#include "llvm/ADT/Twine.h" 41#include "llvm/Analysis/VectorUtils.h" 42#include "llvm/CodeGen/CallingConvLower.h" 43#include "llvm/CodeGen/ISDOpcodes.h" 44#include "llvm/CodeGen/IntrinsicLowering.h" 45#include "llvm/CodeGen/MachineBasicBlock.h" 46#include "llvm/CodeGen/MachineConstantPool.h" 47#include "llvm/CodeGen/MachineFrameInfo.h" 48#include "llvm/CodeGen/MachineFunction.h" 49#include "llvm/CodeGen/MachineInstr.h" 50#include "llvm/CodeGen/MachineInstrBuilder.h" 51#include "llvm/CodeGen/MachineJumpTableInfo.h" 52#include "llvm/CodeGen/MachineMemOperand.h" 53#include "llvm/CodeGen/MachineOperand.h" 54#include "llvm/CodeGen/MachineRegisterInfo.h" 55#include "llvm/CodeGen/RuntimeLibcalls.h" 56#include "llvm/CodeGen/SelectionDAG.h" 57#include "llvm/CodeGen/SelectionDAGNodes.h" 58#include "llvm/CodeGen/TargetInstrInfo.h" 59#include "llvm/CodeGen/TargetLowering.h" 60#include "llvm/CodeGen/TargetOpcodes.h" 61#include "llvm/CodeGen/TargetRegisterInfo.h" 62#include "llvm/CodeGen/TargetSubtargetInfo.h" 63#include "llvm/CodeGen/ValueTypes.h" 64#include "llvm/IR/Attributes.h" 65#include "llvm/IR/CallingConv.h" 66#include "llvm/IR/Constant.h" 67#include "llvm/IR/Constants.h" 68#include "llvm/IR/DataLayout.h" 69#include "llvm/IR/DebugLoc.h" 70#include "llvm/IR/DerivedTypes.h" 71#include "llvm/IR/Function.h" 72#include "llvm/IR/GlobalAlias.h" 73#include "llvm/IR/GlobalValue.h" 74#include "llvm/IR/GlobalVariable.h" 75#include "llvm/IR/IRBuilder.h" 76#include "llvm/IR/InlineAsm.h" 77#include "llvm/IR/Instruction.h" 78#include "llvm/IR/Instructions.h" 79#include "llvm/IR/IntrinsicInst.h" 80#include "llvm/IR/Intrinsics.h" 81#include "llvm/IR/IntrinsicsARM.h" 82#include "llvm/IR/Module.h" 83#include "llvm/IR/PatternMatch.h" 84#include "llvm/IR/Type.h" 85#include "llvm/IR/User.h" 86#include "llvm/IR/Value.h" 87#include "llvm/MC/MCInstrDesc.h" 88#include "llvm/MC/MCInstrItineraries.h" 89#include "llvm/MC/MCRegisterInfo.h" 90#include "llvm/MC/MCSchedule.h" 91#include "llvm/Support/AtomicOrdering.h" 92#include "llvm/Support/BranchProbability.h" 93#include "llvm/Support/Casting.h" 94#include "llvm/Support/CodeGen.h" 95#include "llvm/Support/CommandLine.h" 96#include "llvm/Support/Compiler.h" 97#include "llvm/Support/Debug.h" 98#include "llvm/Support/ErrorHandling.h" 99#include "llvm/Support/KnownBits.h" 100#include "llvm/Support/MachineValueType.h" 101#include "llvm/Support/MathExtras.h" 102#include "llvm/Support/raw_ostream.h" 103#include "llvm/Target/TargetMachine.h" 104#include "llvm/Target/TargetOptions.h" 105#include <algorithm> 106#include <cassert> 107#include <cstdint> 108#include <cstdlib> 109#include <iterator> 110#include <limits> 111#include <string> 112#include <tuple> 113#include <utility> 114#include <vector> 115 116using namespace llvm; 117using namespace llvm::PatternMatch; 118 119#define DEBUG_TYPE "arm-isel" 120 121STATISTIC(NumTailCalls, "Number of tail calls"); 122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 124STATISTIC(NumConstpoolPromoted, 125 "Number of constants with their storage promoted into constant pools"); 126 127static cl::opt<bool> 128ARMInterworking("arm-interworking", cl::Hidden, 129 cl::desc("Enable / disable ARM interworking (for debugging only)"), 130 cl::init(true)); 131 132static cl::opt<bool> EnableConstpoolPromotion( 133 "arm-promote-constant", cl::Hidden, 134 cl::desc("Enable / disable promotion of unnamed_addr constants into " 135 "constant pools"), 136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 137static cl::opt<unsigned> ConstpoolPromotionMaxSize( 138 "arm-promote-constant-max-size", cl::Hidden, 139 cl::desc("Maximum size of constant to promote into a constant pool"), 140 cl::init(64)); 141static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 142 "arm-promote-constant-max-total", cl::Hidden, 143 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 144 cl::init(128)); 145 146static cl::opt<unsigned> 147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 148 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 149 cl::init(2)); 150 151// The APCS parameter registers. 152static const MCPhysReg GPRArgRegs[] = { 153 ARM::R0, ARM::R1, ARM::R2, ARM::R3 154}; 155 156void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 157 MVT PromotedBitwiseVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Promote all bit-wise operations. 197 if (VT.isInteger() && VT != PromotedBitwiseVT) { 198 setOperationAction(ISD::AND, VT, Promote); 199 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 200 setOperationAction(ISD::OR, VT, Promote); 201 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 202 setOperationAction(ISD::XOR, VT, Promote); 203 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 204 } 205 206 // Neon does not support vector divide/remainder operations. 207 setOperationAction(ISD::SDIV, VT, Expand); 208 setOperationAction(ISD::UDIV, VT, Expand); 209 setOperationAction(ISD::FDIV, VT, Expand); 210 setOperationAction(ISD::SREM, VT, Expand); 211 setOperationAction(ISD::UREM, VT, Expand); 212 setOperationAction(ISD::FREM, VT, Expand); 213 214 if (!VT.isFloatingPoint() && 215 VT != MVT::v2i64 && VT != MVT::v1i64) 216 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 217 setOperationAction(Opcode, VT, Legal); 218 if (!VT.isFloatingPoint()) 219 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 220 setOperationAction(Opcode, VT, Legal); 221} 222 223void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 224 addRegisterClass(VT, &ARM::DPRRegClass); 225 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 226} 227 228void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 229 addRegisterClass(VT, &ARM::DPairRegClass); 230 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 231} 232 233void ARMTargetLowering::setAllExpand(MVT VT) { 234 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 235 setOperationAction(Opc, VT, Expand); 236 237 // We support these really simple operations even on types where all 238 // the actual arithmetic has to be broken down into simpler 239 // operations or turned into library calls. 240 setOperationAction(ISD::BITCAST, VT, Legal); 241 setOperationAction(ISD::LOAD, VT, Legal); 242 setOperationAction(ISD::STORE, VT, Legal); 243 setOperationAction(ISD::UNDEF, VT, Legal); 244} 245 246void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 247 LegalizeAction Action) { 248 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 249 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 250 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 251} 252 253void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 254 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 255 256 for (auto VT : IntTypes) { 257 addRegisterClass(VT, &ARM::MQPRRegClass); 258 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 259 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 260 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 261 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 262 setOperationAction(ISD::SHL, VT, Custom); 263 setOperationAction(ISD::SRA, VT, Custom); 264 setOperationAction(ISD::SRL, VT, Custom); 265 setOperationAction(ISD::SMIN, VT, Legal); 266 setOperationAction(ISD::SMAX, VT, Legal); 267 setOperationAction(ISD::UMIN, VT, Legal); 268 setOperationAction(ISD::UMAX, VT, Legal); 269 setOperationAction(ISD::ABS, VT, Legal); 270 setOperationAction(ISD::SETCC, VT, Custom); 271 setOperationAction(ISD::MLOAD, VT, Custom); 272 setOperationAction(ISD::MSTORE, VT, Legal); 273 setOperationAction(ISD::CTLZ, VT, Legal); 274 setOperationAction(ISD::CTTZ, VT, Custom); 275 setOperationAction(ISD::BITREVERSE, VT, Legal); 276 setOperationAction(ISD::BSWAP, VT, Legal); 277 setOperationAction(ISD::SADDSAT, VT, Legal); 278 setOperationAction(ISD::UADDSAT, VT, Legal); 279 setOperationAction(ISD::SSUBSAT, VT, Legal); 280 setOperationAction(ISD::USUBSAT, VT, Legal); 281 282 // No native support for these. 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SDIV, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 setOperationAction(ISD::SREM, VT, Expand); 287 setOperationAction(ISD::CTPOP, VT, Expand); 288 289 // Vector reductions 290 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 295 296 if (!HasMVEFP) { 297 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 298 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 299 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 300 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 301 } 302 303 // Pre and Post inc are supported on loads and stores 304 for (unsigned im = (unsigned)ISD::PRE_INC; 305 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 306 setIndexedLoadAction(im, VT, Legal); 307 setIndexedStoreAction(im, VT, Legal); 308 setIndexedMaskedLoadAction(im, VT, Legal); 309 setIndexedMaskedStoreAction(im, VT, Legal); 310 } 311 } 312 313 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 314 for (auto VT : FloatTypes) { 315 addRegisterClass(VT, &ARM::MQPRRegClass); 316 if (!HasMVEFP) 317 setAllExpand(VT); 318 319 // These are legal or custom whether we have MVE.fp or not 320 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 321 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 322 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 325 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 326 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 327 setOperationAction(ISD::SETCC, VT, Custom); 328 setOperationAction(ISD::MLOAD, VT, Custom); 329 setOperationAction(ISD::MSTORE, VT, Legal); 330 331 // Pre and Post inc are supported on loads and stores 332 for (unsigned im = (unsigned)ISD::PRE_INC; 333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 334 setIndexedLoadAction(im, VT, Legal); 335 setIndexedStoreAction(im, VT, Legal); 336 setIndexedMaskedLoadAction(im, VT, Legal); 337 setIndexedMaskedStoreAction(im, VT, Legal); 338 } 339 340 if (HasMVEFP) { 341 setOperationAction(ISD::FMINNUM, VT, Legal); 342 setOperationAction(ISD::FMAXNUM, VT, Legal); 343 setOperationAction(ISD::FROUND, VT, Legal); 344 345 // No native support for these. 346 setOperationAction(ISD::FDIV, VT, Expand); 347 setOperationAction(ISD::FREM, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FCOS, VT, Expand); 351 setOperationAction(ISD::FPOW, VT, Expand); 352 setOperationAction(ISD::FLOG, VT, Expand); 353 setOperationAction(ISD::FLOG2, VT, Expand); 354 setOperationAction(ISD::FLOG10, VT, Expand); 355 setOperationAction(ISD::FEXP, VT, Expand); 356 setOperationAction(ISD::FEXP2, VT, Expand); 357 setOperationAction(ISD::FNEARBYINT, VT, Expand); 358 } 359 } 360 361 // We 'support' these types up to bitcast/load/store level, regardless of 362 // MVE integer-only / float support. Only doing FP data processing on the FP 363 // vector types is inhibited at integer-only level. 364 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 365 for (auto VT : LongTypes) { 366 addRegisterClass(VT, &ARM::MQPRRegClass); 367 setAllExpand(VT); 368 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 371 } 372 // We can do bitwise operations on v2i64 vectors 373 setOperationAction(ISD::AND, MVT::v2i64, Legal); 374 setOperationAction(ISD::OR, MVT::v2i64, Legal); 375 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 376 377 // It is legal to extload from v4i8 to v4i16 or v4i32. 378 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 379 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 380 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 381 382 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 383 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 384 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 385 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 388 389 // Some truncating stores are legal too. 390 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 391 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 392 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 393 394 // Pre and Post inc on these are legal, given the correct extends 395 for (unsigned im = (unsigned)ISD::PRE_INC; 396 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 397 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 398 setIndexedLoadAction(im, VT, Legal); 399 setIndexedStoreAction(im, VT, Legal); 400 setIndexedMaskedLoadAction(im, VT, Legal); 401 setIndexedMaskedStoreAction(im, VT, Legal); 402 } 403 } 404 405 // Predicate types 406 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 407 for (auto VT : pTypes) { 408 addRegisterClass(VT, &ARM::VCCRRegClass); 409 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 410 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 411 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 412 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 413 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 414 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 415 setOperationAction(ISD::SETCC, VT, Custom); 416 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 417 setOperationAction(ISD::LOAD, VT, Custom); 418 setOperationAction(ISD::STORE, VT, Custom); 419 } 420} 421 422ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 423 const ARMSubtarget &STI) 424 : TargetLowering(TM), Subtarget(&STI) { 425 RegInfo = Subtarget->getRegisterInfo(); 426 Itins = Subtarget->getInstrItineraryData(); 427 428 setBooleanContents(ZeroOrOneBooleanContent); 429 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 430 431 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 432 !Subtarget->isTargetWatchOS()) { 433 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 434 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 435 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 436 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 437 : CallingConv::ARM_AAPCS); 438 } 439 440 if (Subtarget->isTargetMachO()) { 441 // Uses VFP for Thumb libfuncs if available. 442 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 443 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const ISD::CondCode Cond; 448 } LibraryCalls[] = { 449 // Single-precision floating-point arithmetic. 450 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 451 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 452 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 453 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 454 455 // Double-precision floating-point arithmetic. 456 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 457 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 458 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 459 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 460 461 // Single-precision comparisons. 462 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 463 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 464 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 465 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 466 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 467 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 468 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 469 470 // Double-precision comparisons. 471 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 472 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 473 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 474 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 475 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 476 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 477 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 478 479 // Floating-point to integer conversions. 480 // i64 conversions are done via library routines even when generating VFP 481 // instructions, so use the same ones. 482 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 483 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 484 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 485 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 486 487 // Conversions between floating types. 488 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 489 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 490 491 // Integer to floating-point conversions. 492 // i64 conversions are done via library routines even when generating VFP 493 // instructions, so use the same ones. 494 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 495 // e.g., __floatunsidf vs. __floatunssidfvfp. 496 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 497 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 498 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 499 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 if (LC.Cond != ISD::SETCC_INVALID) 505 setCmpLibcallCC(LC.Op, LC.Cond); 506 } 507 } 508 } 509 510 // These libcalls are not available in 32-bit. 511 setLibcallName(RTLIB::SHL_I128, nullptr); 512 setLibcallName(RTLIB::SRL_I128, nullptr); 513 setLibcallName(RTLIB::SRA_I128, nullptr); 514 setLibcallName(RTLIB::MUL_I128, nullptr); 515 516 // RTLIB 517 if (Subtarget->isAAPCS_ABI() && 518 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 519 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 520 static const struct { 521 const RTLIB::Libcall Op; 522 const char * const Name; 523 const CallingConv::ID CC; 524 const ISD::CondCode Cond; 525 } LibraryCalls[] = { 526 // Double-precision floating-point arithmetic helper functions 527 // RTABI chapter 4.1.2, Table 2 528 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 532 533 // Double-precision floating-point comparison helper functions 534 // RTABI chapter 4.1.2, Table 3 535 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 536 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 537 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 542 543 // Single-precision floating-point arithmetic helper functions 544 // RTABI chapter 4.1.2, Table 4 545 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 549 550 // Single-precision floating-point comparison helper functions 551 // RTABI chapter 4.1.2, Table 5 552 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 553 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 554 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 555 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 556 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 557 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 558 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 559 560 // Floating-point to integer conversions. 561 // RTABI chapter 4.1.2, Table 6 562 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 570 571 // Conversions between floating types. 572 // RTABI chapter 4.1.2, Table 7 573 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 576 577 // Integer to floating-point conversions. 578 // RTABI chapter 4.1.2, Table 8 579 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 587 588 // Long long helper functions 589 // RTABI chapter 4.2, Table 9 590 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 594 595 // Integer division functions 596 // RTABI chapter 4.3.1 597 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 603 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 604 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 605 }; 606 607 for (const auto &LC : LibraryCalls) { 608 setLibcallName(LC.Op, LC.Name); 609 setLibcallCallingConv(LC.Op, LC.CC); 610 if (LC.Cond != ISD::SETCC_INVALID) 611 setCmpLibcallCC(LC.Op, LC.Cond); 612 } 613 614 // EABI dependent RTLIB 615 if (TM.Options.EABIVersion == EABI::EABI4 || 616 TM.Options.EABIVersion == EABI::EABI5) { 617 static const struct { 618 const RTLIB::Libcall Op; 619 const char *const Name; 620 const CallingConv::ID CC; 621 const ISD::CondCode Cond; 622 } MemOpsLibraryCalls[] = { 623 // Memory operations 624 // RTABI chapter 4.3.4 625 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 628 }; 629 630 for (const auto &LC : MemOpsLibraryCalls) { 631 setLibcallName(LC.Op, LC.Name); 632 setLibcallCallingConv(LC.Op, LC.CC); 633 if (LC.Cond != ISD::SETCC_INVALID) 634 setCmpLibcallCC(LC.Op, LC.Cond); 635 } 636 } 637 } 638 639 if (Subtarget->isTargetWindows()) { 640 static const struct { 641 const RTLIB::Libcall Op; 642 const char * const Name; 643 const CallingConv::ID CC; 644 } LibraryCalls[] = { 645 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 646 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 647 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 648 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 649 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 650 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 651 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 652 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 653 }; 654 655 for (const auto &LC : LibraryCalls) { 656 setLibcallName(LC.Op, LC.Name); 657 setLibcallCallingConv(LC.Op, LC.CC); 658 } 659 } 660 661 // Use divmod compiler-rt calls for iOS 5.0 and later. 662 if (Subtarget->isTargetMachO() && 663 !(Subtarget->isTargetIOS() && 664 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 665 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 666 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 667 } 668 669 // The half <-> float conversion functions are always soft-float on 670 // non-watchos platforms, but are needed for some targets which use a 671 // hard-float calling convention by default. 672 if (!Subtarget->isTargetWatchABI()) { 673 if (Subtarget->isAAPCS_ABI()) { 674 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 675 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 676 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 677 } else { 678 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 679 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 680 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 681 } 682 } 683 684 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 685 // a __gnu_ prefix (which is the default). 686 if (Subtarget->isTargetAEABI()) { 687 static const struct { 688 const RTLIB::Libcall Op; 689 const char * const Name; 690 const CallingConv::ID CC; 691 } LibraryCalls[] = { 692 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 693 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 694 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 695 }; 696 697 for (const auto &LC : LibraryCalls) { 698 setLibcallName(LC.Op, LC.Name); 699 setLibcallCallingConv(LC.Op, LC.CC); 700 } 701 } 702 703 if (Subtarget->isThumb1Only()) 704 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 705 else 706 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 707 708 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 709 Subtarget->hasFPRegs()) { 710 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 711 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 712 if (!Subtarget->hasVFP2Base()) 713 setAllExpand(MVT::f32); 714 if (!Subtarget->hasFP64()) 715 setAllExpand(MVT::f64); 716 } 717 718 if (Subtarget->hasFullFP16()) { 719 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 720 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 721 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 722 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 723 724 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 725 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 726 } 727 728 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 729 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 730 setTruncStoreAction(VT, InnerVT, Expand); 731 addAllExtLoads(VT, InnerVT, Expand); 732 } 733 734 setOperationAction(ISD::MULHS, VT, Expand); 735 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 736 setOperationAction(ISD::MULHU, VT, Expand); 737 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 738 739 setOperationAction(ISD::BSWAP, VT, Expand); 740 } 741 742 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 743 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 744 745 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 746 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 747 748 if (Subtarget->hasMVEIntegerOps()) 749 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 750 751 // Combine low-overhead loop intrinsics so that we can lower i1 types. 752 if (Subtarget->hasLOB()) { 753 setTargetDAGCombine(ISD::BRCOND); 754 setTargetDAGCombine(ISD::BR_CC); 755 } 756 757 if (Subtarget->hasNEON()) { 758 addDRTypeForNEON(MVT::v2f32); 759 addDRTypeForNEON(MVT::v8i8); 760 addDRTypeForNEON(MVT::v4i16); 761 addDRTypeForNEON(MVT::v2i32); 762 addDRTypeForNEON(MVT::v1i64); 763 764 addQRTypeForNEON(MVT::v4f32); 765 addQRTypeForNEON(MVT::v2f64); 766 addQRTypeForNEON(MVT::v16i8); 767 addQRTypeForNEON(MVT::v8i16); 768 addQRTypeForNEON(MVT::v4i32); 769 addQRTypeForNEON(MVT::v2i64); 770 771 if (Subtarget->hasFullFP16()) { 772 addQRTypeForNEON(MVT::v8f16); 773 addDRTypeForNEON(MVT::v4f16); 774 } 775 } 776 777 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 778 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 779 // none of Neon, MVE or VFP supports any arithmetic operations on it. 780 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 781 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 782 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 783 // FIXME: Code duplication: FDIV and FREM are expanded always, see 784 // ARMTargetLowering::addTypeForNEON method for details. 785 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 786 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 787 // FIXME: Create unittest. 788 // In another words, find a way when "copysign" appears in DAG with vector 789 // operands. 790 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 791 // FIXME: Code duplication: SETCC has custom operation action, see 792 // ARMTargetLowering::addTypeForNEON method for details. 793 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 794 // FIXME: Create unittest for FNEG and for FABS. 795 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 796 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 797 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 798 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 799 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 800 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 801 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 802 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 803 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 804 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 805 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 806 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 807 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 808 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 809 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 810 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 811 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 812 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 813 } 814 815 if (Subtarget->hasNEON()) { 816 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 817 // supported for v4f32. 818 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 819 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 820 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 821 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 822 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 823 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 824 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 825 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 826 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 827 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 828 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 829 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 830 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 831 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 832 833 // Mark v2f32 intrinsics. 834 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 835 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 836 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 837 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 838 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 839 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 840 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 841 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 842 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 843 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 844 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 845 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 846 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 847 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 848 849 // Neon does not support some operations on v1i64 and v2i64 types. 850 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 851 // Custom handling for some quad-vector types to detect VMULL. 852 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 853 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 854 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 855 // Custom handling for some vector types to avoid expensive expansions 856 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 857 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 858 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 859 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 860 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 861 // a destination type that is wider than the source, and nor does 862 // it have a FP_TO_[SU]INT instruction with a narrower destination than 863 // source. 864 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 865 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 866 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 867 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 868 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 869 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 870 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 871 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 872 873 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 874 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 875 876 // NEON does not have single instruction CTPOP for vectors with element 877 // types wider than 8-bits. However, custom lowering can leverage the 878 // v8i8/v16i8 vcnt instruction. 879 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 880 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 881 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 882 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 883 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 884 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 885 886 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 887 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 888 889 // NEON does not have single instruction CTTZ for vectors. 890 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 891 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 892 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 893 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 894 895 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 896 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 897 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 898 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 899 900 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 901 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 902 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 903 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 904 905 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 906 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 907 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 908 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 909 910 // NEON only has FMA instructions as of VFP4. 911 if (!Subtarget->hasVFP4Base()) { 912 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 913 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 914 } 915 916 setTargetDAGCombine(ISD::INTRINSIC_VOID); 917 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 918 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 919 setTargetDAGCombine(ISD::SHL); 920 setTargetDAGCombine(ISD::SRL); 921 setTargetDAGCombine(ISD::SRA); 922 setTargetDAGCombine(ISD::FP_TO_SINT); 923 setTargetDAGCombine(ISD::FP_TO_UINT); 924 setTargetDAGCombine(ISD::FDIV); 925 setTargetDAGCombine(ISD::LOAD); 926 927 // It is legal to extload from v4i8 to v4i16 or v4i32. 928 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 929 MVT::v2i32}) { 930 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 931 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 932 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 933 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 934 } 935 } 936 } 937 938 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 939 setTargetDAGCombine(ISD::BUILD_VECTOR); 940 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 941 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 942 setTargetDAGCombine(ISD::STORE); 943 setTargetDAGCombine(ISD::SIGN_EXTEND); 944 setTargetDAGCombine(ISD::ZERO_EXTEND); 945 setTargetDAGCombine(ISD::ANY_EXTEND); 946 } 947 948 if (!Subtarget->hasFP64()) { 949 // When targeting a floating-point unit with only single-precision 950 // operations, f64 is legal for the few double-precision instructions which 951 // are present However, no double-precision operations other than moves, 952 // loads and stores are provided by the hardware. 953 setOperationAction(ISD::FADD, MVT::f64, Expand); 954 setOperationAction(ISD::FSUB, MVT::f64, Expand); 955 setOperationAction(ISD::FMUL, MVT::f64, Expand); 956 setOperationAction(ISD::FMA, MVT::f64, Expand); 957 setOperationAction(ISD::FDIV, MVT::f64, Expand); 958 setOperationAction(ISD::FREM, MVT::f64, Expand); 959 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 960 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 961 setOperationAction(ISD::FNEG, MVT::f64, Expand); 962 setOperationAction(ISD::FABS, MVT::f64, Expand); 963 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 964 setOperationAction(ISD::FSIN, MVT::f64, Expand); 965 setOperationAction(ISD::FCOS, MVT::f64, Expand); 966 setOperationAction(ISD::FPOW, MVT::f64, Expand); 967 setOperationAction(ISD::FLOG, MVT::f64, Expand); 968 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 969 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 970 setOperationAction(ISD::FEXP, MVT::f64, Expand); 971 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 972 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 973 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 974 setOperationAction(ISD::FRINT, MVT::f64, Expand); 975 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 976 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 977 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 978 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 979 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 980 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 981 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 982 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 983 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 984 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 985 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 986 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 987 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 988 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 989 } 990 991 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 992 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 993 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 994 if (Subtarget->hasFullFP16()) { 995 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 996 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 997 } 998 } 999 1000 if (!Subtarget->hasFP16()) { 1001 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1002 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1003 } 1004 1005 computeRegisterProperties(Subtarget->getRegisterInfo()); 1006 1007 // ARM does not have floating-point extending loads. 1008 for (MVT VT : MVT::fp_valuetypes()) { 1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1010 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1011 } 1012 1013 // ... or truncating stores 1014 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1015 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1016 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1017 1018 // ARM does not have i1 sign extending load. 1019 for (MVT VT : MVT::integer_valuetypes()) 1020 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1021 1022 // ARM supports all 4 flavors of integer indexed load / store. 1023 if (!Subtarget->isThumb1Only()) { 1024 for (unsigned im = (unsigned)ISD::PRE_INC; 1025 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1026 setIndexedLoadAction(im, MVT::i1, Legal); 1027 setIndexedLoadAction(im, MVT::i8, Legal); 1028 setIndexedLoadAction(im, MVT::i16, Legal); 1029 setIndexedLoadAction(im, MVT::i32, Legal); 1030 setIndexedStoreAction(im, MVT::i1, Legal); 1031 setIndexedStoreAction(im, MVT::i8, Legal); 1032 setIndexedStoreAction(im, MVT::i16, Legal); 1033 setIndexedStoreAction(im, MVT::i32, Legal); 1034 } 1035 } else { 1036 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1037 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1038 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1039 } 1040 1041 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1042 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1043 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1044 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1045 1046 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1047 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1048 if (Subtarget->hasDSP()) { 1049 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1050 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1051 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1052 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1053 } 1054 if (Subtarget->hasBaseDSP()) { 1055 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1056 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1057 } 1058 1059 // i64 operation support. 1060 setOperationAction(ISD::MUL, MVT::i64, Expand); 1061 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1062 if (Subtarget->isThumb1Only()) { 1063 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1064 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1065 } 1066 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1067 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1068 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1069 1070 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1071 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1072 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1073 setOperationAction(ISD::SRL, MVT::i64, Custom); 1074 setOperationAction(ISD::SRA, MVT::i64, Custom); 1075 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1076 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1077 1078 // MVE lowers 64 bit shifts to lsll and lsrl 1079 // assuming that ISD::SRL and SRA of i64 are already marked custom 1080 if (Subtarget->hasMVEIntegerOps()) 1081 setOperationAction(ISD::SHL, MVT::i64, Custom); 1082 1083 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1084 if (Subtarget->isThumb1Only()) { 1085 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1086 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1087 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1088 } 1089 1090 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1091 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1092 1093 // ARM does not have ROTL. 1094 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1095 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1096 setOperationAction(ISD::ROTL, VT, Expand); 1097 setOperationAction(ISD::ROTR, VT, Expand); 1098 } 1099 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1100 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1101 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1102 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1103 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1104 } 1105 1106 // @llvm.readcyclecounter requires the Performance Monitors extension. 1107 // Default to the 0 expansion on unsupported platforms. 1108 // FIXME: Technically there are older ARM CPUs that have 1109 // implementation-specific ways of obtaining this information. 1110 if (Subtarget->hasPerfMon()) 1111 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1112 1113 // Only ARMv6 has BSWAP. 1114 if (!Subtarget->hasV6Ops()) 1115 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1116 1117 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1118 : Subtarget->hasDivideInARMMode(); 1119 if (!hasDivide) { 1120 // These are expanded into libcalls if the cpu doesn't have HW divider. 1121 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1122 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1123 } 1124 1125 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1126 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1127 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1128 1129 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1130 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1131 } 1132 1133 setOperationAction(ISD::SREM, MVT::i32, Expand); 1134 setOperationAction(ISD::UREM, MVT::i32, Expand); 1135 1136 // Register based DivRem for AEABI (RTABI 4.2) 1137 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1138 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1139 Subtarget->isTargetWindows()) { 1140 setOperationAction(ISD::SREM, MVT::i64, Custom); 1141 setOperationAction(ISD::UREM, MVT::i64, Custom); 1142 HasStandaloneRem = false; 1143 1144 if (Subtarget->isTargetWindows()) { 1145 const struct { 1146 const RTLIB::Libcall Op; 1147 const char * const Name; 1148 const CallingConv::ID CC; 1149 } LibraryCalls[] = { 1150 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1151 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1152 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1153 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1154 1155 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1156 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1157 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1158 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1159 }; 1160 1161 for (const auto &LC : LibraryCalls) { 1162 setLibcallName(LC.Op, LC.Name); 1163 setLibcallCallingConv(LC.Op, LC.CC); 1164 } 1165 } else { 1166 const struct { 1167 const RTLIB::Libcall Op; 1168 const char * const Name; 1169 const CallingConv::ID CC; 1170 } LibraryCalls[] = { 1171 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1172 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1173 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1174 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1175 1176 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1177 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1178 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1179 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1180 }; 1181 1182 for (const auto &LC : LibraryCalls) { 1183 setLibcallName(LC.Op, LC.Name); 1184 setLibcallCallingConv(LC.Op, LC.CC); 1185 } 1186 } 1187 1188 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1189 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1190 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1191 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1192 } else { 1193 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1194 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1195 } 1196 1197 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1198 // MSVCRT doesn't have powi; fall back to pow 1199 setLibcallName(RTLIB::POWI_F32, nullptr); 1200 setLibcallName(RTLIB::POWI_F64, nullptr); 1201 } 1202 1203 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1204 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1205 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1206 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1207 1208 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1209 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1210 1211 // Use the default implementation. 1212 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1213 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1214 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1215 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1216 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1217 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1218 1219 if (Subtarget->isTargetWindows()) 1220 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1221 else 1222 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1223 1224 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1225 // the default expansion. 1226 InsertFencesForAtomic = false; 1227 if (Subtarget->hasAnyDataBarrier() && 1228 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1229 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1230 // to ldrex/strex loops already. 1231 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1232 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1233 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1234 1235 // On v8, we have particularly efficient implementations of atomic fences 1236 // if they can be combined with nearby atomic loads and stores. 1237 if (!Subtarget->hasAcquireRelease() || 1238 getTargetMachine().getOptLevel() == 0) { 1239 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1240 InsertFencesForAtomic = true; 1241 } 1242 } else { 1243 // If there's anything we can use as a barrier, go through custom lowering 1244 // for ATOMIC_FENCE. 1245 // If target has DMB in thumb, Fences can be inserted. 1246 if (Subtarget->hasDataBarrier()) 1247 InsertFencesForAtomic = true; 1248 1249 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1250 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1251 1252 // Set them all for expansion, which will force libcalls. 1253 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1254 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1255 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1256 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1257 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1258 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1259 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1260 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1261 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1262 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1263 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1264 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1265 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1266 // Unordered/Monotonic case. 1267 if (!InsertFencesForAtomic) { 1268 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1269 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1270 } 1271 } 1272 1273 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1274 1275 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1276 if (!Subtarget->hasV6Ops()) { 1277 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1279 } 1280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1281 1282 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1283 !Subtarget->isThumb1Only()) { 1284 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1285 // iff target supports vfp2. 1286 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1287 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1288 } 1289 1290 // We want to custom lower some of our intrinsics. 1291 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1292 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1293 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1294 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1295 if (Subtarget->useSjLjEH()) 1296 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1297 1298 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1299 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1300 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1301 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1302 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1303 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1304 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1305 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1306 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1307 if (Subtarget->hasFullFP16()) { 1308 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1309 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1310 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1311 } 1312 1313 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1314 1315 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1316 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1317 if (Subtarget->hasFullFP16()) 1318 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1319 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1320 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1321 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1322 1323 // We don't support sin/cos/fmod/copysign/pow 1324 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1325 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1326 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1327 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1328 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1329 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1330 setOperationAction(ISD::FREM, MVT::f64, Expand); 1331 setOperationAction(ISD::FREM, MVT::f32, Expand); 1332 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1333 !Subtarget->isThumb1Only()) { 1334 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1335 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1336 } 1337 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1338 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1339 1340 if (!Subtarget->hasVFP4Base()) { 1341 setOperationAction(ISD::FMA, MVT::f64, Expand); 1342 setOperationAction(ISD::FMA, MVT::f32, Expand); 1343 } 1344 1345 // Various VFP goodness 1346 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1347 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1348 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1349 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1350 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1351 } 1352 1353 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1354 if (!Subtarget->hasFP16()) { 1355 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1356 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1357 } 1358 1359 // Strict floating-point comparisons need custom lowering. 1360 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1361 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1362 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1363 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1364 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1365 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1366 } 1367 1368 // Use __sincos_stret if available. 1369 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1370 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1371 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1372 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1373 } 1374 1375 // FP-ARMv8 implements a lot of rounding-like FP operations. 1376 if (Subtarget->hasFPARMv8Base()) { 1377 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1378 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1379 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1380 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1381 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1382 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1383 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1384 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1385 if (Subtarget->hasNEON()) { 1386 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1387 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1388 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1389 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1390 } 1391 1392 if (Subtarget->hasFP64()) { 1393 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1394 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1395 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1396 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1397 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1398 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1399 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1400 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1401 } 1402 } 1403 1404 // FP16 often need to be promoted to call lib functions 1405 if (Subtarget->hasFullFP16()) { 1406 setOperationAction(ISD::FREM, MVT::f16, Promote); 1407 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1408 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1409 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1410 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1411 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1412 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1413 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1414 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1415 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1416 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1417 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1418 1419 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1420 } 1421 1422 if (Subtarget->hasNEON()) { 1423 // vmin and vmax aren't available in a scalar form, so we use 1424 // a NEON instruction with an undef lane instead. 1425 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1426 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1427 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1428 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1429 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1430 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1431 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1432 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1433 1434 if (Subtarget->hasFullFP16()) { 1435 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1436 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1437 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1438 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1439 1440 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1441 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1442 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1443 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1444 } 1445 } 1446 1447 // We have target-specific dag combine patterns for the following nodes: 1448 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1449 setTargetDAGCombine(ISD::ADD); 1450 setTargetDAGCombine(ISD::SUB); 1451 setTargetDAGCombine(ISD::MUL); 1452 setTargetDAGCombine(ISD::AND); 1453 setTargetDAGCombine(ISD::OR); 1454 setTargetDAGCombine(ISD::XOR); 1455 1456 if (Subtarget->hasV6Ops()) 1457 setTargetDAGCombine(ISD::SRL); 1458 if (Subtarget->isThumb1Only()) 1459 setTargetDAGCombine(ISD::SHL); 1460 1461 setStackPointerRegisterToSaveRestore(ARM::SP); 1462 1463 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1464 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1465 setSchedulingPreference(Sched::RegPressure); 1466 else 1467 setSchedulingPreference(Sched::Hybrid); 1468 1469 //// temporary - rewrite interface to use type 1470 MaxStoresPerMemset = 8; 1471 MaxStoresPerMemsetOptSize = 4; 1472 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1473 MaxStoresPerMemcpyOptSize = 2; 1474 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1475 MaxStoresPerMemmoveOptSize = 2; 1476 1477 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1478 // are at least 4 bytes aligned. 1479 setMinStackArgumentAlignment(Align(4)); 1480 1481 // Prefer likely predicted branches to selects on out-of-order cores. 1482 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1483 1484 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1485 1486 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1487 1488 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1489 setTargetDAGCombine(ISD::ABS); 1490} 1491 1492bool ARMTargetLowering::useSoftFloat() const { 1493 return Subtarget->useSoftFloat(); 1494} 1495 1496// FIXME: It might make sense to define the representative register class as the 1497// nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1498// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1499// SPR's representative would be DPR_VFP2. This should work well if register 1500// pressure tracking were modified such that a register use would increment the 1501// pressure of the register class's representative and all of it's super 1502// classes' representatives transitively. We have not implemented this because 1503// of the difficulty prior to coalescing of modeling operand register classes 1504// due to the common occurrence of cross class copies and subregister insertions 1505// and extractions. 1506std::pair<const TargetRegisterClass *, uint8_t> 1507ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1508 MVT VT) const { 1509 const TargetRegisterClass *RRC = nullptr; 1510 uint8_t Cost = 1; 1511 switch (VT.SimpleTy) { 1512 default: 1513 return TargetLowering::findRepresentativeClass(TRI, VT); 1514 // Use DPR as representative register class for all floating point 1515 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1516 // the cost is 1 for both f32 and f64. 1517 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1518 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1519 RRC = &ARM::DPRRegClass; 1520 // When NEON is used for SP, only half of the register file is available 1521 // because operations that define both SP and DP results will be constrained 1522 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1523 // coalescing by double-counting the SP regs. See the FIXME above. 1524 if (Subtarget->useNEONForSinglePrecisionFP()) 1525 Cost = 2; 1526 break; 1527 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1528 case MVT::v4f32: case MVT::v2f64: 1529 RRC = &ARM::DPRRegClass; 1530 Cost = 2; 1531 break; 1532 case MVT::v4i64: 1533 RRC = &ARM::DPRRegClass; 1534 Cost = 4; 1535 break; 1536 case MVT::v8i64: 1537 RRC = &ARM::DPRRegClass; 1538 Cost = 8; 1539 break; 1540 } 1541 return std::make_pair(RRC, Cost); 1542} 1543 1544const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1545 switch ((ARMISD::NodeType)Opcode) { 1546 case ARMISD::FIRST_NUMBER: break; 1547 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1548 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1549 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1550 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1551 case ARMISD::CALL: return "ARMISD::CALL"; 1552 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1553 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1554 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1555 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1556 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1557 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1558 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1559 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1560 case ARMISD::CMP: return "ARMISD::CMP"; 1561 case ARMISD::CMN: return "ARMISD::CMN"; 1562 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1563 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1564 case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; 1565 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1566 case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; 1567 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1568 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1569 1570 case ARMISD::CMOV: return "ARMISD::CMOV"; 1571 case ARMISD::SUBS: return "ARMISD::SUBS"; 1572 1573 case ARMISD::SSAT: return "ARMISD::SSAT"; 1574 case ARMISD::USAT: return "ARMISD::USAT"; 1575 1576 case ARMISD::ASRL: return "ARMISD::ASRL"; 1577 case ARMISD::LSRL: return "ARMISD::LSRL"; 1578 case ARMISD::LSLL: return "ARMISD::LSLL"; 1579 1580 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1581 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1582 case ARMISD::RRX: return "ARMISD::RRX"; 1583 1584 case ARMISD::ADDC: return "ARMISD::ADDC"; 1585 case ARMISD::ADDE: return "ARMISD::ADDE"; 1586 case ARMISD::SUBC: return "ARMISD::SUBC"; 1587 case ARMISD::SUBE: return "ARMISD::SUBE"; 1588 case ARMISD::LSLS: return "ARMISD::LSLS"; 1589 1590 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1591 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1592 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1593 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1594 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1595 1596 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1597 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1598 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1599 1600 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1601 1602 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1603 1604 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1605 1606 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1607 1608 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1609 1610 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1611 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1612 1613 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1614 case ARMISD::VCMP: return "ARMISD::VCMP"; 1615 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1616 case ARMISD::VTST: return "ARMISD::VTST"; 1617 1618 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1619 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1620 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1621 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1622 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1623 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1624 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1625 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1626 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1627 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1628 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1629 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1630 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1631 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1632 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1633 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1634 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1635 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1636 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1637 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1638 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1639 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1640 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1641 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1642 case ARMISD::VDUP: return "ARMISD::VDUP"; 1643 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1644 case ARMISD::VEXT: return "ARMISD::VEXT"; 1645 case ARMISD::VREV64: return "ARMISD::VREV64"; 1646 case ARMISD::VREV32: return "ARMISD::VREV32"; 1647 case ARMISD::VREV16: return "ARMISD::VREV16"; 1648 case ARMISD::VZIP: return "ARMISD::VZIP"; 1649 case ARMISD::VUZP: return "ARMISD::VUZP"; 1650 case ARMISD::VTRN: return "ARMISD::VTRN"; 1651 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1652 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1653 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1654 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1655 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1656 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1657 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1658 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1659 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1660 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1661 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1662 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1663 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1664 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1665 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1666 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1667 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1668 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1669 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1670 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1671 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1672 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1673 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1674 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1675 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1676 case ARMISD::BFI: return "ARMISD::BFI"; 1677 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1678 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1679 case ARMISD::VBSL: return "ARMISD::VBSL"; 1680 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1681 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1682 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1683 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1684 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1685 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1686 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1687 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1688 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1689 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1690 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1691 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1692 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1693 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1694 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1695 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1696 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1697 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1698 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1699 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1700 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1701 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1702 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1703 case ARMISD::WLS: return "ARMISD::WLS"; 1704 case ARMISD::LE: return "ARMISD::LE"; 1705 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1706 case ARMISD::CSINV: return "ARMISD::CSINV"; 1707 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1708 case ARMISD::CSINC: return "ARMISD::CSINC"; 1709 } 1710 return nullptr; 1711} 1712 1713EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1714 EVT VT) const { 1715 if (!VT.isVector()) 1716 return getPointerTy(DL); 1717 1718 // MVE has a predicate register. 1719 if (Subtarget->hasMVEIntegerOps() && 1720 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1721 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1722 return VT.changeVectorElementTypeToInteger(); 1723} 1724 1725/// getRegClassFor - Return the register class that should be used for the 1726/// specified value type. 1727const TargetRegisterClass * 1728ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1729 (void)isDivergent; 1730 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1731 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1732 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1733 // MVE Q registers. 1734 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1735 if (VT == MVT::v4i64) 1736 return &ARM::QQPRRegClass; 1737 if (VT == MVT::v8i64) 1738 return &ARM::QQQQPRRegClass; 1739 } 1740 return TargetLowering::getRegClassFor(VT); 1741} 1742 1743// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1744// source/dest is aligned and the copy size is large enough. We therefore want 1745// to align such objects passed to memory intrinsics. 1746bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1747 unsigned &PrefAlign) const { 1748 if (!isa<MemIntrinsic>(CI)) 1749 return false; 1750 MinSize = 8; 1751 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1752 // cycle faster than 4-byte aligned LDM. 1753 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1754 return true; 1755} 1756 1757// Create a fast isel object. 1758FastISel * 1759ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1760 const TargetLibraryInfo *libInfo) const { 1761 return ARM::createFastISel(funcInfo, libInfo); 1762} 1763 1764Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1765 unsigned NumVals = N->getNumValues(); 1766 if (!NumVals) 1767 return Sched::RegPressure; 1768 1769 for (unsigned i = 0; i != NumVals; ++i) { 1770 EVT VT = N->getValueType(i); 1771 if (VT == MVT::Glue || VT == MVT::Other) 1772 continue; 1773 if (VT.isFloatingPoint() || VT.isVector()) 1774 return Sched::ILP; 1775 } 1776 1777 if (!N->isMachineOpcode()) 1778 return Sched::RegPressure; 1779 1780 // Load are scheduled for latency even if there instruction itinerary 1781 // is not available. 1782 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1783 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1784 1785 if (MCID.getNumDefs() == 0) 1786 return Sched::RegPressure; 1787 if (!Itins->isEmpty() && 1788 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1789 return Sched::ILP; 1790 1791 return Sched::RegPressure; 1792} 1793 1794//===----------------------------------------------------------------------===// 1795// Lowering Code 1796//===----------------------------------------------------------------------===// 1797 1798static bool isSRL16(const SDValue &Op) { 1799 if (Op.getOpcode() != ISD::SRL) 1800 return false; 1801 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1802 return Const->getZExtValue() == 16; 1803 return false; 1804} 1805 1806static bool isSRA16(const SDValue &Op) { 1807 if (Op.getOpcode() != ISD::SRA) 1808 return false; 1809 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1810 return Const->getZExtValue() == 16; 1811 return false; 1812} 1813 1814static bool isSHL16(const SDValue &Op) { 1815 if (Op.getOpcode() != ISD::SHL) 1816 return false; 1817 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1818 return Const->getZExtValue() == 16; 1819 return false; 1820} 1821 1822// Check for a signed 16-bit value. We special case SRA because it makes it 1823// more simple when also looking for SRAs that aren't sign extending a 1824// smaller value. Without the check, we'd need to take extra care with 1825// checking order for some operations. 1826static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1827 if (isSRA16(Op)) 1828 return isSHL16(Op.getOperand(0)); 1829 return DAG.ComputeNumSignBits(Op) == 17; 1830} 1831 1832/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1833static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1834 switch (CC) { 1835 default: llvm_unreachable("Unknown condition code!"); 1836 case ISD::SETNE: return ARMCC::NE; 1837 case ISD::SETEQ: return ARMCC::EQ; 1838 case ISD::SETGT: return ARMCC::GT; 1839 case ISD::SETGE: return ARMCC::GE; 1840 case ISD::SETLT: return ARMCC::LT; 1841 case ISD::SETLE: return ARMCC::LE; 1842 case ISD::SETUGT: return ARMCC::HI; 1843 case ISD::SETUGE: return ARMCC::HS; 1844 case ISD::SETULT: return ARMCC::LO; 1845 case ISD::SETULE: return ARMCC::LS; 1846 } 1847} 1848 1849/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1850static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1851 ARMCC::CondCodes &CondCode2) { 1852 CondCode2 = ARMCC::AL; 1853 switch (CC) { 1854 default: llvm_unreachable("Unknown FP condition!"); 1855 case ISD::SETEQ: 1856 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1857 case ISD::SETGT: 1858 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1859 case ISD::SETGE: 1860 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1861 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1862 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1863 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1864 case ISD::SETO: CondCode = ARMCC::VC; break; 1865 case ISD::SETUO: CondCode = ARMCC::VS; break; 1866 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1867 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1868 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1869 case ISD::SETLT: 1870 case ISD::SETULT: CondCode = ARMCC::LT; break; 1871 case ISD::SETLE: 1872 case ISD::SETULE: CondCode = ARMCC::LE; break; 1873 case ISD::SETNE: 1874 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1875 } 1876} 1877 1878//===----------------------------------------------------------------------===// 1879// Calling Convention Implementation 1880//===----------------------------------------------------------------------===// 1881 1882/// getEffectiveCallingConv - Get the effective calling convention, taking into 1883/// account presence of floating point hardware and calling convention 1884/// limitations, such as support for variadic functions. 1885CallingConv::ID 1886ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1887 bool isVarArg) const { 1888 switch (CC) { 1889 default: 1890 report_fatal_error("Unsupported calling convention"); 1891 case CallingConv::ARM_AAPCS: 1892 case CallingConv::ARM_APCS: 1893 case CallingConv::GHC: 1894 case CallingConv::CFGuard_Check: 1895 return CC; 1896 case CallingConv::PreserveMost: 1897 return CallingConv::PreserveMost; 1898 case CallingConv::ARM_AAPCS_VFP: 1899 case CallingConv::Swift: 1900 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1901 case CallingConv::C: 1902 if (!Subtarget->isAAPCS_ABI()) 1903 return CallingConv::ARM_APCS; 1904 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1905 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1906 !isVarArg) 1907 return CallingConv::ARM_AAPCS_VFP; 1908 else 1909 return CallingConv::ARM_AAPCS; 1910 case CallingConv::Fast: 1911 case CallingConv::CXX_FAST_TLS: 1912 if (!Subtarget->isAAPCS_ABI()) { 1913 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1914 return CallingConv::Fast; 1915 return CallingConv::ARM_APCS; 1916 } else if (Subtarget->hasVFP2Base() && 1917 !Subtarget->isThumb1Only() && !isVarArg) 1918 return CallingConv::ARM_AAPCS_VFP; 1919 else 1920 return CallingConv::ARM_AAPCS; 1921 } 1922} 1923 1924CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1925 bool isVarArg) const { 1926 return CCAssignFnForNode(CC, false, isVarArg); 1927} 1928 1929CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1930 bool isVarArg) const { 1931 return CCAssignFnForNode(CC, true, isVarArg); 1932} 1933 1934/// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1935/// CallingConvention. 1936CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1937 bool Return, 1938 bool isVarArg) const { 1939 switch (getEffectiveCallingConv(CC, isVarArg)) { 1940 default: 1941 report_fatal_error("Unsupported calling convention"); 1942 case CallingConv::ARM_APCS: 1943 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1944 case CallingConv::ARM_AAPCS: 1945 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1946 case CallingConv::ARM_AAPCS_VFP: 1947 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1948 case CallingConv::Fast: 1949 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1950 case CallingConv::GHC: 1951 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1952 case CallingConv::PreserveMost: 1953 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1954 case CallingConv::CFGuard_Check: 1955 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1956 } 1957} 1958 1959/// LowerCallResult - Lower the result values of a call into the 1960/// appropriate copies out of appropriate physical registers. 1961SDValue ARMTargetLowering::LowerCallResult( 1962 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1963 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1964 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1965 SDValue ThisVal) const { 1966 // Assign locations to each value returned by this call. 1967 SmallVector<CCValAssign, 16> RVLocs; 1968 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1969 *DAG.getContext()); 1970 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1971 1972 // Copy all of the result registers out of their specified physreg. 1973 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1974 CCValAssign VA = RVLocs[i]; 1975 1976 // Pass 'this' value directly from the argument to return value, to avoid 1977 // reg unit interference 1978 if (i == 0 && isThisReturn) { 1979 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1980 "unexpected return calling convention register assignment"); 1981 InVals.push_back(ThisVal); 1982 continue; 1983 } 1984 1985 SDValue Val; 1986 if (VA.needsCustom()) { 1987 // Handle f64 or half of a v2f64. 1988 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1989 InFlag); 1990 Chain = Lo.getValue(1); 1991 InFlag = Lo.getValue(2); 1992 VA = RVLocs[++i]; // skip ahead to next loc 1993 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1994 InFlag); 1995 Chain = Hi.getValue(1); 1996 InFlag = Hi.getValue(2); 1997 if (!Subtarget->isLittle()) 1998 std::swap (Lo, Hi); 1999 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2000 2001 if (VA.getLocVT() == MVT::v2f64) { 2002 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2003 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2004 DAG.getConstant(0, dl, MVT::i32)); 2005 2006 VA = RVLocs[++i]; // skip ahead to next loc 2007 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2008 Chain = Lo.getValue(1); 2009 InFlag = Lo.getValue(2); 2010 VA = RVLocs[++i]; // skip ahead to next loc 2011 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2012 Chain = Hi.getValue(1); 2013 InFlag = Hi.getValue(2); 2014 if (!Subtarget->isLittle()) 2015 std::swap (Lo, Hi); 2016 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2017 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2018 DAG.getConstant(1, dl, MVT::i32)); 2019 } 2020 } else { 2021 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2022 InFlag); 2023 Chain = Val.getValue(1); 2024 InFlag = Val.getValue(2); 2025 } 2026 2027 switch (VA.getLocInfo()) { 2028 default: llvm_unreachable("Unknown loc info!"); 2029 case CCValAssign::Full: break; 2030 case CCValAssign::BCvt: 2031 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2032 break; 2033 } 2034 2035 InVals.push_back(Val); 2036 } 2037 2038 return Chain; 2039} 2040 2041/// LowerMemOpCallTo - Store the argument to the stack. 2042SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2043 SDValue Arg, const SDLoc &dl, 2044 SelectionDAG &DAG, 2045 const CCValAssign &VA, 2046 ISD::ArgFlagsTy Flags) const { 2047 unsigned LocMemOffset = VA.getLocMemOffset(); 2048 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2049 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2050 StackPtr, PtrOff); 2051 return DAG.getStore( 2052 Chain, dl, Arg, PtrOff, 2053 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2054} 2055 2056void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2057 SDValue Chain, SDValue &Arg, 2058 RegsToPassVector &RegsToPass, 2059 CCValAssign &VA, CCValAssign &NextVA, 2060 SDValue &StackPtr, 2061 SmallVectorImpl<SDValue> &MemOpChains, 2062 ISD::ArgFlagsTy Flags) const { 2063 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2064 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2065 unsigned id = Subtarget->isLittle() ? 0 : 1; 2066 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2067 2068 if (NextVA.isRegLoc()) 2069 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2070 else { 2071 assert(NextVA.isMemLoc()); 2072 if (!StackPtr.getNode()) 2073 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2074 getPointerTy(DAG.getDataLayout())); 2075 2076 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2077 dl, DAG, NextVA, 2078 Flags)); 2079 } 2080} 2081 2082/// LowerCall - Lowering a call into a callseq_start <- 2083/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2084/// nodes. 2085SDValue 2086ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2087 SmallVectorImpl<SDValue> &InVals) const { 2088 SelectionDAG &DAG = CLI.DAG; 2089 SDLoc &dl = CLI.DL; 2090 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2091 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2092 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2093 SDValue Chain = CLI.Chain; 2094 SDValue Callee = CLI.Callee; 2095 bool &isTailCall = CLI.IsTailCall; 2096 CallingConv::ID CallConv = CLI.CallConv; 2097 bool doesNotRet = CLI.DoesNotReturn; 2098 bool isVarArg = CLI.IsVarArg; 2099 2100 MachineFunction &MF = DAG.getMachineFunction(); 2101 MachineFunction::CallSiteInfo CSInfo; 2102 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2103 bool isThisReturn = false; 2104 bool PreferIndirect = false; 2105 2106 // Disable tail calls if they're not supported. 2107 if (!Subtarget->supportsTailCall()) 2108 isTailCall = false; 2109 2110 if (isa<GlobalAddressSDNode>(Callee)) { 2111 // If we're optimizing for minimum size and the function is called three or 2112 // more times in this block, we can improve codesize by calling indirectly 2113 // as BLXr has a 16-bit encoding. 2114 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2115 if (CLI.CS) { 2116 auto *BB = CLI.CS.getParent(); 2117 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2118 count_if(GV->users(), [&BB](const User *U) { 2119 return isa<Instruction>(U) && 2120 cast<Instruction>(U)->getParent() == BB; 2121 }) > 2; 2122 } 2123 } 2124 if (isTailCall) { 2125 // Check if it's really possible to do a tail call. 2126 isTailCall = IsEligibleForTailCallOptimization( 2127 Callee, CallConv, isVarArg, isStructRet, 2128 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2129 PreferIndirect); 2130 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 2131 report_fatal_error("failed to perform tail call elimination on a call " 2132 "site marked musttail"); 2133 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2134 // detected sibcalls. 2135 if (isTailCall) 2136 ++NumTailCalls; 2137 } 2138 2139 // Analyze operands of the call, assigning locations to each operand. 2140 SmallVector<CCValAssign, 16> ArgLocs; 2141 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2142 *DAG.getContext()); 2143 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2144 2145 // Get a count of how many bytes are to be pushed on the stack. 2146 unsigned NumBytes = CCInfo.getNextStackOffset(); 2147 2148 if (isTailCall) { 2149 // For tail calls, memory operands are available in our caller's stack. 2150 NumBytes = 0; 2151 } else { 2152 // Adjust the stack pointer for the new arguments... 2153 // These operations are automatically eliminated by the prolog/epilog pass 2154 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2155 } 2156 2157 SDValue StackPtr = 2158 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2159 2160 RegsToPassVector RegsToPass; 2161 SmallVector<SDValue, 8> MemOpChains; 2162 2163 // Walk the register/memloc assignments, inserting copies/loads. In the case 2164 // of tail call optimization, arguments are handled later. 2165 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2166 i != e; 2167 ++i, ++realArgIdx) { 2168 CCValAssign &VA = ArgLocs[i]; 2169 SDValue Arg = OutVals[realArgIdx]; 2170 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2171 bool isByVal = Flags.isByVal(); 2172 2173 // Promote the value if needed. 2174 switch (VA.getLocInfo()) { 2175 default: llvm_unreachable("Unknown loc info!"); 2176 case CCValAssign::Full: break; 2177 case CCValAssign::SExt: 2178 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2179 break; 2180 case CCValAssign::ZExt: 2181 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2182 break; 2183 case CCValAssign::AExt: 2184 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2185 break; 2186 case CCValAssign::BCvt: 2187 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2188 break; 2189 } 2190 2191 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2192 if (VA.needsCustom()) { 2193 if (VA.getLocVT() == MVT::v2f64) { 2194 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2195 DAG.getConstant(0, dl, MVT::i32)); 2196 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2197 DAG.getConstant(1, dl, MVT::i32)); 2198 2199 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2200 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2201 2202 VA = ArgLocs[++i]; // skip ahead to next loc 2203 if (VA.isRegLoc()) { 2204 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2205 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2206 } else { 2207 assert(VA.isMemLoc()); 2208 2209 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2210 dl, DAG, VA, Flags)); 2211 } 2212 } else { 2213 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2214 StackPtr, MemOpChains, Flags); 2215 } 2216 } else if (VA.isRegLoc()) { 2217 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2218 Outs[0].VT == MVT::i32) { 2219 assert(VA.getLocVT() == MVT::i32 && 2220 "unexpected calling convention register assignment"); 2221 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2222 "unexpected use of 'returned'"); 2223 isThisReturn = true; 2224 } 2225 const TargetOptions &Options = DAG.getTarget().Options; 2226 if (Options.EnableDebugEntryValues) 2227 CSInfo.emplace_back(VA.getLocReg(), i); 2228 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2229 } else if (isByVal) { 2230 assert(VA.isMemLoc()); 2231 unsigned offset = 0; 2232 2233 // True if this byval aggregate will be split between registers 2234 // and memory. 2235 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2236 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2237 2238 if (CurByValIdx < ByValArgsCount) { 2239 2240 unsigned RegBegin, RegEnd; 2241 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2242 2243 EVT PtrVT = 2244 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2245 unsigned int i, j; 2246 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2247 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2248 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2249 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 2250 MachinePointerInfo(), 2251 DAG.InferPtrAlignment(AddArg)); 2252 MemOpChains.push_back(Load.getValue(1)); 2253 RegsToPass.push_back(std::make_pair(j, Load)); 2254 } 2255 2256 // If parameter size outsides register area, "offset" value 2257 // helps us to calculate stack slot for remained part properly. 2258 offset = RegEnd - RegBegin; 2259 2260 CCInfo.nextInRegsParam(); 2261 } 2262 2263 if (Flags.getByValSize() > 4*offset) { 2264 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2265 unsigned LocMemOffset = VA.getLocMemOffset(); 2266 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2267 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2268 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2269 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2270 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2271 MVT::i32); 2272 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 2273 MVT::i32); 2274 2275 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2276 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2277 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2278 Ops)); 2279 } 2280 } else if (!isTailCall) { 2281 assert(VA.isMemLoc()); 2282 2283 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2284 dl, DAG, VA, Flags)); 2285 } 2286 } 2287 2288 if (!MemOpChains.empty()) 2289 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2290 2291 // Build a sequence of copy-to-reg nodes chained together with token chain 2292 // and flag operands which copy the outgoing args into the appropriate regs. 2293 SDValue InFlag; 2294 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2295 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2296 RegsToPass[i].second, InFlag); 2297 InFlag = Chain.getValue(1); 2298 } 2299 2300 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2301 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2302 // node so that legalize doesn't hack it. 2303 bool isDirect = false; 2304 2305 const TargetMachine &TM = getTargetMachine(); 2306 const Module *Mod = MF.getFunction().getParent(); 2307 const GlobalValue *GV = nullptr; 2308 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2309 GV = G->getGlobal(); 2310 bool isStub = 2311 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2312 2313 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2314 bool isLocalARMFunc = false; 2315 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2316 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2317 2318 if (Subtarget->genLongCalls()) { 2319 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2320 "long-calls codegen is not position independent!"); 2321 // Handle a global address or an external symbol. If it's not one of 2322 // those, the target's already in a register, so we don't need to do 2323 // anything extra. 2324 if (isa<GlobalAddressSDNode>(Callee)) { 2325 // Create a constant pool entry for the callee address 2326 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2327 ARMConstantPoolValue *CPV = 2328 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2329 2330 // Get the address of the callee into a register 2331 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2332 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2333 Callee = DAG.getLoad( 2334 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2335 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2336 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2337 const char *Sym = S->getSymbol(); 2338 2339 // Create a constant pool entry for the callee address 2340 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2341 ARMConstantPoolValue *CPV = 2342 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2343 ARMPCLabelIndex, 0); 2344 // Get the address of the callee into a register 2345 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2346 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2347 Callee = DAG.getLoad( 2348 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2349 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2350 } 2351 } else if (isa<GlobalAddressSDNode>(Callee)) { 2352 if (!PreferIndirect) { 2353 isDirect = true; 2354 bool isDef = GV->isStrongDefinitionForLinker(); 2355 2356 // ARM call to a local ARM function is predicable. 2357 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2358 // tBX takes a register source operand. 2359 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2360 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2361 Callee = DAG.getNode( 2362 ARMISD::WrapperPIC, dl, PtrVt, 2363 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2364 Callee = DAG.getLoad( 2365 PtrVt, dl, DAG.getEntryNode(), Callee, 2366 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2367 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2368 MachineMemOperand::MOInvariant); 2369 } else if (Subtarget->isTargetCOFF()) { 2370 assert(Subtarget->isTargetWindows() && 2371 "Windows is the only supported COFF target"); 2372 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2373 if (GV->hasDLLImportStorageClass()) 2374 TargetFlags = ARMII::MO_DLLIMPORT; 2375 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2376 TargetFlags = ARMII::MO_COFFSTUB; 2377 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2378 TargetFlags); 2379 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2380 Callee = 2381 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2382 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2383 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2384 } else { 2385 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2386 } 2387 } 2388 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2389 isDirect = true; 2390 // tBX takes a register source operand. 2391 const char *Sym = S->getSymbol(); 2392 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2393 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2394 ARMConstantPoolValue *CPV = 2395 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2396 ARMPCLabelIndex, 4); 2397 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2398 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2399 Callee = DAG.getLoad( 2400 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2401 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2402 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2403 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2404 } else { 2405 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2406 } 2407 } 2408 2409 // FIXME: handle tail calls differently. 2410 unsigned CallOpc; 2411 if (Subtarget->isThumb()) { 2412 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2413 CallOpc = ARMISD::CALL_NOLINK; 2414 else 2415 CallOpc = ARMISD::CALL; 2416 } else { 2417 if (!isDirect && !Subtarget->hasV5TOps()) 2418 CallOpc = ARMISD::CALL_NOLINK; 2419 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2420 // Emit regular call when code size is the priority 2421 !Subtarget->hasMinSize()) 2422 // "mov lr, pc; b _foo" to avoid confusing the RSP 2423 CallOpc = ARMISD::CALL_NOLINK; 2424 else 2425 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2426 } 2427 2428 std::vector<SDValue> Ops; 2429 Ops.push_back(Chain); 2430 Ops.push_back(Callee); 2431 2432 // Add argument registers to the end of the list so that they are known live 2433 // into the call. 2434 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2435 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2436 RegsToPass[i].second.getValueType())); 2437 2438 // Add a register mask operand representing the call-preserved registers. 2439 if (!isTailCall) { 2440 const uint32_t *Mask; 2441 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2442 if (isThisReturn) { 2443 // For 'this' returns, use the R0-preserving mask if applicable 2444 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2445 if (!Mask) { 2446 // Set isThisReturn to false if the calling convention is not one that 2447 // allows 'returned' to be modeled in this way, so LowerCallResult does 2448 // not try to pass 'this' straight through 2449 isThisReturn = false; 2450 Mask = ARI->getCallPreservedMask(MF, CallConv); 2451 } 2452 } else 2453 Mask = ARI->getCallPreservedMask(MF, CallConv); 2454 2455 assert(Mask && "Missing call preserved mask for calling convention"); 2456 Ops.push_back(DAG.getRegisterMask(Mask)); 2457 } 2458 2459 if (InFlag.getNode()) 2460 Ops.push_back(InFlag); 2461 2462 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2463 if (isTailCall) { 2464 MF.getFrameInfo().setHasTailCall(); 2465 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2466 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2467 return Ret; 2468 } 2469 2470 // Returns a chain and a flag for retval copy to use. 2471 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2472 InFlag = Chain.getValue(1); 2473 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2474 2475 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2476 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2477 if (!Ins.empty()) 2478 InFlag = Chain.getValue(1); 2479 2480 // Handle result values, copying them out of physregs into vregs that we 2481 // return. 2482 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2483 InVals, isThisReturn, 2484 isThisReturn ? OutVals[0] : SDValue()); 2485} 2486 2487/// HandleByVal - Every parameter *after* a byval parameter is passed 2488/// on the stack. Remember the next parameter register to allocate, 2489/// and then confiscate the rest of the parameter registers to insure 2490/// this. 2491void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2492 unsigned Align) const { 2493 // Byval (as with any stack) slots are always at least 4 byte aligned. 2494 Align = std::max(Align, 4U); 2495 2496 unsigned Reg = State->AllocateReg(GPRArgRegs); 2497 if (!Reg) 2498 return; 2499 2500 unsigned AlignInRegs = Align / 4; 2501 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2502 for (unsigned i = 0; i < Waste; ++i) 2503 Reg = State->AllocateReg(GPRArgRegs); 2504 2505 if (!Reg) 2506 return; 2507 2508 unsigned Excess = 4 * (ARM::R4 - Reg); 2509 2510 // Special case when NSAA != SP and parameter size greater than size of 2511 // all remained GPR regs. In that case we can't split parameter, we must 2512 // send it to stack. We also must set NCRN to R4, so waste all 2513 // remained registers. 2514 const unsigned NSAAOffset = State->getNextStackOffset(); 2515 if (NSAAOffset != 0 && Size > Excess) { 2516 while (State->AllocateReg(GPRArgRegs)) 2517 ; 2518 return; 2519 } 2520 2521 // First register for byval parameter is the first register that wasn't 2522 // allocated before this method call, so it would be "reg". 2523 // If parameter is small enough to be saved in range [reg, r4), then 2524 // the end (first after last) register would be reg + param-size-in-regs, 2525 // else parameter would be splitted between registers and stack, 2526 // end register would be r4 in this case. 2527 unsigned ByValRegBegin = Reg; 2528 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2529 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2530 // Note, first register is allocated in the beginning of function already, 2531 // allocate remained amount of registers we need. 2532 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2533 State->AllocateReg(GPRArgRegs); 2534 // A byval parameter that is split between registers and memory needs its 2535 // size truncated here. 2536 // In the case where the entire structure fits in registers, we set the 2537 // size in memory to zero. 2538 Size = std::max<int>(Size - Excess, 0); 2539} 2540 2541/// MatchingStackOffset - Return true if the given stack call argument is 2542/// already available in the same position (relatively) of the caller's 2543/// incoming argument stack. 2544static 2545bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2546 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2547 const TargetInstrInfo *TII) { 2548 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2549 int FI = std::numeric_limits<int>::max(); 2550 if (Arg.getOpcode() == ISD::CopyFromReg) { 2551 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2552 if (!Register::isVirtualRegister(VR)) 2553 return false; 2554 MachineInstr *Def = MRI->getVRegDef(VR); 2555 if (!Def) 2556 return false; 2557 if (!Flags.isByVal()) { 2558 if (!TII->isLoadFromStackSlot(*Def, FI)) 2559 return false; 2560 } else { 2561 return false; 2562 } 2563 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2564 if (Flags.isByVal()) 2565 // ByVal argument is passed in as a pointer but it's now being 2566 // dereferenced. e.g. 2567 // define @foo(%struct.X* %A) { 2568 // tail call @bar(%struct.X* byval %A) 2569 // } 2570 return false; 2571 SDValue Ptr = Ld->getBasePtr(); 2572 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2573 if (!FINode) 2574 return false; 2575 FI = FINode->getIndex(); 2576 } else 2577 return false; 2578 2579 assert(FI != std::numeric_limits<int>::max()); 2580 if (!MFI.isFixedObjectIndex(FI)) 2581 return false; 2582 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2583} 2584 2585/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2586/// for tail call optimization. Targets which want to do tail call 2587/// optimization should implement this function. 2588bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2589 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2590 bool isCalleeStructRet, bool isCallerStructRet, 2591 const SmallVectorImpl<ISD::OutputArg> &Outs, 2592 const SmallVectorImpl<SDValue> &OutVals, 2593 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2594 const bool isIndirect) const { 2595 MachineFunction &MF = DAG.getMachineFunction(); 2596 const Function &CallerF = MF.getFunction(); 2597 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2598 2599 assert(Subtarget->supportsTailCall()); 2600 2601 // Indirect tail calls cannot be optimized for Thumb1 if the args 2602 // to the call take up r0-r3. The reason is that there are no legal registers 2603 // left to hold the pointer to the function to be called. 2604 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2605 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2606 return false; 2607 2608 // Look for obvious safe cases to perform tail call optimization that do not 2609 // require ABI changes. This is what gcc calls sibcall. 2610 2611 // Exception-handling functions need a special set of instructions to indicate 2612 // a return to the hardware. Tail-calling another function would probably 2613 // break this. 2614 if (CallerF.hasFnAttribute("interrupt")) 2615 return false; 2616 2617 // Also avoid sibcall optimization if either caller or callee uses struct 2618 // return semantics. 2619 if (isCalleeStructRet || isCallerStructRet) 2620 return false; 2621 2622 // Externally-defined functions with weak linkage should not be 2623 // tail-called on ARM when the OS does not support dynamic 2624 // pre-emption of symbols, as the AAELF spec requires normal calls 2625 // to undefined weak functions to be replaced with a NOP or jump to the 2626 // next instruction. The behaviour of branch instructions in this 2627 // situation (as used for tail calls) is implementation-defined, so we 2628 // cannot rely on the linker replacing the tail call with a return. 2629 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2630 const GlobalValue *GV = G->getGlobal(); 2631 const Triple &TT = getTargetMachine().getTargetTriple(); 2632 if (GV->hasExternalWeakLinkage() && 2633 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2634 return false; 2635 } 2636 2637 // Check that the call results are passed in the same way. 2638 LLVMContext &C = *DAG.getContext(); 2639 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2640 CCAssignFnForReturn(CalleeCC, isVarArg), 2641 CCAssignFnForReturn(CallerCC, isVarArg))) 2642 return false; 2643 // The callee has to preserve all registers the caller needs to preserve. 2644 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2645 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2646 if (CalleeCC != CallerCC) { 2647 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2648 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2649 return false; 2650 } 2651 2652 // If Caller's vararg or byval argument has been split between registers and 2653 // stack, do not perform tail call, since part of the argument is in caller's 2654 // local frame. 2655 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2656 if (AFI_Caller->getArgRegsSaveSize()) 2657 return false; 2658 2659 // If the callee takes no arguments then go on to check the results of the 2660 // call. 2661 if (!Outs.empty()) { 2662 // Check if stack adjustment is needed. For now, do not do this if any 2663 // argument is passed on the stack. 2664 SmallVector<CCValAssign, 16> ArgLocs; 2665 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2666 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2667 if (CCInfo.getNextStackOffset()) { 2668 // Check if the arguments are already laid out in the right way as 2669 // the caller's fixed stack objects. 2670 MachineFrameInfo &MFI = MF.getFrameInfo(); 2671 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2672 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2673 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2674 i != e; 2675 ++i, ++realArgIdx) { 2676 CCValAssign &VA = ArgLocs[i]; 2677 EVT RegVT = VA.getLocVT(); 2678 SDValue Arg = OutVals[realArgIdx]; 2679 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2680 if (VA.getLocInfo() == CCValAssign::Indirect) 2681 return false; 2682 if (VA.needsCustom()) { 2683 // f64 and vector types are split into multiple registers or 2684 // register/stack-slot combinations. The types will not match 2685 // the registers; give up on memory f64 refs until we figure 2686 // out what to do about this. 2687 if (!VA.isRegLoc()) 2688 return false; 2689 if (!ArgLocs[++i].isRegLoc()) 2690 return false; 2691 if (RegVT == MVT::v2f64) { 2692 if (!ArgLocs[++i].isRegLoc()) 2693 return false; 2694 if (!ArgLocs[++i].isRegLoc()) 2695 return false; 2696 } 2697 } else if (!VA.isRegLoc()) { 2698 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2699 MFI, MRI, TII)) 2700 return false; 2701 } 2702 } 2703 } 2704 2705 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2706 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2707 return false; 2708 } 2709 2710 return true; 2711} 2712 2713bool 2714ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2715 MachineFunction &MF, bool isVarArg, 2716 const SmallVectorImpl<ISD::OutputArg> &Outs, 2717 LLVMContext &Context) const { 2718 SmallVector<CCValAssign, 16> RVLocs; 2719 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2720 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2721} 2722 2723static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2724 const SDLoc &DL, SelectionDAG &DAG) { 2725 const MachineFunction &MF = DAG.getMachineFunction(); 2726 const Function &F = MF.getFunction(); 2727 2728 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2729 2730 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2731 // version of the "preferred return address". These offsets affect the return 2732 // instruction if this is a return from PL1 without hypervisor extensions. 2733 // IRQ/FIQ: +4 "subs pc, lr, #4" 2734 // SWI: 0 "subs pc, lr, #0" 2735 // ABORT: +4 "subs pc, lr, #4" 2736 // UNDEF: +4/+2 "subs pc, lr, #0" 2737 // UNDEF varies depending on where the exception came from ARM or Thumb 2738 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2739 2740 int64_t LROffset; 2741 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2742 IntKind == "ABORT") 2743 LROffset = 4; 2744 else if (IntKind == "SWI" || IntKind == "UNDEF") 2745 LROffset = 0; 2746 else 2747 report_fatal_error("Unsupported interrupt attribute. If present, value " 2748 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2749 2750 RetOps.insert(RetOps.begin() + 1, 2751 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2752 2753 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2754} 2755 2756SDValue 2757ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2758 bool isVarArg, 2759 const SmallVectorImpl<ISD::OutputArg> &Outs, 2760 const SmallVectorImpl<SDValue> &OutVals, 2761 const SDLoc &dl, SelectionDAG &DAG) const { 2762 // CCValAssign - represent the assignment of the return value to a location. 2763 SmallVector<CCValAssign, 16> RVLocs; 2764 2765 // CCState - Info about the registers and stack slots. 2766 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2767 *DAG.getContext()); 2768 2769 // Analyze outgoing return values. 2770 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2771 2772 SDValue Flag; 2773 SmallVector<SDValue, 4> RetOps; 2774 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2775 bool isLittleEndian = Subtarget->isLittle(); 2776 2777 MachineFunction &MF = DAG.getMachineFunction(); 2778 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2779 AFI->setReturnRegsCount(RVLocs.size()); 2780 2781 // Copy the result values into the output registers. 2782 for (unsigned i = 0, realRVLocIdx = 0; 2783 i != RVLocs.size(); 2784 ++i, ++realRVLocIdx) { 2785 CCValAssign &VA = RVLocs[i]; 2786 assert(VA.isRegLoc() && "Can only return in registers!"); 2787 2788 SDValue Arg = OutVals[realRVLocIdx]; 2789 bool ReturnF16 = false; 2790 2791 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2792 // Half-precision return values can be returned like this: 2793 // 2794 // t11 f16 = fadd ... 2795 // t12: i16 = bitcast t11 2796 // t13: i32 = zero_extend t12 2797 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2798 // 2799 // to avoid code generation for bitcasts, we simply set Arg to the node 2800 // that produces the f16 value, t11 in this case. 2801 // 2802 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2803 SDValue ZE = Arg.getOperand(0); 2804 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2805 SDValue BC = ZE.getOperand(0); 2806 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2807 Arg = BC.getOperand(0); 2808 ReturnF16 = true; 2809 } 2810 } 2811 } 2812 } 2813 2814 switch (VA.getLocInfo()) { 2815 default: llvm_unreachable("Unknown loc info!"); 2816 case CCValAssign::Full: break; 2817 case CCValAssign::BCvt: 2818 if (!ReturnF16) 2819 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2820 break; 2821 } 2822 2823 if (VA.needsCustom()) { 2824 if (VA.getLocVT() == MVT::v2f64) { 2825 // Extract the first half and return it in two registers. 2826 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2827 DAG.getConstant(0, dl, MVT::i32)); 2828 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2829 DAG.getVTList(MVT::i32, MVT::i32), Half); 2830 2831 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2832 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2833 Flag); 2834 Flag = Chain.getValue(1); 2835 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2836 VA = RVLocs[++i]; // skip ahead to next loc 2837 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2838 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2839 Flag); 2840 Flag = Chain.getValue(1); 2841 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2842 VA = RVLocs[++i]; // skip ahead to next loc 2843 2844 // Extract the 2nd half and fall through to handle it as an f64 value. 2845 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2846 DAG.getConstant(1, dl, MVT::i32)); 2847 } 2848 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2849 // available. 2850 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2851 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2852 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2853 fmrrd.getValue(isLittleEndian ? 0 : 1), 2854 Flag); 2855 Flag = Chain.getValue(1); 2856 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2857 VA = RVLocs[++i]; // skip ahead to next loc 2858 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2859 fmrrd.getValue(isLittleEndian ? 1 : 0), 2860 Flag); 2861 } else 2862 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2863 2864 // Guarantee that all emitted copies are 2865 // stuck together, avoiding something bad. 2866 Flag = Chain.getValue(1); 2867 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2868 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2869 } 2870 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2871 const MCPhysReg *I = 2872 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2873 if (I) { 2874 for (; *I; ++I) { 2875 if (ARM::GPRRegClass.contains(*I)) 2876 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2877 else if (ARM::DPRRegClass.contains(*I)) 2878 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2879 else 2880 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2881 } 2882 } 2883 2884 // Update chain and glue. 2885 RetOps[0] = Chain; 2886 if (Flag.getNode()) 2887 RetOps.push_back(Flag); 2888 2889 // CPUs which aren't M-class use a special sequence to return from 2890 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2891 // though we use "subs pc, lr, #N"). 2892 // 2893 // M-class CPUs actually use a normal return sequence with a special 2894 // (hardware-provided) value in LR, so the normal code path works. 2895 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2896 !Subtarget->isMClass()) { 2897 if (Subtarget->isThumb1Only()) 2898 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2899 return LowerInterruptReturn(RetOps, dl, DAG); 2900 } 2901 2902 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2903} 2904 2905bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2906 if (N->getNumValues() != 1) 2907 return false; 2908 if (!N->hasNUsesOfValue(1, 0)) 2909 return false; 2910 2911 SDValue TCChain = Chain; 2912 SDNode *Copy = *N->use_begin(); 2913 if (Copy->getOpcode() == ISD::CopyToReg) { 2914 // If the copy has a glue operand, we conservatively assume it isn't safe to 2915 // perform a tail call. 2916 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2917 return false; 2918 TCChain = Copy->getOperand(0); 2919 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2920 SDNode *VMov = Copy; 2921 // f64 returned in a pair of GPRs. 2922 SmallPtrSet<SDNode*, 2> Copies; 2923 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2924 UI != UE; ++UI) { 2925 if (UI->getOpcode() != ISD::CopyToReg) 2926 return false; 2927 Copies.insert(*UI); 2928 } 2929 if (Copies.size() > 2) 2930 return false; 2931 2932 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2933 UI != UE; ++UI) { 2934 SDValue UseChain = UI->getOperand(0); 2935 if (Copies.count(UseChain.getNode())) 2936 // Second CopyToReg 2937 Copy = *UI; 2938 else { 2939 // We are at the top of this chain. 2940 // If the copy has a glue operand, we conservatively assume it 2941 // isn't safe to perform a tail call. 2942 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2943 return false; 2944 // First CopyToReg 2945 TCChain = UseChain; 2946 } 2947 } 2948 } else if (Copy->getOpcode() == ISD::BITCAST) { 2949 // f32 returned in a single GPR. 2950 if (!Copy->hasOneUse()) 2951 return false; 2952 Copy = *Copy->use_begin(); 2953 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2954 return false; 2955 // If the copy has a glue operand, we conservatively assume it isn't safe to 2956 // perform a tail call. 2957 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2958 return false; 2959 TCChain = Copy->getOperand(0); 2960 } else { 2961 return false; 2962 } 2963 2964 bool HasRet = false; 2965 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2966 UI != UE; ++UI) { 2967 if (UI->getOpcode() != ARMISD::RET_FLAG && 2968 UI->getOpcode() != ARMISD::INTRET_FLAG) 2969 return false; 2970 HasRet = true; 2971 } 2972 2973 if (!HasRet) 2974 return false; 2975 2976 Chain = TCChain; 2977 return true; 2978} 2979 2980bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2981 if (!Subtarget->supportsTailCall()) 2982 return false; 2983 2984 if (!CI->isTailCall()) 2985 return false; 2986 2987 return true; 2988} 2989 2990// Trying to write a 64 bit value so need to split into two 32 bit values first, 2991// and pass the lower and high parts through. 2992static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2993 SDLoc DL(Op); 2994 SDValue WriteValue = Op->getOperand(2); 2995 2996 // This function is only supposed to be called for i64 type argument. 2997 assert(WriteValue.getValueType() == MVT::i64 2998 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2999 3000 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3001 DAG.getConstant(0, DL, MVT::i32)); 3002 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3003 DAG.getConstant(1, DL, MVT::i32)); 3004 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3005 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3006} 3007 3008// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3009// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3010// one of the above mentioned nodes. It has to be wrapped because otherwise 3011// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3012// be used to form addressing mode. These wrapped nodes will be selected 3013// into MOVi. 3014SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3015 SelectionDAG &DAG) const { 3016 EVT PtrVT = Op.getValueType(); 3017 // FIXME there is no actual debug info here 3018 SDLoc dl(Op); 3019 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3020 SDValue Res; 3021 3022 // When generating execute-only code Constant Pools must be promoted to the 3023 // global data section. It's a bit ugly that we can't share them across basic 3024 // blocks, but this way we guarantee that execute-only behaves correct with 3025 // position-independent addressing modes. 3026 if (Subtarget->genExecuteOnly()) { 3027 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3028 auto T = const_cast<Type*>(CP->getType()); 3029 auto C = const_cast<Constant*>(CP->getConstVal()); 3030 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3031 getFunction().getParent()); 3032 auto GV = new GlobalVariable( 3033 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3034 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3035 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3036 Twine(AFI->createPICLabelUId()) 3037 ); 3038 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3039 dl, PtrVT); 3040 return LowerGlobalAddress(GA, DAG); 3041 } 3042 3043 if (CP->isMachineConstantPoolEntry()) 3044 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 3045 CP->getAlignment()); 3046 else 3047 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 3048 CP->getAlignment()); 3049 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3050} 3051 3052unsigned ARMTargetLowering::getJumpTableEncoding() const { 3053 return MachineJumpTableInfo::EK_Inline; 3054} 3055 3056SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3057 SelectionDAG &DAG) const { 3058 MachineFunction &MF = DAG.getMachineFunction(); 3059 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3060 unsigned ARMPCLabelIndex = 0; 3061 SDLoc DL(Op); 3062 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3063 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3064 SDValue CPAddr; 3065 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3066 if (!IsPositionIndependent) { 3067 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 3068 } else { 3069 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3070 ARMPCLabelIndex = AFI->createPICLabelUId(); 3071 ARMConstantPoolValue *CPV = 3072 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3073 ARMCP::CPBlockAddress, PCAdj); 3074 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3075 } 3076 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3077 SDValue Result = DAG.getLoad( 3078 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3079 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3080 if (!IsPositionIndependent) 3081 return Result; 3082 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3083 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3084} 3085 3086/// Convert a TLS address reference into the correct sequence of loads 3087/// and calls to compute the variable's address for Darwin, and return an 3088/// SDValue containing the final node. 3089 3090/// Darwin only has one TLS scheme which must be capable of dealing with the 3091/// fully general situation, in the worst case. This means: 3092/// + "extern __thread" declaration. 3093/// + Defined in a possibly unknown dynamic library. 3094/// 3095/// The general system is that each __thread variable has a [3 x i32] descriptor 3096/// which contains information used by the runtime to calculate the address. The 3097/// only part of this the compiler needs to know about is the first word, which 3098/// contains a function pointer that must be called with the address of the 3099/// entire descriptor in "r0". 3100/// 3101/// Since this descriptor may be in a different unit, in general access must 3102/// proceed along the usual ARM rules. A common sequence to produce is: 3103/// 3104/// movw rT1, :lower16:_var$non_lazy_ptr 3105/// movt rT1, :upper16:_var$non_lazy_ptr 3106/// ldr r0, [rT1] 3107/// ldr rT2, [r0] 3108/// blx rT2 3109/// [...address now in r0...] 3110SDValue 3111ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3112 SelectionDAG &DAG) const { 3113 assert(Subtarget->isTargetDarwin() && 3114 "This function expects a Darwin target"); 3115 SDLoc DL(Op); 3116 3117 // First step is to get the address of the actua global symbol. This is where 3118 // the TLS descriptor lives. 3119 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3120 3121 // The first entry in the descriptor is a function pointer that we must call 3122 // to obtain the address of the variable. 3123 SDValue Chain = DAG.getEntryNode(); 3124 SDValue FuncTLVGet = DAG.getLoad( 3125 MVT::i32, DL, Chain, DescAddr, 3126 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3127 /* Alignment = */ 4, 3128 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3129 MachineMemOperand::MOInvariant); 3130 Chain = FuncTLVGet.getValue(1); 3131 3132 MachineFunction &F = DAG.getMachineFunction(); 3133 MachineFrameInfo &MFI = F.getFrameInfo(); 3134 MFI.setAdjustsStack(true); 3135 3136 // TLS calls preserve all registers except those that absolutely must be 3137 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3138 // silly). 3139 auto TRI = 3140 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3141 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3142 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3143 3144 // Finally, we can make the call. This is just a degenerate version of a 3145 // normal AArch64 call node: r0 takes the address of the descriptor, and 3146 // returns the address of the variable in this thread. 3147 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3148 Chain = 3149 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3150 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3151 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3152 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3153} 3154 3155SDValue 3156ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3157 SelectionDAG &DAG) const { 3158 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3159 3160 SDValue Chain = DAG.getEntryNode(); 3161 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3162 SDLoc DL(Op); 3163 3164 // Load the current TEB (thread environment block) 3165 SDValue Ops[] = {Chain, 3166 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3167 DAG.getTargetConstant(15, DL, MVT::i32), 3168 DAG.getTargetConstant(0, DL, MVT::i32), 3169 DAG.getTargetConstant(13, DL, MVT::i32), 3170 DAG.getTargetConstant(0, DL, MVT::i32), 3171 DAG.getTargetConstant(2, DL, MVT::i32)}; 3172 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3173 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3174 3175 SDValue TEB = CurrentTEB.getValue(0); 3176 Chain = CurrentTEB.getValue(1); 3177 3178 // Load the ThreadLocalStoragePointer from the TEB 3179 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3180 SDValue TLSArray = 3181 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3182 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3183 3184 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3185 // offset into the TLSArray. 3186 3187 // Load the TLS index from the C runtime 3188 SDValue TLSIndex = 3189 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3190 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3191 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3192 3193 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3194 DAG.getConstant(2, DL, MVT::i32)); 3195 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3196 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3197 MachinePointerInfo()); 3198 3199 // Get the offset of the start of the .tls section (section base) 3200 const auto *GA = cast<GlobalAddressSDNode>(Op); 3201 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3202 SDValue Offset = DAG.getLoad( 3203 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3204 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 3205 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3206 3207 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3208} 3209 3210// Lower ISD::GlobalTLSAddress using the "general dynamic" model 3211SDValue 3212ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3213 SelectionDAG &DAG) const { 3214 SDLoc dl(GA); 3215 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3216 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3217 MachineFunction &MF = DAG.getMachineFunction(); 3218 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3219 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3220 ARMConstantPoolValue *CPV = 3221 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3222 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3223 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3224 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3225 Argument = DAG.getLoad( 3226 PtrVT, dl, DAG.getEntryNode(), Argument, 3227 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3228 SDValue Chain = Argument.getValue(1); 3229 3230 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3231 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3232 3233 // call __tls_get_addr. 3234 ArgListTy Args; 3235 ArgListEntry Entry; 3236 Entry.Node = Argument; 3237 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3238 Args.push_back(Entry); 3239 3240 // FIXME: is there useful debug info available here? 3241 TargetLowering::CallLoweringInfo CLI(DAG); 3242 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3243 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3244 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3245 3246 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3247 return CallResult.first; 3248} 3249 3250// Lower ISD::GlobalTLSAddress using the "initial exec" or 3251// "local exec" model. 3252SDValue 3253ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3254 SelectionDAG &DAG, 3255 TLSModel::Model model) const { 3256 const GlobalValue *GV = GA->getGlobal(); 3257 SDLoc dl(GA); 3258 SDValue Offset; 3259 SDValue Chain = DAG.getEntryNode(); 3260 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3261 // Get the Thread Pointer 3262 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3263 3264 if (model == TLSModel::InitialExec) { 3265 MachineFunction &MF = DAG.getMachineFunction(); 3266 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3267 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3268 // Initial exec model. 3269 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3270 ARMConstantPoolValue *CPV = 3271 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3272 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3273 true); 3274 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3275 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3276 Offset = DAG.getLoad( 3277 PtrVT, dl, Chain, Offset, 3278 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3279 Chain = Offset.getValue(1); 3280 3281 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3282 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3283 3284 Offset = DAG.getLoad( 3285 PtrVT, dl, Chain, Offset, 3286 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3287 } else { 3288 // local exec model 3289 assert(model == TLSModel::LocalExec); 3290 ARMConstantPoolValue *CPV = 3291 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3292 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3293 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3294 Offset = DAG.getLoad( 3295 PtrVT, dl, Chain, Offset, 3296 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3297 } 3298 3299 // The address of the thread local variable is the add of the thread 3300 // pointer with the offset of the variable. 3301 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3302} 3303 3304SDValue 3305ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3306 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3307 if (DAG.getTarget().useEmulatedTLS()) 3308 return LowerToTLSEmulatedModel(GA, DAG); 3309 3310 if (Subtarget->isTargetDarwin()) 3311 return LowerGlobalTLSAddressDarwin(Op, DAG); 3312 3313 if (Subtarget->isTargetWindows()) 3314 return LowerGlobalTLSAddressWindows(Op, DAG); 3315 3316 // TODO: implement the "local dynamic" model 3317 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3318 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3319 3320 switch (model) { 3321 case TLSModel::GeneralDynamic: 3322 case TLSModel::LocalDynamic: 3323 return LowerToTLSGeneralDynamicModel(GA, DAG); 3324 case TLSModel::InitialExec: 3325 case TLSModel::LocalExec: 3326 return LowerToTLSExecModels(GA, DAG, model); 3327 } 3328 llvm_unreachable("bogus TLS model"); 3329} 3330 3331/// Return true if all users of V are within function F, looking through 3332/// ConstantExprs. 3333static bool allUsersAreInFunction(const Value *V, const Function *F) { 3334 SmallVector<const User*,4> Worklist; 3335 for (auto *U : V->users()) 3336 Worklist.push_back(U); 3337 while (!Worklist.empty()) { 3338 auto *U = Worklist.pop_back_val(); 3339 if (isa<ConstantExpr>(U)) { 3340 for (auto *UU : U->users()) 3341 Worklist.push_back(UU); 3342 continue; 3343 } 3344 3345 auto *I = dyn_cast<Instruction>(U); 3346 if (!I || I->getParent()->getParent() != F) 3347 return false; 3348 } 3349 return true; 3350} 3351 3352static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3353 const GlobalValue *GV, SelectionDAG &DAG, 3354 EVT PtrVT, const SDLoc &dl) { 3355 // If we're creating a pool entry for a constant global with unnamed address, 3356 // and the global is small enough, we can emit it inline into the constant pool 3357 // to save ourselves an indirection. 3358 // 3359 // This is a win if the constant is only used in one function (so it doesn't 3360 // need to be duplicated) or duplicating the constant wouldn't increase code 3361 // size (implying the constant is no larger than 4 bytes). 3362 const Function &F = DAG.getMachineFunction().getFunction(); 3363 3364 // We rely on this decision to inline being idemopotent and unrelated to the 3365 // use-site. We know that if we inline a variable at one use site, we'll 3366 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3367 // doesn't know about this optimization, so bail out if it's enabled else 3368 // we could decide to inline here (and thus never emit the GV) but require 3369 // the GV from fast-isel generated code. 3370 if (!EnableConstpoolPromotion || 3371 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3372 return SDValue(); 3373 3374 auto *GVar = dyn_cast<GlobalVariable>(GV); 3375 if (!GVar || !GVar->hasInitializer() || 3376 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3377 !GVar->hasLocalLinkage()) 3378 return SDValue(); 3379 3380 // If we inline a value that contains relocations, we move the relocations 3381 // from .data to .text. This is not allowed in position-independent code. 3382 auto *Init = GVar->getInitializer(); 3383 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3384 Init->needsRelocation()) 3385 return SDValue(); 3386 3387 // The constant islands pass can only really deal with alignment requests 3388 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3389 // any type wanting greater alignment requirements than 4 bytes. We also 3390 // can only promote constants that are multiples of 4 bytes in size or 3391 // are paddable to a multiple of 4. Currently we only try and pad constants 3392 // that are strings for simplicity. 3393 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3394 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3395 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3396 unsigned RequiredPadding = 4 - (Size % 4); 3397 bool PaddingPossible = 3398 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3399 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3400 Size == 0) 3401 return SDValue(); 3402 3403 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3404 MachineFunction &MF = DAG.getMachineFunction(); 3405 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3406 3407 // We can't bloat the constant pool too much, else the ConstantIslands pass 3408 // may fail to converge. If we haven't promoted this global yet (it may have 3409 // multiple uses), and promoting it would increase the constant pool size (Sz 3410 // > 4), ensure we have space to do so up to MaxTotal. 3411 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3412 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3413 ConstpoolPromotionMaxTotal) 3414 return SDValue(); 3415 3416 // This is only valid if all users are in a single function; we can't clone 3417 // the constant in general. The LLVM IR unnamed_addr allows merging 3418 // constants, but not cloning them. 3419 // 3420 // We could potentially allow cloning if we could prove all uses of the 3421 // constant in the current function don't care about the address, like 3422 // printf format strings. But that isn't implemented for now. 3423 if (!allUsersAreInFunction(GVar, &F)) 3424 return SDValue(); 3425 3426 // We're going to inline this global. Pad it out if needed. 3427 if (RequiredPadding != 4) { 3428 StringRef S = CDAInit->getAsString(); 3429 3430 SmallVector<uint8_t,16> V(S.size()); 3431 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3432 while (RequiredPadding--) 3433 V.push_back(0); 3434 Init = ConstantDataArray::get(*DAG.getContext(), V); 3435 } 3436 3437 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3438 SDValue CPAddr = 3439 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3440 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3441 AFI->markGlobalAsPromotedToConstantPool(GVar); 3442 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3443 PaddedSize - 4); 3444 } 3445 ++NumConstpoolPromoted; 3446 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3447} 3448 3449bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3450 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3451 if (!(GV = GA->getBaseObject())) 3452 return false; 3453 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3454 return V->isConstant(); 3455 return isa<Function>(GV); 3456} 3457 3458SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3459 SelectionDAG &DAG) const { 3460 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3461 default: llvm_unreachable("unknown object format"); 3462 case Triple::COFF: 3463 return LowerGlobalAddressWindows(Op, DAG); 3464 case Triple::ELF: 3465 return LowerGlobalAddressELF(Op, DAG); 3466 case Triple::MachO: 3467 return LowerGlobalAddressDarwin(Op, DAG); 3468 } 3469} 3470 3471SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3472 SelectionDAG &DAG) const { 3473 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3474 SDLoc dl(Op); 3475 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3476 const TargetMachine &TM = getTargetMachine(); 3477 bool IsRO = isReadOnly(GV); 3478 3479 // promoteToConstantPool only if not generating XO text section 3480 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3481 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3482 return V; 3483 3484 if (isPositionIndependent()) { 3485 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3486 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3487 UseGOT_PREL ? ARMII::MO_GOT : 0); 3488 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3489 if (UseGOT_PREL) 3490 Result = 3491 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3492 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3493 return Result; 3494 } else if (Subtarget->isROPI() && IsRO) { 3495 // PC-relative. 3496 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3497 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3498 return Result; 3499 } else if (Subtarget->isRWPI() && !IsRO) { 3500 // SB-relative. 3501 SDValue RelAddr; 3502 if (Subtarget->useMovt()) { 3503 ++NumMovwMovt; 3504 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3505 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3506 } else { // use literal pool for address constant 3507 ARMConstantPoolValue *CPV = 3508 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3509 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3510 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3511 RelAddr = DAG.getLoad( 3512 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3513 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3514 } 3515 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3516 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3517 return Result; 3518 } 3519 3520 // If we have T2 ops, we can materialize the address directly via movt/movw 3521 // pair. This is always cheaper. 3522 if (Subtarget->useMovt()) { 3523 ++NumMovwMovt; 3524 // FIXME: Once remat is capable of dealing with instructions with register 3525 // operands, expand this into two nodes. 3526 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3527 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3528 } else { 3529 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3530 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3531 return DAG.getLoad( 3532 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3533 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3534 } 3535} 3536 3537SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3538 SelectionDAG &DAG) const { 3539 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3540 "ROPI/RWPI not currently supported for Darwin"); 3541 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3542 SDLoc dl(Op); 3543 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3544 3545 if (Subtarget->useMovt()) 3546 ++NumMovwMovt; 3547 3548 // FIXME: Once remat is capable of dealing with instructions with register 3549 // operands, expand this into multiple nodes 3550 unsigned Wrapper = 3551 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3552 3553 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3554 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3555 3556 if (Subtarget->isGVIndirectSymbol(GV)) 3557 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3558 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3559 return Result; 3560} 3561 3562SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3563 SelectionDAG &DAG) const { 3564 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3565 assert(Subtarget->useMovt() && 3566 "Windows on ARM expects to use movw/movt"); 3567 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3568 "ROPI/RWPI not currently supported for Windows"); 3569 3570 const TargetMachine &TM = getTargetMachine(); 3571 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3572 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3573 if (GV->hasDLLImportStorageClass()) 3574 TargetFlags = ARMII::MO_DLLIMPORT; 3575 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3576 TargetFlags = ARMII::MO_COFFSTUB; 3577 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3578 SDValue Result; 3579 SDLoc DL(Op); 3580 3581 ++NumMovwMovt; 3582 3583 // FIXME: Once remat is capable of dealing with instructions with register 3584 // operands, expand this into two nodes. 3585 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3586 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3587 TargetFlags)); 3588 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3589 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3590 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3591 return Result; 3592} 3593 3594SDValue 3595ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3596 SDLoc dl(Op); 3597 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3598 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3599 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3600 Op.getOperand(1), Val); 3601} 3602 3603SDValue 3604ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3605 SDLoc dl(Op); 3606 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3607 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3608} 3609 3610SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3611 SelectionDAG &DAG) const { 3612 SDLoc dl(Op); 3613 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3614 Op.getOperand(0)); 3615} 3616 3617SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3618 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3619 unsigned IntNo = 3620 cast<ConstantSDNode>( 3621 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3622 ->getZExtValue(); 3623 switch (IntNo) { 3624 default: 3625 return SDValue(); // Don't custom lower most intrinsics. 3626 case Intrinsic::arm_gnu_eabi_mcount: { 3627 MachineFunction &MF = DAG.getMachineFunction(); 3628 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3629 SDLoc dl(Op); 3630 SDValue Chain = Op.getOperand(0); 3631 // call "\01__gnu_mcount_nc" 3632 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3633 const uint32_t *Mask = 3634 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3635 assert(Mask && "Missing call preserved mask for calling convention"); 3636 // Mark LR an implicit live-in. 3637 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3638 SDValue ReturnAddress = 3639 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3640 std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; 3641 SDValue Callee = 3642 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3643 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3644 if (Subtarget->isThumb()) 3645 return SDValue( 3646 DAG.getMachineNode( 3647 ARM::tBL_PUSHLR, dl, ResultTys, 3648 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3649 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3650 0); 3651 return SDValue( 3652 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3653 {ReturnAddress, Callee, RegisterMask, Chain}), 3654 0); 3655 } 3656 } 3657} 3658 3659SDValue 3660ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3661 const ARMSubtarget *Subtarget) const { 3662 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3663 SDLoc dl(Op); 3664 switch (IntNo) { 3665 default: return SDValue(); // Don't custom lower most intrinsics. 3666 case Intrinsic::thread_pointer: { 3667 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3668 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3669 } 3670 case Intrinsic::arm_cls: { 3671 const SDValue &Operand = Op.getOperand(1); 3672 const EVT VTy = Op.getValueType(); 3673 SDValue SRA = 3674 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3675 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3676 SDValue SHL = 3677 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3678 SDValue OR = 3679 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3680 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3681 return Result; 3682 } 3683 case Intrinsic::arm_cls64: { 3684 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3685 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3686 const SDValue &Operand = Op.getOperand(1); 3687 const EVT VTy = Op.getValueType(); 3688 3689 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3690 DAG.getConstant(1, dl, VTy)); 3691 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3692 DAG.getConstant(0, dl, VTy)); 3693 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3694 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3695 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3696 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3697 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3698 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3699 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3700 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3701 SDValue CheckLo = 3702 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3703 SDValue HiIsZero = 3704 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3705 SDValue AdjustedLo = 3706 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3707 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3708 SDValue Result = 3709 DAG.getSelect(dl, VTy, CheckLo, 3710 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3711 return Result; 3712 } 3713 case Intrinsic::eh_sjlj_lsda: { 3714 MachineFunction &MF = DAG.getMachineFunction(); 3715 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3716 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3717 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3718 SDValue CPAddr; 3719 bool IsPositionIndependent = isPositionIndependent(); 3720 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3721 ARMConstantPoolValue *CPV = 3722 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3723 ARMCP::CPLSDA, PCAdj); 3724 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3725 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3726 SDValue Result = DAG.getLoad( 3727 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3728 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3729 3730 if (IsPositionIndependent) { 3731 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3732 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3733 } 3734 return Result; 3735 } 3736 case Intrinsic::arm_neon_vabs: 3737 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3738 Op.getOperand(1)); 3739 case Intrinsic::arm_neon_vmulls: 3740 case Intrinsic::arm_neon_vmullu: { 3741 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3742 ? ARMISD::VMULLs : ARMISD::VMULLu; 3743 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3744 Op.getOperand(1), Op.getOperand(2)); 3745 } 3746 case Intrinsic::arm_neon_vminnm: 3747 case Intrinsic::arm_neon_vmaxnm: { 3748 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3749 ? ISD::FMINNUM : ISD::FMAXNUM; 3750 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3751 Op.getOperand(1), Op.getOperand(2)); 3752 } 3753 case Intrinsic::arm_neon_vminu: 3754 case Intrinsic::arm_neon_vmaxu: { 3755 if (Op.getValueType().isFloatingPoint()) 3756 return SDValue(); 3757 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3758 ? ISD::UMIN : ISD::UMAX; 3759 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3760 Op.getOperand(1), Op.getOperand(2)); 3761 } 3762 case Intrinsic::arm_neon_vmins: 3763 case Intrinsic::arm_neon_vmaxs: { 3764 // v{min,max}s is overloaded between signed integers and floats. 3765 if (!Op.getValueType().isFloatingPoint()) { 3766 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3767 ? ISD::SMIN : ISD::SMAX; 3768 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3769 Op.getOperand(1), Op.getOperand(2)); 3770 } 3771 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3772 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3773 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3774 Op.getOperand(1), Op.getOperand(2)); 3775 } 3776 case Intrinsic::arm_neon_vtbl1: 3777 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3778 Op.getOperand(1), Op.getOperand(2)); 3779 case Intrinsic::arm_neon_vtbl2: 3780 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3781 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3782 case Intrinsic::arm_mve_pred_i2v: 3783 case Intrinsic::arm_mve_pred_v2i: 3784 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3785 Op.getOperand(1)); 3786 } 3787} 3788 3789static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3790 const ARMSubtarget *Subtarget) { 3791 SDLoc dl(Op); 3792 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3793 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3794 if (SSID == SyncScope::SingleThread) 3795 return Op; 3796 3797 if (!Subtarget->hasDataBarrier()) { 3798 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3799 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3800 // here. 3801 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3802 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3803 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3804 DAG.getConstant(0, dl, MVT::i32)); 3805 } 3806 3807 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3808 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3809 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3810 if (Subtarget->isMClass()) { 3811 // Only a full system barrier exists in the M-class architectures. 3812 Domain = ARM_MB::SY; 3813 } else if (Subtarget->preferISHSTBarriers() && 3814 Ord == AtomicOrdering::Release) { 3815 // Swift happens to implement ISHST barriers in a way that's compatible with 3816 // Release semantics but weaker than ISH so we'd be fools not to use 3817 // it. Beware: other processors probably don't! 3818 Domain = ARM_MB::ISHST; 3819 } 3820 3821 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3822 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3823 DAG.getConstant(Domain, dl, MVT::i32)); 3824} 3825 3826static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3827 const ARMSubtarget *Subtarget) { 3828 // ARM pre v5TE and Thumb1 does not have preload instructions. 3829 if (!(Subtarget->isThumb2() || 3830 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3831 // Just preserve the chain. 3832 return Op.getOperand(0); 3833 3834 SDLoc dl(Op); 3835 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3836 if (!isRead && 3837 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3838 // ARMv7 with MP extension has PLDW. 3839 return Op.getOperand(0); 3840 3841 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3842 if (Subtarget->isThumb()) { 3843 // Invert the bits. 3844 isRead = ~isRead & 1; 3845 isData = ~isData & 1; 3846 } 3847 3848 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3849 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3850 DAG.getConstant(isData, dl, MVT::i32)); 3851} 3852 3853static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3854 MachineFunction &MF = DAG.getMachineFunction(); 3855 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3856 3857 // vastart just stores the address of the VarArgsFrameIndex slot into the 3858 // memory location argument. 3859 SDLoc dl(Op); 3860 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3861 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3862 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3863 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3864 MachinePointerInfo(SV)); 3865} 3866 3867SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3868 CCValAssign &NextVA, 3869 SDValue &Root, 3870 SelectionDAG &DAG, 3871 const SDLoc &dl) const { 3872 MachineFunction &MF = DAG.getMachineFunction(); 3873 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3874 3875 const TargetRegisterClass *RC; 3876 if (AFI->isThumb1OnlyFunction()) 3877 RC = &ARM::tGPRRegClass; 3878 else 3879 RC = &ARM::GPRRegClass; 3880 3881 // Transform the arguments stored in physical registers into virtual ones. 3882 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3883 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3884 3885 SDValue ArgValue2; 3886 if (NextVA.isMemLoc()) { 3887 MachineFrameInfo &MFI = MF.getFrameInfo(); 3888 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3889 3890 // Create load node to retrieve arguments from the stack. 3891 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3892 ArgValue2 = DAG.getLoad( 3893 MVT::i32, dl, Root, FIN, 3894 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3895 } else { 3896 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3897 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3898 } 3899 if (!Subtarget->isLittle()) 3900 std::swap (ArgValue, ArgValue2); 3901 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3902} 3903 3904// The remaining GPRs hold either the beginning of variable-argument 3905// data, or the beginning of an aggregate passed by value (usually 3906// byval). Either way, we allocate stack slots adjacent to the data 3907// provided by our caller, and store the unallocated registers there. 3908// If this is a variadic function, the va_list pointer will begin with 3909// these values; otherwise, this reassembles a (byval) structure that 3910// was split between registers and memory. 3911// Return: The frame index registers were stored into. 3912int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3913 const SDLoc &dl, SDValue &Chain, 3914 const Value *OrigArg, 3915 unsigned InRegsParamRecordIdx, 3916 int ArgOffset, unsigned ArgSize) const { 3917 // Currently, two use-cases possible: 3918 // Case #1. Non-var-args function, and we meet first byval parameter. 3919 // Setup first unallocated register as first byval register; 3920 // eat all remained registers 3921 // (these two actions are performed by HandleByVal method). 3922 // Then, here, we initialize stack frame with 3923 // "store-reg" instructions. 3924 // Case #2. Var-args function, that doesn't contain byval parameters. 3925 // The same: eat all remained unallocated registers, 3926 // initialize stack frame. 3927 3928 MachineFunction &MF = DAG.getMachineFunction(); 3929 MachineFrameInfo &MFI = MF.getFrameInfo(); 3930 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3931 unsigned RBegin, REnd; 3932 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3933 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3934 } else { 3935 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3936 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3937 REnd = ARM::R4; 3938 } 3939 3940 if (REnd != RBegin) 3941 ArgOffset = -4 * (ARM::R4 - RBegin); 3942 3943 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3944 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3945 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3946 3947 SmallVector<SDValue, 4> MemOps; 3948 const TargetRegisterClass *RC = 3949 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3950 3951 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3952 unsigned VReg = MF.addLiveIn(Reg, RC); 3953 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3954 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3955 MachinePointerInfo(OrigArg, 4 * i)); 3956 MemOps.push_back(Store); 3957 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3958 } 3959 3960 if (!MemOps.empty()) 3961 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3962 return FrameIndex; 3963} 3964 3965// Setup stack frame, the va_list pointer will start from. 3966void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3967 const SDLoc &dl, SDValue &Chain, 3968 unsigned ArgOffset, 3969 unsigned TotalArgRegsSaveSize, 3970 bool ForceMutable) const { 3971 MachineFunction &MF = DAG.getMachineFunction(); 3972 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3973 3974 // Try to store any remaining integer argument regs 3975 // to their spots on the stack so that they may be loaded by dereferencing 3976 // the result of va_next. 3977 // If there is no regs to be stored, just point address after last 3978 // argument passed via stack. 3979 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3980 CCInfo.getInRegsParamsCount(), 3981 CCInfo.getNextStackOffset(), 3982 std::max(4U, TotalArgRegsSaveSize)); 3983 AFI->setVarArgsFrameIndex(FrameIndex); 3984} 3985 3986SDValue ARMTargetLowering::LowerFormalArguments( 3987 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3988 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3989 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3990 MachineFunction &MF = DAG.getMachineFunction(); 3991 MachineFrameInfo &MFI = MF.getFrameInfo(); 3992 3993 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3994 3995 // Assign locations to all of the incoming arguments. 3996 SmallVector<CCValAssign, 16> ArgLocs; 3997 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3998 *DAG.getContext()); 3999 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4000 4001 SmallVector<SDValue, 16> ArgValues; 4002 SDValue ArgValue; 4003 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4004 unsigned CurArgIdx = 0; 4005 4006 // Initially ArgRegsSaveSize is zero. 4007 // Then we increase this value each time we meet byval parameter. 4008 // We also increase this value in case of varargs function. 4009 AFI->setArgRegsSaveSize(0); 4010 4011 // Calculate the amount of stack space that we need to allocate to store 4012 // byval and variadic arguments that are passed in registers. 4013 // We need to know this before we allocate the first byval or variadic 4014 // argument, as they will be allocated a stack slot below the CFA (Canonical 4015 // Frame Address, the stack pointer at entry to the function). 4016 unsigned ArgRegBegin = ARM::R4; 4017 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4018 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4019 break; 4020 4021 CCValAssign &VA = ArgLocs[i]; 4022 unsigned Index = VA.getValNo(); 4023 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4024 if (!Flags.isByVal()) 4025 continue; 4026 4027 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4028 unsigned RBegin, REnd; 4029 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4030 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4031 4032 CCInfo.nextInRegsParam(); 4033 } 4034 CCInfo.rewindByValRegsInfo(); 4035 4036 int lastInsIndex = -1; 4037 if (isVarArg && MFI.hasVAStart()) { 4038 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4039 if (RegIdx != array_lengthof(GPRArgRegs)) 4040 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4041 } 4042 4043 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4044 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4045 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4046 4047 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4048 CCValAssign &VA = ArgLocs[i]; 4049 if (Ins[VA.getValNo()].isOrigArg()) { 4050 std::advance(CurOrigArg, 4051 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4052 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4053 } 4054 // Arguments stored in registers. 4055 if (VA.isRegLoc()) { 4056 EVT RegVT = VA.getLocVT(); 4057 4058 if (VA.needsCustom()) { 4059 // f64 and vector types are split up into multiple registers or 4060 // combinations of registers and stack slots. 4061 if (VA.getLocVT() == MVT::v2f64) { 4062 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4063 Chain, DAG, dl); 4064 VA = ArgLocs[++i]; // skip ahead to next loc 4065 SDValue ArgValue2; 4066 if (VA.isMemLoc()) { 4067 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4068 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4069 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4070 MachinePointerInfo::getFixedStack( 4071 DAG.getMachineFunction(), FI)); 4072 } else { 4073 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4074 Chain, DAG, dl); 4075 } 4076 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4077 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4078 ArgValue, ArgValue1, 4079 DAG.getIntPtrConstant(0, dl)); 4080 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4081 ArgValue, ArgValue2, 4082 DAG.getIntPtrConstant(1, dl)); 4083 } else 4084 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4085 } else { 4086 const TargetRegisterClass *RC; 4087 4088 4089 if (RegVT == MVT::f16) 4090 RC = &ARM::HPRRegClass; 4091 else if (RegVT == MVT::f32) 4092 RC = &ARM::SPRRegClass; 4093 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4094 RC = &ARM::DPRRegClass; 4095 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4096 RC = &ARM::QPRRegClass; 4097 else if (RegVT == MVT::i32) 4098 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4099 : &ARM::GPRRegClass; 4100 else 4101 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4102 4103 // Transform the arguments in physical registers into virtual ones. 4104 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4105 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4106 4107 // If this value is passed in r0 and has the returned attribute (e.g. 4108 // C++ 'structors), record this fact for later use. 4109 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4110 AFI->setPreservesR0(); 4111 } 4112 } 4113 4114 // If this is an 8 or 16-bit value, it is really passed promoted 4115 // to 32 bits. Insert an assert[sz]ext to capture this, then 4116 // truncate to the right size. 4117 switch (VA.getLocInfo()) { 4118 default: llvm_unreachable("Unknown loc info!"); 4119 case CCValAssign::Full: break; 4120 case CCValAssign::BCvt: 4121 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4122 break; 4123 case CCValAssign::SExt: 4124 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4125 DAG.getValueType(VA.getValVT())); 4126 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4127 break; 4128 case CCValAssign::ZExt: 4129 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4130 DAG.getValueType(VA.getValVT())); 4131 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4132 break; 4133 } 4134 4135 InVals.push_back(ArgValue); 4136 } else { // VA.isRegLoc() 4137 // sanity check 4138 assert(VA.isMemLoc()); 4139 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4140 4141 int index = VA.getValNo(); 4142 4143 // Some Ins[] entries become multiple ArgLoc[] entries. 4144 // Process them only once. 4145 if (index != lastInsIndex) 4146 { 4147 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4148 // FIXME: For now, all byval parameter objects are marked mutable. 4149 // This can be changed with more analysis. 4150 // In case of tail call optimization mark all arguments mutable. 4151 // Since they could be overwritten by lowering of arguments in case of 4152 // a tail call. 4153 if (Flags.isByVal()) { 4154 assert(Ins[index].isOrigArg() && 4155 "Byval arguments cannot be implicit"); 4156 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4157 4158 int FrameIndex = StoreByValRegs( 4159 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4160 VA.getLocMemOffset(), Flags.getByValSize()); 4161 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4162 CCInfo.nextInRegsParam(); 4163 } else { 4164 unsigned FIOffset = VA.getLocMemOffset(); 4165 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4166 FIOffset, true); 4167 4168 // Create load nodes to retrieve arguments from the stack. 4169 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4170 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4171 MachinePointerInfo::getFixedStack( 4172 DAG.getMachineFunction(), FI))); 4173 } 4174 lastInsIndex = index; 4175 } 4176 } 4177 } 4178 4179 // varargs 4180 if (isVarArg && MFI.hasVAStart()) 4181 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4182 CCInfo.getNextStackOffset(), 4183 TotalArgRegsSaveSize); 4184 4185 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4186 4187 return Chain; 4188} 4189 4190/// isFloatingPointZero - Return true if this is +0.0. 4191static bool isFloatingPointZero(SDValue Op) { 4192 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4193 return CFP->getValueAPF().isPosZero(); 4194 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4195 // Maybe this has already been legalized into the constant pool? 4196 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4197 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4198 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4199 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4200 return CFP->getValueAPF().isPosZero(); 4201 } 4202 } else if (Op->getOpcode() == ISD::BITCAST && 4203 Op->getValueType(0) == MVT::f64) { 4204 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4205 // created by LowerConstantFP(). 4206 SDValue BitcastOp = Op->getOperand(0); 4207 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4208 isNullConstant(BitcastOp->getOperand(0))) 4209 return true; 4210 } 4211 return false; 4212} 4213 4214/// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4215/// the given operands. 4216SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4217 SDValue &ARMcc, SelectionDAG &DAG, 4218 const SDLoc &dl) const { 4219 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4220 unsigned C = RHSC->getZExtValue(); 4221 if (!isLegalICmpImmediate((int32_t)C)) { 4222 // Constant does not fit, try adjusting it by one. 4223 switch (CC) { 4224 default: break; 4225 case ISD::SETLT: 4226 case ISD::SETGE: 4227 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4228 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4229 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4230 } 4231 break; 4232 case ISD::SETULT: 4233 case ISD::SETUGE: 4234 if (C != 0 && isLegalICmpImmediate(C-1)) { 4235 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4236 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4237 } 4238 break; 4239 case ISD::SETLE: 4240 case ISD::SETGT: 4241 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4242 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4243 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4244 } 4245 break; 4246 case ISD::SETULE: 4247 case ISD::SETUGT: 4248 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4249 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4250 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4251 } 4252 break; 4253 } 4254 } 4255 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4256 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4257 // In ARM and Thumb-2, the compare instructions can shift their second 4258 // operand. 4259 CC = ISD::getSetCCSwappedOperands(CC); 4260 std::swap(LHS, RHS); 4261 } 4262 4263 // Thumb1 has very limited immediate modes, so turning an "and" into a 4264 // shift can save multiple instructions. 4265 // 4266 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4267 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4268 // own. If it's the operand to an unsigned comparison with an immediate, 4269 // we can eliminate one of the shifts: we transform 4270 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4271 // 4272 // We avoid transforming cases which aren't profitable due to encoding 4273 // details: 4274 // 4275 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4276 // would not; in that case, we're essentially trading one immediate load for 4277 // another. 4278 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4279 // 3. C2 is zero; we have other code for this special case. 4280 // 4281 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4282 // instruction, since the AND is always one instruction anyway, but we could 4283 // use narrow instructions in some cases. 4284 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4285 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4286 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4287 !isSignedIntSetCC(CC)) { 4288 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4289 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4290 uint64_t RHSV = RHSC->getZExtValue(); 4291 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4292 unsigned ShiftBits = countLeadingZeros(Mask); 4293 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4294 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4295 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4296 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4297 } 4298 } 4299 } 4300 4301 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4302 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4303 // way a cmp would. 4304 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4305 // some tweaks to the heuristics for the previous and->shift transform. 4306 // FIXME: Optimize cases where the LHS isn't a shift. 4307 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4308 isa<ConstantSDNode>(RHS) && 4309 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4310 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4311 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4312 unsigned ShiftAmt = 4313 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4314 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4315 DAG.getVTList(MVT::i32, MVT::i32), 4316 LHS.getOperand(0), 4317 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4318 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4319 Shift.getValue(1), SDValue()); 4320 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4321 return Chain.getValue(1); 4322 } 4323 4324 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4325 4326 // If the RHS is a constant zero then the V (overflow) flag will never be 4327 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4328 // simpler for other passes (like the peephole optimiser) to deal with. 4329 if (isNullConstant(RHS)) { 4330 switch (CondCode) { 4331 default: break; 4332 case ARMCC::GE: 4333 CondCode = ARMCC::PL; 4334 break; 4335 case ARMCC::LT: 4336 CondCode = ARMCC::MI; 4337 break; 4338 } 4339 } 4340 4341 ARMISD::NodeType CompareType; 4342 switch (CondCode) { 4343 default: 4344 CompareType = ARMISD::CMP; 4345 break; 4346 case ARMCC::EQ: 4347 case ARMCC::NE: 4348 // Uses only Z Flag 4349 CompareType = ARMISD::CMPZ; 4350 break; 4351 } 4352 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4353 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4354} 4355 4356/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4357SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4358 SelectionDAG &DAG, const SDLoc &dl, 4359 bool Signaling) const { 4360 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4361 SDValue Cmp; 4362 if (!isFloatingPointZero(RHS)) 4363 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4364 dl, MVT::Glue, LHS, RHS); 4365 else 4366 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4367 dl, MVT::Glue, LHS); 4368 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4369} 4370 4371/// duplicateCmp - Glue values can have only one use, so this function 4372/// duplicates a comparison node. 4373SDValue 4374ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4375 unsigned Opc = Cmp.getOpcode(); 4376 SDLoc DL(Cmp); 4377 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4378 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4379 4380 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4381 Cmp = Cmp.getOperand(0); 4382 Opc = Cmp.getOpcode(); 4383 if (Opc == ARMISD::CMPFP) 4384 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4385 else { 4386 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4387 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4388 } 4389 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4390} 4391 4392// This function returns three things: the arithmetic computation itself 4393// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4394// comparison and the condition code define the case in which the arithmetic 4395// computation *does not* overflow. 4396std::pair<SDValue, SDValue> 4397ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4398 SDValue &ARMcc) const { 4399 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4400 4401 SDValue Value, OverflowCmp; 4402 SDValue LHS = Op.getOperand(0); 4403 SDValue RHS = Op.getOperand(1); 4404 SDLoc dl(Op); 4405 4406 // FIXME: We are currently always generating CMPs because we don't support 4407 // generating CMN through the backend. This is not as good as the natural 4408 // CMP case because it causes a register dependency and cannot be folded 4409 // later. 4410 4411 switch (Op.getOpcode()) { 4412 default: 4413 llvm_unreachable("Unknown overflow instruction!"); 4414 case ISD::SADDO: 4415 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4416 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4417 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4418 break; 4419 case ISD::UADDO: 4420 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4421 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4422 // We do not use it in the USUBO case as Value may not be used. 4423 Value = DAG.getNode(ARMISD::ADDC, dl, 4424 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4425 .getValue(0); 4426 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4427 break; 4428 case ISD::SSUBO: 4429 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4430 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4431 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4432 break; 4433 case ISD::USUBO: 4434 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4435 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4436 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4437 break; 4438 case ISD::UMULO: 4439 // We generate a UMUL_LOHI and then check if the high word is 0. 4440 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4441 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4442 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4443 LHS, RHS); 4444 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4445 DAG.getConstant(0, dl, MVT::i32)); 4446 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4447 break; 4448 case ISD::SMULO: 4449 // We generate a SMUL_LOHI and then check if all the bits of the high word 4450 // are the same as the sign bit of the low word. 4451 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4452 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4453 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4454 LHS, RHS); 4455 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4456 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4457 Value.getValue(0), 4458 DAG.getConstant(31, dl, MVT::i32))); 4459 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4460 break; 4461 } // switch (...) 4462 4463 return std::make_pair(Value, OverflowCmp); 4464} 4465 4466SDValue 4467ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4468 // Let legalize expand this if it isn't a legal type yet. 4469 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4470 return SDValue(); 4471 4472 SDValue Value, OverflowCmp; 4473 SDValue ARMcc; 4474 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4475 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4476 SDLoc dl(Op); 4477 // We use 0 and 1 as false and true values. 4478 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4479 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4480 EVT VT = Op.getValueType(); 4481 4482 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4483 ARMcc, CCR, OverflowCmp); 4484 4485 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4486 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4487} 4488 4489static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4490 SelectionDAG &DAG) { 4491 SDLoc DL(BoolCarry); 4492 EVT CarryVT = BoolCarry.getValueType(); 4493 4494 // This converts the boolean value carry into the carry flag by doing 4495 // ARMISD::SUBC Carry, 1 4496 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4497 DAG.getVTList(CarryVT, MVT::i32), 4498 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4499 return Carry.getValue(1); 4500} 4501 4502static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4503 SelectionDAG &DAG) { 4504 SDLoc DL(Flags); 4505 4506 // Now convert the carry flag into a boolean carry. We do this 4507 // using ARMISD:ADDE 0, 0, Carry 4508 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4509 DAG.getConstant(0, DL, MVT::i32), 4510 DAG.getConstant(0, DL, MVT::i32), Flags); 4511} 4512 4513SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4514 SelectionDAG &DAG) const { 4515 // Let legalize expand this if it isn't a legal type yet. 4516 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4517 return SDValue(); 4518 4519 SDValue LHS = Op.getOperand(0); 4520 SDValue RHS = Op.getOperand(1); 4521 SDLoc dl(Op); 4522 4523 EVT VT = Op.getValueType(); 4524 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4525 SDValue Value; 4526 SDValue Overflow; 4527 switch (Op.getOpcode()) { 4528 default: 4529 llvm_unreachable("Unknown overflow instruction!"); 4530 case ISD::UADDO: 4531 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4532 // Convert the carry flag into a boolean value. 4533 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4534 break; 4535 case ISD::USUBO: { 4536 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4537 // Convert the carry flag into a boolean value. 4538 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4539 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4540 // value. So compute 1 - C. 4541 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4542 DAG.getConstant(1, dl, MVT::i32), Overflow); 4543 break; 4544 } 4545 } 4546 4547 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4548} 4549 4550static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4551 const ARMSubtarget *Subtarget) { 4552 EVT VT = Op.getValueType(); 4553 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 4554 return SDValue(); 4555 if (!VT.isSimple()) 4556 return SDValue(); 4557 4558 unsigned NewOpcode; 4559 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4560 switch (VT.getSimpleVT().SimpleTy) { 4561 default: 4562 return SDValue(); 4563 case MVT::i8: 4564 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4565 break; 4566 case MVT::i16: 4567 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4568 break; 4569 } 4570 4571 SDLoc dl(Op); 4572 SDValue Add = 4573 DAG.getNode(NewOpcode, dl, MVT::i32, 4574 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4575 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4576 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4577} 4578 4579SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4580 SDValue Cond = Op.getOperand(0); 4581 SDValue SelectTrue = Op.getOperand(1); 4582 SDValue SelectFalse = Op.getOperand(2); 4583 SDLoc dl(Op); 4584 unsigned Opc = Cond.getOpcode(); 4585 4586 if (Cond.getResNo() == 1 && 4587 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4588 Opc == ISD::USUBO)) { 4589 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4590 return SDValue(); 4591 4592 SDValue Value, OverflowCmp; 4593 SDValue ARMcc; 4594 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4595 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4596 EVT VT = Op.getValueType(); 4597 4598 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4599 OverflowCmp, DAG); 4600 } 4601 4602 // Convert: 4603 // 4604 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4605 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4606 // 4607 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4608 const ConstantSDNode *CMOVTrue = 4609 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4610 const ConstantSDNode *CMOVFalse = 4611 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4612 4613 if (CMOVTrue && CMOVFalse) { 4614 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4615 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4616 4617 SDValue True; 4618 SDValue False; 4619 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4620 True = SelectTrue; 4621 False = SelectFalse; 4622 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4623 True = SelectFalse; 4624 False = SelectTrue; 4625 } 4626 4627 if (True.getNode() && False.getNode()) { 4628 EVT VT = Op.getValueType(); 4629 SDValue ARMcc = Cond.getOperand(2); 4630 SDValue CCR = Cond.getOperand(3); 4631 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4632 assert(True.getValueType() == VT); 4633 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4634 } 4635 } 4636 } 4637 4638 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4639 // undefined bits before doing a full-word comparison with zero. 4640 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4641 DAG.getConstant(1, dl, Cond.getValueType())); 4642 4643 return DAG.getSelectCC(dl, Cond, 4644 DAG.getConstant(0, dl, Cond.getValueType()), 4645 SelectTrue, SelectFalse, ISD::SETNE); 4646} 4647 4648static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4649 bool &swpCmpOps, bool &swpVselOps) { 4650 // Start by selecting the GE condition code for opcodes that return true for 4651 // 'equality' 4652 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4653 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4654 CondCode = ARMCC::GE; 4655 4656 // and GT for opcodes that return false for 'equality'. 4657 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4658 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4659 CondCode = ARMCC::GT; 4660 4661 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4662 // to swap the compare operands. 4663 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4664 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4665 swpCmpOps = true; 4666 4667 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4668 // If we have an unordered opcode, we need to swap the operands to the VSEL 4669 // instruction (effectively negating the condition). 4670 // 4671 // This also has the effect of swapping which one of 'less' or 'greater' 4672 // returns true, so we also swap the compare operands. It also switches 4673 // whether we return true for 'equality', so we compensate by picking the 4674 // opposite condition code to our original choice. 4675 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4676 CC == ISD::SETUGT) { 4677 swpCmpOps = !swpCmpOps; 4678 swpVselOps = !swpVselOps; 4679 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4680 } 4681 4682 // 'ordered' is 'anything but unordered', so use the VS condition code and 4683 // swap the VSEL operands. 4684 if (CC == ISD::SETO) { 4685 CondCode = ARMCC::VS; 4686 swpVselOps = true; 4687 } 4688 4689 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4690 // code and swap the VSEL operands. Also do this if we don't care about the 4691 // unordered case. 4692 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4693 CondCode = ARMCC::EQ; 4694 swpVselOps = true; 4695 } 4696} 4697 4698SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4699 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4700 SDValue Cmp, SelectionDAG &DAG) const { 4701 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4702 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4703 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4704 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4705 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4706 4707 SDValue TrueLow = TrueVal.getValue(0); 4708 SDValue TrueHigh = TrueVal.getValue(1); 4709 SDValue FalseLow = FalseVal.getValue(0); 4710 SDValue FalseHigh = FalseVal.getValue(1); 4711 4712 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4713 ARMcc, CCR, Cmp); 4714 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4715 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4716 4717 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4718 } else { 4719 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4720 Cmp); 4721 } 4722} 4723 4724static bool isGTorGE(ISD::CondCode CC) { 4725 return CC == ISD::SETGT || CC == ISD::SETGE; 4726} 4727 4728static bool isLTorLE(ISD::CondCode CC) { 4729 return CC == ISD::SETLT || CC == ISD::SETLE; 4730} 4731 4732// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4733// All of these conditions (and their <= and >= counterparts) will do: 4734// x < k ? k : x 4735// x > k ? x : k 4736// k < x ? x : k 4737// k > x ? k : x 4738static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4739 const SDValue TrueVal, const SDValue FalseVal, 4740 const ISD::CondCode CC, const SDValue K) { 4741 return (isGTorGE(CC) && 4742 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4743 (isLTorLE(CC) && 4744 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4745} 4746 4747// Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4748static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4749 const SDValue TrueVal, const SDValue FalseVal, 4750 const ISD::CondCode CC, const SDValue K) { 4751 return (isGTorGE(CC) && 4752 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4753 (isLTorLE(CC) && 4754 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4755} 4756 4757// Check if two chained conditionals could be converted into SSAT or USAT. 4758// 4759// SSAT can replace a set of two conditional selectors that bound a number to an 4760// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4761// 4762// x < -k ? -k : (x > k ? k : x) 4763// x < -k ? -k : (x < k ? x : k) 4764// x > -k ? (x > k ? k : x) : -k 4765// x < k ? (x < -k ? -k : x) : k 4766// etc. 4767// 4768// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4769// a power of 2. 4770// 4771// It returns true if the conversion can be done, false otherwise. 4772// Additionally, the variable is returned in parameter V, the constant in K and 4773// usat is set to true if the conditional represents an unsigned saturation 4774static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4775 uint64_t &K, bool &usat) { 4776 SDValue LHS1 = Op.getOperand(0); 4777 SDValue RHS1 = Op.getOperand(1); 4778 SDValue TrueVal1 = Op.getOperand(2); 4779 SDValue FalseVal1 = Op.getOperand(3); 4780 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4781 4782 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4783 if (Op2.getOpcode() != ISD::SELECT_CC) 4784 return false; 4785 4786 SDValue LHS2 = Op2.getOperand(0); 4787 SDValue RHS2 = Op2.getOperand(1); 4788 SDValue TrueVal2 = Op2.getOperand(2); 4789 SDValue FalseVal2 = Op2.getOperand(3); 4790 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4791 4792 // Find out which are the constants and which are the variables 4793 // in each conditional 4794 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4795 ? &RHS1 4796 : nullptr; 4797 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4798 ? &RHS2 4799 : nullptr; 4800 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4801 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4802 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4803 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4804 4805 // We must detect cases where the original operations worked with 16- or 4806 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4807 // must work with sign-extended values but the select operations return 4808 // the original non-extended value. 4809 SDValue V2TmpReg = V2Tmp; 4810 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4811 V2TmpReg = V2Tmp->getOperand(0); 4812 4813 // Check that the registers and the constants have the correct values 4814 // in both conditionals 4815 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4816 V2TmpReg != V2) 4817 return false; 4818 4819 // Figure out which conditional is saturating the lower/upper bound. 4820 const SDValue *LowerCheckOp = 4821 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4822 ? &Op 4823 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4824 ? &Op2 4825 : nullptr; 4826 const SDValue *UpperCheckOp = 4827 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4828 ? &Op 4829 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4830 ? &Op2 4831 : nullptr; 4832 4833 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4834 return false; 4835 4836 // Check that the constant in the lower-bound check is 4837 // the opposite of the constant in the upper-bound check 4838 // in 1's complement. 4839 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4840 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4841 int64_t PosVal = std::max(Val1, Val2); 4842 int64_t NegVal = std::min(Val1, Val2); 4843 4844 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4845 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4846 isPowerOf2_64(PosVal + 1)) { 4847 4848 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4849 if (Val1 == ~Val2) 4850 usat = false; 4851 else if (NegVal == 0) 4852 usat = true; 4853 else 4854 return false; 4855 4856 V = V2; 4857 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4858 4859 return true; 4860 } 4861 4862 return false; 4863} 4864 4865// Check if a condition of the type x < k ? k : x can be converted into a 4866// bit operation instead of conditional moves. 4867// Currently this is allowed given: 4868// - The conditions and values match up 4869// - k is 0 or -1 (all ones) 4870// This function will not check the last condition, thats up to the caller 4871// It returns true if the transformation can be made, and in such case 4872// returns x in V, and k in SatK. 4873static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4874 SDValue &SatK) 4875{ 4876 SDValue LHS = Op.getOperand(0); 4877 SDValue RHS = Op.getOperand(1); 4878 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4879 SDValue TrueVal = Op.getOperand(2); 4880 SDValue FalseVal = Op.getOperand(3); 4881 4882 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4883 ? &RHS 4884 : nullptr; 4885 4886 // No constant operation in comparison, early out 4887 if (!K) 4888 return false; 4889 4890 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4891 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4892 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4893 4894 // If the constant on left and right side, or variable on left and right, 4895 // does not match, early out 4896 if (*K != KTmp || V != VTmp) 4897 return false; 4898 4899 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4900 SatK = *K; 4901 return true; 4902 } 4903 4904 return false; 4905} 4906 4907bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 4908 if (VT == MVT::f32) 4909 return !Subtarget->hasVFP2Base(); 4910 if (VT == MVT::f64) 4911 return !Subtarget->hasFP64(); 4912 if (VT == MVT::f16) 4913 return !Subtarget->hasFullFP16(); 4914 return false; 4915} 4916 4917SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4918 EVT VT = Op.getValueType(); 4919 SDLoc dl(Op); 4920 4921 // Try to convert two saturating conditional selects into a single SSAT 4922 SDValue SatValue; 4923 uint64_t SatConstant; 4924 bool SatUSat; 4925 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4926 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4927 if (SatUSat) 4928 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4929 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4930 else 4931 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4932 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4933 } 4934 4935 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4936 // into more efficient bit operations, which is possible when k is 0 or -1 4937 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4938 // single instructions. On Thumb the shift and the bit operation will be two 4939 // instructions. 4940 // Only allow this transformation on full-width (32-bit) operations 4941 SDValue LowerSatConstant; 4942 if (VT == MVT::i32 && 4943 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4944 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4945 DAG.getConstant(31, dl, VT)); 4946 if (isNullConstant(LowerSatConstant)) { 4947 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4948 DAG.getAllOnesConstant(dl, VT)); 4949 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4950 } else if (isAllOnesConstant(LowerSatConstant)) 4951 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4952 } 4953 4954 SDValue LHS = Op.getOperand(0); 4955 SDValue RHS = Op.getOperand(1); 4956 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4957 SDValue TrueVal = Op.getOperand(2); 4958 SDValue FalseVal = Op.getOperand(3); 4959 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 4960 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 4961 4962 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 4963 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 4964 unsigned TVal = CTVal->getZExtValue(); 4965 unsigned FVal = CFVal->getZExtValue(); 4966 unsigned Opcode = 0; 4967 4968 if (TVal == ~FVal) { 4969 Opcode = ARMISD::CSINV; 4970 } else if (TVal == ~FVal + 1) { 4971 Opcode = ARMISD::CSNEG; 4972 } else if (TVal + 1 == FVal) { 4973 Opcode = ARMISD::CSINC; 4974 } else if (TVal == FVal + 1) { 4975 Opcode = ARMISD::CSINC; 4976 std::swap(TrueVal, FalseVal); 4977 std::swap(TVal, FVal); 4978 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4979 } 4980 4981 if (Opcode) { 4982 // If one of the constants is cheaper than another, materialise the 4983 // cheaper one and let the csel generate the other. 4984 if (Opcode != ARMISD::CSINC && 4985 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 4986 std::swap(TrueVal, FalseVal); 4987 std::swap(TVal, FVal); 4988 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4989 } 4990 4991 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 4992 // to get there. CSINC not is invertable like the other two (~(~a) == a, 4993 // -(-a) == a, but (a+1)+1 != a). 4994 if (FVal == 0 && Opcode != ARMISD::CSINC) { 4995 std::swap(TrueVal, FalseVal); 4996 std::swap(TVal, FVal); 4997 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 4998 } 4999 if (TVal == 0) 5000 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 5001 5002 // Drops F's value because we can get it by inverting/negating TVal. 5003 FalseVal = TrueVal; 5004 5005 SDValue ARMcc; 5006 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5007 EVT VT = TrueVal.getValueType(); 5008 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5009 } 5010 } 5011 5012 if (isUnsupportedFloatingType(LHS.getValueType())) { 5013 DAG.getTargetLoweringInfo().softenSetCCOperands( 5014 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5015 5016 // If softenSetCCOperands only returned one value, we should compare it to 5017 // zero. 5018 if (!RHS.getNode()) { 5019 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5020 CC = ISD::SETNE; 5021 } 5022 } 5023 5024 if (LHS.getValueType() == MVT::i32) { 5025 // Try to generate VSEL on ARMv8. 5026 // The VSEL instruction can't use all the usual ARM condition 5027 // codes: it only has two bits to select the condition code, so it's 5028 // constrained to use only GE, GT, VS and EQ. 5029 // 5030 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5031 // swap the operands of the previous compare instruction (effectively 5032 // inverting the compare condition, swapping 'less' and 'greater') and 5033 // sometimes need to swap the operands to the VSEL (which inverts the 5034 // condition in the sense of firing whenever the previous condition didn't) 5035 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5036 TrueVal.getValueType() == MVT::f32 || 5037 TrueVal.getValueType() == MVT::f64)) { 5038 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5039 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5040 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5041 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5042 std::swap(TrueVal, FalseVal); 5043 } 5044 } 5045 5046 SDValue ARMcc; 5047 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5048 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5049 // Choose GE over PL, which vsel does now support 5050 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5051 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5052 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5053 } 5054 5055 ARMCC::CondCodes CondCode, CondCode2; 5056 FPCCToARMCC(CC, CondCode, CondCode2); 5057 5058 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5059 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5060 // must use VSEL (limited condition codes), due to not having conditional f16 5061 // moves. 5062 if (Subtarget->hasFPARMv8Base() && 5063 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5064 (TrueVal.getValueType() == MVT::f16 || 5065 TrueVal.getValueType() == MVT::f32 || 5066 TrueVal.getValueType() == MVT::f64)) { 5067 bool swpCmpOps = false; 5068 bool swpVselOps = false; 5069 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5070 5071 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5072 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5073 if (swpCmpOps) 5074 std::swap(LHS, RHS); 5075 if (swpVselOps) 5076 std::swap(TrueVal, FalseVal); 5077 } 5078 } 5079 5080 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5081 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5082 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5083 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5084 if (CondCode2 != ARMCC::AL) { 5085 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5086 // FIXME: Needs another CMP because flag can have but one use. 5087 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5088 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5089 } 5090 return Result; 5091} 5092 5093/// canChangeToInt - Given the fp compare operand, return true if it is suitable 5094/// to morph to an integer compare sequence. 5095static bool canChangeToInt(SDValue Op, bool &SeenZero, 5096 const ARMSubtarget *Subtarget) { 5097 SDNode *N = Op.getNode(); 5098 if (!N->hasOneUse()) 5099 // Otherwise it requires moving the value from fp to integer registers. 5100 return false; 5101 if (!N->getNumValues()) 5102 return false; 5103 EVT VT = Op.getValueType(); 5104 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5105 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5106 // vmrs are very slow, e.g. cortex-a8. 5107 return false; 5108 5109 if (isFloatingPointZero(Op)) { 5110 SeenZero = true; 5111 return true; 5112 } 5113 return ISD::isNormalLoad(N); 5114} 5115 5116static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5117 if (isFloatingPointZero(Op)) 5118 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5119 5120 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5121 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5122 Ld->getPointerInfo(), Ld->getAlignment(), 5123 Ld->getMemOperand()->getFlags()); 5124 5125 llvm_unreachable("Unknown VFP cmp argument!"); 5126} 5127 5128static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5129 SDValue &RetVal1, SDValue &RetVal2) { 5130 SDLoc dl(Op); 5131 5132 if (isFloatingPointZero(Op)) { 5133 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5134 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5135 return; 5136 } 5137 5138 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5139 SDValue Ptr = Ld->getBasePtr(); 5140 RetVal1 = 5141 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5142 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5143 5144 EVT PtrType = Ptr.getValueType(); 5145 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5146 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5147 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5148 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5149 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5150 Ld->getMemOperand()->getFlags()); 5151 return; 5152 } 5153 5154 llvm_unreachable("Unknown VFP cmp argument!"); 5155} 5156 5157/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5158/// f32 and even f64 comparisons to integer ones. 5159SDValue 5160ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5161 SDValue Chain = Op.getOperand(0); 5162 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5163 SDValue LHS = Op.getOperand(2); 5164 SDValue RHS = Op.getOperand(3); 5165 SDValue Dest = Op.getOperand(4); 5166 SDLoc dl(Op); 5167 5168 bool LHSSeenZero = false; 5169 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5170 bool RHSSeenZero = false; 5171 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5172 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5173 // If unsafe fp math optimization is enabled and there are no other uses of 5174 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5175 // to an integer comparison. 5176 if (CC == ISD::SETOEQ) 5177 CC = ISD::SETEQ; 5178 else if (CC == ISD::SETUNE) 5179 CC = ISD::SETNE; 5180 5181 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5182 SDValue ARMcc; 5183 if (LHS.getValueType() == MVT::f32) { 5184 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5185 bitcastf32Toi32(LHS, DAG), Mask); 5186 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5187 bitcastf32Toi32(RHS, DAG), Mask); 5188 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5189 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5190 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5191 Chain, Dest, ARMcc, CCR, Cmp); 5192 } 5193 5194 SDValue LHS1, LHS2; 5195 SDValue RHS1, RHS2; 5196 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5197 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5198 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5199 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5200 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5201 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5202 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5203 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5204 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5205 } 5206 5207 return SDValue(); 5208} 5209 5210SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5211 SDValue Chain = Op.getOperand(0); 5212 SDValue Cond = Op.getOperand(1); 5213 SDValue Dest = Op.getOperand(2); 5214 SDLoc dl(Op); 5215 5216 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5217 // instruction. 5218 unsigned Opc = Cond.getOpcode(); 5219 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5220 !Subtarget->isThumb1Only(); 5221 if (Cond.getResNo() == 1 && 5222 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5223 Opc == ISD::USUBO || OptimizeMul)) { 5224 // Only lower legal XALUO ops. 5225 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5226 return SDValue(); 5227 5228 // The actual operation with overflow check. 5229 SDValue Value, OverflowCmp; 5230 SDValue ARMcc; 5231 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5232 5233 // Reverse the condition code. 5234 ARMCC::CondCodes CondCode = 5235 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5236 CondCode = ARMCC::getOppositeCondition(CondCode); 5237 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5238 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5239 5240 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5241 OverflowCmp); 5242 } 5243 5244 return SDValue(); 5245} 5246 5247SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5248 SDValue Chain = Op.getOperand(0); 5249 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5250 SDValue LHS = Op.getOperand(2); 5251 SDValue RHS = Op.getOperand(3); 5252 SDValue Dest = Op.getOperand(4); 5253 SDLoc dl(Op); 5254 5255 if (isUnsupportedFloatingType(LHS.getValueType())) { 5256 DAG.getTargetLoweringInfo().softenSetCCOperands( 5257 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5258 5259 // If softenSetCCOperands only returned one value, we should compare it to 5260 // zero. 5261 if (!RHS.getNode()) { 5262 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5263 CC = ISD::SETNE; 5264 } 5265 } 5266 5267 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5268 // instruction. 5269 unsigned Opc = LHS.getOpcode(); 5270 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5271 !Subtarget->isThumb1Only(); 5272 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5273 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5274 Opc == ISD::USUBO || OptimizeMul) && 5275 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5276 // Only lower legal XALUO ops. 5277 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5278 return SDValue(); 5279 5280 // The actual operation with overflow check. 5281 SDValue Value, OverflowCmp; 5282 SDValue ARMcc; 5283 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5284 5285 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5286 // Reverse the condition code. 5287 ARMCC::CondCodes CondCode = 5288 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5289 CondCode = ARMCC::getOppositeCondition(CondCode); 5290 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5291 } 5292 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5293 5294 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5295 OverflowCmp); 5296 } 5297 5298 if (LHS.getValueType() == MVT::i32) { 5299 SDValue ARMcc; 5300 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5301 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5302 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5303 Chain, Dest, ARMcc, CCR, Cmp); 5304 } 5305 5306 if (getTargetMachine().Options.UnsafeFPMath && 5307 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5308 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5309 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5310 return Result; 5311 } 5312 5313 ARMCC::CondCodes CondCode, CondCode2; 5314 FPCCToARMCC(CC, CondCode, CondCode2); 5315 5316 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5317 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5318 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5319 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5320 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5321 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5322 if (CondCode2 != ARMCC::AL) { 5323 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5324 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5325 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5326 } 5327 return Res; 5328} 5329 5330SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5331 SDValue Chain = Op.getOperand(0); 5332 SDValue Table = Op.getOperand(1); 5333 SDValue Index = Op.getOperand(2); 5334 SDLoc dl(Op); 5335 5336 EVT PTy = getPointerTy(DAG.getDataLayout()); 5337 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5338 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5339 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5340 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5341 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5342 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5343 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5344 // which does another jump to the destination. This also makes it easier 5345 // to translate it to TBB / TBH later (Thumb2 only). 5346 // FIXME: This might not work if the function is extremely large. 5347 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5348 Addr, Op.getOperand(2), JTI); 5349 } 5350 if (isPositionIndependent() || Subtarget->isROPI()) { 5351 Addr = 5352 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5353 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5354 Chain = Addr.getValue(1); 5355 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5356 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5357 } else { 5358 Addr = 5359 DAG.getLoad(PTy, dl, Chain, Addr, 5360 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5361 Chain = Addr.getValue(1); 5362 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5363 } 5364} 5365 5366static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5367 EVT VT = Op.getValueType(); 5368 SDLoc dl(Op); 5369 5370 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5371 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5372 return Op; 5373 return DAG.UnrollVectorOp(Op.getNode()); 5374 } 5375 5376 const bool HasFullFP16 = 5377 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5378 5379 EVT NewTy; 5380 const EVT OpTy = Op.getOperand(0).getValueType(); 5381 if (OpTy == MVT::v4f32) 5382 NewTy = MVT::v4i32; 5383 else if (OpTy == MVT::v4f16 && HasFullFP16) 5384 NewTy = MVT::v4i16; 5385 else if (OpTy == MVT::v8f16 && HasFullFP16) 5386 NewTy = MVT::v8i16; 5387 else 5388 llvm_unreachable("Invalid type for custom lowering!"); 5389 5390 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5391 return DAG.UnrollVectorOp(Op.getNode()); 5392 5393 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5394 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5395} 5396 5397SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5398 EVT VT = Op.getValueType(); 5399 if (VT.isVector()) 5400 return LowerVectorFP_TO_INT(Op, DAG); 5401 5402 bool IsStrict = Op->isStrictFPOpcode(); 5403 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5404 5405 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5406 RTLIB::Libcall LC; 5407 if (Op.getOpcode() == ISD::FP_TO_SINT || 5408 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5409 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5410 Op.getValueType()); 5411 else 5412 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5413 Op.getValueType()); 5414 SDLoc Loc(Op); 5415 MakeLibCallOptions CallOptions; 5416 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5417 SDValue Result; 5418 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5419 CallOptions, Loc, Chain); 5420 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5421 } 5422 5423 // FIXME: Remove this when we have strict fp instruction selection patterns 5424 if (IsStrict) { 5425 SDLoc Loc(Op); 5426 SDValue Result = 5427 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5428 : ISD::FP_TO_UINT, 5429 Loc, Op.getValueType(), SrcVal); 5430 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5431 } 5432 5433 return Op; 5434} 5435 5436static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5437 EVT VT = Op.getValueType(); 5438 SDLoc dl(Op); 5439 5440 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5441 if (VT.getVectorElementType() == MVT::f32) 5442 return Op; 5443 return DAG.UnrollVectorOp(Op.getNode()); 5444 } 5445 5446 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5447 Op.getOperand(0).getValueType() == MVT::v8i16) && 5448 "Invalid type for custom lowering!"); 5449 5450 const bool HasFullFP16 = 5451 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5452 5453 EVT DestVecType; 5454 if (VT == MVT::v4f32) 5455 DestVecType = MVT::v4i32; 5456 else if (VT == MVT::v4f16 && HasFullFP16) 5457 DestVecType = MVT::v4i16; 5458 else if (VT == MVT::v8f16 && HasFullFP16) 5459 DestVecType = MVT::v8i16; 5460 else 5461 return DAG.UnrollVectorOp(Op.getNode()); 5462 5463 unsigned CastOpc; 5464 unsigned Opc; 5465 switch (Op.getOpcode()) { 5466 default: llvm_unreachable("Invalid opcode!"); 5467 case ISD::SINT_TO_FP: 5468 CastOpc = ISD::SIGN_EXTEND; 5469 Opc = ISD::SINT_TO_FP; 5470 break; 5471 case ISD::UINT_TO_FP: 5472 CastOpc = ISD::ZERO_EXTEND; 5473 Opc = ISD::UINT_TO_FP; 5474 break; 5475 } 5476 5477 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5478 return DAG.getNode(Opc, dl, VT, Op); 5479} 5480 5481SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5482 EVT VT = Op.getValueType(); 5483 if (VT.isVector()) 5484 return LowerVectorINT_TO_FP(Op, DAG); 5485 if (isUnsupportedFloatingType(VT)) { 5486 RTLIB::Libcall LC; 5487 if (Op.getOpcode() == ISD::SINT_TO_FP) 5488 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5489 Op.getValueType()); 5490 else 5491 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5492 Op.getValueType()); 5493 MakeLibCallOptions CallOptions; 5494 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5495 CallOptions, SDLoc(Op)).first; 5496 } 5497 5498 return Op; 5499} 5500 5501SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5502 // Implement fcopysign with a fabs and a conditional fneg. 5503 SDValue Tmp0 = Op.getOperand(0); 5504 SDValue Tmp1 = Op.getOperand(1); 5505 SDLoc dl(Op); 5506 EVT VT = Op.getValueType(); 5507 EVT SrcVT = Tmp1.getValueType(); 5508 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5509 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5510 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5511 5512 if (UseNEON) { 5513 // Use VBSL to copy the sign bit. 5514 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5515 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5516 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5517 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5518 if (VT == MVT::f64) 5519 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5520 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5521 DAG.getConstant(32, dl, MVT::i32)); 5522 else /*if (VT == MVT::f32)*/ 5523 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5524 if (SrcVT == MVT::f32) { 5525 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5526 if (VT == MVT::f64) 5527 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5528 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5529 DAG.getConstant(32, dl, MVT::i32)); 5530 } else if (VT == MVT::f32) 5531 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5532 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5533 DAG.getConstant(32, dl, MVT::i32)); 5534 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5535 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5536 5537 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5538 dl, MVT::i32); 5539 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5540 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5541 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5542 5543 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5544 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5545 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5546 if (VT == MVT::f32) { 5547 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5548 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5549 DAG.getConstant(0, dl, MVT::i32)); 5550 } else { 5551 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5552 } 5553 5554 return Res; 5555 } 5556 5557 // Bitcast operand 1 to i32. 5558 if (SrcVT == MVT::f64) 5559 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5560 Tmp1).getValue(1); 5561 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5562 5563 // Or in the signbit with integer operations. 5564 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5565 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5566 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5567 if (VT == MVT::f32) { 5568 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5569 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5570 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5571 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5572 } 5573 5574 // f64: Or the high part with signbit and then combine two parts. 5575 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5576 Tmp0); 5577 SDValue Lo = Tmp0.getValue(0); 5578 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5579 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5580 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5581} 5582 5583SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5584 MachineFunction &MF = DAG.getMachineFunction(); 5585 MachineFrameInfo &MFI = MF.getFrameInfo(); 5586 MFI.setReturnAddressIsTaken(true); 5587 5588 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5589 return SDValue(); 5590 5591 EVT VT = Op.getValueType(); 5592 SDLoc dl(Op); 5593 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5594 if (Depth) { 5595 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5596 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5597 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5598 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5599 MachinePointerInfo()); 5600 } 5601 5602 // Return LR, which contains the return address. Mark it an implicit live-in. 5603 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5604 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5605} 5606 5607SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5608 const ARMBaseRegisterInfo &ARI = 5609 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5610 MachineFunction &MF = DAG.getMachineFunction(); 5611 MachineFrameInfo &MFI = MF.getFrameInfo(); 5612 MFI.setFrameAddressIsTaken(true); 5613 5614 EVT VT = Op.getValueType(); 5615 SDLoc dl(Op); // FIXME probably not meaningful 5616 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5617 Register FrameReg = ARI.getFrameRegister(MF); 5618 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5619 while (Depth--) 5620 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5621 MachinePointerInfo()); 5622 return FrameAddr; 5623} 5624 5625// FIXME? Maybe this could be a TableGen attribute on some registers and 5626// this table could be generated automatically from RegInfo. 5627Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5628 const MachineFunction &MF) const { 5629 Register Reg = StringSwitch<unsigned>(RegName) 5630 .Case("sp", ARM::SP) 5631 .Default(0); 5632 if (Reg) 5633 return Reg; 5634 report_fatal_error(Twine("Invalid register name \"" 5635 + StringRef(RegName) + "\".")); 5636} 5637 5638// Result is 64 bit value so split into two 32 bit values and return as a 5639// pair of values. 5640static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5641 SelectionDAG &DAG) { 5642 SDLoc DL(N); 5643 5644 // This function is only supposed to be called for i64 type destination. 5645 assert(N->getValueType(0) == MVT::i64 5646 && "ExpandREAD_REGISTER called for non-i64 type result."); 5647 5648 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5649 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5650 N->getOperand(0), 5651 N->getOperand(1)); 5652 5653 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5654 Read.getValue(1))); 5655 Results.push_back(Read.getOperand(0)); 5656} 5657 5658/// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5659/// When \p DstVT, the destination type of \p BC, is on the vector 5660/// register bank and the source of bitcast, \p Op, operates on the same bank, 5661/// it might be possible to combine them, such that everything stays on the 5662/// vector register bank. 5663/// \p return The node that would replace \p BT, if the combine 5664/// is possible. 5665static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5666 SelectionDAG &DAG) { 5667 SDValue Op = BC->getOperand(0); 5668 EVT DstVT = BC->getValueType(0); 5669 5670 // The only vector instruction that can produce a scalar (remember, 5671 // since the bitcast was about to be turned into VMOVDRR, the source 5672 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5673 // Moreover, we can do this combine only if there is one use. 5674 // Finally, if the destination type is not a vector, there is not 5675 // much point on forcing everything on the vector bank. 5676 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5677 !Op.hasOneUse()) 5678 return SDValue(); 5679 5680 // If the index is not constant, we will introduce an additional 5681 // multiply that will stick. 5682 // Give up in that case. 5683 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5684 if (!Index) 5685 return SDValue(); 5686 unsigned DstNumElt = DstVT.getVectorNumElements(); 5687 5688 // Compute the new index. 5689 const APInt &APIntIndex = Index->getAPIntValue(); 5690 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5691 NewIndex *= APIntIndex; 5692 // Check if the new constant index fits into i32. 5693 if (NewIndex.getBitWidth() > 32) 5694 return SDValue(); 5695 5696 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5697 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5698 SDLoc dl(Op); 5699 SDValue ExtractSrc = Op.getOperand(0); 5700 EVT VecVT = EVT::getVectorVT( 5701 *DAG.getContext(), DstVT.getScalarType(), 5702 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5703 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5704 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5705 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5706} 5707 5708/// ExpandBITCAST - If the target supports VFP, this function is called to 5709/// expand a bit convert where either the source or destination type is i64 to 5710/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5711/// operand type is illegal (e.g., v2f32 for a target that doesn't support 5712/// vectors), since the legalizer won't know what to do with that. 5713static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5714 const ARMSubtarget *Subtarget) { 5715 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5716 SDLoc dl(N); 5717 SDValue Op = N->getOperand(0); 5718 5719 // This function is only supposed to be called for i64 types, either as the 5720 // source or destination of the bit convert. 5721 EVT SrcVT = Op.getValueType(); 5722 EVT DstVT = N->getValueType(0); 5723 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5724 5725 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5726 // FullFP16: half values are passed in S-registers, and we don't 5727 // need any of the bitcast and moves: 5728 // 5729 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5730 // t5: i32 = bitcast t2 5731 // t18: f16 = ARMISD::VMOVhr t5 5732 if (Op.getOpcode() != ISD::CopyFromReg || 5733 Op.getValueType() != MVT::f32) 5734 return SDValue(); 5735 5736 auto Move = N->use_begin(); 5737 if (Move->getOpcode() != ARMISD::VMOVhr) 5738 return SDValue(); 5739 5740 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5741 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5742 DAG.ReplaceAllUsesWith(*Move, &Copy); 5743 return Copy; 5744 } 5745 5746 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5747 if (!HasFullFP16) 5748 return SDValue(); 5749 // SoftFP: read half-precision arguments: 5750 // 5751 // t2: i32,ch = ... 5752 // t7: i16 = truncate t2 <~~~~ Op 5753 // t8: f16 = bitcast t7 <~~~~ N 5754 // 5755 if (Op.getOperand(0).getValueType() == MVT::i32) 5756 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5757 MVT::f16, Op.getOperand(0)); 5758 5759 return SDValue(); 5760 } 5761 5762 // Half-precision return values 5763 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5764 if (!HasFullFP16) 5765 return SDValue(); 5766 // 5767 // t11: f16 = fadd t8, t10 5768 // t12: i16 = bitcast t11 <~~~ SDNode N 5769 // t13: i32 = zero_extend t12 5770 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5771 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5772 // 5773 // transform this into: 5774 // 5775 // t20: i32 = ARMISD::VMOVrh t11 5776 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5777 // 5778 auto ZeroExtend = N->use_begin(); 5779 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5780 ZeroExtend->getValueType(0) != MVT::i32) 5781 return SDValue(); 5782 5783 auto Copy = ZeroExtend->use_begin(); 5784 if (Copy->getOpcode() == ISD::CopyToReg && 5785 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5786 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5787 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5788 return Cvt; 5789 } 5790 return SDValue(); 5791 } 5792 5793 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5794 return SDValue(); 5795 5796 // Turn i64->f64 into VMOVDRR. 5797 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5798 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5799 // if we can combine the bitcast with its source. 5800 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5801 return Val; 5802 5803 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5804 DAG.getConstant(0, dl, MVT::i32)); 5805 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5806 DAG.getConstant(1, dl, MVT::i32)); 5807 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5808 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5809 } 5810 5811 // Turn f64->i64 into VMOVRRD. 5812 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5813 SDValue Cvt; 5814 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5815 SrcVT.getVectorNumElements() > 1) 5816 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5817 DAG.getVTList(MVT::i32, MVT::i32), 5818 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5819 else 5820 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5821 DAG.getVTList(MVT::i32, MVT::i32), Op); 5822 // Merge the pieces into a single i64 value. 5823 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5824 } 5825 5826 return SDValue(); 5827} 5828 5829/// getZeroVector - Returns a vector of specified type with all zero elements. 5830/// Zero vectors are used to represent vector negation and in those cases 5831/// will be implemented with the NEON VNEG instruction. However, VNEG does 5832/// not support i64 elements, so sometimes the zero vectors will need to be 5833/// explicitly constructed. Regardless, use a canonical VMOV to create the 5834/// zero vector. 5835static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5836 assert(VT.isVector() && "Expected a vector type"); 5837 // The canonical modified immediate encoding of a zero vector is....0! 5838 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5839 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5840 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5841 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5842} 5843 5844/// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5845/// i32 values and take a 2 x i32 value to shift plus a shift amount. 5846SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5847 SelectionDAG &DAG) const { 5848 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5849 EVT VT = Op.getValueType(); 5850 unsigned VTBits = VT.getSizeInBits(); 5851 SDLoc dl(Op); 5852 SDValue ShOpLo = Op.getOperand(0); 5853 SDValue ShOpHi = Op.getOperand(1); 5854 SDValue ShAmt = Op.getOperand(2); 5855 SDValue ARMcc; 5856 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5857 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5858 5859 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5860 5861 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5862 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5863 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5864 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5865 DAG.getConstant(VTBits, dl, MVT::i32)); 5866 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5867 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5868 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5869 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5870 ISD::SETGE, ARMcc, DAG, dl); 5871 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5872 ARMcc, CCR, CmpLo); 5873 5874 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5875 SDValue HiBigShift = Opc == ISD::SRA 5876 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5877 DAG.getConstant(VTBits - 1, dl, VT)) 5878 : DAG.getConstant(0, dl, VT); 5879 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5880 ISD::SETGE, ARMcc, DAG, dl); 5881 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5882 ARMcc, CCR, CmpHi); 5883 5884 SDValue Ops[2] = { Lo, Hi }; 5885 return DAG.getMergeValues(Ops, dl); 5886} 5887 5888/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5889/// i32 values and take a 2 x i32 value to shift plus a shift amount. 5890SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5891 SelectionDAG &DAG) const { 5892 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5893 EVT VT = Op.getValueType(); 5894 unsigned VTBits = VT.getSizeInBits(); 5895 SDLoc dl(Op); 5896 SDValue ShOpLo = Op.getOperand(0); 5897 SDValue ShOpHi = Op.getOperand(1); 5898 SDValue ShAmt = Op.getOperand(2); 5899 SDValue ARMcc; 5900 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5901 5902 assert(Op.getOpcode() == ISD::SHL_PARTS); 5903 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5904 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5905 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5906 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5907 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5908 5909 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5910 DAG.getConstant(VTBits, dl, MVT::i32)); 5911 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5912 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5913 ISD::SETGE, ARMcc, DAG, dl); 5914 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5915 ARMcc, CCR, CmpHi); 5916 5917 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5918 ISD::SETGE, ARMcc, DAG, dl); 5919 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5920 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5921 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5922 5923 SDValue Ops[2] = { Lo, Hi }; 5924 return DAG.getMergeValues(Ops, dl); 5925} 5926 5927SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5928 SelectionDAG &DAG) const { 5929 // The rounding mode is in bits 23:22 of the FPSCR. 5930 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5931 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5932 // so that the shift + and get folded into a bitfield extract. 5933 SDLoc dl(Op); 5934 SDValue Ops[] = { DAG.getEntryNode(), 5935 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5936 5937 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5938 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5939 DAG.getConstant(1U << 22, dl, MVT::i32)); 5940 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5941 DAG.getConstant(22, dl, MVT::i32)); 5942 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5943 DAG.getConstant(3, dl, MVT::i32)); 5944} 5945 5946static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5947 const ARMSubtarget *ST) { 5948 SDLoc dl(N); 5949 EVT VT = N->getValueType(0); 5950 if (VT.isVector() && ST->hasNEON()) { 5951 5952 // Compute the least significant set bit: LSB = X & -X 5953 SDValue X = N->getOperand(0); 5954 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5955 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5956 5957 EVT ElemTy = VT.getVectorElementType(); 5958 5959 if (ElemTy == MVT::i8) { 5960 // Compute with: cttz(x) = ctpop(lsb - 1) 5961 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5962 DAG.getTargetConstant(1, dl, ElemTy)); 5963 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5964 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5965 } 5966 5967 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5968 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5969 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5970 unsigned NumBits = ElemTy.getSizeInBits(); 5971 SDValue WidthMinus1 = 5972 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5973 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5974 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5975 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5976 } 5977 5978 // Compute with: cttz(x) = ctpop(lsb - 1) 5979 5980 // Compute LSB - 1. 5981 SDValue Bits; 5982 if (ElemTy == MVT::i64) { 5983 // Load constant 0xffff'ffff'ffff'ffff to register. 5984 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5985 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5986 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5987 } else { 5988 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5989 DAG.getTargetConstant(1, dl, ElemTy)); 5990 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5991 } 5992 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5993 } 5994 5995 if (!ST->hasV6T2Ops()) 5996 return SDValue(); 5997 5998 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5999 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6000} 6001 6002static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6003 const ARMSubtarget *ST) { 6004 EVT VT = N->getValueType(0); 6005 SDLoc DL(N); 6006 6007 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6008 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6009 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6010 "Unexpected type for custom ctpop lowering"); 6011 6012 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6013 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6014 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6015 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6016 6017 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6018 unsigned EltSize = 8; 6019 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6020 while (EltSize != VT.getScalarSizeInBits()) { 6021 SmallVector<SDValue, 8> Ops; 6022 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6023 TLI.getPointerTy(DAG.getDataLayout()))); 6024 Ops.push_back(Res); 6025 6026 EltSize *= 2; 6027 NumElts /= 2; 6028 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6029 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6030 } 6031 6032 return Res; 6033} 6034 6035/// Getvshiftimm - Check if this is a valid build_vector for the immediate 6036/// operand of a vector shift operation, where all the elements of the 6037/// build_vector must have the same constant integer value. 6038static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6039 // Ignore bit_converts. 6040 while (Op.getOpcode() == ISD::BITCAST) 6041 Op = Op.getOperand(0); 6042 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6043 APInt SplatBits, SplatUndef; 6044 unsigned SplatBitSize; 6045 bool HasAnyUndefs; 6046 if (!BVN || 6047 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6048 ElementBits) || 6049 SplatBitSize > ElementBits) 6050 return false; 6051 Cnt = SplatBits.getSExtValue(); 6052 return true; 6053} 6054 6055/// isVShiftLImm - Check if this is a valid build_vector for the immediate 6056/// operand of a vector shift left operation. That value must be in the range: 6057/// 0 <= Value < ElementBits for a left shift; or 6058/// 0 <= Value <= ElementBits for a long left shift. 6059static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6060 assert(VT.isVector() && "vector shift count is not a vector type"); 6061 int64_t ElementBits = VT.getScalarSizeInBits(); 6062 if (!getVShiftImm(Op, ElementBits, Cnt)) 6063 return false; 6064 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6065} 6066 6067/// isVShiftRImm - Check if this is a valid build_vector for the immediate 6068/// operand of a vector shift right operation. For a shift opcode, the value 6069/// is positive, but for an intrinsic the value count must be negative. The 6070/// absolute value must be in the range: 6071/// 1 <= |Value| <= ElementBits for a right shift; or 6072/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6073static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6074 int64_t &Cnt) { 6075 assert(VT.isVector() && "vector shift count is not a vector type"); 6076 int64_t ElementBits = VT.getScalarSizeInBits(); 6077 if (!getVShiftImm(Op, ElementBits, Cnt)) 6078 return false; 6079 if (!isIntrinsic) 6080 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6081 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6082 Cnt = -Cnt; 6083 return true; 6084 } 6085 return false; 6086} 6087 6088static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6089 const ARMSubtarget *ST) { 6090 EVT VT = N->getValueType(0); 6091 SDLoc dl(N); 6092 int64_t Cnt; 6093 6094 if (!VT.isVector()) 6095 return SDValue(); 6096 6097 // We essentially have two forms here. Shift by an immediate and shift by a 6098 // vector register (there are also shift by a gpr, but that is just handled 6099 // with a tablegen pattern). We cannot easily match shift by an immediate in 6100 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6101 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6102 // signed or unsigned, and a negative shift indicates a shift right). 6103 if (N->getOpcode() == ISD::SHL) { 6104 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6105 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6106 DAG.getConstant(Cnt, dl, MVT::i32)); 6107 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6108 N->getOperand(1)); 6109 } 6110 6111 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6112 "unexpected vector shift opcode"); 6113 6114 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6115 unsigned VShiftOpc = 6116 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6117 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6118 DAG.getConstant(Cnt, dl, MVT::i32)); 6119 } 6120 6121 // Other right shifts we don't have operations for (we use a shift left by a 6122 // negative number). 6123 EVT ShiftVT = N->getOperand(1).getValueType(); 6124 SDValue NegatedCount = DAG.getNode( 6125 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6126 unsigned VShiftOpc = 6127 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6128 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6129} 6130 6131static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6132 const ARMSubtarget *ST) { 6133 EVT VT = N->getValueType(0); 6134 SDLoc dl(N); 6135 6136 // We can get here for a node like i32 = ISD::SHL i32, i64 6137 if (VT != MVT::i64) 6138 return SDValue(); 6139 6140 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6141 N->getOpcode() == ISD::SHL) && 6142 "Unknown shift to lower!"); 6143 6144 unsigned ShOpc = N->getOpcode(); 6145 if (ST->hasMVEIntegerOps()) { 6146 SDValue ShAmt = N->getOperand(1); 6147 unsigned ShPartsOpc = ARMISD::LSLL; 6148 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6149 6150 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6151 // then do the default optimisation 6152 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6153 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6154 return SDValue(); 6155 6156 // Extract the lower 32 bits of the shift amount if it's not an i32 6157 if (ShAmt->getValueType(0) != MVT::i32) 6158 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6159 6160 if (ShOpc == ISD::SRL) { 6161 if (!Con) 6162 // There is no t2LSRLr instruction so negate and perform an lsll if the 6163 // shift amount is in a register, emulating a right shift. 6164 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6165 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6166 else 6167 // Else generate an lsrl on the immediate shift amount 6168 ShPartsOpc = ARMISD::LSRL; 6169 } else if (ShOpc == ISD::SRA) 6170 ShPartsOpc = ARMISD::ASRL; 6171 6172 // Lower 32 bits of the destination/source 6173 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6174 DAG.getConstant(0, dl, MVT::i32)); 6175 // Upper 32 bits of the destination/source 6176 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6177 DAG.getConstant(1, dl, MVT::i32)); 6178 6179 // Generate the shift operation as computed above 6180 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6181 ShAmt); 6182 // The upper 32 bits come from the second return value of lsll 6183 Hi = SDValue(Lo.getNode(), 1); 6184 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6185 } 6186 6187 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6188 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6189 return SDValue(); 6190 6191 // If we are in thumb mode, we don't have RRX. 6192 if (ST->isThumb1Only()) 6193 return SDValue(); 6194 6195 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6196 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6197 DAG.getConstant(0, dl, MVT::i32)); 6198 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6199 DAG.getConstant(1, dl, MVT::i32)); 6200 6201 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6202 // captures the result into a carry flag. 6203 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6204 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6205 6206 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6207 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6208 6209 // Merge the pieces into a single i64 value. 6210 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6211} 6212 6213static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6214 const ARMSubtarget *ST) { 6215 bool Invert = false; 6216 bool Swap = false; 6217 unsigned Opc = ARMCC::AL; 6218 6219 SDValue Op0 = Op.getOperand(0); 6220 SDValue Op1 = Op.getOperand(1); 6221 SDValue CC = Op.getOperand(2); 6222 EVT VT = Op.getValueType(); 6223 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6224 SDLoc dl(Op); 6225 6226 EVT CmpVT; 6227 if (ST->hasNEON()) 6228 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6229 else { 6230 assert(ST->hasMVEIntegerOps() && 6231 "No hardware support for integer vector comparison!"); 6232 6233 if (Op.getValueType().getVectorElementType() != MVT::i1) 6234 return SDValue(); 6235 6236 // Make sure we expand floating point setcc to scalar if we do not have 6237 // mve.fp, so that we can handle them from there. 6238 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6239 return SDValue(); 6240 6241 CmpVT = VT; 6242 } 6243 6244 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6245 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6246 // Special-case integer 64-bit equality comparisons. They aren't legal, 6247 // but they can be lowered with a few vector instructions. 6248 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6249 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6250 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6251 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6252 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6253 DAG.getCondCode(ISD::SETEQ)); 6254 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6255 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6256 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6257 if (SetCCOpcode == ISD::SETNE) 6258 Merged = DAG.getNOT(dl, Merged, CmpVT); 6259 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6260 return Merged; 6261 } 6262 6263 if (CmpVT.getVectorElementType() == MVT::i64) 6264 // 64-bit comparisons are not legal in general. 6265 return SDValue(); 6266 6267 if (Op1.getValueType().isFloatingPoint()) { 6268 switch (SetCCOpcode) { 6269 default: llvm_unreachable("Illegal FP comparison"); 6270 case ISD::SETUNE: 6271 case ISD::SETNE: 6272 if (ST->hasMVEFloatOps()) { 6273 Opc = ARMCC::NE; break; 6274 } else { 6275 Invert = true; LLVM_FALLTHROUGH; 6276 } 6277 case ISD::SETOEQ: 6278 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6279 case ISD::SETOLT: 6280 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6281 case ISD::SETOGT: 6282 case ISD::SETGT: Opc = ARMCC::GT; break; 6283 case ISD::SETOLE: 6284 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6285 case ISD::SETOGE: 6286 case ISD::SETGE: Opc = ARMCC::GE; break; 6287 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6288 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6289 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6290 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6291 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6292 case ISD::SETONE: { 6293 // Expand this to (OLT | OGT). 6294 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6295 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6296 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6297 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6298 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6299 if (Invert) 6300 Result = DAG.getNOT(dl, Result, VT); 6301 return Result; 6302 } 6303 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6304 case ISD::SETO: { 6305 // Expand this to (OLT | OGE). 6306 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6307 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6308 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6309 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6310 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6311 if (Invert) 6312 Result = DAG.getNOT(dl, Result, VT); 6313 return Result; 6314 } 6315 } 6316 } else { 6317 // Integer comparisons. 6318 switch (SetCCOpcode) { 6319 default: llvm_unreachable("Illegal integer comparison"); 6320 case ISD::SETNE: 6321 if (ST->hasMVEIntegerOps()) { 6322 Opc = ARMCC::NE; break; 6323 } else { 6324 Invert = true; LLVM_FALLTHROUGH; 6325 } 6326 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6327 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6328 case ISD::SETGT: Opc = ARMCC::GT; break; 6329 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6330 case ISD::SETGE: Opc = ARMCC::GE; break; 6331 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6332 case ISD::SETUGT: Opc = ARMCC::HI; break; 6333 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6334 case ISD::SETUGE: Opc = ARMCC::HS; break; 6335 } 6336 6337 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6338 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6339 SDValue AndOp; 6340 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6341 AndOp = Op0; 6342 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6343 AndOp = Op1; 6344 6345 // Ignore bitconvert. 6346 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6347 AndOp = AndOp.getOperand(0); 6348 6349 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6350 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6351 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6352 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6353 if (!Invert) 6354 Result = DAG.getNOT(dl, Result, VT); 6355 return Result; 6356 } 6357 } 6358 } 6359 6360 if (Swap) 6361 std::swap(Op0, Op1); 6362 6363 // If one of the operands is a constant vector zero, attempt to fold the 6364 // comparison to a specialized compare-against-zero form. 6365 SDValue SingleOp; 6366 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6367 SingleOp = Op0; 6368 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6369 if (Opc == ARMCC::GE) 6370 Opc = ARMCC::LE; 6371 else if (Opc == ARMCC::GT) 6372 Opc = ARMCC::LT; 6373 SingleOp = Op1; 6374 } 6375 6376 SDValue Result; 6377 if (SingleOp.getNode()) { 6378 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6379 DAG.getConstant(Opc, dl, MVT::i32)); 6380 } else { 6381 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6382 DAG.getConstant(Opc, dl, MVT::i32)); 6383 } 6384 6385 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6386 6387 if (Invert) 6388 Result = DAG.getNOT(dl, Result, VT); 6389 6390 return Result; 6391} 6392 6393static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6394 SDValue LHS = Op.getOperand(0); 6395 SDValue RHS = Op.getOperand(1); 6396 SDValue Carry = Op.getOperand(2); 6397 SDValue Cond = Op.getOperand(3); 6398 SDLoc DL(Op); 6399 6400 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6401 6402 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6403 // have to invert the carry first. 6404 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6405 DAG.getConstant(1, DL, MVT::i32), Carry); 6406 // This converts the boolean value carry into the carry flag. 6407 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6408 6409 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6410 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6411 6412 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6413 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6414 SDValue ARMcc = DAG.getConstant( 6415 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6416 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6417 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6418 Cmp.getValue(1), SDValue()); 6419 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6420 CCR, Chain.getValue(1)); 6421} 6422 6423/// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6424/// valid vector constant for a NEON or MVE instruction with a "modified 6425/// immediate" operand (e.g., VMOV). If so, return the encoded value. 6426static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6427 unsigned SplatBitSize, SelectionDAG &DAG, 6428 const SDLoc &dl, EVT &VT, bool is128Bits, 6429 VMOVModImmType type) { 6430 unsigned OpCmode, Imm; 6431 6432 // SplatBitSize is set to the smallest size that splats the vector, so a 6433 // zero vector will always have SplatBitSize == 8. However, NEON modified 6434 // immediate instructions others than VMOV do not support the 8-bit encoding 6435 // of a zero vector, and the default encoding of zero is supposed to be the 6436 // 32-bit version. 6437 if (SplatBits == 0) 6438 SplatBitSize = 32; 6439 6440 switch (SplatBitSize) { 6441 case 8: 6442 if (type != VMOVModImm) 6443 return SDValue(); 6444 // Any 1-byte value is OK. Op=0, Cmode=1110. 6445 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6446 OpCmode = 0xe; 6447 Imm = SplatBits; 6448 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6449 break; 6450 6451 case 16: 6452 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6453 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6454 if ((SplatBits & ~0xff) == 0) { 6455 // Value = 0x00nn: Op=x, Cmode=100x. 6456 OpCmode = 0x8; 6457 Imm = SplatBits; 6458 break; 6459 } 6460 if ((SplatBits & ~0xff00) == 0) { 6461 // Value = 0xnn00: Op=x, Cmode=101x. 6462 OpCmode = 0xa; 6463 Imm = SplatBits >> 8; 6464 break; 6465 } 6466 return SDValue(); 6467 6468 case 32: 6469 // NEON's 32-bit VMOV supports splat values where: 6470 // * only one byte is nonzero, or 6471 // * the least significant byte is 0xff and the second byte is nonzero, or 6472 // * the least significant 2 bytes are 0xff and the third is nonzero. 6473 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6474 if ((SplatBits & ~0xff) == 0) { 6475 // Value = 0x000000nn: Op=x, Cmode=000x. 6476 OpCmode = 0; 6477 Imm = SplatBits; 6478 break; 6479 } 6480 if ((SplatBits & ~0xff00) == 0) { 6481 // Value = 0x0000nn00: Op=x, Cmode=001x. 6482 OpCmode = 0x2; 6483 Imm = SplatBits >> 8; 6484 break; 6485 } 6486 if ((SplatBits & ~0xff0000) == 0) { 6487 // Value = 0x00nn0000: Op=x, Cmode=010x. 6488 OpCmode = 0x4; 6489 Imm = SplatBits >> 16; 6490 break; 6491 } 6492 if ((SplatBits & ~0xff000000) == 0) { 6493 // Value = 0xnn000000: Op=x, Cmode=011x. 6494 OpCmode = 0x6; 6495 Imm = SplatBits >> 24; 6496 break; 6497 } 6498 6499 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6500 if (type == OtherModImm) return SDValue(); 6501 6502 if ((SplatBits & ~0xffff) == 0 && 6503 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6504 // Value = 0x0000nnff: Op=x, Cmode=1100. 6505 OpCmode = 0xc; 6506 Imm = SplatBits >> 8; 6507 break; 6508 } 6509 6510 // cmode == 0b1101 is not supported for MVE VMVN 6511 if (type == MVEVMVNModImm) 6512 return SDValue(); 6513 6514 if ((SplatBits & ~0xffffff) == 0 && 6515 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6516 // Value = 0x00nnffff: Op=x, Cmode=1101. 6517 OpCmode = 0xd; 6518 Imm = SplatBits >> 16; 6519 break; 6520 } 6521 6522 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6523 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6524 // VMOV.I32. A (very) minor optimization would be to replicate the value 6525 // and fall through here to test for a valid 64-bit splat. But, then the 6526 // caller would also need to check and handle the change in size. 6527 return SDValue(); 6528 6529 case 64: { 6530 if (type != VMOVModImm) 6531 return SDValue(); 6532 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6533 uint64_t BitMask = 0xff; 6534 uint64_t Val = 0; 6535 unsigned ImmMask = 1; 6536 Imm = 0; 6537 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6538 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6539 Val |= BitMask; 6540 Imm |= ImmMask; 6541 } else if ((SplatBits & BitMask) != 0) { 6542 return SDValue(); 6543 } 6544 BitMask <<= 8; 6545 ImmMask <<= 1; 6546 } 6547 6548 if (DAG.getDataLayout().isBigEndian()) 6549 // swap higher and lower 32 bit word 6550 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 6551 6552 // Op=1, Cmode=1110. 6553 OpCmode = 0x1e; 6554 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6555 break; 6556 } 6557 6558 default: 6559 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6560 } 6561 6562 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6563 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6564} 6565 6566SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6567 const ARMSubtarget *ST) const { 6568 EVT VT = Op.getValueType(); 6569 bool IsDouble = (VT == MVT::f64); 6570 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6571 const APFloat &FPVal = CFP->getValueAPF(); 6572 6573 // Prevent floating-point constants from using literal loads 6574 // when execute-only is enabled. 6575 if (ST->genExecuteOnly()) { 6576 // If we can represent the constant as an immediate, don't lower it 6577 if (isFPImmLegal(FPVal, VT)) 6578 return Op; 6579 // Otherwise, construct as integer, and move to float register 6580 APInt INTVal = FPVal.bitcastToAPInt(); 6581 SDLoc DL(CFP); 6582 switch (VT.getSimpleVT().SimpleTy) { 6583 default: 6584 llvm_unreachable("Unknown floating point type!"); 6585 break; 6586 case MVT::f64: { 6587 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6588 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6589 if (!ST->isLittle()) 6590 std::swap(Lo, Hi); 6591 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6592 } 6593 case MVT::f32: 6594 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6595 DAG.getConstant(INTVal, DL, MVT::i32)); 6596 } 6597 } 6598 6599 if (!ST->hasVFP3Base()) 6600 return SDValue(); 6601 6602 // Use the default (constant pool) lowering for double constants when we have 6603 // an SP-only FPU 6604 if (IsDouble && !Subtarget->hasFP64()) 6605 return SDValue(); 6606 6607 // Try splatting with a VMOV.f32... 6608 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6609 6610 if (ImmVal != -1) { 6611 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6612 // We have code in place to select a valid ConstantFP already, no need to 6613 // do any mangling. 6614 return Op; 6615 } 6616 6617 // It's a float and we are trying to use NEON operations where 6618 // possible. Lower it to a splat followed by an extract. 6619 SDLoc DL(Op); 6620 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6621 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6622 NewVal); 6623 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6624 DAG.getConstant(0, DL, MVT::i32)); 6625 } 6626 6627 // The rest of our options are NEON only, make sure that's allowed before 6628 // proceeding.. 6629 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6630 return SDValue(); 6631 6632 EVT VMovVT; 6633 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6634 6635 // It wouldn't really be worth bothering for doubles except for one very 6636 // important value, which does happen to match: 0.0. So make sure we don't do 6637 // anything stupid. 6638 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6639 return SDValue(); 6640 6641 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6642 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6643 VMovVT, false, VMOVModImm); 6644 if (NewVal != SDValue()) { 6645 SDLoc DL(Op); 6646 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6647 NewVal); 6648 if (IsDouble) 6649 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6650 6651 // It's a float: cast and extract a vector element. 6652 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6653 VecConstant); 6654 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6655 DAG.getConstant(0, DL, MVT::i32)); 6656 } 6657 6658 // Finally, try a VMVN.i32 6659 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6660 false, VMVNModImm); 6661 if (NewVal != SDValue()) { 6662 SDLoc DL(Op); 6663 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6664 6665 if (IsDouble) 6666 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6667 6668 // It's a float: cast and extract a vector element. 6669 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6670 VecConstant); 6671 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6672 DAG.getConstant(0, DL, MVT::i32)); 6673 } 6674 6675 return SDValue(); 6676} 6677 6678// check if an VEXT instruction can handle the shuffle mask when the 6679// vector sources of the shuffle are the same. 6680static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6681 unsigned NumElts = VT.getVectorNumElements(); 6682 6683 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6684 if (M[0] < 0) 6685 return false; 6686 6687 Imm = M[0]; 6688 6689 // If this is a VEXT shuffle, the immediate value is the index of the first 6690 // element. The other shuffle indices must be the successive elements after 6691 // the first one. 6692 unsigned ExpectedElt = Imm; 6693 for (unsigned i = 1; i < NumElts; ++i) { 6694 // Increment the expected index. If it wraps around, just follow it 6695 // back to index zero and keep going. 6696 ++ExpectedElt; 6697 if (ExpectedElt == NumElts) 6698 ExpectedElt = 0; 6699 6700 if (M[i] < 0) continue; // ignore UNDEF indices 6701 if (ExpectedElt != static_cast<unsigned>(M[i])) 6702 return false; 6703 } 6704 6705 return true; 6706} 6707 6708static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6709 bool &ReverseVEXT, unsigned &Imm) { 6710 unsigned NumElts = VT.getVectorNumElements(); 6711 ReverseVEXT = false; 6712 6713 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6714 if (M[0] < 0) 6715 return false; 6716 6717 Imm = M[0]; 6718 6719 // If this is a VEXT shuffle, the immediate value is the index of the first 6720 // element. The other shuffle indices must be the successive elements after 6721 // the first one. 6722 unsigned ExpectedElt = Imm; 6723 for (unsigned i = 1; i < NumElts; ++i) { 6724 // Increment the expected index. If it wraps around, it may still be 6725 // a VEXT but the source vectors must be swapped. 6726 ExpectedElt += 1; 6727 if (ExpectedElt == NumElts * 2) { 6728 ExpectedElt = 0; 6729 ReverseVEXT = true; 6730 } 6731 6732 if (M[i] < 0) continue; // ignore UNDEF indices 6733 if (ExpectedElt != static_cast<unsigned>(M[i])) 6734 return false; 6735 } 6736 6737 // Adjust the index value if the source operands will be swapped. 6738 if (ReverseVEXT) 6739 Imm -= NumElts; 6740 6741 return true; 6742} 6743 6744/// isVREVMask - Check if a vector shuffle corresponds to a VREV 6745/// instruction with the specified blocksize. (The order of the elements 6746/// within each block of the vector is reversed.) 6747static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6748 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6749 "Only possible block sizes for VREV are: 16, 32, 64"); 6750 6751 unsigned EltSz = VT.getScalarSizeInBits(); 6752 if (EltSz == 64) 6753 return false; 6754 6755 unsigned NumElts = VT.getVectorNumElements(); 6756 unsigned BlockElts = M[0] + 1; 6757 // If the first shuffle index is UNDEF, be optimistic. 6758 if (M[0] < 0) 6759 BlockElts = BlockSize / EltSz; 6760 6761 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6762 return false; 6763 6764 for (unsigned i = 0; i < NumElts; ++i) { 6765 if (M[i] < 0) continue; // ignore UNDEF indices 6766 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6767 return false; 6768 } 6769 6770 return true; 6771} 6772 6773static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6774 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6775 // range, then 0 is placed into the resulting vector. So pretty much any mask 6776 // of 8 elements can work here. 6777 return VT == MVT::v8i8 && M.size() == 8; 6778} 6779 6780static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6781 unsigned Index) { 6782 if (Mask.size() == Elements * 2) 6783 return Index / Elements; 6784 return Mask[Index] == 0 ? 0 : 1; 6785} 6786 6787// Checks whether the shuffle mask represents a vector transpose (VTRN) by 6788// checking that pairs of elements in the shuffle mask represent the same index 6789// in each vector, incrementing the expected index by 2 at each step. 6790// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6791// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6792// v2={e,f,g,h} 6793// WhichResult gives the offset for each element in the mask based on which 6794// of the two results it belongs to. 6795// 6796// The transpose can be represented either as: 6797// result1 = shufflevector v1, v2, result1_shuffle_mask 6798// result2 = shufflevector v1, v2, result2_shuffle_mask 6799// where v1/v2 and the shuffle masks have the same number of elements 6800// (here WhichResult (see below) indicates which result is being checked) 6801// 6802// or as: 6803// results = shufflevector v1, v2, shuffle_mask 6804// where both results are returned in one vector and the shuffle mask has twice 6805// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6806// want to check the low half and high half of the shuffle mask as if it were 6807// the other case 6808static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6809 unsigned EltSz = VT.getScalarSizeInBits(); 6810 if (EltSz == 64) 6811 return false; 6812 6813 unsigned NumElts = VT.getVectorNumElements(); 6814 if (M.size() != NumElts && M.size() != NumElts*2) 6815 return false; 6816 6817 // If the mask is twice as long as the input vector then we need to check the 6818 // upper and lower parts of the mask with a matching value for WhichResult 6819 // FIXME: A mask with only even values will be rejected in case the first 6820 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6821 // M[0] is used to determine WhichResult 6822 for (unsigned i = 0; i < M.size(); i += NumElts) { 6823 WhichResult = SelectPairHalf(NumElts, M, i); 6824 for (unsigned j = 0; j < NumElts; j += 2) { 6825 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6826 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6827 return false; 6828 } 6829 } 6830 6831 if (M.size() == NumElts*2) 6832 WhichResult = 0; 6833 6834 return true; 6835} 6836 6837/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6838/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6839/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6840static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6841 unsigned EltSz = VT.getScalarSizeInBits(); 6842 if (EltSz == 64) 6843 return false; 6844 6845 unsigned NumElts = VT.getVectorNumElements(); 6846 if (M.size() != NumElts && M.size() != NumElts*2) 6847 return false; 6848 6849 for (unsigned i = 0; i < M.size(); i += NumElts) { 6850 WhichResult = SelectPairHalf(NumElts, M, i); 6851 for (unsigned j = 0; j < NumElts; j += 2) { 6852 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6853 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6854 return false; 6855 } 6856 } 6857 6858 if (M.size() == NumElts*2) 6859 WhichResult = 0; 6860 6861 return true; 6862} 6863 6864// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6865// that the mask elements are either all even and in steps of size 2 or all odd 6866// and in steps of size 2. 6867// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6868// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6869// v2={e,f,g,h} 6870// Requires similar checks to that of isVTRNMask with 6871// respect the how results are returned. 6872static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6873 unsigned EltSz = VT.getScalarSizeInBits(); 6874 if (EltSz == 64) 6875 return false; 6876 6877 unsigned NumElts = VT.getVectorNumElements(); 6878 if (M.size() != NumElts && M.size() != NumElts*2) 6879 return false; 6880 6881 for (unsigned i = 0; i < M.size(); i += NumElts) { 6882 WhichResult = SelectPairHalf(NumElts, M, i); 6883 for (unsigned j = 0; j < NumElts; ++j) { 6884 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6885 return false; 6886 } 6887 } 6888 6889 if (M.size() == NumElts*2) 6890 WhichResult = 0; 6891 6892 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6893 if (VT.is64BitVector() && EltSz == 32) 6894 return false; 6895 6896 return true; 6897} 6898 6899/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6900/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6901/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6902static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6903 unsigned EltSz = VT.getScalarSizeInBits(); 6904 if (EltSz == 64) 6905 return false; 6906 6907 unsigned NumElts = VT.getVectorNumElements(); 6908 if (M.size() != NumElts && M.size() != NumElts*2) 6909 return false; 6910 6911 unsigned Half = NumElts / 2; 6912 for (unsigned i = 0; i < M.size(); i += NumElts) { 6913 WhichResult = SelectPairHalf(NumElts, M, i); 6914 for (unsigned j = 0; j < NumElts; j += Half) { 6915 unsigned Idx = WhichResult; 6916 for (unsigned k = 0; k < Half; ++k) { 6917 int MIdx = M[i + j + k]; 6918 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6919 return false; 6920 Idx += 2; 6921 } 6922 } 6923 } 6924 6925 if (M.size() == NumElts*2) 6926 WhichResult = 0; 6927 6928 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6929 if (VT.is64BitVector() && EltSz == 32) 6930 return false; 6931 6932 return true; 6933} 6934 6935// Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6936// that pairs of elements of the shufflemask represent the same index in each 6937// vector incrementing sequentially through the vectors. 6938// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6939// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6940// v2={e,f,g,h} 6941// Requires similar checks to that of isVTRNMask with respect the how results 6942// are returned. 6943static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6944 unsigned EltSz = VT.getScalarSizeInBits(); 6945 if (EltSz == 64) 6946 return false; 6947 6948 unsigned NumElts = VT.getVectorNumElements(); 6949 if (M.size() != NumElts && M.size() != NumElts*2) 6950 return false; 6951 6952 for (unsigned i = 0; i < M.size(); i += NumElts) { 6953 WhichResult = SelectPairHalf(NumElts, M, i); 6954 unsigned Idx = WhichResult * NumElts / 2; 6955 for (unsigned j = 0; j < NumElts; j += 2) { 6956 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6957 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6958 return false; 6959 Idx += 1; 6960 } 6961 } 6962 6963 if (M.size() == NumElts*2) 6964 WhichResult = 0; 6965 6966 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6967 if (VT.is64BitVector() && EltSz == 32) 6968 return false; 6969 6970 return true; 6971} 6972 6973/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6974/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6975/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6976static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6977 unsigned EltSz = VT.getScalarSizeInBits(); 6978 if (EltSz == 64) 6979 return false; 6980 6981 unsigned NumElts = VT.getVectorNumElements(); 6982 if (M.size() != NumElts && M.size() != NumElts*2) 6983 return false; 6984 6985 for (unsigned i = 0; i < M.size(); i += NumElts) { 6986 WhichResult = SelectPairHalf(NumElts, M, i); 6987 unsigned Idx = WhichResult * NumElts / 2; 6988 for (unsigned j = 0; j < NumElts; j += 2) { 6989 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6990 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6991 return false; 6992 Idx += 1; 6993 } 6994 } 6995 6996 if (M.size() == NumElts*2) 6997 WhichResult = 0; 6998 6999 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7000 if (VT.is64BitVector() && EltSz == 32) 7001 return false; 7002 7003 return true; 7004} 7005 7006/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7007/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7008static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7009 unsigned &WhichResult, 7010 bool &isV_UNDEF) { 7011 isV_UNDEF = false; 7012 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7013 return ARMISD::VTRN; 7014 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7015 return ARMISD::VUZP; 7016 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7017 return ARMISD::VZIP; 7018 7019 isV_UNDEF = true; 7020 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7021 return ARMISD::VTRN; 7022 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7023 return ARMISD::VUZP; 7024 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7025 return ARMISD::VZIP; 7026 7027 return 0; 7028} 7029 7030/// \return true if this is a reverse operation on an vector. 7031static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7032 unsigned NumElts = VT.getVectorNumElements(); 7033 // Make sure the mask has the right size. 7034 if (NumElts != M.size()) 7035 return false; 7036 7037 // Look for <15, ..., 3, -1, 1, 0>. 7038 for (unsigned i = 0; i != NumElts; ++i) 7039 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7040 return false; 7041 7042 return true; 7043} 7044 7045static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 7046 unsigned NumElts = VT.getVectorNumElements(); 7047 // Make sure the mask has the right size. 7048 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7049 return false; 7050 7051 // If Top 7052 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7053 // This inserts Input2 into Input1 7054 // else if not Top 7055 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7056 // This inserts Input1 into Input2 7057 unsigned Offset = Top ? 0 : 1; 7058 for (unsigned i = 0; i < NumElts; i+=2) { 7059 if (M[i] >= 0 && M[i] != (int)i) 7060 return false; 7061 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7062 return false; 7063 } 7064 7065 return true; 7066} 7067 7068// If N is an integer constant that can be moved into a register in one 7069// instruction, return an SDValue of such a constant (will become a MOV 7070// instruction). Otherwise return null. 7071static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7072 const ARMSubtarget *ST, const SDLoc &dl) { 7073 uint64_t Val; 7074 if (!isa<ConstantSDNode>(N)) 7075 return SDValue(); 7076 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7077 7078 if (ST->isThumb1Only()) { 7079 if (Val <= 255 || ~Val <= 255) 7080 return DAG.getConstant(Val, dl, MVT::i32); 7081 } else { 7082 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7083 return DAG.getConstant(Val, dl, MVT::i32); 7084 } 7085 return SDValue(); 7086} 7087 7088static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7089 const ARMSubtarget *ST) { 7090 SDLoc dl(Op); 7091 EVT VT = Op.getValueType(); 7092 7093 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7094 7095 unsigned NumElts = VT.getVectorNumElements(); 7096 unsigned BoolMask; 7097 unsigned BitsPerBool; 7098 if (NumElts == 4) { 7099 BitsPerBool = 4; 7100 BoolMask = 0xf; 7101 } else if (NumElts == 8) { 7102 BitsPerBool = 2; 7103 BoolMask = 0x3; 7104 } else if (NumElts == 16) { 7105 BitsPerBool = 1; 7106 BoolMask = 0x1; 7107 } else 7108 return SDValue(); 7109 7110 // If this is a single value copied into all lanes (a splat), we can just sign 7111 // extend that single value 7112 SDValue FirstOp = Op.getOperand(0); 7113 if (!isa<ConstantSDNode>(FirstOp) && 7114 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7115 [&FirstOp](SDUse &U) { 7116 return U.get().isUndef() || U.get() == FirstOp; 7117 })) { 7118 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7119 DAG.getValueType(MVT::i1)); 7120 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7121 } 7122 7123 // First create base with bits set where known 7124 unsigned Bits32 = 0; 7125 for (unsigned i = 0; i < NumElts; ++i) { 7126 SDValue V = Op.getOperand(i); 7127 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7128 continue; 7129 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7130 if (BitSet) 7131 Bits32 |= BoolMask << (i * BitsPerBool); 7132 } 7133 7134 // Add in unknown nodes 7135 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7136 DAG.getConstant(Bits32, dl, MVT::i32)); 7137 for (unsigned i = 0; i < NumElts; ++i) { 7138 SDValue V = Op.getOperand(i); 7139 if (isa<ConstantSDNode>(V) || V.isUndef()) 7140 continue; 7141 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7142 DAG.getConstant(i, dl, MVT::i32)); 7143 } 7144 7145 return Base; 7146} 7147 7148// If this is a case we can't handle, return null and let the default 7149// expansion code take care of it. 7150SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7151 const ARMSubtarget *ST) const { 7152 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7153 SDLoc dl(Op); 7154 EVT VT = Op.getValueType(); 7155 7156 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7157 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7158 7159 APInt SplatBits, SplatUndef; 7160 unsigned SplatBitSize; 7161 bool HasAnyUndefs; 7162 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7163 if (SplatUndef.isAllOnesValue()) 7164 return DAG.getUNDEF(VT); 7165 7166 if ((ST->hasNEON() && SplatBitSize <= 64) || 7167 (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { 7168 // Check if an immediate VMOV works. 7169 EVT VmovVT; 7170 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 7171 SplatUndef.getZExtValue(), SplatBitSize, 7172 DAG, dl, VmovVT, VT.is128BitVector(), 7173 VMOVModImm); 7174 7175 if (Val.getNode()) { 7176 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7177 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7178 } 7179 7180 // Try an immediate VMVN. 7181 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7182 Val = isVMOVModifiedImm( 7183 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 7184 DAG, dl, VmovVT, VT.is128BitVector(), 7185 ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7186 if (Val.getNode()) { 7187 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7188 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7189 } 7190 7191 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7192 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7193 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7194 if (ImmVal != -1) { 7195 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7196 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7197 } 7198 } 7199 } 7200 } 7201 7202 // Scan through the operands to see if only one value is used. 7203 // 7204 // As an optimisation, even if more than one value is used it may be more 7205 // profitable to splat with one value then change some lanes. 7206 // 7207 // Heuristically we decide to do this if the vector has a "dominant" value, 7208 // defined as splatted to more than half of the lanes. 7209 unsigned NumElts = VT.getVectorNumElements(); 7210 bool isOnlyLowElement = true; 7211 bool usesOnlyOneValue = true; 7212 bool hasDominantValue = false; 7213 bool isConstant = true; 7214 7215 // Map of the number of times a particular SDValue appears in the 7216 // element list. 7217 DenseMap<SDValue, unsigned> ValueCounts; 7218 SDValue Value; 7219 for (unsigned i = 0; i < NumElts; ++i) { 7220 SDValue V = Op.getOperand(i); 7221 if (V.isUndef()) 7222 continue; 7223 if (i > 0) 7224 isOnlyLowElement = false; 7225 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7226 isConstant = false; 7227 7228 ValueCounts.insert(std::make_pair(V, 0)); 7229 unsigned &Count = ValueCounts[V]; 7230 7231 // Is this value dominant? (takes up more than half of the lanes) 7232 if (++Count > (NumElts / 2)) { 7233 hasDominantValue = true; 7234 Value = V; 7235 } 7236 } 7237 if (ValueCounts.size() != 1) 7238 usesOnlyOneValue = false; 7239 if (!Value.getNode() && !ValueCounts.empty()) 7240 Value = ValueCounts.begin()->first; 7241 7242 if (ValueCounts.empty()) 7243 return DAG.getUNDEF(VT); 7244 7245 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7246 // Keep going if we are hitting this case. 7247 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7248 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7249 7250 unsigned EltSize = VT.getScalarSizeInBits(); 7251 7252 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7253 // i32 and try again. 7254 if (hasDominantValue && EltSize <= 32) { 7255 if (!isConstant) { 7256 SDValue N; 7257 7258 // If we are VDUPing a value that comes directly from a vector, that will 7259 // cause an unnecessary move to and from a GPR, where instead we could 7260 // just use VDUPLANE. We can only do this if the lane being extracted 7261 // is at a constant index, as the VDUP from lane instructions only have 7262 // constant-index forms. 7263 ConstantSDNode *constIndex; 7264 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7265 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7266 // We need to create a new undef vector to use for the VDUPLANE if the 7267 // size of the vector from which we get the value is different than the 7268 // size of the vector that we need to create. We will insert the element 7269 // such that the register coalescer will remove unnecessary copies. 7270 if (VT != Value->getOperand(0).getValueType()) { 7271 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7272 VT.getVectorNumElements(); 7273 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7274 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7275 Value, DAG.getConstant(index, dl, MVT::i32)), 7276 DAG.getConstant(index, dl, MVT::i32)); 7277 } else 7278 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7279 Value->getOperand(0), Value->getOperand(1)); 7280 } else 7281 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7282 7283 if (!usesOnlyOneValue) { 7284 // The dominant value was splatted as 'N', but we now have to insert 7285 // all differing elements. 7286 for (unsigned I = 0; I < NumElts; ++I) { 7287 if (Op.getOperand(I) == Value) 7288 continue; 7289 SmallVector<SDValue, 3> Ops; 7290 Ops.push_back(N); 7291 Ops.push_back(Op.getOperand(I)); 7292 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7293 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7294 } 7295 } 7296 return N; 7297 } 7298 if (VT.getVectorElementType().isFloatingPoint()) { 7299 SmallVector<SDValue, 8> Ops; 7300 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7301 assert(FVT == MVT::f32 || FVT == MVT::f16); 7302 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7303 for (unsigned i = 0; i < NumElts; ++i) 7304 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7305 Op.getOperand(i))); 7306 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7307 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7308 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7309 if (Val.getNode()) 7310 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7311 } 7312 if (usesOnlyOneValue) { 7313 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7314 if (isConstant && Val.getNode()) 7315 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7316 } 7317 } 7318 7319 // If all elements are constants and the case above didn't get hit, fall back 7320 // to the default expansion, which will generate a load from the constant 7321 // pool. 7322 if (isConstant) 7323 return SDValue(); 7324 7325 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7326 if (NumElts >= 4) { 7327 SDValue shuffle = ReconstructShuffle(Op, DAG); 7328 if (shuffle != SDValue()) 7329 return shuffle; 7330 } 7331 7332 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7333 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7334 // into two 64-bit vectors; we might discover a better way to lower it. 7335 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7336 EVT ExtVT = VT.getVectorElementType(); 7337 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7338 SDValue Lower = 7339 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7340 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7341 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7342 SDValue Upper = DAG.getBuildVector( 7343 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7344 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7345 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7346 if (Lower && Upper) 7347 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7348 } 7349 7350 // Vectors with 32- or 64-bit elements can be built by directly assigning 7351 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7352 // will be legalized. 7353 if (EltSize >= 32) { 7354 // Do the expansion with floating-point types, since that is what the VFP 7355 // registers are defined to use, and since i64 is not legal. 7356 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7357 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7358 SmallVector<SDValue, 8> Ops; 7359 for (unsigned i = 0; i < NumElts; ++i) 7360 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7361 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7362 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7363 } 7364 7365 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7366 // know the default expansion would otherwise fall back on something even 7367 // worse. For a vector with one or two non-undef values, that's 7368 // scalar_to_vector for the elements followed by a shuffle (provided the 7369 // shuffle is valid for the target) and materialization element by element 7370 // on the stack followed by a load for everything else. 7371 if (!isConstant && !usesOnlyOneValue) { 7372 SDValue Vec = DAG.getUNDEF(VT); 7373 for (unsigned i = 0 ; i < NumElts; ++i) { 7374 SDValue V = Op.getOperand(i); 7375 if (V.isUndef()) 7376 continue; 7377 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7378 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7379 } 7380 return Vec; 7381 } 7382 7383 return SDValue(); 7384} 7385 7386// Gather data to see if the operation can be modelled as a 7387// shuffle in combination with VEXTs. 7388SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7389 SelectionDAG &DAG) const { 7390 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7391 SDLoc dl(Op); 7392 EVT VT = Op.getValueType(); 7393 unsigned NumElts = VT.getVectorNumElements(); 7394 7395 struct ShuffleSourceInfo { 7396 SDValue Vec; 7397 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7398 unsigned MaxElt = 0; 7399 7400 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7401 // be compatible with the shuffle we intend to construct. As a result 7402 // ShuffleVec will be some sliding window into the original Vec. 7403 SDValue ShuffleVec; 7404 7405 // Code should guarantee that element i in Vec starts at element "WindowBase 7406 // + i * WindowScale in ShuffleVec". 7407 int WindowBase = 0; 7408 int WindowScale = 1; 7409 7410 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7411 7412 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7413 }; 7414 7415 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7416 // node. 7417 SmallVector<ShuffleSourceInfo, 2> Sources; 7418 for (unsigned i = 0; i < NumElts; ++i) { 7419 SDValue V = Op.getOperand(i); 7420 if (V.isUndef()) 7421 continue; 7422 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7423 // A shuffle can only come from building a vector from various 7424 // elements of other vectors. 7425 return SDValue(); 7426 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7427 // Furthermore, shuffles require a constant mask, whereas extractelts 7428 // accept variable indices. 7429 return SDValue(); 7430 } 7431 7432 // Add this element source to the list if it's not already there. 7433 SDValue SourceVec = V.getOperand(0); 7434 auto Source = llvm::find(Sources, SourceVec); 7435 if (Source == Sources.end()) 7436 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7437 7438 // Update the minimum and maximum lane number seen. 7439 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7440 Source->MinElt = std::min(Source->MinElt, EltNo); 7441 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7442 } 7443 7444 // Currently only do something sane when at most two source vectors 7445 // are involved. 7446 if (Sources.size() > 2) 7447 return SDValue(); 7448 7449 // Find out the smallest element size among result and two sources, and use 7450 // it as element size to build the shuffle_vector. 7451 EVT SmallestEltTy = VT.getVectorElementType(); 7452 for (auto &Source : Sources) { 7453 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7454 if (SrcEltTy.bitsLT(SmallestEltTy)) 7455 SmallestEltTy = SrcEltTy; 7456 } 7457 unsigned ResMultiplier = 7458 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7459 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7460 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7461 7462 // If the source vector is too wide or too narrow, we may nevertheless be able 7463 // to construct a compatible shuffle either by concatenating it with UNDEF or 7464 // extracting a suitable range of elements. 7465 for (auto &Src : Sources) { 7466 EVT SrcVT = Src.ShuffleVec.getValueType(); 7467 7468 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7469 continue; 7470 7471 // This stage of the search produces a source with the same element type as 7472 // the original, but with a total width matching the BUILD_VECTOR output. 7473 EVT EltVT = SrcVT.getVectorElementType(); 7474 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7475 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7476 7477 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7478 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7479 return SDValue(); 7480 // We can pad out the smaller vector for free, so if it's part of a 7481 // shuffle... 7482 Src.ShuffleVec = 7483 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7484 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7485 continue; 7486 } 7487 7488 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7489 return SDValue(); 7490 7491 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7492 // Span too large for a VEXT to cope 7493 return SDValue(); 7494 } 7495 7496 if (Src.MinElt >= NumSrcElts) { 7497 // The extraction can just take the second half 7498 Src.ShuffleVec = 7499 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7500 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7501 Src.WindowBase = -NumSrcElts; 7502 } else if (Src.MaxElt < NumSrcElts) { 7503 // The extraction can just take the first half 7504 Src.ShuffleVec = 7505 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7506 DAG.getConstant(0, dl, MVT::i32)); 7507 } else { 7508 // An actual VEXT is needed 7509 SDValue VEXTSrc1 = 7510 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7511 DAG.getConstant(0, dl, MVT::i32)); 7512 SDValue VEXTSrc2 = 7513 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7514 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7515 7516 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7517 VEXTSrc2, 7518 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7519 Src.WindowBase = -Src.MinElt; 7520 } 7521 } 7522 7523 // Another possible incompatibility occurs from the vector element types. We 7524 // can fix this by bitcasting the source vectors to the same type we intend 7525 // for the shuffle. 7526 for (auto &Src : Sources) { 7527 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7528 if (SrcEltTy == SmallestEltTy) 7529 continue; 7530 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7531 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 7532 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7533 Src.WindowBase *= Src.WindowScale; 7534 } 7535 7536 // Final sanity check before we try to actually produce a shuffle. 7537 LLVM_DEBUG(for (auto Src 7538 : Sources) 7539 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7540 7541 // The stars all align, our next step is to produce the mask for the shuffle. 7542 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7543 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7544 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7545 SDValue Entry = Op.getOperand(i); 7546 if (Entry.isUndef()) 7547 continue; 7548 7549 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7550 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7551 7552 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7553 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7554 // segment. 7555 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7556 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7557 VT.getScalarSizeInBits()); 7558 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7559 7560 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7561 // starting at the appropriate offset. 7562 int *LaneMask = &Mask[i * ResMultiplier]; 7563 7564 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7565 ExtractBase += NumElts * (Src - Sources.begin()); 7566 for (int j = 0; j < LanesDefined; ++j) 7567 LaneMask[j] = ExtractBase + j; 7568 } 7569 7570 7571 // We can't handle more than two sources. This should have already 7572 // been checked before this point. 7573 assert(Sources.size() <= 2 && "Too many sources!"); 7574 7575 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7576 for (unsigned i = 0; i < Sources.size(); ++i) 7577 ShuffleOps[i] = Sources[i].ShuffleVec; 7578 7579 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7580 ShuffleOps[1], Mask, DAG); 7581 if (!Shuffle) 7582 return SDValue(); 7583 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 7584} 7585 7586enum ShuffleOpCodes { 7587 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7588 OP_VREV, 7589 OP_VDUP0, 7590 OP_VDUP1, 7591 OP_VDUP2, 7592 OP_VDUP3, 7593 OP_VEXT1, 7594 OP_VEXT2, 7595 OP_VEXT3, 7596 OP_VUZPL, // VUZP, left result 7597 OP_VUZPR, // VUZP, right result 7598 OP_VZIPL, // VZIP, left result 7599 OP_VZIPR, // VZIP, right result 7600 OP_VTRNL, // VTRN, left result 7601 OP_VTRNR // VTRN, right result 7602}; 7603 7604static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7605 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7606 switch (OpNum) { 7607 case OP_COPY: 7608 case OP_VREV: 7609 case OP_VDUP0: 7610 case OP_VDUP1: 7611 case OP_VDUP2: 7612 case OP_VDUP3: 7613 return true; 7614 } 7615 return false; 7616} 7617 7618/// isShuffleMaskLegal - Targets can use this to indicate that they only 7619/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7620/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7621/// are assumed to be legal. 7622bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7623 if (VT.getVectorNumElements() == 4 && 7624 (VT.is128BitVector() || VT.is64BitVector())) { 7625 unsigned PFIndexes[4]; 7626 for (unsigned i = 0; i != 4; ++i) { 7627 if (M[i] < 0) 7628 PFIndexes[i] = 8; 7629 else 7630 PFIndexes[i] = M[i]; 7631 } 7632 7633 // Compute the index in the perfect shuffle table. 7634 unsigned PFTableIndex = 7635 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7636 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7637 unsigned Cost = (PFEntry >> 30); 7638 7639 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7640 return true; 7641 } 7642 7643 bool ReverseVEXT, isV_UNDEF; 7644 unsigned Imm, WhichResult; 7645 7646 unsigned EltSize = VT.getScalarSizeInBits(); 7647 if (EltSize >= 32 || 7648 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7649 ShuffleVectorInst::isIdentityMask(M) || 7650 isVREVMask(M, VT, 64) || 7651 isVREVMask(M, VT, 32) || 7652 isVREVMask(M, VT, 16)) 7653 return true; 7654 else if (Subtarget->hasNEON() && 7655 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7656 isVTBLMask(M, VT) || 7657 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7658 return true; 7659 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7660 isReverseMask(M, VT)) 7661 return true; 7662 else if (Subtarget->hasMVEIntegerOps() && 7663 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7664 return true; 7665 else 7666 return false; 7667} 7668 7669/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7670/// the specified operations to build the shuffle. 7671static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7672 SDValue RHS, SelectionDAG &DAG, 7673 const SDLoc &dl) { 7674 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7675 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7676 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7677 7678 if (OpNum == OP_COPY) { 7679 if (LHSID == (1*9+2)*9+3) return LHS; 7680 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7681 return RHS; 7682 } 7683 7684 SDValue OpLHS, OpRHS; 7685 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7686 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7687 EVT VT = OpLHS.getValueType(); 7688 7689 switch (OpNum) { 7690 default: llvm_unreachable("Unknown shuffle opcode!"); 7691 case OP_VREV: 7692 // VREV divides the vector in half and swaps within the half. 7693 if (VT.getVectorElementType() == MVT::i32 || 7694 VT.getVectorElementType() == MVT::f32) 7695 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7696 // vrev <4 x i16> -> VREV32 7697 if (VT.getVectorElementType() == MVT::i16) 7698 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7699 // vrev <4 x i8> -> VREV16 7700 assert(VT.getVectorElementType() == MVT::i8); 7701 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7702 case OP_VDUP0: 7703 case OP_VDUP1: 7704 case OP_VDUP2: 7705 case OP_VDUP3: 7706 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7707 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7708 case OP_VEXT1: 7709 case OP_VEXT2: 7710 case OP_VEXT3: 7711 return DAG.getNode(ARMISD::VEXT, dl, VT, 7712 OpLHS, OpRHS, 7713 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7714 case OP_VUZPL: 7715 case OP_VUZPR: 7716 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7717 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7718 case OP_VZIPL: 7719 case OP_VZIPR: 7720 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7721 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7722 case OP_VTRNL: 7723 case OP_VTRNR: 7724 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7725 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7726 } 7727} 7728 7729static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7730 ArrayRef<int> ShuffleMask, 7731 SelectionDAG &DAG) { 7732 // Check to see if we can use the VTBL instruction. 7733 SDValue V1 = Op.getOperand(0); 7734 SDValue V2 = Op.getOperand(1); 7735 SDLoc DL(Op); 7736 7737 SmallVector<SDValue, 8> VTBLMask; 7738 for (ArrayRef<int>::iterator 7739 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7740 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7741 7742 if (V2.getNode()->isUndef()) 7743 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7744 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7745 7746 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7747 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7748} 7749 7750static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7751 SelectionDAG &DAG) { 7752 SDLoc DL(Op); 7753 SDValue OpLHS = Op.getOperand(0); 7754 EVT VT = OpLHS.getValueType(); 7755 7756 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7757 "Expect an v8i16/v16i8 type"); 7758 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7759 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7760 // extract the first 8 bytes into the top double word and the last 8 bytes 7761 // into the bottom double word. The v8i16 case is similar. 7762 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7763 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7764 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7765} 7766 7767static EVT getVectorTyFromPredicateVector(EVT VT) { 7768 switch (VT.getSimpleVT().SimpleTy) { 7769 case MVT::v4i1: 7770 return MVT::v4i32; 7771 case MVT::v8i1: 7772 return MVT::v8i16; 7773 case MVT::v16i1: 7774 return MVT::v16i8; 7775 default: 7776 llvm_unreachable("Unexpected vector predicate type"); 7777 } 7778} 7779 7780static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7781 SelectionDAG &DAG) { 7782 // Converting from boolean predicates to integers involves creating a vector 7783 // of all ones or all zeroes and selecting the lanes based upon the real 7784 // predicate. 7785 SDValue AllOnes = 7786 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7787 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7788 7789 SDValue AllZeroes = 7790 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7791 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7792 7793 // Get full vector type from predicate type 7794 EVT NewVT = getVectorTyFromPredicateVector(VT); 7795 7796 SDValue RecastV1; 7797 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7798 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7799 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7800 // since we know in hardware the sizes are really the same. 7801 if (VT != MVT::v16i1) 7802 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7803 else 7804 RecastV1 = Pred; 7805 7806 // Select either all ones or zeroes depending upon the real predicate bits. 7807 SDValue PredAsVector = 7808 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7809 7810 // Recast our new predicate-as-integer v16i8 vector into something 7811 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7812 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7813} 7814 7815static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7816 const ARMSubtarget *ST) { 7817 EVT VT = Op.getValueType(); 7818 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7819 ArrayRef<int> ShuffleMask = SVN->getMask(); 7820 7821 assert(ST->hasMVEIntegerOps() && 7822 "No support for vector shuffle of boolean predicates"); 7823 7824 SDValue V1 = Op.getOperand(0); 7825 SDLoc dl(Op); 7826 if (isReverseMask(ShuffleMask, VT)) { 7827 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7828 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7829 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7830 DAG.getConstant(16, dl, MVT::i32)); 7831 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7832 } 7833 7834 // Until we can come up with optimised cases for every single vector 7835 // shuffle in existence we have chosen the least painful strategy. This is 7836 // to essentially promote the boolean predicate to a 8-bit integer, where 7837 // each predicate represents a byte. Then we fall back on a normal integer 7838 // vector shuffle and convert the result back into a predicate vector. In 7839 // many cases the generated code might be even better than scalar code 7840 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7841 // fields in a register into 8 other arbitrary 2-bit fields! 7842 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7843 EVT NewVT = PredAsVector.getValueType(); 7844 7845 // Do the shuffle! 7846 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7847 DAG.getUNDEF(NewVT), ShuffleMask); 7848 7849 // Now return the result of comparing the shuffled vector with zero, 7850 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7851 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7852 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7853} 7854 7855static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 7856 ArrayRef<int> ShuffleMask, 7857 SelectionDAG &DAG) { 7858 // Attempt to lower the vector shuffle using as many whole register movs as 7859 // possible. This is useful for types smaller than 32bits, which would 7860 // often otherwise become a series for grp movs. 7861 SDLoc dl(Op); 7862 EVT VT = Op.getValueType(); 7863 if (VT.getScalarSizeInBits() >= 32) 7864 return SDValue(); 7865 7866 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 7867 "Unexpected vector type"); 7868 int NumElts = VT.getVectorNumElements(); 7869 int QuarterSize = NumElts / 4; 7870 // The four final parts of the vector, as i32's 7871 SDValue Parts[4]; 7872 7873 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 7874 // <u,u,u,u>), returning the vmov lane index 7875 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 7876 // Detect which mov lane this would be from the first non-undef element. 7877 int MovIdx = -1; 7878 for (int i = 0; i < Length; i++) { 7879 if (ShuffleMask[Start + i] >= 0) { 7880 if (ShuffleMask[Start + i] % Length != i) 7881 return -1; 7882 MovIdx = ShuffleMask[Start + i] / Length; 7883 break; 7884 } 7885 } 7886 // If all items are undef, leave this for other combines 7887 if (MovIdx == -1) 7888 return -1; 7889 // Check the remaining values are the correct part of the same mov 7890 for (int i = 1; i < Length; i++) { 7891 if (ShuffleMask[Start + i] >= 0 && 7892 (ShuffleMask[Start + i] / Length != MovIdx || 7893 ShuffleMask[Start + i] % Length != i)) 7894 return -1; 7895 } 7896 return MovIdx; 7897 }; 7898 7899 for (int Part = 0; Part < 4; ++Part) { 7900 // Does this part look like a mov 7901 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 7902 if (Elt != -1) { 7903 SDValue Input = Op->getOperand(0); 7904 if (Elt >= 4) { 7905 Input = Op->getOperand(1); 7906 Elt -= 4; 7907 } 7908 SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); 7909 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, 7910 DAG.getConstant(Elt, dl, MVT::i32)); 7911 } 7912 } 7913 7914 // Nothing interesting found, just return 7915 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 7916 return SDValue(); 7917 7918 // The other parts need to be built with the old shuffle vector, cast to a 7919 // v4i32 and extract_vector_elts 7920 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 7921 SmallVector<int, 16> NewShuffleMask; 7922 for (int Part = 0; Part < 4; ++Part) 7923 for (int i = 0; i < QuarterSize; i++) 7924 NewShuffleMask.push_back( 7925 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 7926 SDValue NewShuffle = DAG.getVectorShuffle( 7927 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 7928 SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); 7929 7930 for (int Part = 0; Part < 4; ++Part) 7931 if (!Parts[Part]) 7932 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7933 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 7934 } 7935 // Build a vector out of the various parts and bitcast it back to the original 7936 // type. 7937 SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); 7938 return DAG.getBitcast(VT, NewVec); 7939} 7940 7941static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7942 const ARMSubtarget *ST) { 7943 SDValue V1 = Op.getOperand(0); 7944 SDValue V2 = Op.getOperand(1); 7945 SDLoc dl(Op); 7946 EVT VT = Op.getValueType(); 7947 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7948 unsigned EltSize = VT.getScalarSizeInBits(); 7949 7950 if (ST->hasMVEIntegerOps() && EltSize == 1) 7951 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 7952 7953 // Convert shuffles that are directly supported on NEON to target-specific 7954 // DAG nodes, instead of keeping them as shuffles and matching them again 7955 // during code selection. This is more efficient and avoids the possibility 7956 // of inconsistencies between legalization and selection. 7957 // FIXME: floating-point vectors should be canonicalized to integer vectors 7958 // of the same time so that they get CSEd properly. 7959 ArrayRef<int> ShuffleMask = SVN->getMask(); 7960 7961 if (EltSize <= 32) { 7962 if (SVN->isSplat()) { 7963 int Lane = SVN->getSplatIndex(); 7964 // If this is undef splat, generate it via "just" vdup, if possible. 7965 if (Lane == -1) Lane = 0; 7966 7967 // Test if V1 is a SCALAR_TO_VECTOR. 7968 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7969 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7970 } 7971 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7972 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7973 // reaches it). 7974 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7975 !isa<ConstantSDNode>(V1.getOperand(0))) { 7976 bool IsScalarToVector = true; 7977 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7978 if (!V1.getOperand(i).isUndef()) { 7979 IsScalarToVector = false; 7980 break; 7981 } 7982 if (IsScalarToVector) 7983 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7984 } 7985 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7986 DAG.getConstant(Lane, dl, MVT::i32)); 7987 } 7988 7989 bool ReverseVEXT = false; 7990 unsigned Imm = 0; 7991 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7992 if (ReverseVEXT) 7993 std::swap(V1, V2); 7994 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7995 DAG.getConstant(Imm, dl, MVT::i32)); 7996 } 7997 7998 if (isVREVMask(ShuffleMask, VT, 64)) 7999 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8000 if (isVREVMask(ShuffleMask, VT, 32)) 8001 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8002 if (isVREVMask(ShuffleMask, VT, 16)) 8003 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8004 8005 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8006 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8007 DAG.getConstant(Imm, dl, MVT::i32)); 8008 } 8009 8010 // Check for Neon shuffles that modify both input vectors in place. 8011 // If both results are used, i.e., if there are two shuffles with the same 8012 // source operands and with masks corresponding to both results of one of 8013 // these operations, DAG memoization will ensure that a single node is 8014 // used for both shuffles. 8015 unsigned WhichResult = 0; 8016 bool isV_UNDEF = false; 8017 if (ST->hasNEON()) { 8018 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8019 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8020 if (isV_UNDEF) 8021 V2 = V1; 8022 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8023 .getValue(WhichResult); 8024 } 8025 } 8026 if (ST->hasMVEIntegerOps()) { 8027 if (isVMOVNMask(ShuffleMask, VT, 0)) 8028 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8029 DAG.getConstant(0, dl, MVT::i32)); 8030 if (isVMOVNMask(ShuffleMask, VT, 1)) 8031 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8032 DAG.getConstant(1, dl, MVT::i32)); 8033 } 8034 8035 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8036 // shuffles that produce a result larger than their operands with: 8037 // shuffle(concat(v1, undef), concat(v2, undef)) 8038 // -> 8039 // shuffle(concat(v1, v2), undef) 8040 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8041 // 8042 // This is useful in the general case, but there are special cases where 8043 // native shuffles produce larger results: the two-result ops. 8044 // 8045 // Look through the concat when lowering them: 8046 // shuffle(concat(v1, v2), undef) 8047 // -> 8048 // concat(VZIP(v1, v2):0, :1) 8049 // 8050 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8051 SDValue SubV1 = V1->getOperand(0); 8052 SDValue SubV2 = V1->getOperand(1); 8053 EVT SubVT = SubV1.getValueType(); 8054 8055 // We expect these to have been canonicalized to -1. 8056 assert(llvm::all_of(ShuffleMask, [&](int i) { 8057 return i < (int)VT.getVectorNumElements(); 8058 }) && "Unexpected shuffle index into UNDEF operand!"); 8059 8060 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8061 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8062 if (isV_UNDEF) 8063 SubV2 = SubV1; 8064 assert((WhichResult == 0) && 8065 "In-place shuffle of concat can only have one result!"); 8066 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8067 SubV1, SubV2); 8068 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8069 Res.getValue(1)); 8070 } 8071 } 8072 } 8073 8074 // If the shuffle is not directly supported and it has 4 elements, use 8075 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8076 unsigned NumElts = VT.getVectorNumElements(); 8077 if (NumElts == 4) { 8078 unsigned PFIndexes[4]; 8079 for (unsigned i = 0; i != 4; ++i) { 8080 if (ShuffleMask[i] < 0) 8081 PFIndexes[i] = 8; 8082 else 8083 PFIndexes[i] = ShuffleMask[i]; 8084 } 8085 8086 // Compute the index in the perfect shuffle table. 8087 unsigned PFTableIndex = 8088 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8089 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8090 unsigned Cost = (PFEntry >> 30); 8091 8092 if (Cost <= 4) { 8093 if (ST->hasNEON()) 8094 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8095 else if (isLegalMVEShuffleOp(PFEntry)) { 8096 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8097 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8098 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8099 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8100 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8101 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8102 } 8103 } 8104 } 8105 8106 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8107 if (EltSize >= 32) { 8108 // Do the expansion with floating-point types, since that is what the VFP 8109 // registers are defined to use, and since i64 is not legal. 8110 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8111 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8112 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8113 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8114 SmallVector<SDValue, 8> Ops; 8115 for (unsigned i = 0; i < NumElts; ++i) { 8116 if (ShuffleMask[i] < 0) 8117 Ops.push_back(DAG.getUNDEF(EltVT)); 8118 else 8119 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8120 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8121 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8122 dl, MVT::i32))); 8123 } 8124 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8125 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8126 } 8127 8128 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8129 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8130 8131 if (ST->hasNEON() && VT == MVT::v8i8) 8132 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8133 return NewOp; 8134 8135 if (ST->hasMVEIntegerOps()) 8136 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8137 return NewOp; 8138 8139 return SDValue(); 8140} 8141 8142static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8143 const ARMSubtarget *ST) { 8144 EVT VecVT = Op.getOperand(0).getValueType(); 8145 SDLoc dl(Op); 8146 8147 assert(ST->hasMVEIntegerOps() && 8148 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8149 8150 SDValue Conv = 8151 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8152 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8153 unsigned LaneWidth = 8154 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8155 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8156 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8157 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8158 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8159 DAG.getConstant(~Mask, dl, MVT::i32)); 8160 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8161} 8162 8163SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8164 SelectionDAG &DAG) const { 8165 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8166 SDValue Lane = Op.getOperand(2); 8167 if (!isa<ConstantSDNode>(Lane)) 8168 return SDValue(); 8169 8170 SDValue Elt = Op.getOperand(1); 8171 EVT EltVT = Elt.getValueType(); 8172 8173 if (Subtarget->hasMVEIntegerOps() && 8174 Op.getValueType().getScalarSizeInBits() == 1) 8175 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8176 8177 if (getTypeAction(*DAG.getContext(), EltVT) == 8178 TargetLowering::TypePromoteFloat) { 8179 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8180 // but the type system will try to do that if we don't intervene. 8181 // Reinterpret any such vector-element insertion as one with the 8182 // corresponding integer types. 8183 8184 SDLoc dl(Op); 8185 8186 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8187 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8188 TargetLowering::TypePromoteFloat); 8189 8190 SDValue VecIn = Op.getOperand(0); 8191 EVT VecVT = VecIn.getValueType(); 8192 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8193 VecVT.getVectorNumElements()); 8194 8195 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8196 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8197 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8198 IVecIn, IElt, Lane); 8199 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8200 } 8201 8202 return Op; 8203} 8204 8205static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8206 const ARMSubtarget *ST) { 8207 EVT VecVT = Op.getOperand(0).getValueType(); 8208 SDLoc dl(Op); 8209 8210 assert(ST->hasMVEIntegerOps() && 8211 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8212 8213 SDValue Conv = 8214 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8215 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8216 unsigned LaneWidth = 8217 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8218 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8219 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8220 return Shift; 8221} 8222 8223static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8224 const ARMSubtarget *ST) { 8225 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8226 SDValue Lane = Op.getOperand(1); 8227 if (!isa<ConstantSDNode>(Lane)) 8228 return SDValue(); 8229 8230 SDValue Vec = Op.getOperand(0); 8231 EVT VT = Vec.getValueType(); 8232 8233 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8234 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8235 8236 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8237 SDLoc dl(Op); 8238 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8239 } 8240 8241 return Op; 8242} 8243 8244static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8245 const ARMSubtarget *ST) { 8246 SDValue V1 = Op.getOperand(0); 8247 SDValue V2 = Op.getOperand(1); 8248 SDLoc dl(Op); 8249 EVT VT = Op.getValueType(); 8250 EVT Op1VT = V1.getValueType(); 8251 EVT Op2VT = V2.getValueType(); 8252 unsigned NumElts = VT.getVectorNumElements(); 8253 8254 assert(Op1VT == Op2VT && "Operand types don't match!"); 8255 assert(VT.getScalarSizeInBits() == 1 && 8256 "Unexpected custom CONCAT_VECTORS lowering"); 8257 assert(ST->hasMVEIntegerOps() && 8258 "CONCAT_VECTORS lowering only supported for MVE"); 8259 8260 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8261 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8262 8263 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8264 // promoted to v8i16, etc. 8265 8266 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8267 8268 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8269 // to be the right size for the destination. For example, if Op1 is v4i1 then 8270 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8271 // which when promoted is v8i16. That means each i32 element from Op1 needs 8272 // truncating to i16 and inserting in the result. 8273 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8274 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8275 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8276 EVT NewVT = NewV.getValueType(); 8277 EVT ConcatVT = ConVec.getValueType(); 8278 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8279 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8280 DAG.getIntPtrConstant(i, dl)); 8281 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8282 DAG.getConstant(j, dl, MVT::i32)); 8283 } 8284 return ConVec; 8285 }; 8286 unsigned j = 0; 8287 ConVec = ExractInto(NewV1, ConVec, j); 8288 ConVec = ExractInto(NewV2, ConVec, j); 8289 8290 // Now return the result of comparing the subvector with zero, 8291 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8292 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8293 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8294} 8295 8296static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8297 const ARMSubtarget *ST) { 8298 EVT VT = Op->getValueType(0); 8299 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8300 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8301 8302 // The only time a CONCAT_VECTORS operation can have legal types is when 8303 // two 64-bit vectors are concatenated to a 128-bit vector. 8304 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8305 "unexpected CONCAT_VECTORS"); 8306 SDLoc dl(Op); 8307 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8308 SDValue Op0 = Op.getOperand(0); 8309 SDValue Op1 = Op.getOperand(1); 8310 if (!Op0.isUndef()) 8311 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8312 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8313 DAG.getIntPtrConstant(0, dl)); 8314 if (!Op1.isUndef()) 8315 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8316 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8317 DAG.getIntPtrConstant(1, dl)); 8318 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8319} 8320 8321static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8322 const ARMSubtarget *ST) { 8323 SDValue V1 = Op.getOperand(0); 8324 SDValue V2 = Op.getOperand(1); 8325 SDLoc dl(Op); 8326 EVT VT = Op.getValueType(); 8327 EVT Op1VT = V1.getValueType(); 8328 unsigned NumElts = VT.getVectorNumElements(); 8329 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8330 8331 assert(VT.getScalarSizeInBits() == 1 && 8332 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8333 assert(ST->hasMVEIntegerOps() && 8334 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8335 8336 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8337 8338 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8339 // promoted to v8i16, etc. 8340 8341 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8342 8343 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8344 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8345 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8346 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8347 DAG.getIntPtrConstant(i, dl)); 8348 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8349 DAG.getConstant(j, dl, MVT::i32)); 8350 } 8351 8352 // Now return the result of comparing the subvector with zero, 8353 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8354 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8355 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8356} 8357 8358/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8359/// element has been zero/sign-extended, depending on the isSigned parameter, 8360/// from an integer type half its size. 8361static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8362 bool isSigned) { 8363 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8364 EVT VT = N->getValueType(0); 8365 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8366 SDNode *BVN = N->getOperand(0).getNode(); 8367 if (BVN->getValueType(0) != MVT::v4i32 || 8368 BVN->getOpcode() != ISD::BUILD_VECTOR) 8369 return false; 8370 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8371 unsigned HiElt = 1 - LoElt; 8372 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8373 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8374 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8375 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8376 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8377 return false; 8378 if (isSigned) { 8379 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8380 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8381 return true; 8382 } else { 8383 if (Hi0->isNullValue() && Hi1->isNullValue()) 8384 return true; 8385 } 8386 return false; 8387 } 8388 8389 if (N->getOpcode() != ISD::BUILD_VECTOR) 8390 return false; 8391 8392 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8393 SDNode *Elt = N->getOperand(i).getNode(); 8394 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8395 unsigned EltSize = VT.getScalarSizeInBits(); 8396 unsigned HalfSize = EltSize / 2; 8397 if (isSigned) { 8398 if (!isIntN(HalfSize, C->getSExtValue())) 8399 return false; 8400 } else { 8401 if (!isUIntN(HalfSize, C->getZExtValue())) 8402 return false; 8403 } 8404 continue; 8405 } 8406 return false; 8407 } 8408 8409 return true; 8410} 8411 8412/// isSignExtended - Check if a node is a vector value that is sign-extended 8413/// or a constant BUILD_VECTOR with sign-extended elements. 8414static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8415 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8416 return true; 8417 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8418 return true; 8419 return false; 8420} 8421 8422/// isZeroExtended - Check if a node is a vector value that is zero-extended 8423/// or a constant BUILD_VECTOR with zero-extended elements. 8424static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8425 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8426 return true; 8427 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8428 return true; 8429 return false; 8430} 8431 8432static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8433 if (OrigVT.getSizeInBits() >= 64) 8434 return OrigVT; 8435 8436 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8437 8438 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8439 switch (OrigSimpleTy) { 8440 default: llvm_unreachable("Unexpected Vector Type"); 8441 case MVT::v2i8: 8442 case MVT::v2i16: 8443 return MVT::v2i32; 8444 case MVT::v4i8: 8445 return MVT::v4i16; 8446 } 8447} 8448 8449/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8450/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8451/// We insert the required extension here to get the vector to fill a D register. 8452static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8453 const EVT &OrigTy, 8454 const EVT &ExtTy, 8455 unsigned ExtOpcode) { 8456 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8457 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8458 // 64-bits we need to insert a new extension so that it will be 64-bits. 8459 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8460 if (OrigTy.getSizeInBits() >= 64) 8461 return N; 8462 8463 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8464 EVT NewVT = getExtensionTo64Bits(OrigTy); 8465 8466 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8467} 8468 8469/// SkipLoadExtensionForVMULL - return a load of the original vector size that 8470/// does not do any sign/zero extension. If the original vector is less 8471/// than 64 bits, an appropriate extension will be added after the load to 8472/// reach a total size of 64 bits. We have to add the extension separately 8473/// because ARM does not have a sign/zero extending load for vectors. 8474static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8475 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8476 8477 // The load already has the right type. 8478 if (ExtendedTy == LD->getMemoryVT()) 8479 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8480 LD->getBasePtr(), LD->getPointerInfo(), 8481 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8482 8483 // We need to create a zextload/sextload. We cannot just create a load 8484 // followed by a zext/zext node because LowerMUL is also run during normal 8485 // operation legalization where we can't create illegal types. 8486 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8487 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8488 LD->getMemoryVT(), LD->getAlignment(), 8489 LD->getMemOperand()->getFlags()); 8490} 8491 8492/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8493/// extending load, or BUILD_VECTOR with extended elements, return the 8494/// unextended value. The unextended vector should be 64 bits so that it can 8495/// be used as an operand to a VMULL instruction. If the original vector size 8496/// before extension is less than 64 bits we add a an extension to resize 8497/// the vector to 64 bits. 8498static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8499 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8500 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8501 N->getOperand(0)->getValueType(0), 8502 N->getValueType(0), 8503 N->getOpcode()); 8504 8505 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8506 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8507 "Expected extending load"); 8508 8509 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8510 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8511 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8512 SDValue extLoad = 8513 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8514 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8515 8516 return newLoad; 8517 } 8518 8519 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8520 // have been legalized as a BITCAST from v4i32. 8521 if (N->getOpcode() == ISD::BITCAST) { 8522 SDNode *BVN = N->getOperand(0).getNode(); 8523 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8524 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8525 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8526 return DAG.getBuildVector( 8527 MVT::v2i32, SDLoc(N), 8528 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8529 } 8530 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8531 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8532 EVT VT = N->getValueType(0); 8533 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8534 unsigned NumElts = VT.getVectorNumElements(); 8535 MVT TruncVT = MVT::getIntegerVT(EltSize); 8536 SmallVector<SDValue, 8> Ops; 8537 SDLoc dl(N); 8538 for (unsigned i = 0; i != NumElts; ++i) { 8539 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8540 const APInt &CInt = C->getAPIntValue(); 8541 // Element types smaller than 32 bits are not legal, so use i32 elements. 8542 // The values are implicitly truncated so sext vs. zext doesn't matter. 8543 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8544 } 8545 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8546} 8547 8548static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8549 unsigned Opcode = N->getOpcode(); 8550 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8551 SDNode *N0 = N->getOperand(0).getNode(); 8552 SDNode *N1 = N->getOperand(1).getNode(); 8553 return N0->hasOneUse() && N1->hasOneUse() && 8554 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8555 } 8556 return false; 8557} 8558 8559static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8560 unsigned Opcode = N->getOpcode(); 8561 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8562 SDNode *N0 = N->getOperand(0).getNode(); 8563 SDNode *N1 = N->getOperand(1).getNode(); 8564 return N0->hasOneUse() && N1->hasOneUse() && 8565 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8566 } 8567 return false; 8568} 8569 8570static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8571 // Multiplications are only custom-lowered for 128-bit vectors so that 8572 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8573 EVT VT = Op.getValueType(); 8574 assert(VT.is128BitVector() && VT.isInteger() && 8575 "unexpected type for custom-lowering ISD::MUL"); 8576 SDNode *N0 = Op.getOperand(0).getNode(); 8577 SDNode *N1 = Op.getOperand(1).getNode(); 8578 unsigned NewOpc = 0; 8579 bool isMLA = false; 8580 bool isN0SExt = isSignExtended(N0, DAG); 8581 bool isN1SExt = isSignExtended(N1, DAG); 8582 if (isN0SExt && isN1SExt) 8583 NewOpc = ARMISD::VMULLs; 8584 else { 8585 bool isN0ZExt = isZeroExtended(N0, DAG); 8586 bool isN1ZExt = isZeroExtended(N1, DAG); 8587 if (isN0ZExt && isN1ZExt) 8588 NewOpc = ARMISD::VMULLu; 8589 else if (isN1SExt || isN1ZExt) { 8590 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8591 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8592 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8593 NewOpc = ARMISD::VMULLs; 8594 isMLA = true; 8595 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8596 NewOpc = ARMISD::VMULLu; 8597 isMLA = true; 8598 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8599 std::swap(N0, N1); 8600 NewOpc = ARMISD::VMULLu; 8601 isMLA = true; 8602 } 8603 } 8604 8605 if (!NewOpc) { 8606 if (VT == MVT::v2i64) 8607 // Fall through to expand this. It is not legal. 8608 return SDValue(); 8609 else 8610 // Other vector multiplications are legal. 8611 return Op; 8612 } 8613 } 8614 8615 // Legalize to a VMULL instruction. 8616 SDLoc DL(Op); 8617 SDValue Op0; 8618 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8619 if (!isMLA) { 8620 Op0 = SkipExtensionForVMULL(N0, DAG); 8621 assert(Op0.getValueType().is64BitVector() && 8622 Op1.getValueType().is64BitVector() && 8623 "unexpected types for extended operands to VMULL"); 8624 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8625 } 8626 8627 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8628 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8629 // vmull q0, d4, d6 8630 // vmlal q0, d5, d6 8631 // is faster than 8632 // vaddl q0, d4, d5 8633 // vmovl q1, d6 8634 // vmul q0, q0, q1 8635 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8636 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8637 EVT Op1VT = Op1.getValueType(); 8638 return DAG.getNode(N0->getOpcode(), DL, VT, 8639 DAG.getNode(NewOpc, DL, VT, 8640 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8641 DAG.getNode(NewOpc, DL, VT, 8642 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8643} 8644 8645static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8646 SelectionDAG &DAG) { 8647 // TODO: Should this propagate fast-math-flags? 8648 8649 // Convert to float 8650 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8651 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8652 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8653 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8654 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8655 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8656 // Get reciprocal estimate. 8657 // float4 recip = vrecpeq_f32(yf); 8658 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8659 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8660 Y); 8661 // Because char has a smaller range than uchar, we can actually get away 8662 // without any newton steps. This requires that we use a weird bias 8663 // of 0xb000, however (again, this has been exhaustively tested). 8664 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8665 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8666 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8667 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8668 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8669 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8670 // Convert back to short. 8671 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8672 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8673 return X; 8674} 8675 8676static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8677 SelectionDAG &DAG) { 8678 // TODO: Should this propagate fast-math-flags? 8679 8680 SDValue N2; 8681 // Convert to float. 8682 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8683 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8684 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8685 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8686 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8687 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8688 8689 // Use reciprocal estimate and one refinement step. 8690 // float4 recip = vrecpeq_f32(yf); 8691 // recip *= vrecpsq_f32(yf, recip); 8692 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8693 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8694 N1); 8695 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8696 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8697 N1, N2); 8698 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8699 // Because short has a smaller range than ushort, we can actually get away 8700 // with only a single newton step. This requires that we use a weird bias 8701 // of 89, however (again, this has been exhaustively tested). 8702 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8703 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8704 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8705 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8706 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8707 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8708 // Convert back to integer and return. 8709 // return vmovn_s32(vcvt_s32_f32(result)); 8710 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8711 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8712 return N0; 8713} 8714 8715static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8716 const ARMSubtarget *ST) { 8717 EVT VT = Op.getValueType(); 8718 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8719 "unexpected type for custom-lowering ISD::SDIV"); 8720 8721 SDLoc dl(Op); 8722 SDValue N0 = Op.getOperand(0); 8723 SDValue N1 = Op.getOperand(1); 8724 SDValue N2, N3; 8725 8726 if (VT == MVT::v8i8) { 8727 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8728 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8729 8730 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8731 DAG.getIntPtrConstant(4, dl)); 8732 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8733 DAG.getIntPtrConstant(4, dl)); 8734 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8735 DAG.getIntPtrConstant(0, dl)); 8736 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8737 DAG.getIntPtrConstant(0, dl)); 8738 8739 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8740 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8741 8742 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8743 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8744 8745 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8746 return N0; 8747 } 8748 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8749} 8750 8751static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8752 const ARMSubtarget *ST) { 8753 // TODO: Should this propagate fast-math-flags? 8754 EVT VT = Op.getValueType(); 8755 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8756 "unexpected type for custom-lowering ISD::UDIV"); 8757 8758 SDLoc dl(Op); 8759 SDValue N0 = Op.getOperand(0); 8760 SDValue N1 = Op.getOperand(1); 8761 SDValue N2, N3; 8762 8763 if (VT == MVT::v8i8) { 8764 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8765 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8766 8767 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8768 DAG.getIntPtrConstant(4, dl)); 8769 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8770 DAG.getIntPtrConstant(4, dl)); 8771 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8772 DAG.getIntPtrConstant(0, dl)); 8773 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8774 DAG.getIntPtrConstant(0, dl)); 8775 8776 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8777 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8778 8779 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8780 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8781 8782 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8783 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8784 MVT::i32), 8785 N0); 8786 return N0; 8787 } 8788 8789 // v4i16 sdiv ... Convert to float. 8790 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8791 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8792 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8793 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8794 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8795 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8796 8797 // Use reciprocal estimate and two refinement steps. 8798 // float4 recip = vrecpeq_f32(yf); 8799 // recip *= vrecpsq_f32(yf, recip); 8800 // recip *= vrecpsq_f32(yf, recip); 8801 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8802 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8803 BN1); 8804 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8805 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8806 BN1, N2); 8807 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8808 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8809 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8810 BN1, N2); 8811 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8812 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8813 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8814 // and that it will never cause us to return an answer too large). 8815 // float4 result = as_float4(as_int4(xf*recip) + 2); 8816 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8817 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8818 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8819 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8820 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8821 // Convert back to integer and return. 8822 // return vmovn_u32(vcvt_s32_f32(result)); 8823 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8824 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8825 return N0; 8826} 8827 8828static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8829 SDNode *N = Op.getNode(); 8830 EVT VT = N->getValueType(0); 8831 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8832 8833 SDValue Carry = Op.getOperand(2); 8834 8835 SDLoc DL(Op); 8836 8837 SDValue Result; 8838 if (Op.getOpcode() == ISD::ADDCARRY) { 8839 // This converts the boolean value carry into the carry flag. 8840 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8841 8842 // Do the addition proper using the carry flag we wanted. 8843 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8844 Op.getOperand(1), Carry); 8845 8846 // Now convert the carry flag into a boolean value. 8847 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8848 } else { 8849 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8850 // have to invert the carry first. 8851 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8852 DAG.getConstant(1, DL, MVT::i32), Carry); 8853 // This converts the boolean value carry into the carry flag. 8854 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8855 8856 // Do the subtraction proper using the carry flag we wanted. 8857 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8858 Op.getOperand(1), Carry); 8859 8860 // Now convert the carry flag into a boolean value. 8861 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8862 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8863 // by ISD::SUBCARRY, so compute 1 - C. 8864 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8865 DAG.getConstant(1, DL, MVT::i32), Carry); 8866 } 8867 8868 // Return both values. 8869 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8870} 8871 8872SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8873 assert(Subtarget->isTargetDarwin()); 8874 8875 // For iOS, we want to call an alternative entry point: __sincos_stret, 8876 // return values are passed via sret. 8877 SDLoc dl(Op); 8878 SDValue Arg = Op.getOperand(0); 8879 EVT ArgVT = Arg.getValueType(); 8880 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8881 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8882 8883 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8884 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8885 8886 // Pair of floats / doubles used to pass the result. 8887 Type *RetTy = StructType::get(ArgTy, ArgTy); 8888 auto &DL = DAG.getDataLayout(); 8889 8890 ArgListTy Args; 8891 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8892 SDValue SRet; 8893 if (ShouldUseSRet) { 8894 // Create stack object for sret. 8895 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8896 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8897 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8898 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8899 8900 ArgListEntry Entry; 8901 Entry.Node = SRet; 8902 Entry.Ty = RetTy->getPointerTo(); 8903 Entry.IsSExt = false; 8904 Entry.IsZExt = false; 8905 Entry.IsSRet = true; 8906 Args.push_back(Entry); 8907 RetTy = Type::getVoidTy(*DAG.getContext()); 8908 } 8909 8910 ArgListEntry Entry; 8911 Entry.Node = Arg; 8912 Entry.Ty = ArgTy; 8913 Entry.IsSExt = false; 8914 Entry.IsZExt = false; 8915 Args.push_back(Entry); 8916 8917 RTLIB::Libcall LC = 8918 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8919 const char *LibcallName = getLibcallName(LC); 8920 CallingConv::ID CC = getLibcallCallingConv(LC); 8921 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8922 8923 TargetLowering::CallLoweringInfo CLI(DAG); 8924 CLI.setDebugLoc(dl) 8925 .setChain(DAG.getEntryNode()) 8926 .setCallee(CC, RetTy, Callee, std::move(Args)) 8927 .setDiscardResult(ShouldUseSRet); 8928 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8929 8930 if (!ShouldUseSRet) 8931 return CallResult.first; 8932 8933 SDValue LoadSin = 8934 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8935 8936 // Address of cos field. 8937 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8938 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8939 SDValue LoadCos = 8940 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8941 8942 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8943 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 8944 LoadSin.getValue(0), LoadCos.getValue(0)); 8945} 8946 8947SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 8948 bool Signed, 8949 SDValue &Chain) const { 8950 EVT VT = Op.getValueType(); 8951 assert((VT == MVT::i32 || VT == MVT::i64) && 8952 "unexpected type for custom lowering DIV"); 8953 SDLoc dl(Op); 8954 8955 const auto &DL = DAG.getDataLayout(); 8956 const auto &TLI = DAG.getTargetLoweringInfo(); 8957 8958 const char *Name = nullptr; 8959 if (Signed) 8960 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 8961 else 8962 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 8963 8964 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 8965 8966 ARMTargetLowering::ArgListTy Args; 8967 8968 for (auto AI : {1, 0}) { 8969 ArgListEntry Arg; 8970 Arg.Node = Op.getOperand(AI); 8971 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 8972 Args.push_back(Arg); 8973 } 8974 8975 CallLoweringInfo CLI(DAG); 8976 CLI.setDebugLoc(dl) 8977 .setChain(Chain) 8978 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 8979 ES, std::move(Args)); 8980 8981 return LowerCallTo(CLI).first; 8982} 8983 8984// This is a code size optimisation: return the original SDIV node to 8985// DAGCombiner when we don't want to expand SDIV into a sequence of 8986// instructions, and an empty node otherwise which will cause the 8987// SDIV to be expanded in DAGCombine. 8988SDValue 8989ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 8990 SelectionDAG &DAG, 8991 SmallVectorImpl<SDNode *> &Created) const { 8992 // TODO: Support SREM 8993 if (N->getOpcode() != ISD::SDIV) 8994 return SDValue(); 8995 8996 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 8997 const bool MinSize = ST.hasMinSize(); 8998 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 8999 : ST.hasDivideInARMMode(); 9000 9001 // Don't touch vector types; rewriting this may lead to scalarizing 9002 // the int divs. 9003 if (N->getOperand(0).getValueType().isVector()) 9004 return SDValue(); 9005 9006 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9007 // hwdiv support for this to be really profitable. 9008 if (!(MinSize && HasDivide)) 9009 return SDValue(); 9010 9011 // ARM mode is a bit simpler than Thumb: we can handle large power 9012 // of 2 immediates with 1 mov instruction; no further checks required, 9013 // just return the sdiv node. 9014 if (!ST.isThumb()) 9015 return SDValue(N, 0); 9016 9017 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9018 // and thus lose the code size benefits of a MOVS that requires only 2. 9019 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9020 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9021 if (Divisor.sgt(128)) 9022 return SDValue(); 9023 9024 return SDValue(N, 0); 9025} 9026 9027SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9028 bool Signed) const { 9029 assert(Op.getValueType() == MVT::i32 && 9030 "unexpected type for custom lowering DIV"); 9031 SDLoc dl(Op); 9032 9033 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9034 DAG.getEntryNode(), Op.getOperand(1)); 9035 9036 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9037} 9038 9039static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9040 SDLoc DL(N); 9041 SDValue Op = N->getOperand(1); 9042 if (N->getValueType(0) == MVT::i32) 9043 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9044 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9045 DAG.getConstant(0, DL, MVT::i32)); 9046 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9047 DAG.getConstant(1, DL, MVT::i32)); 9048 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9049 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9050} 9051 9052void ARMTargetLowering::ExpandDIV_Windows( 9053 SDValue Op, SelectionDAG &DAG, bool Signed, 9054 SmallVectorImpl<SDValue> &Results) const { 9055 const auto &DL = DAG.getDataLayout(); 9056 const auto &TLI = DAG.getTargetLoweringInfo(); 9057 9058 assert(Op.getValueType() == MVT::i64 && 9059 "unexpected type for custom lowering DIV"); 9060 SDLoc dl(Op); 9061 9062 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9063 9064 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9065 9066 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9067 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9068 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9069 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9070 9071 Results.push_back(Lower); 9072 Results.push_back(Upper); 9073} 9074 9075static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9076 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9077 EVT MemVT = LD->getMemoryVT(); 9078 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9079 "Expected a predicate type!"); 9080 assert(MemVT == Op.getValueType()); 9081 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9082 "Expected a non-extending load"); 9083 assert(LD->isUnindexed() && "Expected a unindexed load"); 9084 9085 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9086 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9087 // need to make sure that 8/4 bits are actually loaded into the correct 9088 // place, which means loading the value and then shuffling the values into 9089 // the bottom bits of the predicate. 9090 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9091 // for BE). 9092 9093 SDLoc dl(Op); 9094 SDValue Load = DAG.getExtLoad( 9095 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9096 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9097 LD->getMemOperand()); 9098 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 9099 if (MemVT != MVT::v16i1) 9100 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9101 DAG.getConstant(0, dl, MVT::i32)); 9102 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9103} 9104 9105static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9106 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9107 EVT MemVT = ST->getMemoryVT(); 9108 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9109 "Expected a predicate type!"); 9110 assert(MemVT == ST->getValue().getValueType()); 9111 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9112 assert(ST->isUnindexed() && "Expected a unindexed store"); 9113 9114 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9115 // unset and a scalar store. 9116 SDLoc dl(Op); 9117 SDValue Build = ST->getValue(); 9118 if (MemVT != MVT::v16i1) { 9119 SmallVector<SDValue, 16> Ops; 9120 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 9121 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9122 DAG.getConstant(I, dl, MVT::i32))); 9123 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9124 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9125 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9126 } 9127 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9128 return DAG.getTruncStore( 9129 ST->getChain(), dl, GRP, ST->getBasePtr(), 9130 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9131 ST->getMemOperand()); 9132} 9133 9134static bool isZeroVector(SDValue N) { 9135 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9136 (N->getOpcode() == ARMISD::VMOVIMM && 9137 isNullConstant(N->getOperand(0)))); 9138} 9139 9140static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9141 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9142 MVT VT = Op.getSimpleValueType(); 9143 SDValue Mask = N->getMask(); 9144 SDValue PassThru = N->getPassThru(); 9145 SDLoc dl(Op); 9146 9147 if (isZeroVector(PassThru)) 9148 return Op; 9149 9150 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9151 // zero too, and other values are lowered to a select. 9152 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9153 DAG.getTargetConstant(0, dl, MVT::i32)); 9154 SDValue NewLoad = DAG.getMaskedLoad( 9155 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9156 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9157 N->getExtensionType(), N->isExpandingLoad()); 9158 SDValue Combo = NewLoad; 9159 if (!PassThru.isUndef() && 9160 (PassThru.getOpcode() != ISD::BITCAST || 9161 !isZeroVector(PassThru->getOperand(0)))) 9162 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9163 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9164} 9165 9166static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9167 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9168 // Acquire/Release load/store is not legal for targets without a dmb or 9169 // equivalent available. 9170 return SDValue(); 9171 9172 // Monotonic load/store is legal for all targets. 9173 return Op; 9174} 9175 9176static void ReplaceREADCYCLECOUNTER(SDNode *N, 9177 SmallVectorImpl<SDValue> &Results, 9178 SelectionDAG &DAG, 9179 const ARMSubtarget *Subtarget) { 9180 SDLoc DL(N); 9181 // Under Power Management extensions, the cycle-count is: 9182 // mrc p15, #0, <Rt>, c9, c13, #0 9183 SDValue Ops[] = { N->getOperand(0), // Chain 9184 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9185 DAG.getTargetConstant(15, DL, MVT::i32), 9186 DAG.getTargetConstant(0, DL, MVT::i32), 9187 DAG.getTargetConstant(9, DL, MVT::i32), 9188 DAG.getTargetConstant(13, DL, MVT::i32), 9189 DAG.getTargetConstant(0, DL, MVT::i32) 9190 }; 9191 9192 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9193 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9194 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9195 DAG.getConstant(0, DL, MVT::i32))); 9196 Results.push_back(Cycles32.getValue(1)); 9197} 9198 9199static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9200 SDLoc dl(V.getNode()); 9201 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9202 SDValue VHi = DAG.getAnyExtOrTrunc( 9203 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9204 dl, MVT::i32); 9205 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9206 if (isBigEndian) 9207 std::swap (VLo, VHi); 9208 SDValue RegClass = 9209 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9210 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9211 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9212 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9213 return SDValue( 9214 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9215} 9216 9217static void ReplaceCMP_SWAP_64Results(SDNode *N, 9218 SmallVectorImpl<SDValue> & Results, 9219 SelectionDAG &DAG) { 9220 assert(N->getValueType(0) == MVT::i64 && 9221 "AtomicCmpSwap on types less than 64 should be legal"); 9222 SDValue Ops[] = {N->getOperand(1), 9223 createGPRPairNode(DAG, N->getOperand(2)), 9224 createGPRPairNode(DAG, N->getOperand(3)), 9225 N->getOperand(0)}; 9226 SDNode *CmpSwap = DAG.getMachineNode( 9227 ARM::CMP_SWAP_64, SDLoc(N), 9228 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9229 9230 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9231 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9232 9233 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9234 9235 Results.push_back( 9236 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9237 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9238 Results.push_back( 9239 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9240 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9241 Results.push_back(SDValue(CmpSwap, 2)); 9242} 9243 9244SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 9245 SDLoc dl(Op); 9246 EVT VT = Op.getValueType(); 9247 SDValue Chain = Op.getOperand(0); 9248 SDValue LHS = Op.getOperand(1); 9249 SDValue RHS = Op.getOperand(2); 9250 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9251 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9252 9253 // If we don't have instructions of this float type then soften to a libcall 9254 // and use SETCC instead. 9255 if (isUnsupportedFloatingType(LHS.getValueType())) { 9256 DAG.getTargetLoweringInfo().softenSetCCOperands( 9257 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 9258 if (!RHS.getNode()) { 9259 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9260 CC = ISD::SETNE; 9261 } 9262 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 9263 DAG.getCondCode(CC)); 9264 return DAG.getMergeValues({Result, Chain}, dl); 9265 } 9266 9267 ARMCC::CondCodes CondCode, CondCode2; 9268 FPCCToARMCC(CC, CondCode, CondCode2); 9269 9270 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 9271 // in CMPFP and CMPFPE, but instead it should be made explicit by these 9272 // instructions using a chain instead of glue. This would also fix the problem 9273 // here (and also in LowerSELECT_CC) where we generate two comparisons when 9274 // CondCode2 != AL. 9275 SDValue True = DAG.getConstant(1, dl, VT); 9276 SDValue False = DAG.getConstant(0, dl, VT); 9277 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 9278 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 9279 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9280 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 9281 if (CondCode2 != ARMCC::AL) { 9282 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 9283 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9284 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 9285 } 9286 return DAG.getMergeValues({Result, Chain}, dl); 9287} 9288 9289SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9290 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9291 switch (Op.getOpcode()) { 9292 default: llvm_unreachable("Don't know how to custom lower this!"); 9293 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9294 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9295 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9296 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9297 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9298 case ISD::SELECT: return LowerSELECT(Op, DAG); 9299 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9300 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9301 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9302 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9303 case ISD::VASTART: return LowerVASTART(Op, DAG); 9304 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9305 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9306 case ISD::SINT_TO_FP: 9307 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9308 case ISD::STRICT_FP_TO_SINT: 9309 case ISD::STRICT_FP_TO_UINT: 9310 case ISD::FP_TO_SINT: 9311 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9312 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9313 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9314 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9315 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9316 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9317 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9318 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9319 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9320 Subtarget); 9321 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9322 case ISD::SHL: 9323 case ISD::SRL: 9324 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9325 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9326 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9327 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9328 case ISD::SRL_PARTS: 9329 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9330 case ISD::CTTZ: 9331 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9332 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9333 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9334 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9335 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9336 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9337 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9338 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9339 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9340 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9341 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9342 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9343 case ISD::MUL: return LowerMUL(Op, DAG); 9344 case ISD::SDIV: 9345 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9346 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9347 return LowerSDIV(Op, DAG, Subtarget); 9348 case ISD::UDIV: 9349 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9350 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9351 return LowerUDIV(Op, DAG, Subtarget); 9352 case ISD::ADDCARRY: 9353 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9354 case ISD::SADDO: 9355 case ISD::SSUBO: 9356 return LowerSignedALUO(Op, DAG); 9357 case ISD::UADDO: 9358 case ISD::USUBO: 9359 return LowerUnsignedALUO(Op, DAG); 9360 case ISD::SADDSAT: 9361 case ISD::SSUBSAT: 9362 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9363 case ISD::LOAD: 9364 return LowerPredicateLoad(Op, DAG); 9365 case ISD::STORE: 9366 return LowerPredicateStore(Op, DAG); 9367 case ISD::MLOAD: 9368 return LowerMLOAD(Op, DAG); 9369 case ISD::ATOMIC_LOAD: 9370 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9371 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9372 case ISD::SDIVREM: 9373 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9374 case ISD::DYNAMIC_STACKALLOC: 9375 if (Subtarget->isTargetWindows()) 9376 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9377 llvm_unreachable("Don't know how to custom lower this!"); 9378 case ISD::STRICT_FP_ROUND: 9379 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9380 case ISD::STRICT_FP_EXTEND: 9381 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9382 case ISD::STRICT_FSETCC: 9383 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 9384 case ARMISD::WIN__DBZCHK: return SDValue(); 9385 } 9386} 9387 9388static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9389 SelectionDAG &DAG) { 9390 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9391 unsigned Opc = 0; 9392 if (IntNo == Intrinsic::arm_smlald) 9393 Opc = ARMISD::SMLALD; 9394 else if (IntNo == Intrinsic::arm_smlaldx) 9395 Opc = ARMISD::SMLALDX; 9396 else if (IntNo == Intrinsic::arm_smlsld) 9397 Opc = ARMISD::SMLSLD; 9398 else if (IntNo == Intrinsic::arm_smlsldx) 9399 Opc = ARMISD::SMLSLDX; 9400 else 9401 return; 9402 9403 SDLoc dl(N); 9404 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9405 N->getOperand(3), 9406 DAG.getConstant(0, dl, MVT::i32)); 9407 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9408 N->getOperand(3), 9409 DAG.getConstant(1, dl, MVT::i32)); 9410 9411 SDValue LongMul = DAG.getNode(Opc, dl, 9412 DAG.getVTList(MVT::i32, MVT::i32), 9413 N->getOperand(1), N->getOperand(2), 9414 Lo, Hi); 9415 Results.push_back(LongMul.getValue(0)); 9416 Results.push_back(LongMul.getValue(1)); 9417} 9418 9419/// ReplaceNodeResults - Replace the results of node with an illegal result 9420/// type with new values built out of custom code. 9421void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9422 SmallVectorImpl<SDValue> &Results, 9423 SelectionDAG &DAG) const { 9424 SDValue Res; 9425 switch (N->getOpcode()) { 9426 default: 9427 llvm_unreachable("Don't know how to custom expand this!"); 9428 case ISD::READ_REGISTER: 9429 ExpandREAD_REGISTER(N, Results, DAG); 9430 break; 9431 case ISD::BITCAST: 9432 Res = ExpandBITCAST(N, DAG, Subtarget); 9433 break; 9434 case ISD::SRL: 9435 case ISD::SRA: 9436 case ISD::SHL: 9437 Res = Expand64BitShift(N, DAG, Subtarget); 9438 break; 9439 case ISD::SREM: 9440 case ISD::UREM: 9441 Res = LowerREM(N, DAG); 9442 break; 9443 case ISD::SDIVREM: 9444 case ISD::UDIVREM: 9445 Res = LowerDivRem(SDValue(N, 0), DAG); 9446 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9447 Results.push_back(Res.getValue(0)); 9448 Results.push_back(Res.getValue(1)); 9449 return; 9450 case ISD::SADDSAT: 9451 case ISD::SSUBSAT: 9452 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9453 break; 9454 case ISD::READCYCLECOUNTER: 9455 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9456 return; 9457 case ISD::UDIV: 9458 case ISD::SDIV: 9459 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9460 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9461 Results); 9462 case ISD::ATOMIC_CMP_SWAP: 9463 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9464 return; 9465 case ISD::INTRINSIC_WO_CHAIN: 9466 return ReplaceLongIntrinsic(N, Results, DAG); 9467 case ISD::ABS: 9468 lowerABS(N, Results, DAG); 9469 return ; 9470 9471 } 9472 if (Res.getNode()) 9473 Results.push_back(Res); 9474} 9475 9476//===----------------------------------------------------------------------===// 9477// ARM Scheduler Hooks 9478//===----------------------------------------------------------------------===// 9479 9480/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9481/// registers the function context. 9482void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9483 MachineBasicBlock *MBB, 9484 MachineBasicBlock *DispatchBB, 9485 int FI) const { 9486 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9487 "ROPI/RWPI not currently supported with SjLj"); 9488 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9489 DebugLoc dl = MI.getDebugLoc(); 9490 MachineFunction *MF = MBB->getParent(); 9491 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9492 MachineConstantPool *MCP = MF->getConstantPool(); 9493 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9494 const Function &F = MF->getFunction(); 9495 9496 bool isThumb = Subtarget->isThumb(); 9497 bool isThumb2 = Subtarget->isThumb2(); 9498 9499 unsigned PCLabelId = AFI->createPICLabelUId(); 9500 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9501 ARMConstantPoolValue *CPV = 9502 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9503 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 9504 9505 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9506 : &ARM::GPRRegClass; 9507 9508 // Grab constant pool and fixed stack memory operands. 9509 MachineMemOperand *CPMMO = 9510 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9511 MachineMemOperand::MOLoad, 4, 4); 9512 9513 MachineMemOperand *FIMMOSt = 9514 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9515 MachineMemOperand::MOStore, 4, 4); 9516 9517 // Load the address of the dispatch MBB into the jump buffer. 9518 if (isThumb2) { 9519 // Incoming value: jbuf 9520 // ldr.n r5, LCPI1_1 9521 // orr r5, r5, #1 9522 // add r5, pc 9523 // str r5, [$jbuf, #+4] ; &jbuf[1] 9524 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9525 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9526 .addConstantPoolIndex(CPI) 9527 .addMemOperand(CPMMO) 9528 .add(predOps(ARMCC::AL)); 9529 // Set the low bit because of thumb mode. 9530 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9531 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9532 .addReg(NewVReg1, RegState::Kill) 9533 .addImm(0x01) 9534 .add(predOps(ARMCC::AL)) 9535 .add(condCodeOp()); 9536 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9537 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9538 .addReg(NewVReg2, RegState::Kill) 9539 .addImm(PCLabelId); 9540 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9541 .addReg(NewVReg3, RegState::Kill) 9542 .addFrameIndex(FI) 9543 .addImm(36) // &jbuf[1] :: pc 9544 .addMemOperand(FIMMOSt) 9545 .add(predOps(ARMCC::AL)); 9546 } else if (isThumb) { 9547 // Incoming value: jbuf 9548 // ldr.n r1, LCPI1_4 9549 // add r1, pc 9550 // mov r2, #1 9551 // orrs r1, r2 9552 // add r2, $jbuf, #+4 ; &jbuf[1] 9553 // str r1, [r2] 9554 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9555 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9556 .addConstantPoolIndex(CPI) 9557 .addMemOperand(CPMMO) 9558 .add(predOps(ARMCC::AL)); 9559 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9560 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9561 .addReg(NewVReg1, RegState::Kill) 9562 .addImm(PCLabelId); 9563 // Set the low bit because of thumb mode. 9564 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9565 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9566 .addReg(ARM::CPSR, RegState::Define) 9567 .addImm(1) 9568 .add(predOps(ARMCC::AL)); 9569 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9570 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9571 .addReg(ARM::CPSR, RegState::Define) 9572 .addReg(NewVReg2, RegState::Kill) 9573 .addReg(NewVReg3, RegState::Kill) 9574 .add(predOps(ARMCC::AL)); 9575 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9576 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9577 .addFrameIndex(FI) 9578 .addImm(36); // &jbuf[1] :: pc 9579 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9580 .addReg(NewVReg4, RegState::Kill) 9581 .addReg(NewVReg5, RegState::Kill) 9582 .addImm(0) 9583 .addMemOperand(FIMMOSt) 9584 .add(predOps(ARMCC::AL)); 9585 } else { 9586 // Incoming value: jbuf 9587 // ldr r1, LCPI1_1 9588 // add r1, pc, r1 9589 // str r1, [$jbuf, #+4] ; &jbuf[1] 9590 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9591 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9592 .addConstantPoolIndex(CPI) 9593 .addImm(0) 9594 .addMemOperand(CPMMO) 9595 .add(predOps(ARMCC::AL)); 9596 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9597 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9598 .addReg(NewVReg1, RegState::Kill) 9599 .addImm(PCLabelId) 9600 .add(predOps(ARMCC::AL)); 9601 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9602 .addReg(NewVReg2, RegState::Kill) 9603 .addFrameIndex(FI) 9604 .addImm(36) // &jbuf[1] :: pc 9605 .addMemOperand(FIMMOSt) 9606 .add(predOps(ARMCC::AL)); 9607 } 9608} 9609 9610void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9611 MachineBasicBlock *MBB) const { 9612 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9613 DebugLoc dl = MI.getDebugLoc(); 9614 MachineFunction *MF = MBB->getParent(); 9615 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9616 MachineFrameInfo &MFI = MF->getFrameInfo(); 9617 int FI = MFI.getFunctionContextIndex(); 9618 9619 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9620 : &ARM::GPRnopcRegClass; 9621 9622 // Get a mapping of the call site numbers to all of the landing pads they're 9623 // associated with. 9624 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9625 unsigned MaxCSNum = 0; 9626 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9627 ++BB) { 9628 if (!BB->isEHPad()) continue; 9629 9630 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9631 // pad. 9632 for (MachineBasicBlock::iterator 9633 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9634 if (!II->isEHLabel()) continue; 9635 9636 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9637 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9638 9639 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9640 for (SmallVectorImpl<unsigned>::iterator 9641 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9642 CSI != CSE; ++CSI) { 9643 CallSiteNumToLPad[*CSI].push_back(&*BB); 9644 MaxCSNum = std::max(MaxCSNum, *CSI); 9645 } 9646 break; 9647 } 9648 } 9649 9650 // Get an ordered list of the machine basic blocks for the jump table. 9651 std::vector<MachineBasicBlock*> LPadList; 9652 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9653 LPadList.reserve(CallSiteNumToLPad.size()); 9654 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9655 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9656 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9657 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9658 LPadList.push_back(*II); 9659 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9660 } 9661 } 9662 9663 assert(!LPadList.empty() && 9664 "No landing pad destinations for the dispatch jump table!"); 9665 9666 // Create the jump table and associated information. 9667 MachineJumpTableInfo *JTI = 9668 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9669 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9670 9671 // Create the MBBs for the dispatch code. 9672 9673 // Shove the dispatch's address into the return slot in the function context. 9674 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9675 DispatchBB->setIsEHPad(); 9676 9677 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9678 unsigned trap_opcode; 9679 if (Subtarget->isThumb()) 9680 trap_opcode = ARM::tTRAP; 9681 else 9682 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9683 9684 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9685 DispatchBB->addSuccessor(TrapBB); 9686 9687 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9688 DispatchBB->addSuccessor(DispContBB); 9689 9690 // Insert and MBBs. 9691 MF->insert(MF->end(), DispatchBB); 9692 MF->insert(MF->end(), DispContBB); 9693 MF->insert(MF->end(), TrapBB); 9694 9695 // Insert code into the entry block that creates and registers the function 9696 // context. 9697 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9698 9699 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9700 MachinePointerInfo::getFixedStack(*MF, FI), 9701 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 9702 9703 MachineInstrBuilder MIB; 9704 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9705 9706 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9707 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9708 9709 // Add a register mask with no preserved registers. This results in all 9710 // registers being marked as clobbered. This can't work if the dispatch block 9711 // is in a Thumb1 function and is linked with ARM code which uses the FP 9712 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9713 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9714 9715 bool IsPositionIndependent = isPositionIndependent(); 9716 unsigned NumLPads = LPadList.size(); 9717 if (Subtarget->isThumb2()) { 9718 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9719 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9720 .addFrameIndex(FI) 9721 .addImm(4) 9722 .addMemOperand(FIMMOLd) 9723 .add(predOps(ARMCC::AL)); 9724 9725 if (NumLPads < 256) { 9726 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9727 .addReg(NewVReg1) 9728 .addImm(LPadList.size()) 9729 .add(predOps(ARMCC::AL)); 9730 } else { 9731 Register VReg1 = MRI->createVirtualRegister(TRC); 9732 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9733 .addImm(NumLPads & 0xFFFF) 9734 .add(predOps(ARMCC::AL)); 9735 9736 unsigned VReg2 = VReg1; 9737 if ((NumLPads & 0xFFFF0000) != 0) { 9738 VReg2 = MRI->createVirtualRegister(TRC); 9739 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9740 .addReg(VReg1) 9741 .addImm(NumLPads >> 16) 9742 .add(predOps(ARMCC::AL)); 9743 } 9744 9745 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9746 .addReg(NewVReg1) 9747 .addReg(VReg2) 9748 .add(predOps(ARMCC::AL)); 9749 } 9750 9751 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9752 .addMBB(TrapBB) 9753 .addImm(ARMCC::HI) 9754 .addReg(ARM::CPSR); 9755 9756 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9757 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9758 .addJumpTableIndex(MJTI) 9759 .add(predOps(ARMCC::AL)); 9760 9761 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9762 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9763 .addReg(NewVReg3, RegState::Kill) 9764 .addReg(NewVReg1) 9765 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9766 .add(predOps(ARMCC::AL)) 9767 .add(condCodeOp()); 9768 9769 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9770 .addReg(NewVReg4, RegState::Kill) 9771 .addReg(NewVReg1) 9772 .addJumpTableIndex(MJTI); 9773 } else if (Subtarget->isThumb()) { 9774 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9775 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9776 .addFrameIndex(FI) 9777 .addImm(1) 9778 .addMemOperand(FIMMOLd) 9779 .add(predOps(ARMCC::AL)); 9780 9781 if (NumLPads < 256) { 9782 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9783 .addReg(NewVReg1) 9784 .addImm(NumLPads) 9785 .add(predOps(ARMCC::AL)); 9786 } else { 9787 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9788 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9789 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9790 9791 // MachineConstantPool wants an explicit alignment. 9792 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9793 if (Align == 0) 9794 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9795 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9796 9797 Register VReg1 = MRI->createVirtualRegister(TRC); 9798 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9799 .addReg(VReg1, RegState::Define) 9800 .addConstantPoolIndex(Idx) 9801 .add(predOps(ARMCC::AL)); 9802 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9803 .addReg(NewVReg1) 9804 .addReg(VReg1) 9805 .add(predOps(ARMCC::AL)); 9806 } 9807 9808 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9809 .addMBB(TrapBB) 9810 .addImm(ARMCC::HI) 9811 .addReg(ARM::CPSR); 9812 9813 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9814 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9815 .addReg(ARM::CPSR, RegState::Define) 9816 .addReg(NewVReg1) 9817 .addImm(2) 9818 .add(predOps(ARMCC::AL)); 9819 9820 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9821 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9822 .addJumpTableIndex(MJTI) 9823 .add(predOps(ARMCC::AL)); 9824 9825 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9826 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9827 .addReg(ARM::CPSR, RegState::Define) 9828 .addReg(NewVReg2, RegState::Kill) 9829 .addReg(NewVReg3) 9830 .add(predOps(ARMCC::AL)); 9831 9832 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9833 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9834 9835 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9836 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9837 .addReg(NewVReg4, RegState::Kill) 9838 .addImm(0) 9839 .addMemOperand(JTMMOLd) 9840 .add(predOps(ARMCC::AL)); 9841 9842 unsigned NewVReg6 = NewVReg5; 9843 if (IsPositionIndependent) { 9844 NewVReg6 = MRI->createVirtualRegister(TRC); 9845 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9846 .addReg(ARM::CPSR, RegState::Define) 9847 .addReg(NewVReg5, RegState::Kill) 9848 .addReg(NewVReg3) 9849 .add(predOps(ARMCC::AL)); 9850 } 9851 9852 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9853 .addReg(NewVReg6, RegState::Kill) 9854 .addJumpTableIndex(MJTI); 9855 } else { 9856 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9857 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9858 .addFrameIndex(FI) 9859 .addImm(4) 9860 .addMemOperand(FIMMOLd) 9861 .add(predOps(ARMCC::AL)); 9862 9863 if (NumLPads < 256) { 9864 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9865 .addReg(NewVReg1) 9866 .addImm(NumLPads) 9867 .add(predOps(ARMCC::AL)); 9868 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9869 Register VReg1 = MRI->createVirtualRegister(TRC); 9870 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9871 .addImm(NumLPads & 0xFFFF) 9872 .add(predOps(ARMCC::AL)); 9873 9874 unsigned VReg2 = VReg1; 9875 if ((NumLPads & 0xFFFF0000) != 0) { 9876 VReg2 = MRI->createVirtualRegister(TRC); 9877 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9878 .addReg(VReg1) 9879 .addImm(NumLPads >> 16) 9880 .add(predOps(ARMCC::AL)); 9881 } 9882 9883 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9884 .addReg(NewVReg1) 9885 .addReg(VReg2) 9886 .add(predOps(ARMCC::AL)); 9887 } else { 9888 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9889 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9890 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9891 9892 // MachineConstantPool wants an explicit alignment. 9893 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9894 if (Align == 0) 9895 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9896 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9897 9898 Register VReg1 = MRI->createVirtualRegister(TRC); 9899 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 9900 .addReg(VReg1, RegState::Define) 9901 .addConstantPoolIndex(Idx) 9902 .addImm(0) 9903 .add(predOps(ARMCC::AL)); 9904 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9905 .addReg(NewVReg1) 9906 .addReg(VReg1, RegState::Kill) 9907 .add(predOps(ARMCC::AL)); 9908 } 9909 9910 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 9911 .addMBB(TrapBB) 9912 .addImm(ARMCC::HI) 9913 .addReg(ARM::CPSR); 9914 9915 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9916 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 9917 .addReg(NewVReg1) 9918 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9919 .add(predOps(ARMCC::AL)) 9920 .add(condCodeOp()); 9921 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9922 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 9923 .addJumpTableIndex(MJTI) 9924 .add(predOps(ARMCC::AL)); 9925 9926 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9927 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9928 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9929 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 9930 .addReg(NewVReg3, RegState::Kill) 9931 .addReg(NewVReg4) 9932 .addImm(0) 9933 .addMemOperand(JTMMOLd) 9934 .add(predOps(ARMCC::AL)); 9935 9936 if (IsPositionIndependent) { 9937 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 9938 .addReg(NewVReg5, RegState::Kill) 9939 .addReg(NewVReg4) 9940 .addJumpTableIndex(MJTI); 9941 } else { 9942 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 9943 .addReg(NewVReg5, RegState::Kill) 9944 .addJumpTableIndex(MJTI); 9945 } 9946 } 9947 9948 // Add the jump table entries as successors to the MBB. 9949 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 9950 for (std::vector<MachineBasicBlock*>::iterator 9951 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 9952 MachineBasicBlock *CurMBB = *I; 9953 if (SeenMBBs.insert(CurMBB).second) 9954 DispContBB->addSuccessor(CurMBB); 9955 } 9956 9957 // N.B. the order the invoke BBs are processed in doesn't matter here. 9958 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 9959 SmallVector<MachineBasicBlock*, 64> MBBLPads; 9960 for (MachineBasicBlock *BB : InvokeBBs) { 9961 9962 // Remove the landing pad successor from the invoke block and replace it 9963 // with the new dispatch block. 9964 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 9965 BB->succ_end()); 9966 while (!Successors.empty()) { 9967 MachineBasicBlock *SMBB = Successors.pop_back_val(); 9968 if (SMBB->isEHPad()) { 9969 BB->removeSuccessor(SMBB); 9970 MBBLPads.push_back(SMBB); 9971 } 9972 } 9973 9974 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 9975 BB->normalizeSuccProbs(); 9976 9977 // Find the invoke call and mark all of the callee-saved registers as 9978 // 'implicit defined' so that they're spilled. This prevents code from 9979 // moving instructions to before the EH block, where they will never be 9980 // executed. 9981 for (MachineBasicBlock::reverse_iterator 9982 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 9983 if (!II->isCall()) continue; 9984 9985 DenseMap<unsigned, bool> DefRegs; 9986 for (MachineInstr::mop_iterator 9987 OI = II->operands_begin(), OE = II->operands_end(); 9988 OI != OE; ++OI) { 9989 if (!OI->isReg()) continue; 9990 DefRegs[OI->getReg()] = true; 9991 } 9992 9993 MachineInstrBuilder MIB(*MF, &*II); 9994 9995 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 9996 unsigned Reg = SavedRegs[i]; 9997 if (Subtarget->isThumb2() && 9998 !ARM::tGPRRegClass.contains(Reg) && 9999 !ARM::hGPRRegClass.contains(Reg)) 10000 continue; 10001 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10002 continue; 10003 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10004 continue; 10005 if (!DefRegs[Reg]) 10006 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10007 } 10008 10009 break; 10010 } 10011 } 10012 10013 // Mark all former landing pads as non-landing pads. The dispatch is the only 10014 // landing pad now. 10015 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10016 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10017 (*I)->setIsEHPad(false); 10018 10019 // The instruction is gone now. 10020 MI.eraseFromParent(); 10021} 10022 10023static 10024MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10025 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10026 E = MBB->succ_end(); I != E; ++I) 10027 if (*I != Succ) 10028 return *I; 10029 llvm_unreachable("Expecting a BB with two successors!"); 10030} 10031 10032/// Return the load opcode for a given load size. If load size >= 8, 10033/// neon opcode will be returned. 10034static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10035 if (LdSize >= 8) 10036 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10037 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10038 if (IsThumb1) 10039 return LdSize == 4 ? ARM::tLDRi 10040 : LdSize == 2 ? ARM::tLDRHi 10041 : LdSize == 1 ? ARM::tLDRBi : 0; 10042 if (IsThumb2) 10043 return LdSize == 4 ? ARM::t2LDR_POST 10044 : LdSize == 2 ? ARM::t2LDRH_POST 10045 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10046 return LdSize == 4 ? ARM::LDR_POST_IMM 10047 : LdSize == 2 ? ARM::LDRH_POST 10048 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10049} 10050 10051/// Return the store opcode for a given store size. If store size >= 8, 10052/// neon opcode will be returned. 10053static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10054 if (StSize >= 8) 10055 return StSize == 16 ? ARM::VST1q32wb_fixed 10056 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10057 if (IsThumb1) 10058 return StSize == 4 ? ARM::tSTRi 10059 : StSize == 2 ? ARM::tSTRHi 10060 : StSize == 1 ? ARM::tSTRBi : 0; 10061 if (IsThumb2) 10062 return StSize == 4 ? ARM::t2STR_POST 10063 : StSize == 2 ? ARM::t2STRH_POST 10064 : StSize == 1 ? ARM::t2STRB_POST : 0; 10065 return StSize == 4 ? ARM::STR_POST_IMM 10066 : StSize == 2 ? ARM::STRH_POST 10067 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10068} 10069 10070/// Emit a post-increment load operation with given size. The instructions 10071/// will be added to BB at Pos. 10072static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10073 const TargetInstrInfo *TII, const DebugLoc &dl, 10074 unsigned LdSize, unsigned Data, unsigned AddrIn, 10075 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10076 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10077 assert(LdOpc != 0 && "Should have a load opcode"); 10078 if (LdSize >= 8) { 10079 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10080 .addReg(AddrOut, RegState::Define) 10081 .addReg(AddrIn) 10082 .addImm(0) 10083 .add(predOps(ARMCC::AL)); 10084 } else if (IsThumb1) { 10085 // load + update AddrIn 10086 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10087 .addReg(AddrIn) 10088 .addImm(0) 10089 .add(predOps(ARMCC::AL)); 10090 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10091 .add(t1CondCodeOp()) 10092 .addReg(AddrIn) 10093 .addImm(LdSize) 10094 .add(predOps(ARMCC::AL)); 10095 } else if (IsThumb2) { 10096 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10097 .addReg(AddrOut, RegState::Define) 10098 .addReg(AddrIn) 10099 .addImm(LdSize) 10100 .add(predOps(ARMCC::AL)); 10101 } else { // arm 10102 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10103 .addReg(AddrOut, RegState::Define) 10104 .addReg(AddrIn) 10105 .addReg(0) 10106 .addImm(LdSize) 10107 .add(predOps(ARMCC::AL)); 10108 } 10109} 10110 10111/// Emit a post-increment store operation with given size. The instructions 10112/// will be added to BB at Pos. 10113static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10114 const TargetInstrInfo *TII, const DebugLoc &dl, 10115 unsigned StSize, unsigned Data, unsigned AddrIn, 10116 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10117 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10118 assert(StOpc != 0 && "Should have a store opcode"); 10119 if (StSize >= 8) { 10120 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10121 .addReg(AddrIn) 10122 .addImm(0) 10123 .addReg(Data) 10124 .add(predOps(ARMCC::AL)); 10125 } else if (IsThumb1) { 10126 // store + update AddrIn 10127 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10128 .addReg(Data) 10129 .addReg(AddrIn) 10130 .addImm(0) 10131 .add(predOps(ARMCC::AL)); 10132 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10133 .add(t1CondCodeOp()) 10134 .addReg(AddrIn) 10135 .addImm(StSize) 10136 .add(predOps(ARMCC::AL)); 10137 } else if (IsThumb2) { 10138 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10139 .addReg(Data) 10140 .addReg(AddrIn) 10141 .addImm(StSize) 10142 .add(predOps(ARMCC::AL)); 10143 } else { // arm 10144 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10145 .addReg(Data) 10146 .addReg(AddrIn) 10147 .addReg(0) 10148 .addImm(StSize) 10149 .add(predOps(ARMCC::AL)); 10150 } 10151} 10152 10153MachineBasicBlock * 10154ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10155 MachineBasicBlock *BB) const { 10156 // This pseudo instruction has 3 operands: dst, src, size 10157 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10158 // Otherwise, we will generate unrolled scalar copies. 10159 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10160 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10161 MachineFunction::iterator It = ++BB->getIterator(); 10162 10163 Register dest = MI.getOperand(0).getReg(); 10164 Register src = MI.getOperand(1).getReg(); 10165 unsigned SizeVal = MI.getOperand(2).getImm(); 10166 unsigned Align = MI.getOperand(3).getImm(); 10167 DebugLoc dl = MI.getDebugLoc(); 10168 10169 MachineFunction *MF = BB->getParent(); 10170 MachineRegisterInfo &MRI = MF->getRegInfo(); 10171 unsigned UnitSize = 0; 10172 const TargetRegisterClass *TRC = nullptr; 10173 const TargetRegisterClass *VecTRC = nullptr; 10174 10175 bool IsThumb1 = Subtarget->isThumb1Only(); 10176 bool IsThumb2 = Subtarget->isThumb2(); 10177 bool IsThumb = Subtarget->isThumb(); 10178 10179 if (Align & 1) { 10180 UnitSize = 1; 10181 } else if (Align & 2) { 10182 UnitSize = 2; 10183 } else { 10184 // Check whether we can use NEON instructions. 10185 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10186 Subtarget->hasNEON()) { 10187 if ((Align % 16 == 0) && SizeVal >= 16) 10188 UnitSize = 16; 10189 else if ((Align % 8 == 0) && SizeVal >= 8) 10190 UnitSize = 8; 10191 } 10192 // Can't use NEON instructions. 10193 if (UnitSize == 0) 10194 UnitSize = 4; 10195 } 10196 10197 // Select the correct opcode and register class for unit size load/store 10198 bool IsNeon = UnitSize >= 8; 10199 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10200 if (IsNeon) 10201 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10202 : UnitSize == 8 ? &ARM::DPRRegClass 10203 : nullptr; 10204 10205 unsigned BytesLeft = SizeVal % UnitSize; 10206 unsigned LoopSize = SizeVal - BytesLeft; 10207 10208 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10209 // Use LDR and STR to copy. 10210 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10211 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10212 unsigned srcIn = src; 10213 unsigned destIn = dest; 10214 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10215 Register srcOut = MRI.createVirtualRegister(TRC); 10216 Register destOut = MRI.createVirtualRegister(TRC); 10217 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10218 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10219 IsThumb1, IsThumb2); 10220 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10221 IsThumb1, IsThumb2); 10222 srcIn = srcOut; 10223 destIn = destOut; 10224 } 10225 10226 // Handle the leftover bytes with LDRB and STRB. 10227 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10228 // [destOut] = STRB_POST(scratch, destIn, 1) 10229 for (unsigned i = 0; i < BytesLeft; i++) { 10230 Register srcOut = MRI.createVirtualRegister(TRC); 10231 Register destOut = MRI.createVirtualRegister(TRC); 10232 Register scratch = MRI.createVirtualRegister(TRC); 10233 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10234 IsThumb1, IsThumb2); 10235 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10236 IsThumb1, IsThumb2); 10237 srcIn = srcOut; 10238 destIn = destOut; 10239 } 10240 MI.eraseFromParent(); // The instruction is gone now. 10241 return BB; 10242 } 10243 10244 // Expand the pseudo op to a loop. 10245 // thisMBB: 10246 // ... 10247 // movw varEnd, # --> with thumb2 10248 // movt varEnd, # 10249 // ldrcp varEnd, idx --> without thumb2 10250 // fallthrough --> loopMBB 10251 // loopMBB: 10252 // PHI varPhi, varEnd, varLoop 10253 // PHI srcPhi, src, srcLoop 10254 // PHI destPhi, dst, destLoop 10255 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10256 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10257 // subs varLoop, varPhi, #UnitSize 10258 // bne loopMBB 10259 // fallthrough --> exitMBB 10260 // exitMBB: 10261 // epilogue to handle left-over bytes 10262 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10263 // [destOut] = STRB_POST(scratch, destLoop, 1) 10264 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10265 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10266 MF->insert(It, loopMBB); 10267 MF->insert(It, exitMBB); 10268 10269 // Transfer the remainder of BB and its successor edges to exitMBB. 10270 exitMBB->splice(exitMBB->begin(), BB, 10271 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10272 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10273 10274 // Load an immediate to varEnd. 10275 Register varEnd = MRI.createVirtualRegister(TRC); 10276 if (Subtarget->useMovt()) { 10277 unsigned Vtmp = varEnd; 10278 if ((LoopSize & 0xFFFF0000) != 0) 10279 Vtmp = MRI.createVirtualRegister(TRC); 10280 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10281 .addImm(LoopSize & 0xFFFF) 10282 .add(predOps(ARMCC::AL)); 10283 10284 if ((LoopSize & 0xFFFF0000) != 0) 10285 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10286 .addReg(Vtmp) 10287 .addImm(LoopSize >> 16) 10288 .add(predOps(ARMCC::AL)); 10289 } else { 10290 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10291 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10292 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10293 10294 // MachineConstantPool wants an explicit alignment. 10295 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 10296 if (Align == 0) 10297 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 10298 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 10299 MachineMemOperand *CPMMO = 10300 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10301 MachineMemOperand::MOLoad, 4, 4); 10302 10303 if (IsThumb) 10304 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10305 .addReg(varEnd, RegState::Define) 10306 .addConstantPoolIndex(Idx) 10307 .add(predOps(ARMCC::AL)) 10308 .addMemOperand(CPMMO); 10309 else 10310 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10311 .addReg(varEnd, RegState::Define) 10312 .addConstantPoolIndex(Idx) 10313 .addImm(0) 10314 .add(predOps(ARMCC::AL)) 10315 .addMemOperand(CPMMO); 10316 } 10317 BB->addSuccessor(loopMBB); 10318 10319 // Generate the loop body: 10320 // varPhi = PHI(varLoop, varEnd) 10321 // srcPhi = PHI(srcLoop, src) 10322 // destPhi = PHI(destLoop, dst) 10323 MachineBasicBlock *entryBB = BB; 10324 BB = loopMBB; 10325 Register varLoop = MRI.createVirtualRegister(TRC); 10326 Register varPhi = MRI.createVirtualRegister(TRC); 10327 Register srcLoop = MRI.createVirtualRegister(TRC); 10328 Register srcPhi = MRI.createVirtualRegister(TRC); 10329 Register destLoop = MRI.createVirtualRegister(TRC); 10330 Register destPhi = MRI.createVirtualRegister(TRC); 10331 10332 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10333 .addReg(varLoop).addMBB(loopMBB) 10334 .addReg(varEnd).addMBB(entryBB); 10335 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10336 .addReg(srcLoop).addMBB(loopMBB) 10337 .addReg(src).addMBB(entryBB); 10338 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10339 .addReg(destLoop).addMBB(loopMBB) 10340 .addReg(dest).addMBB(entryBB); 10341 10342 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10343 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10344 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10345 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10346 IsThumb1, IsThumb2); 10347 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10348 IsThumb1, IsThumb2); 10349 10350 // Decrement loop variable by UnitSize. 10351 if (IsThumb1) { 10352 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10353 .add(t1CondCodeOp()) 10354 .addReg(varPhi) 10355 .addImm(UnitSize) 10356 .add(predOps(ARMCC::AL)); 10357 } else { 10358 MachineInstrBuilder MIB = 10359 BuildMI(*BB, BB->end(), dl, 10360 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10361 MIB.addReg(varPhi) 10362 .addImm(UnitSize) 10363 .add(predOps(ARMCC::AL)) 10364 .add(condCodeOp()); 10365 MIB->getOperand(5).setReg(ARM::CPSR); 10366 MIB->getOperand(5).setIsDef(true); 10367 } 10368 BuildMI(*BB, BB->end(), dl, 10369 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10370 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10371 10372 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10373 BB->addSuccessor(loopMBB); 10374 BB->addSuccessor(exitMBB); 10375 10376 // Add epilogue to handle BytesLeft. 10377 BB = exitMBB; 10378 auto StartOfExit = exitMBB->begin(); 10379 10380 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10381 // [destOut] = STRB_POST(scratch, destLoop, 1) 10382 unsigned srcIn = srcLoop; 10383 unsigned destIn = destLoop; 10384 for (unsigned i = 0; i < BytesLeft; i++) { 10385 Register srcOut = MRI.createVirtualRegister(TRC); 10386 Register destOut = MRI.createVirtualRegister(TRC); 10387 Register scratch = MRI.createVirtualRegister(TRC); 10388 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10389 IsThumb1, IsThumb2); 10390 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10391 IsThumb1, IsThumb2); 10392 srcIn = srcOut; 10393 destIn = destOut; 10394 } 10395 10396 MI.eraseFromParent(); // The instruction is gone now. 10397 return BB; 10398} 10399 10400MachineBasicBlock * 10401ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10402 MachineBasicBlock *MBB) const { 10403 const TargetMachine &TM = getTargetMachine(); 10404 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10405 DebugLoc DL = MI.getDebugLoc(); 10406 10407 assert(Subtarget->isTargetWindows() && 10408 "__chkstk is only supported on Windows"); 10409 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10410 10411 // __chkstk takes the number of words to allocate on the stack in R4, and 10412 // returns the stack adjustment in number of bytes in R4. This will not 10413 // clober any other registers (other than the obvious lr). 10414 // 10415 // Although, technically, IP should be considered a register which may be 10416 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10417 // thumb-2 environment, so there is no interworking required. As a result, we 10418 // do not expect a veneer to be emitted by the linker, clobbering IP. 10419 // 10420 // Each module receives its own copy of __chkstk, so no import thunk is 10421 // required, again, ensuring that IP is not clobbered. 10422 // 10423 // Finally, although some linkers may theoretically provide a trampoline for 10424 // out of range calls (which is quite common due to a 32M range limitation of 10425 // branches for Thumb), we can generate the long-call version via 10426 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10427 // IP. 10428 10429 switch (TM.getCodeModel()) { 10430 case CodeModel::Tiny: 10431 llvm_unreachable("Tiny code model not available on ARM."); 10432 case CodeModel::Small: 10433 case CodeModel::Medium: 10434 case CodeModel::Kernel: 10435 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10436 .add(predOps(ARMCC::AL)) 10437 .addExternalSymbol("__chkstk") 10438 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10439 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10440 .addReg(ARM::R12, 10441 RegState::Implicit | RegState::Define | RegState::Dead) 10442 .addReg(ARM::CPSR, 10443 RegState::Implicit | RegState::Define | RegState::Dead); 10444 break; 10445 case CodeModel::Large: { 10446 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10447 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10448 10449 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10450 .addExternalSymbol("__chkstk"); 10451 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10452 .add(predOps(ARMCC::AL)) 10453 .addReg(Reg, RegState::Kill) 10454 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10455 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10456 .addReg(ARM::R12, 10457 RegState::Implicit | RegState::Define | RegState::Dead) 10458 .addReg(ARM::CPSR, 10459 RegState::Implicit | RegState::Define | RegState::Dead); 10460 break; 10461 } 10462 } 10463 10464 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10465 .addReg(ARM::SP, RegState::Kill) 10466 .addReg(ARM::R4, RegState::Kill) 10467 .setMIFlags(MachineInstr::FrameSetup) 10468 .add(predOps(ARMCC::AL)) 10469 .add(condCodeOp()); 10470 10471 MI.eraseFromParent(); 10472 return MBB; 10473} 10474 10475MachineBasicBlock * 10476ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10477 MachineBasicBlock *MBB) const { 10478 DebugLoc DL = MI.getDebugLoc(); 10479 MachineFunction *MF = MBB->getParent(); 10480 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10481 10482 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10483 MF->insert(++MBB->getIterator(), ContBB); 10484 ContBB->splice(ContBB->begin(), MBB, 10485 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10486 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10487 MBB->addSuccessor(ContBB); 10488 10489 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10490 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10491 MF->push_back(TrapBB); 10492 MBB->addSuccessor(TrapBB); 10493 10494 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10495 .addReg(MI.getOperand(0).getReg()) 10496 .addImm(0) 10497 .add(predOps(ARMCC::AL)); 10498 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10499 .addMBB(TrapBB) 10500 .addImm(ARMCC::EQ) 10501 .addReg(ARM::CPSR); 10502 10503 MI.eraseFromParent(); 10504 return ContBB; 10505} 10506 10507// The CPSR operand of SelectItr might be missing a kill marker 10508// because there were multiple uses of CPSR, and ISel didn't know 10509// which to mark. Figure out whether SelectItr should have had a 10510// kill marker, and set it if it should. Returns the correct kill 10511// marker value. 10512static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10513 MachineBasicBlock* BB, 10514 const TargetRegisterInfo* TRI) { 10515 // Scan forward through BB for a use/def of CPSR. 10516 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10517 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10518 const MachineInstr& mi = *miI; 10519 if (mi.readsRegister(ARM::CPSR)) 10520 return false; 10521 if (mi.definesRegister(ARM::CPSR)) 10522 break; // Should have kill-flag - update below. 10523 } 10524 10525 // If we hit the end of the block, check whether CPSR is live into a 10526 // successor. 10527 if (miI == BB->end()) { 10528 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10529 sEnd = BB->succ_end(); 10530 sItr != sEnd; ++sItr) { 10531 MachineBasicBlock* succ = *sItr; 10532 if (succ->isLiveIn(ARM::CPSR)) 10533 return false; 10534 } 10535 } 10536 10537 // We found a def, or hit the end of the basic block and CPSR wasn't live 10538 // out. SelectMI should have a kill flag on CPSR. 10539 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10540 return true; 10541} 10542 10543MachineBasicBlock * 10544ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10545 MachineBasicBlock *BB) const { 10546 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10547 DebugLoc dl = MI.getDebugLoc(); 10548 bool isThumb2 = Subtarget->isThumb2(); 10549 switch (MI.getOpcode()) { 10550 default: { 10551 MI.print(errs()); 10552 llvm_unreachable("Unexpected instr type to insert"); 10553 } 10554 10555 // Thumb1 post-indexed loads are really just single-register LDMs. 10556 case ARM::tLDR_postidx: { 10557 MachineOperand Def(MI.getOperand(1)); 10558 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10559 .add(Def) // Rn_wb 10560 .add(MI.getOperand(2)) // Rn 10561 .add(MI.getOperand(3)) // PredImm 10562 .add(MI.getOperand(4)) // PredReg 10563 .add(MI.getOperand(0)) // Rt 10564 .cloneMemRefs(MI); 10565 MI.eraseFromParent(); 10566 return BB; 10567 } 10568 10569 // The Thumb2 pre-indexed stores have the same MI operands, they just 10570 // define them differently in the .td files from the isel patterns, so 10571 // they need pseudos. 10572 case ARM::t2STR_preidx: 10573 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10574 return BB; 10575 case ARM::t2STRB_preidx: 10576 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10577 return BB; 10578 case ARM::t2STRH_preidx: 10579 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10580 return BB; 10581 10582 case ARM::STRi_preidx: 10583 case ARM::STRBi_preidx: { 10584 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10585 : ARM::STRB_PRE_IMM; 10586 // Decode the offset. 10587 unsigned Offset = MI.getOperand(4).getImm(); 10588 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10589 Offset = ARM_AM::getAM2Offset(Offset); 10590 if (isSub) 10591 Offset = -Offset; 10592 10593 MachineMemOperand *MMO = *MI.memoperands_begin(); 10594 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10595 .add(MI.getOperand(0)) // Rn_wb 10596 .add(MI.getOperand(1)) // Rt 10597 .add(MI.getOperand(2)) // Rn 10598 .addImm(Offset) // offset (skip GPR==zero_reg) 10599 .add(MI.getOperand(5)) // pred 10600 .add(MI.getOperand(6)) 10601 .addMemOperand(MMO); 10602 MI.eraseFromParent(); 10603 return BB; 10604 } 10605 case ARM::STRr_preidx: 10606 case ARM::STRBr_preidx: 10607 case ARM::STRH_preidx: { 10608 unsigned NewOpc; 10609 switch (MI.getOpcode()) { 10610 default: llvm_unreachable("unexpected opcode!"); 10611 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10612 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10613 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10614 } 10615 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10616 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10617 MIB.add(MI.getOperand(i)); 10618 MI.eraseFromParent(); 10619 return BB; 10620 } 10621 10622 case ARM::tMOVCCr_pseudo: { 10623 // To "insert" a SELECT_CC instruction, we actually have to insert the 10624 // diamond control-flow pattern. The incoming instruction knows the 10625 // destination vreg to set, the condition code register to branch on, the 10626 // true/false values to select between, and a branch opcode to use. 10627 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10628 MachineFunction::iterator It = ++BB->getIterator(); 10629 10630 // thisMBB: 10631 // ... 10632 // TrueVal = ... 10633 // cmpTY ccX, r1, r2 10634 // bCC copy1MBB 10635 // fallthrough --> copy0MBB 10636 MachineBasicBlock *thisMBB = BB; 10637 MachineFunction *F = BB->getParent(); 10638 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10639 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10640 F->insert(It, copy0MBB); 10641 F->insert(It, sinkMBB); 10642 10643 // Check whether CPSR is live past the tMOVCCr_pseudo. 10644 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10645 if (!MI.killsRegister(ARM::CPSR) && 10646 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10647 copy0MBB->addLiveIn(ARM::CPSR); 10648 sinkMBB->addLiveIn(ARM::CPSR); 10649 } 10650 10651 // Transfer the remainder of BB and its successor edges to sinkMBB. 10652 sinkMBB->splice(sinkMBB->begin(), BB, 10653 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10654 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10655 10656 BB->addSuccessor(copy0MBB); 10657 BB->addSuccessor(sinkMBB); 10658 10659 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10660 .addMBB(sinkMBB) 10661 .addImm(MI.getOperand(3).getImm()) 10662 .addReg(MI.getOperand(4).getReg()); 10663 10664 // copy0MBB: 10665 // %FalseValue = ... 10666 // # fallthrough to sinkMBB 10667 BB = copy0MBB; 10668 10669 // Update machine-CFG edges 10670 BB->addSuccessor(sinkMBB); 10671 10672 // sinkMBB: 10673 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10674 // ... 10675 BB = sinkMBB; 10676 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10677 .addReg(MI.getOperand(1).getReg()) 10678 .addMBB(copy0MBB) 10679 .addReg(MI.getOperand(2).getReg()) 10680 .addMBB(thisMBB); 10681 10682 MI.eraseFromParent(); // The pseudo instruction is gone now. 10683 return BB; 10684 } 10685 10686 case ARM::BCCi64: 10687 case ARM::BCCZi64: { 10688 // If there is an unconditional branch to the other successor, remove it. 10689 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10690 10691 // Compare both parts that make up the double comparison separately for 10692 // equality. 10693 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10694 10695 Register LHS1 = MI.getOperand(1).getReg(); 10696 Register LHS2 = MI.getOperand(2).getReg(); 10697 if (RHSisZero) { 10698 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10699 .addReg(LHS1) 10700 .addImm(0) 10701 .add(predOps(ARMCC::AL)); 10702 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10703 .addReg(LHS2).addImm(0) 10704 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10705 } else { 10706 Register RHS1 = MI.getOperand(3).getReg(); 10707 Register RHS2 = MI.getOperand(4).getReg(); 10708 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10709 .addReg(LHS1) 10710 .addReg(RHS1) 10711 .add(predOps(ARMCC::AL)); 10712 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10713 .addReg(LHS2).addReg(RHS2) 10714 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10715 } 10716 10717 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10718 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10719 if (MI.getOperand(0).getImm() == ARMCC::NE) 10720 std::swap(destMBB, exitMBB); 10721 10722 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10723 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10724 if (isThumb2) 10725 BuildMI(BB, dl, TII->get(ARM::t2B)) 10726 .addMBB(exitMBB) 10727 .add(predOps(ARMCC::AL)); 10728 else 10729 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10730 10731 MI.eraseFromParent(); // The pseudo instruction is gone now. 10732 return BB; 10733 } 10734 10735 case ARM::Int_eh_sjlj_setjmp: 10736 case ARM::Int_eh_sjlj_setjmp_nofp: 10737 case ARM::tInt_eh_sjlj_setjmp: 10738 case ARM::t2Int_eh_sjlj_setjmp: 10739 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10740 return BB; 10741 10742 case ARM::Int_eh_sjlj_setup_dispatch: 10743 EmitSjLjDispatchBlock(MI, BB); 10744 return BB; 10745 10746 case ARM::ABS: 10747 case ARM::t2ABS: { 10748 // To insert an ABS instruction, we have to insert the 10749 // diamond control-flow pattern. The incoming instruction knows the 10750 // source vreg to test against 0, the destination vreg to set, 10751 // the condition code register to branch on, the 10752 // true/false values to select between, and a branch opcode to use. 10753 // It transforms 10754 // V1 = ABS V0 10755 // into 10756 // V2 = MOVS V0 10757 // BCC (branch to SinkBB if V0 >= 0) 10758 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10759 // SinkBB: V1 = PHI(V2, V3) 10760 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10761 MachineFunction::iterator BBI = ++BB->getIterator(); 10762 MachineFunction *Fn = BB->getParent(); 10763 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10764 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10765 Fn->insert(BBI, RSBBB); 10766 Fn->insert(BBI, SinkBB); 10767 10768 Register ABSSrcReg = MI.getOperand(1).getReg(); 10769 Register ABSDstReg = MI.getOperand(0).getReg(); 10770 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10771 bool isThumb2 = Subtarget->isThumb2(); 10772 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10773 // In Thumb mode S must not be specified if source register is the SP or 10774 // PC and if destination register is the SP, so restrict register class 10775 Register NewRsbDstReg = MRI.createVirtualRegister( 10776 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10777 10778 // Transfer the remainder of BB and its successor edges to sinkMBB. 10779 SinkBB->splice(SinkBB->begin(), BB, 10780 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10781 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10782 10783 BB->addSuccessor(RSBBB); 10784 BB->addSuccessor(SinkBB); 10785 10786 // fall through to SinkMBB 10787 RSBBB->addSuccessor(SinkBB); 10788 10789 // insert a cmp at the end of BB 10790 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10791 .addReg(ABSSrcReg) 10792 .addImm(0) 10793 .add(predOps(ARMCC::AL)); 10794 10795 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10796 BuildMI(BB, dl, 10797 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10798 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10799 10800 // insert rsbri in RSBBB 10801 // Note: BCC and rsbri will be converted into predicated rsbmi 10802 // by if-conversion pass 10803 BuildMI(*RSBBB, RSBBB->begin(), dl, 10804 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10805 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10806 .addImm(0) 10807 .add(predOps(ARMCC::AL)) 10808 .add(condCodeOp()); 10809 10810 // insert PHI in SinkBB, 10811 // reuse ABSDstReg to not change uses of ABS instruction 10812 BuildMI(*SinkBB, SinkBB->begin(), dl, 10813 TII->get(ARM::PHI), ABSDstReg) 10814 .addReg(NewRsbDstReg).addMBB(RSBBB) 10815 .addReg(ABSSrcReg).addMBB(BB); 10816 10817 // remove ABS instruction 10818 MI.eraseFromParent(); 10819 10820 // return last added BB 10821 return SinkBB; 10822 } 10823 case ARM::COPY_STRUCT_BYVAL_I32: 10824 ++NumLoopByVals; 10825 return EmitStructByval(MI, BB); 10826 case ARM::WIN__CHKSTK: 10827 return EmitLowered__chkstk(MI, BB); 10828 case ARM::WIN__DBZCHK: 10829 return EmitLowered__dbzchk(MI, BB); 10830 } 10831} 10832 10833/// Attaches vregs to MEMCPY that it will use as scratch registers 10834/// when it is expanded into LDM/STM. This is done as a post-isel lowering 10835/// instead of as a custom inserter because we need the use list from the SDNode. 10836static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10837 MachineInstr &MI, const SDNode *Node) { 10838 bool isThumb1 = Subtarget->isThumb1Only(); 10839 10840 DebugLoc DL = MI.getDebugLoc(); 10841 MachineFunction *MF = MI.getParent()->getParent(); 10842 MachineRegisterInfo &MRI = MF->getRegInfo(); 10843 MachineInstrBuilder MIB(*MF, MI); 10844 10845 // If the new dst/src is unused mark it as dead. 10846 if (!Node->hasAnyUseOfValue(0)) { 10847 MI.getOperand(0).setIsDead(true); 10848 } 10849 if (!Node->hasAnyUseOfValue(1)) { 10850 MI.getOperand(1).setIsDead(true); 10851 } 10852 10853 // The MEMCPY both defines and kills the scratch registers. 10854 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10855 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10856 : &ARM::GPRRegClass); 10857 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10858 } 10859} 10860 10861void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10862 SDNode *Node) const { 10863 if (MI.getOpcode() == ARM::MEMCPY) { 10864 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10865 return; 10866 } 10867 10868 const MCInstrDesc *MCID = &MI.getDesc(); 10869 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10870 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10871 // operand is still set to noreg. If needed, set the optional operand's 10872 // register to CPSR, and remove the redundant implicit def. 10873 // 10874 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10875 10876 // Rename pseudo opcodes. 10877 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10878 unsigned ccOutIdx; 10879 if (NewOpc) { 10880 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10881 MCID = &TII->get(NewOpc); 10882 10883 assert(MCID->getNumOperands() == 10884 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10885 && "converted opcode should be the same except for cc_out" 10886 " (and, on Thumb1, pred)"); 10887 10888 MI.setDesc(*MCID); 10889 10890 // Add the optional cc_out operand 10891 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10892 10893 // On Thumb1, move all input operands to the end, then add the predicate 10894 if (Subtarget->isThumb1Only()) { 10895 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 10896 MI.addOperand(MI.getOperand(1)); 10897 MI.RemoveOperand(1); 10898 } 10899 10900 // Restore the ties 10901 for (unsigned i = MI.getNumOperands(); i--;) { 10902 const MachineOperand& op = MI.getOperand(i); 10903 if (op.isReg() && op.isUse()) { 10904 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 10905 if (DefIdx != -1) 10906 MI.tieOperands(DefIdx, i); 10907 } 10908 } 10909 10910 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 10911 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 10912 ccOutIdx = 1; 10913 } else 10914 ccOutIdx = MCID->getNumOperands() - 1; 10915 } else 10916 ccOutIdx = MCID->getNumOperands() - 1; 10917 10918 // Any ARM instruction that sets the 's' bit should specify an optional 10919 // "cc_out" operand in the last operand position. 10920 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 10921 assert(!NewOpc && "Optional cc_out operand required"); 10922 return; 10923 } 10924 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 10925 // since we already have an optional CPSR def. 10926 bool definesCPSR = false; 10927 bool deadCPSR = false; 10928 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 10929 ++i) { 10930 const MachineOperand &MO = MI.getOperand(i); 10931 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 10932 definesCPSR = true; 10933 if (MO.isDead()) 10934 deadCPSR = true; 10935 MI.RemoveOperand(i); 10936 break; 10937 } 10938 } 10939 if (!definesCPSR) { 10940 assert(!NewOpc && "Optional cc_out operand required"); 10941 return; 10942 } 10943 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 10944 if (deadCPSR) { 10945 assert(!MI.getOperand(ccOutIdx).getReg() && 10946 "expect uninitialized optional cc_out operand"); 10947 // Thumb1 instructions must have the S bit even if the CPSR is dead. 10948 if (!Subtarget->isThumb1Only()) 10949 return; 10950 } 10951 10952 // If this instruction was defined with an optional CPSR def and its dag node 10953 // had a live implicit CPSR def, then activate the optional CPSR def. 10954 MachineOperand &MO = MI.getOperand(ccOutIdx); 10955 MO.setReg(ARM::CPSR); 10956 MO.setIsDef(true); 10957} 10958 10959//===----------------------------------------------------------------------===// 10960// ARM Optimization Hooks 10961//===----------------------------------------------------------------------===// 10962 10963// Helper function that checks if N is a null or all ones constant. 10964static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 10965 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 10966} 10967 10968// Return true if N is conditionally 0 or all ones. 10969// Detects these expressions where cc is an i1 value: 10970// 10971// (select cc 0, y) [AllOnes=0] 10972// (select cc y, 0) [AllOnes=0] 10973// (zext cc) [AllOnes=0] 10974// (sext cc) [AllOnes=0/1] 10975// (select cc -1, y) [AllOnes=1] 10976// (select cc y, -1) [AllOnes=1] 10977// 10978// Invert is set when N is the null/all ones constant when CC is false. 10979// OtherOp is set to the alternative value of N. 10980static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 10981 SDValue &CC, bool &Invert, 10982 SDValue &OtherOp, 10983 SelectionDAG &DAG) { 10984 switch (N->getOpcode()) { 10985 default: return false; 10986 case ISD::SELECT: { 10987 CC = N->getOperand(0); 10988 SDValue N1 = N->getOperand(1); 10989 SDValue N2 = N->getOperand(2); 10990 if (isZeroOrAllOnes(N1, AllOnes)) { 10991 Invert = false; 10992 OtherOp = N2; 10993 return true; 10994 } 10995 if (isZeroOrAllOnes(N2, AllOnes)) { 10996 Invert = true; 10997 OtherOp = N1; 10998 return true; 10999 } 11000 return false; 11001 } 11002 case ISD::ZERO_EXTEND: 11003 // (zext cc) can never be the all ones value. 11004 if (AllOnes) 11005 return false; 11006 LLVM_FALLTHROUGH; 11007 case ISD::SIGN_EXTEND: { 11008 SDLoc dl(N); 11009 EVT VT = N->getValueType(0); 11010 CC = N->getOperand(0); 11011 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 11012 return false; 11013 Invert = !AllOnes; 11014 if (AllOnes) 11015 // When looking for an AllOnes constant, N is an sext, and the 'other' 11016 // value is 0. 11017 OtherOp = DAG.getConstant(0, dl, VT); 11018 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11019 // When looking for a 0 constant, N can be zext or sext. 11020 OtherOp = DAG.getConstant(1, dl, VT); 11021 else 11022 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11023 VT); 11024 return true; 11025 } 11026 } 11027} 11028 11029// Combine a constant select operand into its use: 11030// 11031// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11032// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11033// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11034// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11035// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11036// 11037// The transform is rejected if the select doesn't have a constant operand that 11038// is null, or all ones when AllOnes is set. 11039// 11040// Also recognize sext/zext from i1: 11041// 11042// (add (zext cc), x) -> (select cc (add x, 1), x) 11043// (add (sext cc), x) -> (select cc (add x, -1), x) 11044// 11045// These transformations eventually create predicated instructions. 11046// 11047// @param N The node to transform. 11048// @param Slct The N operand that is a select. 11049// @param OtherOp The other N operand (x above). 11050// @param DCI Context. 11051// @param AllOnes Require the select constant to be all ones instead of null. 11052// @returns The new node, or SDValue() on failure. 11053static 11054SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11055 TargetLowering::DAGCombinerInfo &DCI, 11056 bool AllOnes = false) { 11057 SelectionDAG &DAG = DCI.DAG; 11058 EVT VT = N->getValueType(0); 11059 SDValue NonConstantVal; 11060 SDValue CCOp; 11061 bool SwapSelectOps; 11062 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11063 NonConstantVal, DAG)) 11064 return SDValue(); 11065 11066 // Slct is now know to be the desired identity constant when CC is true. 11067 SDValue TrueVal = OtherOp; 11068 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11069 OtherOp, NonConstantVal); 11070 // Unless SwapSelectOps says CC should be false. 11071 if (SwapSelectOps) 11072 std::swap(TrueVal, FalseVal); 11073 11074 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11075 CCOp, TrueVal, FalseVal); 11076} 11077 11078// Attempt combineSelectAndUse on each operand of a commutative operator N. 11079static 11080SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11081 TargetLowering::DAGCombinerInfo &DCI) { 11082 SDValue N0 = N->getOperand(0); 11083 SDValue N1 = N->getOperand(1); 11084 if (N0.getNode()->hasOneUse()) 11085 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11086 return Result; 11087 if (N1.getNode()->hasOneUse()) 11088 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11089 return Result; 11090 return SDValue(); 11091} 11092 11093static bool IsVUZPShuffleNode(SDNode *N) { 11094 // VUZP shuffle node. 11095 if (N->getOpcode() == ARMISD::VUZP) 11096 return true; 11097 11098 // "VUZP" on i32 is an alias for VTRN. 11099 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11100 return true; 11101 11102 return false; 11103} 11104 11105static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11106 TargetLowering::DAGCombinerInfo &DCI, 11107 const ARMSubtarget *Subtarget) { 11108 // Look for ADD(VUZP.0, VUZP.1). 11109 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11110 N0 == N1) 11111 return SDValue(); 11112 11113 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11114 if (!N->getValueType(0).is64BitVector()) 11115 return SDValue(); 11116 11117 // Generate vpadd. 11118 SelectionDAG &DAG = DCI.DAG; 11119 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11120 SDLoc dl(N); 11121 SDNode *Unzip = N0.getNode(); 11122 EVT VT = N->getValueType(0); 11123 11124 SmallVector<SDValue, 8> Ops; 11125 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11126 TLI.getPointerTy(DAG.getDataLayout()))); 11127 Ops.push_back(Unzip->getOperand(0)); 11128 Ops.push_back(Unzip->getOperand(1)); 11129 11130 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11131} 11132 11133static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11134 TargetLowering::DAGCombinerInfo &DCI, 11135 const ARMSubtarget *Subtarget) { 11136 // Check for two extended operands. 11137 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11138 N1.getOpcode() == ISD::SIGN_EXTEND) && 11139 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11140 N1.getOpcode() == ISD::ZERO_EXTEND)) 11141 return SDValue(); 11142 11143 SDValue N00 = N0.getOperand(0); 11144 SDValue N10 = N1.getOperand(0); 11145 11146 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11147 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11148 N00 == N10) 11149 return SDValue(); 11150 11151 // We only recognize Q register paddl here; this can't be reached until 11152 // after type legalization. 11153 if (!N00.getValueType().is64BitVector() || 11154 !N0.getValueType().is128BitVector()) 11155 return SDValue(); 11156 11157 // Generate vpaddl. 11158 SelectionDAG &DAG = DCI.DAG; 11159 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11160 SDLoc dl(N); 11161 EVT VT = N->getValueType(0); 11162 11163 SmallVector<SDValue, 8> Ops; 11164 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11165 unsigned Opcode; 11166 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11167 Opcode = Intrinsic::arm_neon_vpaddls; 11168 else 11169 Opcode = Intrinsic::arm_neon_vpaddlu; 11170 Ops.push_back(DAG.getConstant(Opcode, dl, 11171 TLI.getPointerTy(DAG.getDataLayout()))); 11172 EVT ElemTy = N00.getValueType().getVectorElementType(); 11173 unsigned NumElts = VT.getVectorNumElements(); 11174 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11175 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11176 N00.getOperand(0), N00.getOperand(1)); 11177 Ops.push_back(Concat); 11178 11179 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11180} 11181 11182// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11183// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11184// much easier to match. 11185static SDValue 11186AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11187 TargetLowering::DAGCombinerInfo &DCI, 11188 const ARMSubtarget *Subtarget) { 11189 // Only perform optimization if after legalize, and if NEON is available. We 11190 // also expected both operands to be BUILD_VECTORs. 11191 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11192 || N0.getOpcode() != ISD::BUILD_VECTOR 11193 || N1.getOpcode() != ISD::BUILD_VECTOR) 11194 return SDValue(); 11195 11196 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11197 EVT VT = N->getValueType(0); 11198 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11199 return SDValue(); 11200 11201 // Check that the vector operands are of the right form. 11202 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11203 // operands, where N is the size of the formed vector. 11204 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11205 // index such that we have a pair wise add pattern. 11206 11207 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11208 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11209 return SDValue(); 11210 SDValue Vec = N0->getOperand(0)->getOperand(0); 11211 SDNode *V = Vec.getNode(); 11212 unsigned nextIndex = 0; 11213 11214 // For each operands to the ADD which are BUILD_VECTORs, 11215 // check to see if each of their operands are an EXTRACT_VECTOR with 11216 // the same vector and appropriate index. 11217 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11218 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11219 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11220 11221 SDValue ExtVec0 = N0->getOperand(i); 11222 SDValue ExtVec1 = N1->getOperand(i); 11223 11224 // First operand is the vector, verify its the same. 11225 if (V != ExtVec0->getOperand(0).getNode() || 11226 V != ExtVec1->getOperand(0).getNode()) 11227 return SDValue(); 11228 11229 // Second is the constant, verify its correct. 11230 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11231 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11232 11233 // For the constant, we want to see all the even or all the odd. 11234 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11235 || C1->getZExtValue() != nextIndex+1) 11236 return SDValue(); 11237 11238 // Increment index. 11239 nextIndex+=2; 11240 } else 11241 return SDValue(); 11242 } 11243 11244 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11245 // we're using the entire input vector, otherwise there's a size/legality 11246 // mismatch somewhere. 11247 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11248 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11249 return SDValue(); 11250 11251 // Create VPADDL node. 11252 SelectionDAG &DAG = DCI.DAG; 11253 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11254 11255 SDLoc dl(N); 11256 11257 // Build operand list. 11258 SmallVector<SDValue, 8> Ops; 11259 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11260 TLI.getPointerTy(DAG.getDataLayout()))); 11261 11262 // Input is the vector. 11263 Ops.push_back(Vec); 11264 11265 // Get widened type and narrowed type. 11266 MVT widenType; 11267 unsigned numElem = VT.getVectorNumElements(); 11268 11269 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11270 switch (inputLaneType.getSimpleVT().SimpleTy) { 11271 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11272 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11273 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11274 default: 11275 llvm_unreachable("Invalid vector element type for padd optimization."); 11276 } 11277 11278 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11279 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11280 return DAG.getNode(ExtOp, dl, VT, tmp); 11281} 11282 11283static SDValue findMUL_LOHI(SDValue V) { 11284 if (V->getOpcode() == ISD::UMUL_LOHI || 11285 V->getOpcode() == ISD::SMUL_LOHI) 11286 return V; 11287 return SDValue(); 11288} 11289 11290static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11291 TargetLowering::DAGCombinerInfo &DCI, 11292 const ARMSubtarget *Subtarget) { 11293 if (!Subtarget->hasBaseDSP()) 11294 return SDValue(); 11295 11296 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11297 // accumulates the product into a 64-bit value. The 16-bit values will 11298 // be sign extended somehow or SRA'd into 32-bit values 11299 // (addc (adde (mul 16bit, 16bit), lo), hi) 11300 SDValue Mul = AddcNode->getOperand(0); 11301 SDValue Lo = AddcNode->getOperand(1); 11302 if (Mul.getOpcode() != ISD::MUL) { 11303 Lo = AddcNode->getOperand(0); 11304 Mul = AddcNode->getOperand(1); 11305 if (Mul.getOpcode() != ISD::MUL) 11306 return SDValue(); 11307 } 11308 11309 SDValue SRA = AddeNode->getOperand(0); 11310 SDValue Hi = AddeNode->getOperand(1); 11311 if (SRA.getOpcode() != ISD::SRA) { 11312 SRA = AddeNode->getOperand(1); 11313 Hi = AddeNode->getOperand(0); 11314 if (SRA.getOpcode() != ISD::SRA) 11315 return SDValue(); 11316 } 11317 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11318 if (Const->getZExtValue() != 31) 11319 return SDValue(); 11320 } else 11321 return SDValue(); 11322 11323 if (SRA.getOperand(0) != Mul) 11324 return SDValue(); 11325 11326 SelectionDAG &DAG = DCI.DAG; 11327 SDLoc dl(AddcNode); 11328 unsigned Opcode = 0; 11329 SDValue Op0; 11330 SDValue Op1; 11331 11332 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11333 Opcode = ARMISD::SMLALBB; 11334 Op0 = Mul.getOperand(0); 11335 Op1 = Mul.getOperand(1); 11336 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11337 Opcode = ARMISD::SMLALBT; 11338 Op0 = Mul.getOperand(0); 11339 Op1 = Mul.getOperand(1).getOperand(0); 11340 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11341 Opcode = ARMISD::SMLALTB; 11342 Op0 = Mul.getOperand(0).getOperand(0); 11343 Op1 = Mul.getOperand(1); 11344 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11345 Opcode = ARMISD::SMLALTT; 11346 Op0 = Mul->getOperand(0).getOperand(0); 11347 Op1 = Mul->getOperand(1).getOperand(0); 11348 } 11349 11350 if (!Op0 || !Op1) 11351 return SDValue(); 11352 11353 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11354 Op0, Op1, Lo, Hi); 11355 // Replace the ADDs' nodes uses by the MLA node's values. 11356 SDValue HiMLALResult(SMLAL.getNode(), 1); 11357 SDValue LoMLALResult(SMLAL.getNode(), 0); 11358 11359 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11360 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11361 11362 // Return original node to notify the driver to stop replacing. 11363 SDValue resNode(AddcNode, 0); 11364 return resNode; 11365} 11366 11367static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11368 TargetLowering::DAGCombinerInfo &DCI, 11369 const ARMSubtarget *Subtarget) { 11370 // Look for multiply add opportunities. 11371 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11372 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11373 // a glue link from the first add to the second add. 11374 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11375 // a S/UMLAL instruction. 11376 // UMUL_LOHI 11377 // / :lo \ :hi 11378 // V \ [no multiline comment] 11379 // loAdd -> ADDC | 11380 // \ :carry / 11381 // V V 11382 // ADDE <- hiAdd 11383 // 11384 // In the special case where only the higher part of a signed result is used 11385 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11386 // a constant with the exact value of 0x80000000, we recognize we are dealing 11387 // with a "rounded multiply and add" (or subtract) and transform it into 11388 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11389 11390 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11391 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11392 "Expect an ADDE or SUBE"); 11393 11394 assert(AddeSubeNode->getNumOperands() == 3 && 11395 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11396 "ADDE node has the wrong inputs"); 11397 11398 // Check that we are chained to the right ADDC or SUBC node. 11399 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11400 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11401 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11402 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11403 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11404 return SDValue(); 11405 11406 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11407 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11408 11409 // Check if the two operands are from the same mul_lohi node. 11410 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11411 return SDValue(); 11412 11413 assert(AddcSubcNode->getNumValues() == 2 && 11414 AddcSubcNode->getValueType(0) == MVT::i32 && 11415 "Expect ADDC with two result values. First: i32"); 11416 11417 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11418 // maybe a SMLAL which multiplies two 16-bit values. 11419 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11420 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11421 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11422 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11423 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11424 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11425 11426 // Check for the triangle shape. 11427 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11428 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11429 11430 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11431 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11432 return SDValue(); 11433 11434 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11435 bool IsLeftOperandMUL = false; 11436 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11437 if (MULOp == SDValue()) 11438 MULOp = findMUL_LOHI(AddeSubeOp1); 11439 else 11440 IsLeftOperandMUL = true; 11441 if (MULOp == SDValue()) 11442 return SDValue(); 11443 11444 // Figure out the right opcode. 11445 unsigned Opc = MULOp->getOpcode(); 11446 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11447 11448 // Figure out the high and low input values to the MLAL node. 11449 SDValue *HiAddSub = nullptr; 11450 SDValue *LoMul = nullptr; 11451 SDValue *LowAddSub = nullptr; 11452 11453 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11454 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11455 return SDValue(); 11456 11457 if (IsLeftOperandMUL) 11458 HiAddSub = &AddeSubeOp1; 11459 else 11460 HiAddSub = &AddeSubeOp0; 11461 11462 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11463 // whose low result is fed to the ADDC/SUBC we are checking. 11464 11465 if (AddcSubcOp0 == MULOp.getValue(0)) { 11466 LoMul = &AddcSubcOp0; 11467 LowAddSub = &AddcSubcOp1; 11468 } 11469 if (AddcSubcOp1 == MULOp.getValue(0)) { 11470 LoMul = &AddcSubcOp1; 11471 LowAddSub = &AddcSubcOp0; 11472 } 11473 11474 if (!LoMul) 11475 return SDValue(); 11476 11477 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11478 // the replacement below will create a cycle. 11479 if (AddcSubcNode == HiAddSub->getNode() || 11480 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11481 return SDValue(); 11482 11483 // Create the merged node. 11484 SelectionDAG &DAG = DCI.DAG; 11485 11486 // Start building operand list. 11487 SmallVector<SDValue, 8> Ops; 11488 Ops.push_back(LoMul->getOperand(0)); 11489 Ops.push_back(LoMul->getOperand(1)); 11490 11491 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11492 // the case, we must be doing signed multiplication and only use the higher 11493 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11494 // addition or subtraction with the value of 0x800000. 11495 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11496 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11497 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11498 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11499 0x80000000) { 11500 Ops.push_back(*HiAddSub); 11501 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11502 FinalOpc = ARMISD::SMMLSR; 11503 } else { 11504 FinalOpc = ARMISD::SMMLAR; 11505 } 11506 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11507 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11508 11509 return SDValue(AddeSubeNode, 0); 11510 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11511 // SMMLS is generated during instruction selection and the rest of this 11512 // function can not handle the case where AddcSubcNode is a SUBC. 11513 return SDValue(); 11514 11515 // Finish building the operand list for {U/S}MLAL 11516 Ops.push_back(*LowAddSub); 11517 Ops.push_back(*HiAddSub); 11518 11519 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11520 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11521 11522 // Replace the ADDs' nodes uses by the MLA node's values. 11523 SDValue HiMLALResult(MLALNode.getNode(), 1); 11524 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11525 11526 SDValue LoMLALResult(MLALNode.getNode(), 0); 11527 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11528 11529 // Return original node to notify the driver to stop replacing. 11530 return SDValue(AddeSubeNode, 0); 11531} 11532 11533static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11534 TargetLowering::DAGCombinerInfo &DCI, 11535 const ARMSubtarget *Subtarget) { 11536 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11537 // While trying to combine for the other MLAL nodes, first search for the 11538 // chance to use UMAAL. Check if Addc uses a node which has already 11539 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11540 // as the addend, and it's handled in PerformUMLALCombine. 11541 11542 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11543 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11544 11545 // Check that we have a glued ADDC node. 11546 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11547 if (AddcNode->getOpcode() != ARMISD::ADDC) 11548 return SDValue(); 11549 11550 // Find the converted UMAAL or quit if it doesn't exist. 11551 SDNode *UmlalNode = nullptr; 11552 SDValue AddHi; 11553 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11554 UmlalNode = AddcNode->getOperand(0).getNode(); 11555 AddHi = AddcNode->getOperand(1); 11556 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11557 UmlalNode = AddcNode->getOperand(1).getNode(); 11558 AddHi = AddcNode->getOperand(0); 11559 } else { 11560 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11561 } 11562 11563 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11564 // the ADDC as well as Zero. 11565 if (!isNullConstant(UmlalNode->getOperand(3))) 11566 return SDValue(); 11567 11568 if ((isNullConstant(AddeNode->getOperand(0)) && 11569 AddeNode->getOperand(1).getNode() == UmlalNode) || 11570 (AddeNode->getOperand(0).getNode() == UmlalNode && 11571 isNullConstant(AddeNode->getOperand(1)))) { 11572 SelectionDAG &DAG = DCI.DAG; 11573 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11574 UmlalNode->getOperand(2), AddHi }; 11575 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11576 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11577 11578 // Replace the ADDs' nodes uses by the UMAAL node's values. 11579 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11580 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11581 11582 // Return original node to notify the driver to stop replacing. 11583 return SDValue(AddeNode, 0); 11584 } 11585 return SDValue(); 11586} 11587 11588static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11589 const ARMSubtarget *Subtarget) { 11590 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11591 return SDValue(); 11592 11593 // Check that we have a pair of ADDC and ADDE as operands. 11594 // Both addends of the ADDE must be zero. 11595 SDNode* AddcNode = N->getOperand(2).getNode(); 11596 SDNode* AddeNode = N->getOperand(3).getNode(); 11597 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11598 (AddeNode->getOpcode() == ARMISD::ADDE) && 11599 isNullConstant(AddeNode->getOperand(0)) && 11600 isNullConstant(AddeNode->getOperand(1)) && 11601 (AddeNode->getOperand(2).getNode() == AddcNode)) 11602 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11603 DAG.getVTList(MVT::i32, MVT::i32), 11604 {N->getOperand(0), N->getOperand(1), 11605 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11606 else 11607 return SDValue(); 11608} 11609 11610static SDValue PerformAddcSubcCombine(SDNode *N, 11611 TargetLowering::DAGCombinerInfo &DCI, 11612 const ARMSubtarget *Subtarget) { 11613 SelectionDAG &DAG(DCI.DAG); 11614 11615 if (N->getOpcode() == ARMISD::SUBC) { 11616 // (SUBC (ADDE 0, 0, C), 1) -> C 11617 SDValue LHS = N->getOperand(0); 11618 SDValue RHS = N->getOperand(1); 11619 if (LHS->getOpcode() == ARMISD::ADDE && 11620 isNullConstant(LHS->getOperand(0)) && 11621 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11622 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11623 } 11624 } 11625 11626 if (Subtarget->isThumb1Only()) { 11627 SDValue RHS = N->getOperand(1); 11628 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11629 int32_t imm = C->getSExtValue(); 11630 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11631 SDLoc DL(N); 11632 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11633 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11634 : ARMISD::ADDC; 11635 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11636 } 11637 } 11638 } 11639 11640 return SDValue(); 11641} 11642 11643static SDValue PerformAddeSubeCombine(SDNode *N, 11644 TargetLowering::DAGCombinerInfo &DCI, 11645 const ARMSubtarget *Subtarget) { 11646 if (Subtarget->isThumb1Only()) { 11647 SelectionDAG &DAG = DCI.DAG; 11648 SDValue RHS = N->getOperand(1); 11649 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11650 int64_t imm = C->getSExtValue(); 11651 if (imm < 0) { 11652 SDLoc DL(N); 11653 11654 // The with-carry-in form matches bitwise not instead of the negation. 11655 // Effectively, the inverse interpretation of the carry flag already 11656 // accounts for part of the negation. 11657 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11658 11659 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11660 : ARMISD::ADDE; 11661 return DAG.getNode(Opcode, DL, N->getVTList(), 11662 N->getOperand(0), RHS, N->getOperand(2)); 11663 } 11664 } 11665 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11666 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11667 } 11668 return SDValue(); 11669} 11670 11671static SDValue PerformABSCombine(SDNode *N, 11672 TargetLowering::DAGCombinerInfo &DCI, 11673 const ARMSubtarget *Subtarget) { 11674 SDValue res; 11675 SelectionDAG &DAG = DCI.DAG; 11676 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11677 11678 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11679 return SDValue(); 11680 11681 if (!TLI.expandABS(N, res, DAG)) 11682 return SDValue(); 11683 11684 return res; 11685} 11686 11687/// PerformADDECombine - Target-specific dag combine transform from 11688/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11689/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11690static SDValue PerformADDECombine(SDNode *N, 11691 TargetLowering::DAGCombinerInfo &DCI, 11692 const ARMSubtarget *Subtarget) { 11693 // Only ARM and Thumb2 support UMLAL/SMLAL. 11694 if (Subtarget->isThumb1Only()) 11695 return PerformAddeSubeCombine(N, DCI, Subtarget); 11696 11697 // Only perform the checks after legalize when the pattern is available. 11698 if (DCI.isBeforeLegalize()) return SDValue(); 11699 11700 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11701} 11702 11703/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11704/// operands N0 and N1. This is a helper for PerformADDCombine that is 11705/// called with the default operands, and if that fails, with commuted 11706/// operands. 11707static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11708 TargetLowering::DAGCombinerInfo &DCI, 11709 const ARMSubtarget *Subtarget){ 11710 // Attempt to create vpadd for this add. 11711 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11712 return Result; 11713 11714 // Attempt to create vpaddl for this add. 11715 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11716 return Result; 11717 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11718 Subtarget)) 11719 return Result; 11720 11721 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11722 if (N0.getNode()->hasOneUse()) 11723 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11724 return Result; 11725 return SDValue(); 11726} 11727 11728bool 11729ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11730 CombineLevel Level) const { 11731 if (Level == BeforeLegalizeTypes) 11732 return true; 11733 11734 if (N->getOpcode() != ISD::SHL) 11735 return true; 11736 11737 if (Subtarget->isThumb1Only()) { 11738 // Avoid making expensive immediates by commuting shifts. (This logic 11739 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11740 // for free.) 11741 if (N->getOpcode() != ISD::SHL) 11742 return true; 11743 SDValue N1 = N->getOperand(0); 11744 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11745 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11746 return true; 11747 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11748 if (Const->getAPIntValue().ult(256)) 11749 return false; 11750 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11751 Const->getAPIntValue().sgt(-256)) 11752 return false; 11753 } 11754 return true; 11755 } 11756 11757 // Turn off commute-with-shift transform after legalization, so it doesn't 11758 // conflict with PerformSHLSimplify. (We could try to detect when 11759 // PerformSHLSimplify would trigger more precisely, but it isn't 11760 // really necessary.) 11761 return false; 11762} 11763 11764bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11765 const SDNode *N, CombineLevel Level) const { 11766 if (!Subtarget->isThumb1Only()) 11767 return true; 11768 11769 if (Level == BeforeLegalizeTypes) 11770 return true; 11771 11772 return false; 11773} 11774 11775bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11776 if (!Subtarget->hasNEON()) { 11777 if (Subtarget->isThumb1Only()) 11778 return VT.getScalarSizeInBits() <= 32; 11779 return true; 11780 } 11781 return VT.isScalarInteger(); 11782} 11783 11784static SDValue PerformSHLSimplify(SDNode *N, 11785 TargetLowering::DAGCombinerInfo &DCI, 11786 const ARMSubtarget *ST) { 11787 // Allow the generic combiner to identify potential bswaps. 11788 if (DCI.isBeforeLegalize()) 11789 return SDValue(); 11790 11791 // DAG combiner will fold: 11792 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 11793 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 11794 // Other code patterns that can be also be modified have the following form: 11795 // b + ((a << 1) | 510) 11796 // b + ((a << 1) & 510) 11797 // b + ((a << 1) ^ 510) 11798 // b + ((a << 1) + 510) 11799 11800 // Many instructions can perform the shift for free, but it requires both 11801 // the operands to be registers. If c1 << c2 is too large, a mov immediate 11802 // instruction will needed. So, unfold back to the original pattern if: 11803 // - if c1 and c2 are small enough that they don't require mov imms. 11804 // - the user(s) of the node can perform an shl 11805 11806 // No shifted operands for 16-bit instructions. 11807 if (ST->isThumb() && ST->isThumb1Only()) 11808 return SDValue(); 11809 11810 // Check that all the users could perform the shl themselves. 11811 for (auto U : N->uses()) { 11812 switch(U->getOpcode()) { 11813 default: 11814 return SDValue(); 11815 case ISD::SUB: 11816 case ISD::ADD: 11817 case ISD::AND: 11818 case ISD::OR: 11819 case ISD::XOR: 11820 case ISD::SETCC: 11821 case ARMISD::CMP: 11822 // Check that the user isn't already using a constant because there 11823 // aren't any instructions that support an immediate operand and a 11824 // shifted operand. 11825 if (isa<ConstantSDNode>(U->getOperand(0)) || 11826 isa<ConstantSDNode>(U->getOperand(1))) 11827 return SDValue(); 11828 11829 // Check that it's not already using a shift. 11830 if (U->getOperand(0).getOpcode() == ISD::SHL || 11831 U->getOperand(1).getOpcode() == ISD::SHL) 11832 return SDValue(); 11833 break; 11834 } 11835 } 11836 11837 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 11838 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 11839 return SDValue(); 11840 11841 if (N->getOperand(0).getOpcode() != ISD::SHL) 11842 return SDValue(); 11843 11844 SDValue SHL = N->getOperand(0); 11845 11846 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11847 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 11848 if (!C1ShlC2 || !C2) 11849 return SDValue(); 11850 11851 APInt C2Int = C2->getAPIntValue(); 11852 APInt C1Int = C1ShlC2->getAPIntValue(); 11853 11854 // Check that performing a lshr will not lose any information. 11855 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 11856 C2Int.getBitWidth() - C2->getZExtValue()); 11857 if ((C1Int & Mask) != C1Int) 11858 return SDValue(); 11859 11860 // Shift the first constant. 11861 C1Int.lshrInPlace(C2Int); 11862 11863 // The immediates are encoded as an 8-bit value that can be rotated. 11864 auto LargeImm = [](const APInt &Imm) { 11865 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 11866 return Imm.getBitWidth() - Zeros > 8; 11867 }; 11868 11869 if (LargeImm(C1Int) || LargeImm(C2Int)) 11870 return SDValue(); 11871 11872 SelectionDAG &DAG = DCI.DAG; 11873 SDLoc dl(N); 11874 SDValue X = SHL.getOperand(0); 11875 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 11876 DAG.getConstant(C1Int, dl, MVT::i32)); 11877 // Shift left to compensate for the lshr of C1Int. 11878 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 11879 11880 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 11881 SHL.dump(); N->dump()); 11882 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 11883 return Res; 11884} 11885 11886 11887/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 11888/// 11889static SDValue PerformADDCombine(SDNode *N, 11890 TargetLowering::DAGCombinerInfo &DCI, 11891 const ARMSubtarget *Subtarget) { 11892 SDValue N0 = N->getOperand(0); 11893 SDValue N1 = N->getOperand(1); 11894 11895 // Only works one way, because it needs an immediate operand. 11896 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11897 return Result; 11898 11899 // First try with the default operand order. 11900 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 11901 return Result; 11902 11903 // If that didn't work, try again with the operands commuted. 11904 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 11905} 11906 11907/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 11908/// 11909static SDValue PerformSUBCombine(SDNode *N, 11910 TargetLowering::DAGCombinerInfo &DCI, 11911 const ARMSubtarget *Subtarget) { 11912 SDValue N0 = N->getOperand(0); 11913 SDValue N1 = N->getOperand(1); 11914 11915 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11916 if (N1.getNode()->hasOneUse()) 11917 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 11918 return Result; 11919 11920 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 11921 return SDValue(); 11922 11923 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 11924 // so that we can readily pattern match more mve instructions which can use 11925 // a scalar operand. 11926 SDValue VDup = N->getOperand(1); 11927 if (VDup->getOpcode() != ARMISD::VDUP) 11928 return SDValue(); 11929 11930 SDValue VMov = N->getOperand(0); 11931 if (VMov->getOpcode() == ISD::BITCAST) 11932 VMov = VMov->getOperand(0); 11933 11934 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 11935 return SDValue(); 11936 11937 SDLoc dl(N); 11938 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 11939 DCI.DAG.getConstant(0, dl, MVT::i32), 11940 VDup->getOperand(0)); 11941 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 11942} 11943 11944/// PerformVMULCombine 11945/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 11946/// special multiplier accumulator forwarding. 11947/// vmul d3, d0, d2 11948/// vmla d3, d1, d2 11949/// is faster than 11950/// vadd d3, d0, d1 11951/// vmul d3, d3, d2 11952// However, for (A + B) * (A + B), 11953// vadd d2, d0, d1 11954// vmul d3, d0, d2 11955// vmla d3, d1, d2 11956// is slower than 11957// vadd d2, d0, d1 11958// vmul d3, d2, d2 11959static SDValue PerformVMULCombine(SDNode *N, 11960 TargetLowering::DAGCombinerInfo &DCI, 11961 const ARMSubtarget *Subtarget) { 11962 if (!Subtarget->hasVMLxForwarding()) 11963 return SDValue(); 11964 11965 SelectionDAG &DAG = DCI.DAG; 11966 SDValue N0 = N->getOperand(0); 11967 SDValue N1 = N->getOperand(1); 11968 unsigned Opcode = N0.getOpcode(); 11969 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11970 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 11971 Opcode = N1.getOpcode(); 11972 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11973 Opcode != ISD::FADD && Opcode != ISD::FSUB) 11974 return SDValue(); 11975 std::swap(N0, N1); 11976 } 11977 11978 if (N0 == N1) 11979 return SDValue(); 11980 11981 EVT VT = N->getValueType(0); 11982 SDLoc DL(N); 11983 SDValue N00 = N0->getOperand(0); 11984 SDValue N01 = N0->getOperand(1); 11985 return DAG.getNode(Opcode, DL, VT, 11986 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 11987 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 11988} 11989 11990static SDValue PerformMULCombine(SDNode *N, 11991 TargetLowering::DAGCombinerInfo &DCI, 11992 const ARMSubtarget *Subtarget) { 11993 SelectionDAG &DAG = DCI.DAG; 11994 11995 if (Subtarget->isThumb1Only()) 11996 return SDValue(); 11997 11998 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11999 return SDValue(); 12000 12001 EVT VT = N->getValueType(0); 12002 if (VT.is64BitVector() || VT.is128BitVector()) 12003 return PerformVMULCombine(N, DCI, Subtarget); 12004 if (VT != MVT::i32) 12005 return SDValue(); 12006 12007 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12008 if (!C) 12009 return SDValue(); 12010 12011 int64_t MulAmt = C->getSExtValue(); 12012 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 12013 12014 ShiftAmt = ShiftAmt & (32 - 1); 12015 SDValue V = N->getOperand(0); 12016 SDLoc DL(N); 12017 12018 SDValue Res; 12019 MulAmt >>= ShiftAmt; 12020 12021 if (MulAmt >= 0) { 12022 if (isPowerOf2_32(MulAmt - 1)) { 12023 // (mul x, 2^N + 1) => (add (shl x, N), x) 12024 Res = DAG.getNode(ISD::ADD, DL, VT, 12025 V, 12026 DAG.getNode(ISD::SHL, DL, VT, 12027 V, 12028 DAG.getConstant(Log2_32(MulAmt - 1), DL, 12029 MVT::i32))); 12030 } else if (isPowerOf2_32(MulAmt + 1)) { 12031 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12032 Res = DAG.getNode(ISD::SUB, DL, VT, 12033 DAG.getNode(ISD::SHL, DL, VT, 12034 V, 12035 DAG.getConstant(Log2_32(MulAmt + 1), DL, 12036 MVT::i32)), 12037 V); 12038 } else 12039 return SDValue(); 12040 } else { 12041 uint64_t MulAmtAbs = -MulAmt; 12042 if (isPowerOf2_32(MulAmtAbs + 1)) { 12043 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12044 Res = DAG.getNode(ISD::SUB, DL, VT, 12045 V, 12046 DAG.getNode(ISD::SHL, DL, VT, 12047 V, 12048 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 12049 MVT::i32))); 12050 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 12051 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12052 Res = DAG.getNode(ISD::ADD, DL, VT, 12053 V, 12054 DAG.getNode(ISD::SHL, DL, VT, 12055 V, 12056 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 12057 MVT::i32))); 12058 Res = DAG.getNode(ISD::SUB, DL, VT, 12059 DAG.getConstant(0, DL, MVT::i32), Res); 12060 } else 12061 return SDValue(); 12062 } 12063 12064 if (ShiftAmt != 0) 12065 Res = DAG.getNode(ISD::SHL, DL, VT, 12066 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 12067 12068 // Do not add new nodes to DAG combiner worklist. 12069 DCI.CombineTo(N, Res, false); 12070 return SDValue(); 12071} 12072 12073static SDValue CombineANDShift(SDNode *N, 12074 TargetLowering::DAGCombinerInfo &DCI, 12075 const ARMSubtarget *Subtarget) { 12076 // Allow DAGCombine to pattern-match before we touch the canonical form. 12077 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12078 return SDValue(); 12079 12080 if (N->getValueType(0) != MVT::i32) 12081 return SDValue(); 12082 12083 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12084 if (!N1C) 12085 return SDValue(); 12086 12087 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 12088 // Don't transform uxtb/uxth. 12089 if (C1 == 255 || C1 == 65535) 12090 return SDValue(); 12091 12092 SDNode *N0 = N->getOperand(0).getNode(); 12093 if (!N0->hasOneUse()) 12094 return SDValue(); 12095 12096 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 12097 return SDValue(); 12098 12099 bool LeftShift = N0->getOpcode() == ISD::SHL; 12100 12101 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12102 if (!N01C) 12103 return SDValue(); 12104 12105 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 12106 if (!C2 || C2 >= 32) 12107 return SDValue(); 12108 12109 // Clear irrelevant bits in the mask. 12110 if (LeftShift) 12111 C1 &= (-1U << C2); 12112 else 12113 C1 &= (-1U >> C2); 12114 12115 SelectionDAG &DAG = DCI.DAG; 12116 SDLoc DL(N); 12117 12118 // We have a pattern of the form "(and (shl x, c2) c1)" or 12119 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 12120 // transform to a pair of shifts, to save materializing c1. 12121 12122 // First pattern: right shift, then mask off leading bits. 12123 // FIXME: Use demanded bits? 12124 if (!LeftShift && isMask_32(C1)) { 12125 uint32_t C3 = countLeadingZeros(C1); 12126 if (C2 < C3) { 12127 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12128 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12129 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12130 DAG.getConstant(C3, DL, MVT::i32)); 12131 } 12132 } 12133 12134 // First pattern, reversed: left shift, then mask off trailing bits. 12135 if (LeftShift && isMask_32(~C1)) { 12136 uint32_t C3 = countTrailingZeros(C1); 12137 if (C2 < C3) { 12138 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12139 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12140 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12141 DAG.getConstant(C3, DL, MVT::i32)); 12142 } 12143 } 12144 12145 // Second pattern: left shift, then mask off leading bits. 12146 // FIXME: Use demanded bits? 12147 if (LeftShift && isShiftedMask_32(C1)) { 12148 uint32_t Trailing = countTrailingZeros(C1); 12149 uint32_t C3 = countLeadingZeros(C1); 12150 if (Trailing == C2 && C2 + C3 < 32) { 12151 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12152 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12153 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12154 DAG.getConstant(C3, DL, MVT::i32)); 12155 } 12156 } 12157 12158 // Second pattern, reversed: right shift, then mask off trailing bits. 12159 // FIXME: Handle other patterns of known/demanded bits. 12160 if (!LeftShift && isShiftedMask_32(C1)) { 12161 uint32_t Leading = countLeadingZeros(C1); 12162 uint32_t C3 = countTrailingZeros(C1); 12163 if (Leading == C2 && C2 + C3 < 32) { 12164 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12165 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12166 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12167 DAG.getConstant(C3, DL, MVT::i32)); 12168 } 12169 } 12170 12171 // FIXME: Transform "(and (shl x, c2) c1)" -> 12172 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12173 // c1. 12174 return SDValue(); 12175} 12176 12177static SDValue PerformANDCombine(SDNode *N, 12178 TargetLowering::DAGCombinerInfo &DCI, 12179 const ARMSubtarget *Subtarget) { 12180 // Attempt to use immediate-form VBIC 12181 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12182 SDLoc dl(N); 12183 EVT VT = N->getValueType(0); 12184 SelectionDAG &DAG = DCI.DAG; 12185 12186 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12187 return SDValue(); 12188 12189 APInt SplatBits, SplatUndef; 12190 unsigned SplatBitSize; 12191 bool HasAnyUndefs; 12192 if (BVN && Subtarget->hasNEON() && 12193 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12194 if (SplatBitSize <= 64) { 12195 EVT VbicVT; 12196 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12197 SplatUndef.getZExtValue(), SplatBitSize, 12198 DAG, dl, VbicVT, VT.is128BitVector(), 12199 OtherModImm); 12200 if (Val.getNode()) { 12201 SDValue Input = 12202 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12203 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12204 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12205 } 12206 } 12207 } 12208 12209 if (!Subtarget->isThumb1Only()) { 12210 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12211 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12212 return Result; 12213 12214 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12215 return Result; 12216 } 12217 12218 if (Subtarget->isThumb1Only()) 12219 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12220 return Result; 12221 12222 return SDValue(); 12223} 12224 12225// Try combining OR nodes to SMULWB, SMULWT. 12226static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12227 TargetLowering::DAGCombinerInfo &DCI, 12228 const ARMSubtarget *Subtarget) { 12229 if (!Subtarget->hasV6Ops() || 12230 (Subtarget->isThumb() && 12231 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12232 return SDValue(); 12233 12234 SDValue SRL = OR->getOperand(0); 12235 SDValue SHL = OR->getOperand(1); 12236 12237 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12238 SRL = OR->getOperand(1); 12239 SHL = OR->getOperand(0); 12240 } 12241 if (!isSRL16(SRL) || !isSHL16(SHL)) 12242 return SDValue(); 12243 12244 // The first operands to the shifts need to be the two results from the 12245 // same smul_lohi node. 12246 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12247 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12248 return SDValue(); 12249 12250 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12251 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12252 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12253 return SDValue(); 12254 12255 // Now we have: 12256 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12257 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12258 // For SMUWB the 16-bit value will signed extended somehow. 12259 // For SMULWT only the SRA is required. 12260 // Check both sides of SMUL_LOHI 12261 SDValue OpS16 = SMULLOHI->getOperand(0); 12262 SDValue OpS32 = SMULLOHI->getOperand(1); 12263 12264 SelectionDAG &DAG = DCI.DAG; 12265 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12266 OpS16 = OpS32; 12267 OpS32 = SMULLOHI->getOperand(0); 12268 } 12269 12270 SDLoc dl(OR); 12271 unsigned Opcode = 0; 12272 if (isS16(OpS16, DAG)) 12273 Opcode = ARMISD::SMULWB; 12274 else if (isSRA16(OpS16)) { 12275 Opcode = ARMISD::SMULWT; 12276 OpS16 = OpS16->getOperand(0); 12277 } 12278 else 12279 return SDValue(); 12280 12281 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12282 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12283 return SDValue(OR, 0); 12284} 12285 12286static SDValue PerformORCombineToBFI(SDNode *N, 12287 TargetLowering::DAGCombinerInfo &DCI, 12288 const ARMSubtarget *Subtarget) { 12289 // BFI is only available on V6T2+ 12290 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12291 return SDValue(); 12292 12293 EVT VT = N->getValueType(0); 12294 SDValue N0 = N->getOperand(0); 12295 SDValue N1 = N->getOperand(1); 12296 SelectionDAG &DAG = DCI.DAG; 12297 SDLoc DL(N); 12298 // 1) or (and A, mask), val => ARMbfi A, val, mask 12299 // iff (val & mask) == val 12300 // 12301 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12302 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12303 // && mask == ~mask2 12304 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12305 // && ~mask == mask2 12306 // (i.e., copy a bitfield value into another bitfield of the same width) 12307 12308 if (VT != MVT::i32) 12309 return SDValue(); 12310 12311 SDValue N00 = N0.getOperand(0); 12312 12313 // The value and the mask need to be constants so we can verify this is 12314 // actually a bitfield set. If the mask is 0xffff, we can do better 12315 // via a movt instruction, so don't use BFI in that case. 12316 SDValue MaskOp = N0.getOperand(1); 12317 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12318 if (!MaskC) 12319 return SDValue(); 12320 unsigned Mask = MaskC->getZExtValue(); 12321 if (Mask == 0xffff) 12322 return SDValue(); 12323 SDValue Res; 12324 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12325 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12326 if (N1C) { 12327 unsigned Val = N1C->getZExtValue(); 12328 if ((Val & ~Mask) != Val) 12329 return SDValue(); 12330 12331 if (ARM::isBitFieldInvertedMask(Mask)) { 12332 Val >>= countTrailingZeros(~Mask); 12333 12334 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12335 DAG.getConstant(Val, DL, MVT::i32), 12336 DAG.getConstant(Mask, DL, MVT::i32)); 12337 12338 DCI.CombineTo(N, Res, false); 12339 // Return value from the original node to inform the combiner than N is 12340 // now dead. 12341 return SDValue(N, 0); 12342 } 12343 } else if (N1.getOpcode() == ISD::AND) { 12344 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12345 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12346 if (!N11C) 12347 return SDValue(); 12348 unsigned Mask2 = N11C->getZExtValue(); 12349 12350 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12351 // as is to match. 12352 if (ARM::isBitFieldInvertedMask(Mask) && 12353 (Mask == ~Mask2)) { 12354 // The pack halfword instruction works better for masks that fit it, 12355 // so use that when it's available. 12356 if (Subtarget->hasDSP() && 12357 (Mask == 0xffff || Mask == 0xffff0000)) 12358 return SDValue(); 12359 // 2a 12360 unsigned amt = countTrailingZeros(Mask2); 12361 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12362 DAG.getConstant(amt, DL, MVT::i32)); 12363 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12364 DAG.getConstant(Mask, DL, MVT::i32)); 12365 DCI.CombineTo(N, Res, false); 12366 // Return value from the original node to inform the combiner than N is 12367 // now dead. 12368 return SDValue(N, 0); 12369 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12370 (~Mask == Mask2)) { 12371 // The pack halfword instruction works better for masks that fit it, 12372 // so use that when it's available. 12373 if (Subtarget->hasDSP() && 12374 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12375 return SDValue(); 12376 // 2b 12377 unsigned lsb = countTrailingZeros(Mask); 12378 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12379 DAG.getConstant(lsb, DL, MVT::i32)); 12380 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12381 DAG.getConstant(Mask2, DL, MVT::i32)); 12382 DCI.CombineTo(N, Res, false); 12383 // Return value from the original node to inform the combiner than N is 12384 // now dead. 12385 return SDValue(N, 0); 12386 } 12387 } 12388 12389 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12390 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12391 ARM::isBitFieldInvertedMask(~Mask)) { 12392 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12393 // where lsb(mask) == #shamt and masked bits of B are known zero. 12394 SDValue ShAmt = N00.getOperand(1); 12395 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12396 unsigned LSB = countTrailingZeros(Mask); 12397 if (ShAmtC != LSB) 12398 return SDValue(); 12399 12400 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12401 DAG.getConstant(~Mask, DL, MVT::i32)); 12402 12403 DCI.CombineTo(N, Res, false); 12404 // Return value from the original node to inform the combiner than N is 12405 // now dead. 12406 return SDValue(N, 0); 12407 } 12408 12409 return SDValue(); 12410} 12411 12412static bool isValidMVECond(unsigned CC, bool IsFloat) { 12413 switch (CC) { 12414 case ARMCC::EQ: 12415 case ARMCC::NE: 12416 case ARMCC::LE: 12417 case ARMCC::GT: 12418 case ARMCC::GE: 12419 case ARMCC::LT: 12420 return true; 12421 case ARMCC::HS: 12422 case ARMCC::HI: 12423 return !IsFloat; 12424 default: 12425 return false; 12426 }; 12427} 12428 12429static SDValue PerformORCombine_i1(SDNode *N, 12430 TargetLowering::DAGCombinerInfo &DCI, 12431 const ARMSubtarget *Subtarget) { 12432 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12433 // together with predicates 12434 EVT VT = N->getValueType(0); 12435 SDValue N0 = N->getOperand(0); 12436 SDValue N1 = N->getOperand(1); 12437 12438 ARMCC::CondCodes CondCode0 = ARMCC::AL; 12439 ARMCC::CondCodes CondCode1 = ARMCC::AL; 12440 if (N0->getOpcode() == ARMISD::VCMP) 12441 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) 12442 ->getZExtValue(); 12443 else if (N0->getOpcode() == ARMISD::VCMPZ) 12444 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) 12445 ->getZExtValue(); 12446 if (N1->getOpcode() == ARMISD::VCMP) 12447 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) 12448 ->getZExtValue(); 12449 else if (N1->getOpcode() == ARMISD::VCMPZ) 12450 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) 12451 ->getZExtValue(); 12452 12453 if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) 12454 return SDValue(); 12455 12456 unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); 12457 unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); 12458 12459 if (!isValidMVECond(Opposite0, 12460 N0->getOperand(0)->getValueType(0).isFloatingPoint()) || 12461 !isValidMVECond(Opposite1, 12462 N1->getOperand(0)->getValueType(0).isFloatingPoint())) 12463 return SDValue(); 12464 12465 SmallVector<SDValue, 4> Ops0; 12466 Ops0.push_back(N0->getOperand(0)); 12467 if (N0->getOpcode() == ARMISD::VCMP) 12468 Ops0.push_back(N0->getOperand(1)); 12469 Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); 12470 SmallVector<SDValue, 4> Ops1; 12471 Ops1.push_back(N1->getOperand(0)); 12472 if (N1->getOpcode() == ARMISD::VCMP) 12473 Ops1.push_back(N1->getOperand(1)); 12474 Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); 12475 12476 SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); 12477 SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); 12478 SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); 12479 return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, 12480 DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); 12481} 12482 12483/// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12484static SDValue PerformORCombine(SDNode *N, 12485 TargetLowering::DAGCombinerInfo &DCI, 12486 const ARMSubtarget *Subtarget) { 12487 // Attempt to use immediate-form VORR 12488 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12489 SDLoc dl(N); 12490 EVT VT = N->getValueType(0); 12491 SelectionDAG &DAG = DCI.DAG; 12492 12493 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12494 return SDValue(); 12495 12496 APInt SplatBits, SplatUndef; 12497 unsigned SplatBitSize; 12498 bool HasAnyUndefs; 12499 if (BVN && Subtarget->hasNEON() && 12500 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12501 if (SplatBitSize <= 64) { 12502 EVT VorrVT; 12503 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 12504 SplatUndef.getZExtValue(), SplatBitSize, 12505 DAG, dl, VorrVT, VT.is128BitVector(), 12506 OtherModImm); 12507 if (Val.getNode()) { 12508 SDValue Input = 12509 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12510 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12511 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12512 } 12513 } 12514 } 12515 12516 if (!Subtarget->isThumb1Only()) { 12517 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12518 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12519 return Result; 12520 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12521 return Result; 12522 } 12523 12524 SDValue N0 = N->getOperand(0); 12525 SDValue N1 = N->getOperand(1); 12526 12527 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12528 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12529 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12530 12531 // The code below optimizes (or (and X, Y), Z). 12532 // The AND operand needs to have a single user to make these optimizations 12533 // profitable. 12534 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12535 return SDValue(); 12536 12537 APInt SplatUndef; 12538 unsigned SplatBitSize; 12539 bool HasAnyUndefs; 12540 12541 APInt SplatBits0, SplatBits1; 12542 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12543 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12544 // Ensure that the second operand of both ands are constants 12545 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12546 HasAnyUndefs) && !HasAnyUndefs) { 12547 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12548 HasAnyUndefs) && !HasAnyUndefs) { 12549 // Ensure that the bit width of the constants are the same and that 12550 // the splat arguments are logical inverses as per the pattern we 12551 // are trying to simplify. 12552 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12553 SplatBits0 == ~SplatBits1) { 12554 // Canonicalize the vector type to make instruction selection 12555 // simpler. 12556 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12557 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12558 N0->getOperand(1), 12559 N0->getOperand(0), 12560 N1->getOperand(0)); 12561 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12562 } 12563 } 12564 } 12565 } 12566 12567 if (Subtarget->hasMVEIntegerOps() && 12568 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12569 return PerformORCombine_i1(N, DCI, Subtarget); 12570 12571 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12572 // reasonable. 12573 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12574 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12575 return Res; 12576 } 12577 12578 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12579 return Result; 12580 12581 return SDValue(); 12582} 12583 12584static SDValue PerformXORCombine(SDNode *N, 12585 TargetLowering::DAGCombinerInfo &DCI, 12586 const ARMSubtarget *Subtarget) { 12587 EVT VT = N->getValueType(0); 12588 SelectionDAG &DAG = DCI.DAG; 12589 12590 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12591 return SDValue(); 12592 12593 if (!Subtarget->isThumb1Only()) { 12594 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12595 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12596 return Result; 12597 12598 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12599 return Result; 12600 } 12601 12602 return SDValue(); 12603} 12604 12605// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12606// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12607// their position in "to" (Rd). 12608static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12609 assert(N->getOpcode() == ARMISD::BFI); 12610 12611 SDValue From = N->getOperand(1); 12612 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12613 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12614 12615 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12616 // #C in the base of the SHR. 12617 if (From->getOpcode() == ISD::SRL && 12618 isa<ConstantSDNode>(From->getOperand(1))) { 12619 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12620 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12621 FromMask <<= Shift.getLimitedValue(31); 12622 From = From->getOperand(0); 12623 } 12624 12625 return From; 12626} 12627 12628// If A and B contain one contiguous set of bits, does A | B == A . B? 12629// 12630// Neither A nor B must be zero. 12631static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12632 unsigned LastActiveBitInA = A.countTrailingZeros(); 12633 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12634 return LastActiveBitInA - 1 == FirstActiveBitInB; 12635} 12636 12637static SDValue FindBFIToCombineWith(SDNode *N) { 12638 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12639 // if one exists. 12640 APInt ToMask, FromMask; 12641 SDValue From = ParseBFI(N, ToMask, FromMask); 12642 SDValue To = N->getOperand(0); 12643 12644 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12645 // aren't compatible, but not if they set the same bit in their destination as 12646 // we do (or that of any BFI we're going to combine with). 12647 SDValue V = To; 12648 APInt CombinedToMask = ToMask; 12649 while (V.getOpcode() == ARMISD::BFI) { 12650 APInt NewToMask, NewFromMask; 12651 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12652 if (NewFrom != From) { 12653 // This BFI has a different base. Keep going. 12654 CombinedToMask |= NewToMask; 12655 V = V.getOperand(0); 12656 continue; 12657 } 12658 12659 // Do the written bits conflict with any we've seen so far? 12660 if ((NewToMask & CombinedToMask).getBoolValue()) 12661 // Conflicting bits - bail out because going further is unsafe. 12662 return SDValue(); 12663 12664 // Are the new bits contiguous when combined with the old bits? 12665 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12666 BitsProperlyConcatenate(FromMask, NewFromMask)) 12667 return V; 12668 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12669 BitsProperlyConcatenate(NewFromMask, FromMask)) 12670 return V; 12671 12672 // We've seen a write to some bits, so track it. 12673 CombinedToMask |= NewToMask; 12674 // Keep going... 12675 V = V.getOperand(0); 12676 } 12677 12678 return SDValue(); 12679} 12680 12681static SDValue PerformBFICombine(SDNode *N, 12682 TargetLowering::DAGCombinerInfo &DCI) { 12683 SDValue N1 = N->getOperand(1); 12684 if (N1.getOpcode() == ISD::AND) { 12685 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12686 // the bits being cleared by the AND are not demanded by the BFI. 12687 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12688 if (!N11C) 12689 return SDValue(); 12690 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12691 unsigned LSB = countTrailingZeros(~InvMask); 12692 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12693 assert(Width < 12694 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12695 "undefined behavior"); 12696 unsigned Mask = (1u << Width) - 1; 12697 unsigned Mask2 = N11C->getZExtValue(); 12698 if ((Mask & (~Mask2)) == 0) 12699 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12700 N->getOperand(0), N1.getOperand(0), 12701 N->getOperand(2)); 12702 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12703 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12704 // Keep track of any consecutive bits set that all come from the same base 12705 // value. We can combine these together into a single BFI. 12706 SDValue CombineBFI = FindBFIToCombineWith(N); 12707 if (CombineBFI == SDValue()) 12708 return SDValue(); 12709 12710 // We've found a BFI. 12711 APInt ToMask1, FromMask1; 12712 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12713 12714 APInt ToMask2, FromMask2; 12715 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12716 assert(From1 == From2); 12717 (void)From2; 12718 12719 // First, unlink CombineBFI. 12720 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 12721 // Then create a new BFI, combining the two together. 12722 APInt NewFromMask = FromMask1 | FromMask2; 12723 APInt NewToMask = ToMask1 | ToMask2; 12724 12725 EVT VT = N->getValueType(0); 12726 SDLoc dl(N); 12727 12728 if (NewFromMask[0] == 0) 12729 From1 = DCI.DAG.getNode( 12730 ISD::SRL, dl, VT, From1, 12731 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 12732 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 12733 DCI.DAG.getConstant(~NewToMask, dl, VT)); 12734 } 12735 return SDValue(); 12736} 12737 12738/// PerformVMOVRRDCombine - Target-specific dag combine xforms for 12739/// ARMISD::VMOVRRD. 12740static SDValue PerformVMOVRRDCombine(SDNode *N, 12741 TargetLowering::DAGCombinerInfo &DCI, 12742 const ARMSubtarget *Subtarget) { 12743 // vmovrrd(vmovdrr x, y) -> x,y 12744 SDValue InDouble = N->getOperand(0); 12745 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 12746 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 12747 12748 // vmovrrd(load f64) -> (load i32), (load i32) 12749 SDNode *InNode = InDouble.getNode(); 12750 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 12751 InNode->getValueType(0) == MVT::f64 && 12752 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 12753 !cast<LoadSDNode>(InNode)->isVolatile()) { 12754 // TODO: Should this be done for non-FrameIndex operands? 12755 LoadSDNode *LD = cast<LoadSDNode>(InNode); 12756 12757 SelectionDAG &DAG = DCI.DAG; 12758 SDLoc DL(LD); 12759 SDValue BasePtr = LD->getBasePtr(); 12760 SDValue NewLD1 = 12761 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 12762 LD->getAlignment(), LD->getMemOperand()->getFlags()); 12763 12764 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12765 DAG.getConstant(4, DL, MVT::i32)); 12766 12767 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 12768 LD->getPointerInfo().getWithOffset(4), 12769 std::min(4U, LD->getAlignment()), 12770 LD->getMemOperand()->getFlags()); 12771 12772 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 12773 if (DCI.DAG.getDataLayout().isBigEndian()) 12774 std::swap (NewLD1, NewLD2); 12775 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 12776 return Result; 12777 } 12778 12779 return SDValue(); 12780} 12781 12782/// PerformVMOVDRRCombine - Target-specific dag combine xforms for 12783/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 12784static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 12785 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 12786 SDValue Op0 = N->getOperand(0); 12787 SDValue Op1 = N->getOperand(1); 12788 if (Op0.getOpcode() == ISD::BITCAST) 12789 Op0 = Op0.getOperand(0); 12790 if (Op1.getOpcode() == ISD::BITCAST) 12791 Op1 = Op1.getOperand(0); 12792 if (Op0.getOpcode() == ARMISD::VMOVRRD && 12793 Op0.getNode() == Op1.getNode() && 12794 Op0.getResNo() == 0 && Op1.getResNo() == 1) 12795 return DAG.getNode(ISD::BITCAST, SDLoc(N), 12796 N->getValueType(0), Op0.getOperand(0)); 12797 return SDValue(); 12798} 12799 12800/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 12801/// are normal, non-volatile loads. If so, it is profitable to bitcast an 12802/// i64 vector to have f64 elements, since the value can then be loaded 12803/// directly into a VFP register. 12804static bool hasNormalLoadOperand(SDNode *N) { 12805 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 12806 for (unsigned i = 0; i < NumElts; ++i) { 12807 SDNode *Elt = N->getOperand(i).getNode(); 12808 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 12809 return true; 12810 } 12811 return false; 12812} 12813 12814/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 12815/// ISD::BUILD_VECTOR. 12816static SDValue PerformBUILD_VECTORCombine(SDNode *N, 12817 TargetLowering::DAGCombinerInfo &DCI, 12818 const ARMSubtarget *Subtarget) { 12819 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 12820 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 12821 // into a pair of GPRs, which is fine when the value is used as a scalar, 12822 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 12823 SelectionDAG &DAG = DCI.DAG; 12824 if (N->getNumOperands() == 2) 12825 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 12826 return RV; 12827 12828 // Load i64 elements as f64 values so that type legalization does not split 12829 // them up into i32 values. 12830 EVT VT = N->getValueType(0); 12831 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 12832 return SDValue(); 12833 SDLoc dl(N); 12834 SmallVector<SDValue, 8> Ops; 12835 unsigned NumElts = VT.getVectorNumElements(); 12836 for (unsigned i = 0; i < NumElts; ++i) { 12837 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 12838 Ops.push_back(V); 12839 // Make the DAGCombiner fold the bitcast. 12840 DCI.AddToWorklist(V.getNode()); 12841 } 12842 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 12843 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 12844 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 12845} 12846 12847/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 12848static SDValue 12849PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12850 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 12851 // At that time, we may have inserted bitcasts from integer to float. 12852 // If these bitcasts have survived DAGCombine, change the lowering of this 12853 // BUILD_VECTOR in something more vector friendly, i.e., that does not 12854 // force to use floating point types. 12855 12856 // Make sure we can change the type of the vector. 12857 // This is possible iff: 12858 // 1. The vector is only used in a bitcast to a integer type. I.e., 12859 // 1.1. Vector is used only once. 12860 // 1.2. Use is a bit convert to an integer type. 12861 // 2. The size of its operands are 32-bits (64-bits are not legal). 12862 EVT VT = N->getValueType(0); 12863 EVT EltVT = VT.getVectorElementType(); 12864 12865 // Check 1.1. and 2. 12866 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 12867 return SDValue(); 12868 12869 // By construction, the input type must be float. 12870 assert(EltVT == MVT::f32 && "Unexpected type!"); 12871 12872 // Check 1.2. 12873 SDNode *Use = *N->use_begin(); 12874 if (Use->getOpcode() != ISD::BITCAST || 12875 Use->getValueType(0).isFloatingPoint()) 12876 return SDValue(); 12877 12878 // Check profitability. 12879 // Model is, if more than half of the relevant operands are bitcast from 12880 // i32, turn the build_vector into a sequence of insert_vector_elt. 12881 // Relevant operands are everything that is not statically 12882 // (i.e., at compile time) bitcasted. 12883 unsigned NumOfBitCastedElts = 0; 12884 unsigned NumElts = VT.getVectorNumElements(); 12885 unsigned NumOfRelevantElts = NumElts; 12886 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 12887 SDValue Elt = N->getOperand(Idx); 12888 if (Elt->getOpcode() == ISD::BITCAST) { 12889 // Assume only bit cast to i32 will go away. 12890 if (Elt->getOperand(0).getValueType() == MVT::i32) 12891 ++NumOfBitCastedElts; 12892 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 12893 // Constants are statically casted, thus do not count them as 12894 // relevant operands. 12895 --NumOfRelevantElts; 12896 } 12897 12898 // Check if more than half of the elements require a non-free bitcast. 12899 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 12900 return SDValue(); 12901 12902 SelectionDAG &DAG = DCI.DAG; 12903 // Create the new vector type. 12904 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 12905 // Check if the type is legal. 12906 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12907 if (!TLI.isTypeLegal(VecVT)) 12908 return SDValue(); 12909 12910 // Combine: 12911 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 12912 // => BITCAST INSERT_VECTOR_ELT 12913 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 12914 // (BITCAST EN), N. 12915 SDValue Vec = DAG.getUNDEF(VecVT); 12916 SDLoc dl(N); 12917 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 12918 SDValue V = N->getOperand(Idx); 12919 if (V.isUndef()) 12920 continue; 12921 if (V.getOpcode() == ISD::BITCAST && 12922 V->getOperand(0).getValueType() == MVT::i32) 12923 // Fold obvious case. 12924 V = V.getOperand(0); 12925 else { 12926 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 12927 // Make the DAGCombiner fold the bitcasts. 12928 DCI.AddToWorklist(V.getNode()); 12929 } 12930 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 12931 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 12932 } 12933 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 12934 // Make the DAGCombiner fold the bitcasts. 12935 DCI.AddToWorklist(Vec.getNode()); 12936 return Vec; 12937} 12938 12939static SDValue 12940PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12941 EVT VT = N->getValueType(0); 12942 SDValue Op = N->getOperand(0); 12943 SDLoc dl(N); 12944 12945 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 12946 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 12947 // If the valuetypes are the same, we can remove the cast entirely. 12948 if (Op->getOperand(0).getValueType() == VT) 12949 return Op->getOperand(0); 12950 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, 12951 Op->getOperand(0).getValueType(), Op->getOperand(0)); 12952 } 12953 12954 return SDValue(); 12955} 12956 12957static SDValue PerformVCMPCombine(SDNode *N, 12958 TargetLowering::DAGCombinerInfo &DCI, 12959 const ARMSubtarget *Subtarget) { 12960 if (!Subtarget->hasMVEIntegerOps()) 12961 return SDValue(); 12962 12963 EVT VT = N->getValueType(0); 12964 SDValue Op0 = N->getOperand(0); 12965 SDValue Op1 = N->getOperand(1); 12966 ARMCC::CondCodes Cond = 12967 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12968 SDLoc dl(N); 12969 12970 // vcmp X, 0, cc -> vcmpz X, cc 12971 if (isZeroVector(Op1)) 12972 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 12973 N->getOperand(2)); 12974 12975 unsigned SwappedCond = getSwappedCondition(Cond); 12976 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 12977 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 12978 if (isZeroVector(Op0)) 12979 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 12980 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12981 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 12982 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 12983 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 12984 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 12985 } 12986 12987 return SDValue(); 12988} 12989 12990/// PerformInsertEltCombine - Target-specific dag combine xforms for 12991/// ISD::INSERT_VECTOR_ELT. 12992static SDValue PerformInsertEltCombine(SDNode *N, 12993 TargetLowering::DAGCombinerInfo &DCI) { 12994 // Bitcast an i64 load inserted into a vector to f64. 12995 // Otherwise, the i64 value will be legalized to a pair of i32 values. 12996 EVT VT = N->getValueType(0); 12997 SDNode *Elt = N->getOperand(1).getNode(); 12998 if (VT.getVectorElementType() != MVT::i64 || 12999 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 13000 return SDValue(); 13001 13002 SelectionDAG &DAG = DCI.DAG; 13003 SDLoc dl(N); 13004 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13005 VT.getVectorNumElements()); 13006 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 13007 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 13008 // Make the DAGCombiner fold the bitcasts. 13009 DCI.AddToWorklist(Vec.getNode()); 13010 DCI.AddToWorklist(V.getNode()); 13011 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 13012 Vec, V, N->getOperand(2)); 13013 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 13014} 13015 13016/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 13017/// ISD::VECTOR_SHUFFLE. 13018static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 13019 // The LLVM shufflevector instruction does not require the shuffle mask 13020 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 13021 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 13022 // operands do not match the mask length, they are extended by concatenating 13023 // them with undef vectors. That is probably the right thing for other 13024 // targets, but for NEON it is better to concatenate two double-register 13025 // size vector operands into a single quad-register size vector. Do that 13026 // transformation here: 13027 // shuffle(concat(v1, undef), concat(v2, undef)) -> 13028 // shuffle(concat(v1, v2), undef) 13029 SDValue Op0 = N->getOperand(0); 13030 SDValue Op1 = N->getOperand(1); 13031 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 13032 Op1.getOpcode() != ISD::CONCAT_VECTORS || 13033 Op0.getNumOperands() != 2 || 13034 Op1.getNumOperands() != 2) 13035 return SDValue(); 13036 SDValue Concat0Op1 = Op0.getOperand(1); 13037 SDValue Concat1Op1 = Op1.getOperand(1); 13038 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 13039 return SDValue(); 13040 // Skip the transformation if any of the types are illegal. 13041 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13042 EVT VT = N->getValueType(0); 13043 if (!TLI.isTypeLegal(VT) || 13044 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 13045 !TLI.isTypeLegal(Concat1Op1.getValueType())) 13046 return SDValue(); 13047 13048 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 13049 Op0.getOperand(0), Op1.getOperand(0)); 13050 // Translate the shuffle mask. 13051 SmallVector<int, 16> NewMask; 13052 unsigned NumElts = VT.getVectorNumElements(); 13053 unsigned HalfElts = NumElts/2; 13054 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 13055 for (unsigned n = 0; n < NumElts; ++n) { 13056 int MaskElt = SVN->getMaskElt(n); 13057 int NewElt = -1; 13058 if (MaskElt < (int)HalfElts) 13059 NewElt = MaskElt; 13060 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 13061 NewElt = HalfElts + MaskElt - NumElts; 13062 NewMask.push_back(NewElt); 13063 } 13064 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 13065 DAG.getUNDEF(VT), NewMask); 13066} 13067 13068/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 13069/// NEON load/store intrinsics, and generic vector load/stores, to merge 13070/// base address updates. 13071/// For generic load/stores, the memory type is assumed to be a vector. 13072/// The caller is assumed to have checked legality. 13073static SDValue CombineBaseUpdate(SDNode *N, 13074 TargetLowering::DAGCombinerInfo &DCI) { 13075 SelectionDAG &DAG = DCI.DAG; 13076 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 13077 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 13078 const bool isStore = N->getOpcode() == ISD::STORE; 13079 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 13080 SDValue Addr = N->getOperand(AddrOpIdx); 13081 MemSDNode *MemN = cast<MemSDNode>(N); 13082 SDLoc dl(N); 13083 13084 // Search for a use of the address operand that is an increment. 13085 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13086 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 13087 SDNode *User = *UI; 13088 if (User->getOpcode() != ISD::ADD || 13089 UI.getUse().getResNo() != Addr.getResNo()) 13090 continue; 13091 13092 // Check that the add is independent of the load/store. Otherwise, folding 13093 // it would create a cycle. We can avoid searching through Addr as it's a 13094 // predecessor to both. 13095 SmallPtrSet<const SDNode *, 32> Visited; 13096 SmallVector<const SDNode *, 16> Worklist; 13097 Visited.insert(Addr.getNode()); 13098 Worklist.push_back(N); 13099 Worklist.push_back(User); 13100 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13101 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13102 continue; 13103 13104 // Find the new opcode for the updating load/store. 13105 bool isLoadOp = true; 13106 bool isLaneOp = false; 13107 unsigned NewOpc = 0; 13108 unsigned NumVecs = 0; 13109 if (isIntrinsic) { 13110 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13111 switch (IntNo) { 13112 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 13113 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 13114 NumVecs = 1; break; 13115 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 13116 NumVecs = 2; break; 13117 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 13118 NumVecs = 3; break; 13119 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 13120 NumVecs = 4; break; 13121 case Intrinsic::arm_neon_vld2dup: 13122 case Intrinsic::arm_neon_vld3dup: 13123 case Intrinsic::arm_neon_vld4dup: 13124 // TODO: Support updating VLDxDUP nodes. For now, we just skip 13125 // combining base updates for such intrinsics. 13126 continue; 13127 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 13128 NumVecs = 2; isLaneOp = true; break; 13129 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 13130 NumVecs = 3; isLaneOp = true; break; 13131 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 13132 NumVecs = 4; isLaneOp = true; break; 13133 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 13134 NumVecs = 1; isLoadOp = false; break; 13135 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 13136 NumVecs = 2; isLoadOp = false; break; 13137 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 13138 NumVecs = 3; isLoadOp = false; break; 13139 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 13140 NumVecs = 4; isLoadOp = false; break; 13141 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 13142 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 13143 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 13144 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 13145 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 13146 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 13147 } 13148 } else { 13149 isLaneOp = true; 13150 switch (N->getOpcode()) { 13151 default: llvm_unreachable("unexpected opcode for Neon base update"); 13152 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 13153 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 13154 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 13155 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 13156 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 13157 NumVecs = 1; isLaneOp = false; break; 13158 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 13159 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 13160 } 13161 } 13162 13163 // Find the size of memory referenced by the load/store. 13164 EVT VecTy; 13165 if (isLoadOp) { 13166 VecTy = N->getValueType(0); 13167 } else if (isIntrinsic) { 13168 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 13169 } else { 13170 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 13171 VecTy = N->getOperand(1).getValueType(); 13172 } 13173 13174 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13175 if (isLaneOp) 13176 NumBytes /= VecTy.getVectorNumElements(); 13177 13178 // If the increment is a constant, it must match the memory ref size. 13179 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13180 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13181 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 13182 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 13183 // separate instructions that make it harder to use a non-constant update. 13184 continue; 13185 } 13186 13187 // OK, we found an ADD we can fold into the base update. 13188 // Now, create a _UPD node, taking care of not breaking alignment. 13189 13190 EVT AlignedVecTy = VecTy; 13191 unsigned Alignment = MemN->getAlignment(); 13192 13193 // If this is a less-than-standard-aligned load/store, change the type to 13194 // match the standard alignment. 13195 // The alignment is overlooked when selecting _UPD variants; and it's 13196 // easier to introduce bitcasts here than fix that. 13197 // There are 3 ways to get to this base-update combine: 13198 // - intrinsics: they are assumed to be properly aligned (to the standard 13199 // alignment of the memory type), so we don't need to do anything. 13200 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13201 // intrinsics, so, likewise, there's nothing to do. 13202 // - generic load/store instructions: the alignment is specified as an 13203 // explicit operand, rather than implicitly as the standard alignment 13204 // of the memory type (like the intrisics). We need to change the 13205 // memory type to match the explicit alignment. That way, we don't 13206 // generate non-standard-aligned ARMISD::VLDx nodes. 13207 if (isa<LSBaseSDNode>(N)) { 13208 if (Alignment == 0) 13209 Alignment = 1; 13210 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13211 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13212 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13213 assert(!isLaneOp && "Unexpected generic load/store lane."); 13214 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13215 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13216 } 13217 // Don't set an explicit alignment on regular load/stores that we want 13218 // to transform to VLD/VST 1_UPD nodes. 13219 // This matches the behavior of regular load/stores, which only get an 13220 // explicit alignment if the MMO alignment is larger than the standard 13221 // alignment of the memory type. 13222 // Intrinsics, however, always get an explicit alignment, set to the 13223 // alignment of the MMO. 13224 Alignment = 1; 13225 } 13226 13227 // Create the new updating load/store node. 13228 // First, create an SDVTList for the new updating node's results. 13229 EVT Tys[6]; 13230 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13231 unsigned n; 13232 for (n = 0; n < NumResultVecs; ++n) 13233 Tys[n] = AlignedVecTy; 13234 Tys[n++] = MVT::i32; 13235 Tys[n] = MVT::Other; 13236 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13237 13238 // Then, gather the new node's operands. 13239 SmallVector<SDValue, 8> Ops; 13240 Ops.push_back(N->getOperand(0)); // incoming chain 13241 Ops.push_back(N->getOperand(AddrOpIdx)); 13242 Ops.push_back(Inc); 13243 13244 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13245 // Try to match the intrinsic's signature 13246 Ops.push_back(StN->getValue()); 13247 } else { 13248 // Loads (and of course intrinsics) match the intrinsics' signature, 13249 // so just add all but the alignment operand. 13250 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13251 Ops.push_back(N->getOperand(i)); 13252 } 13253 13254 // For all node types, the alignment operand is always the last one. 13255 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13256 13257 // If this is a non-standard-aligned STORE, the penultimate operand is the 13258 // stored value. Bitcast it to the aligned type. 13259 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13260 SDValue &StVal = Ops[Ops.size()-2]; 13261 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13262 } 13263 13264 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13265 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13266 MemN->getMemOperand()); 13267 13268 // Update the uses. 13269 SmallVector<SDValue, 5> NewResults; 13270 for (unsigned i = 0; i < NumResultVecs; ++i) 13271 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13272 13273 // If this is an non-standard-aligned LOAD, the first result is the loaded 13274 // value. Bitcast it to the expected result type. 13275 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13276 SDValue &LdVal = NewResults[0]; 13277 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13278 } 13279 13280 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13281 DCI.CombineTo(N, NewResults); 13282 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13283 13284 break; 13285 } 13286 return SDValue(); 13287} 13288 13289static SDValue PerformVLDCombine(SDNode *N, 13290 TargetLowering::DAGCombinerInfo &DCI) { 13291 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13292 return SDValue(); 13293 13294 return CombineBaseUpdate(N, DCI); 13295} 13296 13297/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13298/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13299/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13300/// return true. 13301static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13302 SelectionDAG &DAG = DCI.DAG; 13303 EVT VT = N->getValueType(0); 13304 // vldN-dup instructions only support 64-bit vectors for N > 1. 13305 if (!VT.is64BitVector()) 13306 return false; 13307 13308 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13309 SDNode *VLD = N->getOperand(0).getNode(); 13310 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13311 return false; 13312 unsigned NumVecs = 0; 13313 unsigned NewOpc = 0; 13314 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13315 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13316 NumVecs = 2; 13317 NewOpc = ARMISD::VLD2DUP; 13318 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13319 NumVecs = 3; 13320 NewOpc = ARMISD::VLD3DUP; 13321 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13322 NumVecs = 4; 13323 NewOpc = ARMISD::VLD4DUP; 13324 } else { 13325 return false; 13326 } 13327 13328 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13329 // numbers match the load. 13330 unsigned VLDLaneNo = 13331 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13332 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13333 UI != UE; ++UI) { 13334 // Ignore uses of the chain result. 13335 if (UI.getUse().getResNo() == NumVecs) 13336 continue; 13337 SDNode *User = *UI; 13338 if (User->getOpcode() != ARMISD::VDUPLANE || 13339 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13340 return false; 13341 } 13342 13343 // Create the vldN-dup node. 13344 EVT Tys[5]; 13345 unsigned n; 13346 for (n = 0; n < NumVecs; ++n) 13347 Tys[n] = VT; 13348 Tys[n] = MVT::Other; 13349 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13350 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13351 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13352 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13353 Ops, VLDMemInt->getMemoryVT(), 13354 VLDMemInt->getMemOperand()); 13355 13356 // Update the uses. 13357 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13358 UI != UE; ++UI) { 13359 unsigned ResNo = UI.getUse().getResNo(); 13360 // Ignore uses of the chain result. 13361 if (ResNo == NumVecs) 13362 continue; 13363 SDNode *User = *UI; 13364 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13365 } 13366 13367 // Now the vldN-lane intrinsic is dead except for its chain result. 13368 // Update uses of the chain. 13369 std::vector<SDValue> VLDDupResults; 13370 for (unsigned n = 0; n < NumVecs; ++n) 13371 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13372 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13373 DCI.CombineTo(VLD, VLDDupResults); 13374 13375 return true; 13376} 13377 13378/// PerformVDUPLANECombine - Target-specific dag combine xforms for 13379/// ARMISD::VDUPLANE. 13380static SDValue PerformVDUPLANECombine(SDNode *N, 13381 TargetLowering::DAGCombinerInfo &DCI) { 13382 SDValue Op = N->getOperand(0); 13383 13384 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13385 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13386 if (CombineVLDDUP(N, DCI)) 13387 return SDValue(N, 0); 13388 13389 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13390 // redundant. Ignore bit_converts for now; element sizes are checked below. 13391 while (Op.getOpcode() == ISD::BITCAST) 13392 Op = Op.getOperand(0); 13393 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13394 return SDValue(); 13395 13396 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13397 unsigned EltSize = Op.getScalarValueSizeInBits(); 13398 // The canonical VMOV for a zero vector uses a 32-bit element size. 13399 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13400 unsigned EltBits; 13401 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13402 EltSize = 8; 13403 EVT VT = N->getValueType(0); 13404 if (EltSize > VT.getScalarSizeInBits()) 13405 return SDValue(); 13406 13407 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13408} 13409 13410/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13411static SDValue PerformVDUPCombine(SDNode *N, 13412 TargetLowering::DAGCombinerInfo &DCI, 13413 const ARMSubtarget *Subtarget) { 13414 SelectionDAG &DAG = DCI.DAG; 13415 SDValue Op = N->getOperand(0); 13416 13417 if (!Subtarget->hasNEON()) 13418 return SDValue(); 13419 13420 // Match VDUP(LOAD) -> VLD1DUP. 13421 // We match this pattern here rather than waiting for isel because the 13422 // transform is only legal for unindexed loads. 13423 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13424 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13425 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13426 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13427 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13428 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13429 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13430 Ops, LD->getMemoryVT(), 13431 LD->getMemOperand()); 13432 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13433 return VLDDup; 13434 } 13435 13436 return SDValue(); 13437} 13438 13439static SDValue PerformLOADCombine(SDNode *N, 13440 TargetLowering::DAGCombinerInfo &DCI) { 13441 EVT VT = N->getValueType(0); 13442 13443 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13444 if (ISD::isNormalLoad(N) && VT.isVector() && 13445 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13446 return CombineBaseUpdate(N, DCI); 13447 13448 return SDValue(); 13449} 13450 13451// Optimize trunc store (of multiple scalars) to shuffle and store. First, 13452// pack all of the elements in one place. Next, store to memory in fewer 13453// chunks. 13454static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13455 SelectionDAG &DAG) { 13456 SDValue StVal = St->getValue(); 13457 EVT VT = StVal.getValueType(); 13458 if (!St->isTruncatingStore() || !VT.isVector()) 13459 return SDValue(); 13460 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13461 EVT StVT = St->getMemoryVT(); 13462 unsigned NumElems = VT.getVectorNumElements(); 13463 assert(StVT != VT && "Cannot truncate to the same type"); 13464 unsigned FromEltSz = VT.getScalarSizeInBits(); 13465 unsigned ToEltSz = StVT.getScalarSizeInBits(); 13466 13467 // From, To sizes and ElemCount must be pow of two 13468 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 13469 return SDValue(); 13470 13471 // We are going to use the original vector elt for storing. 13472 // Accumulated smaller vector elements must be a multiple of the store size. 13473 if (0 != (NumElems * FromEltSz) % ToEltSz) 13474 return SDValue(); 13475 13476 unsigned SizeRatio = FromEltSz / ToEltSz; 13477 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 13478 13479 // Create a type on which we perform the shuffle. 13480 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 13481 NumElems * SizeRatio); 13482 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13483 13484 SDLoc DL(St); 13485 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 13486 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13487 for (unsigned i = 0; i < NumElems; ++i) 13488 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 13489 : i * SizeRatio; 13490 13491 // Can't shuffle using an illegal type. 13492 if (!TLI.isTypeLegal(WideVecVT)) 13493 return SDValue(); 13494 13495 SDValue Shuff = DAG.getVectorShuffle( 13496 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 13497 // At this point all of the data is stored at the bottom of the 13498 // register. We now need to save it to mem. 13499 13500 // Find the largest store unit 13501 MVT StoreType = MVT::i8; 13502 for (MVT Tp : MVT::integer_valuetypes()) { 13503 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 13504 StoreType = Tp; 13505 } 13506 // Didn't find a legal store type. 13507 if (!TLI.isTypeLegal(StoreType)) 13508 return SDValue(); 13509 13510 // Bitcast the original vector into a vector of store-size units 13511 EVT StoreVecVT = 13512 EVT::getVectorVT(*DAG.getContext(), StoreType, 13513 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 13514 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13515 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 13516 SmallVector<SDValue, 8> Chains; 13517 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 13518 TLI.getPointerTy(DAG.getDataLayout())); 13519 SDValue BasePtr = St->getBasePtr(); 13520 13521 // Perform one or more big stores into memory. 13522 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 13523 for (unsigned I = 0; I < E; I++) { 13524 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 13525 ShuffWide, DAG.getIntPtrConstant(I, DL)); 13526 SDValue Ch = 13527 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 13528 St->getAlignment(), St->getMemOperand()->getFlags()); 13529 BasePtr = 13530 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 13531 Chains.push_back(Ch); 13532 } 13533 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 13534} 13535 13536// Try taking a single vector store from an truncate (which would otherwise turn 13537// into an expensive buildvector) and splitting it into a series of narrowing 13538// stores. 13539static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 13540 SelectionDAG &DAG) { 13541 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 13542 return SDValue(); 13543 SDValue Trunc = St->getValue(); 13544 if (Trunc->getOpcode() != ISD::TRUNCATE) 13545 return SDValue(); 13546 EVT FromVT = Trunc->getOperand(0).getValueType(); 13547 EVT ToVT = Trunc.getValueType(); 13548 if (!ToVT.isVector()) 13549 return SDValue(); 13550 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13551 EVT ToEltVT = ToVT.getVectorElementType(); 13552 EVT FromEltVT = FromVT.getVectorElementType(); 13553 13554 unsigned NumElements = 0; 13555 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 13556 NumElements = 4; 13557 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 13558 NumElements = 8; 13559 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 13560 FromVT.getVectorNumElements() % NumElements != 0) 13561 return SDValue(); 13562 13563 SDLoc DL(St); 13564 // Details about the old store 13565 SDValue Ch = St->getChain(); 13566 SDValue BasePtr = St->getBasePtr(); 13567 unsigned Alignment = St->getOriginalAlignment(); 13568 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 13569 AAMDNodes AAInfo = St->getAAInfo(); 13570 13571 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 13572 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 13573 13574 SmallVector<SDValue, 4> Stores; 13575 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 13576 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 13577 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13578 13579 SDValue Extract = 13580 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 13581 DAG.getConstant(i * NumElements, DL, MVT::i32)); 13582 SDValue Store = DAG.getTruncStore( 13583 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 13584 NewToVT, Alignment, MMOFlags, AAInfo); 13585 Stores.push_back(Store); 13586 } 13587 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 13588} 13589 13590/// PerformSTORECombine - Target-specific dag combine xforms for 13591/// ISD::STORE. 13592static SDValue PerformSTORECombine(SDNode *N, 13593 TargetLowering::DAGCombinerInfo &DCI, 13594 const ARMSubtarget *Subtarget) { 13595 StoreSDNode *St = cast<StoreSDNode>(N); 13596 if (St->isVolatile()) 13597 return SDValue(); 13598 SDValue StVal = St->getValue(); 13599 EVT VT = StVal.getValueType(); 13600 13601 if (Subtarget->hasNEON()) 13602 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 13603 return Store; 13604 13605 if (Subtarget->hasMVEIntegerOps()) 13606 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 13607 return NewToken; 13608 13609 if (!ISD::isNormalStore(St)) 13610 return SDValue(); 13611 13612 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 13613 // ARM stores of arguments in the same cache line. 13614 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 13615 StVal.getNode()->hasOneUse()) { 13616 SelectionDAG &DAG = DCI.DAG; 13617 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 13618 SDLoc DL(St); 13619 SDValue BasePtr = St->getBasePtr(); 13620 SDValue NewST1 = DAG.getStore( 13621 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 13622 BasePtr, St->getPointerInfo(), St->getAlignment(), 13623 St->getMemOperand()->getFlags()); 13624 13625 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13626 DAG.getConstant(4, DL, MVT::i32)); 13627 return DAG.getStore(NewST1.getValue(0), DL, 13628 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 13629 OffsetPtr, St->getPointerInfo(), 13630 std::min(4U, St->getAlignment() / 2), 13631 St->getMemOperand()->getFlags()); 13632 } 13633 13634 if (StVal.getValueType() == MVT::i64 && 13635 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13636 13637 // Bitcast an i64 store extracted from a vector to f64. 13638 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13639 SelectionDAG &DAG = DCI.DAG; 13640 SDLoc dl(StVal); 13641 SDValue IntVec = StVal.getOperand(0); 13642 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13643 IntVec.getValueType().getVectorNumElements()); 13644 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 13645 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13646 Vec, StVal.getOperand(1)); 13647 dl = SDLoc(N); 13648 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 13649 // Make the DAGCombiner fold the bitcasts. 13650 DCI.AddToWorklist(Vec.getNode()); 13651 DCI.AddToWorklist(ExtElt.getNode()); 13652 DCI.AddToWorklist(V.getNode()); 13653 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 13654 St->getPointerInfo(), St->getAlignment(), 13655 St->getMemOperand()->getFlags(), St->getAAInfo()); 13656 } 13657 13658 // If this is a legal vector store, try to combine it into a VST1_UPD. 13659 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 13660 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13661 return CombineBaseUpdate(N, DCI); 13662 13663 return SDValue(); 13664} 13665 13666/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 13667/// can replace combinations of VMUL and VCVT (floating-point to integer) 13668/// when the VMUL has a constant operand that is a power of 2. 13669/// 13670/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13671/// vmul.f32 d16, d17, d16 13672/// vcvt.s32.f32 d16, d16 13673/// becomes: 13674/// vcvt.s32.f32 d16, d16, #3 13675static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 13676 const ARMSubtarget *Subtarget) { 13677 if (!Subtarget->hasNEON()) 13678 return SDValue(); 13679 13680 SDValue Op = N->getOperand(0); 13681 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 13682 Op.getOpcode() != ISD::FMUL) 13683 return SDValue(); 13684 13685 SDValue ConstVec = Op->getOperand(1); 13686 if (!isa<BuildVectorSDNode>(ConstVec)) 13687 return SDValue(); 13688 13689 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 13690 uint32_t FloatBits = FloatTy.getSizeInBits(); 13691 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 13692 uint32_t IntBits = IntTy.getSizeInBits(); 13693 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13694 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13695 // These instructions only exist converting from f32 to i32. We can handle 13696 // smaller integers by generating an extra truncate, but larger ones would 13697 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13698 // these intructions only support v2i32/v4i32 types. 13699 return SDValue(); 13700 } 13701 13702 BitVector UndefElements; 13703 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13704 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13705 if (C == -1 || C == 0 || C > 32) 13706 return SDValue(); 13707 13708 SDLoc dl(N); 13709 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 13710 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 13711 Intrinsic::arm_neon_vcvtfp2fxu; 13712 SDValue FixConv = DAG.getNode( 13713 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13714 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 13715 DAG.getConstant(C, dl, MVT::i32)); 13716 13717 if (IntBits < FloatBits) 13718 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 13719 13720 return FixConv; 13721} 13722 13723/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 13724/// can replace combinations of VCVT (integer to floating-point) and VDIV 13725/// when the VDIV has a constant operand that is a power of 2. 13726/// 13727/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13728/// vcvt.f32.s32 d16, d16 13729/// vdiv.f32 d16, d17, d16 13730/// becomes: 13731/// vcvt.f32.s32 d16, d16, #3 13732static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 13733 const ARMSubtarget *Subtarget) { 13734 if (!Subtarget->hasNEON()) 13735 return SDValue(); 13736 13737 SDValue Op = N->getOperand(0); 13738 unsigned OpOpcode = Op.getNode()->getOpcode(); 13739 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 13740 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 13741 return SDValue(); 13742 13743 SDValue ConstVec = N->getOperand(1); 13744 if (!isa<BuildVectorSDNode>(ConstVec)) 13745 return SDValue(); 13746 13747 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 13748 uint32_t FloatBits = FloatTy.getSizeInBits(); 13749 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 13750 uint32_t IntBits = IntTy.getSizeInBits(); 13751 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13752 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13753 // These instructions only exist converting from i32 to f32. We can handle 13754 // smaller integers by generating an extra extend, but larger ones would 13755 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13756 // these intructions only support v2i32/v4i32 types. 13757 return SDValue(); 13758 } 13759 13760 BitVector UndefElements; 13761 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13762 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13763 if (C == -1 || C == 0 || C > 32) 13764 return SDValue(); 13765 13766 SDLoc dl(N); 13767 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 13768 SDValue ConvInput = Op.getOperand(0); 13769 if (IntBits < FloatBits) 13770 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 13771 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13772 ConvInput); 13773 13774 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 13775 Intrinsic::arm_neon_vcvtfxu2fp; 13776 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 13777 Op.getValueType(), 13778 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 13779 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 13780} 13781 13782/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 13783static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 13784 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 13785 switch (IntNo) { 13786 default: 13787 // Don't do anything for most intrinsics. 13788 break; 13789 13790 // Vector shifts: check for immediate versions and lower them. 13791 // Note: This is done during DAG combining instead of DAG legalizing because 13792 // the build_vectors for 64-bit vector element shift counts are generally 13793 // not legal, and it is hard to see their values after they get legalized to 13794 // loads from a constant pool. 13795 case Intrinsic::arm_neon_vshifts: 13796 case Intrinsic::arm_neon_vshiftu: 13797 case Intrinsic::arm_neon_vrshifts: 13798 case Intrinsic::arm_neon_vrshiftu: 13799 case Intrinsic::arm_neon_vrshiftn: 13800 case Intrinsic::arm_neon_vqshifts: 13801 case Intrinsic::arm_neon_vqshiftu: 13802 case Intrinsic::arm_neon_vqshiftsu: 13803 case Intrinsic::arm_neon_vqshiftns: 13804 case Intrinsic::arm_neon_vqshiftnu: 13805 case Intrinsic::arm_neon_vqshiftnsu: 13806 case Intrinsic::arm_neon_vqrshiftns: 13807 case Intrinsic::arm_neon_vqrshiftnu: 13808 case Intrinsic::arm_neon_vqrshiftnsu: { 13809 EVT VT = N->getOperand(1).getValueType(); 13810 int64_t Cnt; 13811 unsigned VShiftOpc = 0; 13812 13813 switch (IntNo) { 13814 case Intrinsic::arm_neon_vshifts: 13815 case Intrinsic::arm_neon_vshiftu: 13816 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 13817 VShiftOpc = ARMISD::VSHLIMM; 13818 break; 13819 } 13820 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 13821 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 13822 : ARMISD::VSHRuIMM); 13823 break; 13824 } 13825 return SDValue(); 13826 13827 case Intrinsic::arm_neon_vrshifts: 13828 case Intrinsic::arm_neon_vrshiftu: 13829 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 13830 break; 13831 return SDValue(); 13832 13833 case Intrinsic::arm_neon_vqshifts: 13834 case Intrinsic::arm_neon_vqshiftu: 13835 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13836 break; 13837 return SDValue(); 13838 13839 case Intrinsic::arm_neon_vqshiftsu: 13840 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13841 break; 13842 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 13843 13844 case Intrinsic::arm_neon_vrshiftn: 13845 case Intrinsic::arm_neon_vqshiftns: 13846 case Intrinsic::arm_neon_vqshiftnu: 13847 case Intrinsic::arm_neon_vqshiftnsu: 13848 case Intrinsic::arm_neon_vqrshiftns: 13849 case Intrinsic::arm_neon_vqrshiftnu: 13850 case Intrinsic::arm_neon_vqrshiftnsu: 13851 // Narrowing shifts require an immediate right shift. 13852 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 13853 break; 13854 llvm_unreachable("invalid shift count for narrowing vector shift " 13855 "intrinsic"); 13856 13857 default: 13858 llvm_unreachable("unhandled vector shift"); 13859 } 13860 13861 switch (IntNo) { 13862 case Intrinsic::arm_neon_vshifts: 13863 case Intrinsic::arm_neon_vshiftu: 13864 // Opcode already set above. 13865 break; 13866 case Intrinsic::arm_neon_vrshifts: 13867 VShiftOpc = ARMISD::VRSHRsIMM; 13868 break; 13869 case Intrinsic::arm_neon_vrshiftu: 13870 VShiftOpc = ARMISD::VRSHRuIMM; 13871 break; 13872 case Intrinsic::arm_neon_vrshiftn: 13873 VShiftOpc = ARMISD::VRSHRNIMM; 13874 break; 13875 case Intrinsic::arm_neon_vqshifts: 13876 VShiftOpc = ARMISD::VQSHLsIMM; 13877 break; 13878 case Intrinsic::arm_neon_vqshiftu: 13879 VShiftOpc = ARMISD::VQSHLuIMM; 13880 break; 13881 case Intrinsic::arm_neon_vqshiftsu: 13882 VShiftOpc = ARMISD::VQSHLsuIMM; 13883 break; 13884 case Intrinsic::arm_neon_vqshiftns: 13885 VShiftOpc = ARMISD::VQSHRNsIMM; 13886 break; 13887 case Intrinsic::arm_neon_vqshiftnu: 13888 VShiftOpc = ARMISD::VQSHRNuIMM; 13889 break; 13890 case Intrinsic::arm_neon_vqshiftnsu: 13891 VShiftOpc = ARMISD::VQSHRNsuIMM; 13892 break; 13893 case Intrinsic::arm_neon_vqrshiftns: 13894 VShiftOpc = ARMISD::VQRSHRNsIMM; 13895 break; 13896 case Intrinsic::arm_neon_vqrshiftnu: 13897 VShiftOpc = ARMISD::VQRSHRNuIMM; 13898 break; 13899 case Intrinsic::arm_neon_vqrshiftnsu: 13900 VShiftOpc = ARMISD::VQRSHRNsuIMM; 13901 break; 13902 } 13903 13904 SDLoc dl(N); 13905 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13906 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 13907 } 13908 13909 case Intrinsic::arm_neon_vshiftins: { 13910 EVT VT = N->getOperand(1).getValueType(); 13911 int64_t Cnt; 13912 unsigned VShiftOpc = 0; 13913 13914 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 13915 VShiftOpc = ARMISD::VSLIIMM; 13916 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 13917 VShiftOpc = ARMISD::VSRIIMM; 13918 else { 13919 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 13920 } 13921 13922 SDLoc dl(N); 13923 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13924 N->getOperand(1), N->getOperand(2), 13925 DAG.getConstant(Cnt, dl, MVT::i32)); 13926 } 13927 13928 case Intrinsic::arm_neon_vqrshifts: 13929 case Intrinsic::arm_neon_vqrshiftu: 13930 // No immediate versions of these to check for. 13931 break; 13932 } 13933 13934 return SDValue(); 13935} 13936 13937/// PerformShiftCombine - Checks for immediate versions of vector shifts and 13938/// lowers them. As with the vector shift intrinsics, this is done during DAG 13939/// combining instead of DAG legalizing because the build_vectors for 64-bit 13940/// vector element shift counts are generally not legal, and it is hard to see 13941/// their values after they get legalized to loads from a constant pool. 13942static SDValue PerformShiftCombine(SDNode *N, 13943 TargetLowering::DAGCombinerInfo &DCI, 13944 const ARMSubtarget *ST) { 13945 SelectionDAG &DAG = DCI.DAG; 13946 EVT VT = N->getValueType(0); 13947 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 13948 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 13949 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 13950 SDValue N1 = N->getOperand(1); 13951 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 13952 SDValue N0 = N->getOperand(0); 13953 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 13954 DAG.MaskedValueIsZero(N0.getOperand(0), 13955 APInt::getHighBitsSet(32, 16))) 13956 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 13957 } 13958 } 13959 13960 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 13961 N->getOperand(0)->getOpcode() == ISD::AND && 13962 N->getOperand(0)->hasOneUse()) { 13963 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13964 return SDValue(); 13965 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 13966 // usually show up because instcombine prefers to canonicalize it to 13967 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 13968 // out of GEP lowering in some cases. 13969 SDValue N0 = N->getOperand(0); 13970 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13971 if (!ShiftAmtNode) 13972 return SDValue(); 13973 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 13974 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13975 if (!AndMaskNode) 13976 return SDValue(); 13977 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 13978 // Don't transform uxtb/uxth. 13979 if (AndMask == 255 || AndMask == 65535) 13980 return SDValue(); 13981 if (isMask_32(AndMask)) { 13982 uint32_t MaskedBits = countLeadingZeros(AndMask); 13983 if (MaskedBits > ShiftAmt) { 13984 SDLoc DL(N); 13985 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13986 DAG.getConstant(MaskedBits, DL, MVT::i32)); 13987 return DAG.getNode( 13988 ISD::SRL, DL, MVT::i32, SHL, 13989 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 13990 } 13991 } 13992 } 13993 13994 // Nothing to be done for scalar shifts. 13995 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13996 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 13997 return SDValue(); 13998 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 13999 return SDValue(); 14000 14001 int64_t Cnt; 14002 14003 switch (N->getOpcode()) { 14004 default: llvm_unreachable("unexpected shift opcode"); 14005 14006 case ISD::SHL: 14007 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 14008 SDLoc dl(N); 14009 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 14010 DAG.getConstant(Cnt, dl, MVT::i32)); 14011 } 14012 break; 14013 14014 case ISD::SRA: 14015 case ISD::SRL: 14016 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 14017 unsigned VShiftOpc = 14018 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 14019 SDLoc dl(N); 14020 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 14021 DAG.getConstant(Cnt, dl, MVT::i32)); 14022 } 14023 } 14024 return SDValue(); 14025} 14026 14027// Look for a sign/zero extend of a larger than legal load. This can be split 14028// into two extending loads, which are simpler to deal with than an arbitrary 14029// sign extend. 14030static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 14031 SDValue N0 = N->getOperand(0); 14032 if (N0.getOpcode() != ISD::LOAD) 14033 return SDValue(); 14034 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 14035 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 14036 LD->getExtensionType() != ISD::NON_EXTLOAD) 14037 return SDValue(); 14038 EVT FromVT = LD->getValueType(0); 14039 EVT ToVT = N->getValueType(0); 14040 if (!ToVT.isVector()) 14041 return SDValue(); 14042 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14043 EVT ToEltVT = ToVT.getVectorElementType(); 14044 EVT FromEltVT = FromVT.getVectorElementType(); 14045 14046 unsigned NumElements = 0; 14047 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 14048 NumElements = 4; 14049 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 14050 NumElements = 8; 14051 if (NumElements == 0 || 14052 FromVT.getVectorNumElements() == NumElements || 14053 FromVT.getVectorNumElements() % NumElements != 0 || 14054 !isPowerOf2_32(NumElements)) 14055 return SDValue(); 14056 14057 SDLoc DL(LD); 14058 // Details about the old load 14059 SDValue Ch = LD->getChain(); 14060 SDValue BasePtr = LD->getBasePtr(); 14061 unsigned Alignment = LD->getOriginalAlignment(); 14062 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 14063 AAMDNodes AAInfo = LD->getAAInfo(); 14064 14065 ISD::LoadExtType NewExtType = 14066 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 14067 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 14068 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14069 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14070 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 14071 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14072 14073 // Split the load in half, each side of which is extended separately. This 14074 // is good enough, as legalisation will take it from there. They are either 14075 // already legal or they will be split further into something that is 14076 // legal. 14077 SDValue NewLoad1 = 14078 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 14079 LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); 14080 SDValue NewLoad2 = 14081 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 14082 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 14083 Alignment, MMOFlags, AAInfo); 14084 14085 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 14086 SDValue(NewLoad1.getNode(), 1), 14087 SDValue(NewLoad2.getNode(), 1)); 14088 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 14089 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 14090} 14091 14092/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 14093/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 14094static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 14095 const ARMSubtarget *ST) { 14096 SDValue N0 = N->getOperand(0); 14097 14098 // Check for sign- and zero-extensions of vector extract operations of 8- and 14099 // 16-bit vector elements. NEON and MVE support these directly. They are 14100 // handled during DAG combining because type legalization will promote them 14101 // to 32-bit types and it is messy to recognize the operations after that. 14102 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 14103 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14104 SDValue Vec = N0.getOperand(0); 14105 SDValue Lane = N0.getOperand(1); 14106 EVT VT = N->getValueType(0); 14107 EVT EltVT = N0.getValueType(); 14108 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14109 14110 if (VT == MVT::i32 && 14111 (EltVT == MVT::i8 || EltVT == MVT::i16) && 14112 TLI.isTypeLegal(Vec.getValueType()) && 14113 isa<ConstantSDNode>(Lane)) { 14114 14115 unsigned Opc = 0; 14116 switch (N->getOpcode()) { 14117 default: llvm_unreachable("unexpected opcode"); 14118 case ISD::SIGN_EXTEND: 14119 Opc = ARMISD::VGETLANEs; 14120 break; 14121 case ISD::ZERO_EXTEND: 14122 case ISD::ANY_EXTEND: 14123 Opc = ARMISD::VGETLANEu; 14124 break; 14125 } 14126 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 14127 } 14128 } 14129 14130 if (ST->hasMVEIntegerOps()) 14131 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 14132 return NewLoad; 14133 14134 return SDValue(); 14135} 14136 14137static const APInt *isPowerOf2Constant(SDValue V) { 14138 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 14139 if (!C) 14140 return nullptr; 14141 const APInt *CV = &C->getAPIntValue(); 14142 return CV->isPowerOf2() ? CV : nullptr; 14143} 14144 14145SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 14146 // If we have a CMOV, OR and AND combination such as: 14147 // if (x & CN) 14148 // y |= CM; 14149 // 14150 // And: 14151 // * CN is a single bit; 14152 // * All bits covered by CM are known zero in y 14153 // 14154 // Then we can convert this into a sequence of BFI instructions. This will 14155 // always be a win if CM is a single bit, will always be no worse than the 14156 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 14157 // three bits (due to the extra IT instruction). 14158 14159 SDValue Op0 = CMOV->getOperand(0); 14160 SDValue Op1 = CMOV->getOperand(1); 14161 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 14162 auto CC = CCNode->getAPIntValue().getLimitedValue(); 14163 SDValue CmpZ = CMOV->getOperand(4); 14164 14165 // The compare must be against zero. 14166 if (!isNullConstant(CmpZ->getOperand(1))) 14167 return SDValue(); 14168 14169 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 14170 SDValue And = CmpZ->getOperand(0); 14171 if (And->getOpcode() != ISD::AND) 14172 return SDValue(); 14173 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 14174 if (!AndC) 14175 return SDValue(); 14176 SDValue X = And->getOperand(0); 14177 14178 if (CC == ARMCC::EQ) { 14179 // We're performing an "equal to zero" compare. Swap the operands so we 14180 // canonicalize on a "not equal to zero" compare. 14181 std::swap(Op0, Op1); 14182 } else { 14183 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 14184 } 14185 14186 if (Op1->getOpcode() != ISD::OR) 14187 return SDValue(); 14188 14189 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 14190 if (!OrC) 14191 return SDValue(); 14192 SDValue Y = Op1->getOperand(0); 14193 14194 if (Op0 != Y) 14195 return SDValue(); 14196 14197 // Now, is it profitable to continue? 14198 APInt OrCI = OrC->getAPIntValue(); 14199 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 14200 if (OrCI.countPopulation() > Heuristic) 14201 return SDValue(); 14202 14203 // Lastly, can we determine that the bits defined by OrCI 14204 // are zero in Y? 14205 KnownBits Known = DAG.computeKnownBits(Y); 14206 if ((OrCI & Known.Zero) != OrCI) 14207 return SDValue(); 14208 14209 // OK, we can do the combine. 14210 SDValue V = Y; 14211 SDLoc dl(X); 14212 EVT VT = X.getValueType(); 14213 unsigned BitInX = AndC->logBase2(); 14214 14215 if (BitInX != 0) { 14216 // We must shift X first. 14217 X = DAG.getNode(ISD::SRL, dl, VT, X, 14218 DAG.getConstant(BitInX, dl, VT)); 14219 } 14220 14221 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 14222 BitInY < NumActiveBits; ++BitInY) { 14223 if (OrCI[BitInY] == 0) 14224 continue; 14225 APInt Mask(VT.getSizeInBits(), 0); 14226 Mask.setBit(BitInY); 14227 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 14228 // Confusingly, the operand is an *inverted* mask. 14229 DAG.getConstant(~Mask, dl, VT)); 14230 } 14231 14232 return V; 14233} 14234 14235// Given N, the value controlling the conditional branch, search for the loop 14236// intrinsic, returning it, along with how the value is used. We need to handle 14237// patterns such as the following: 14238// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 14239// (brcond (setcc (loop.decrement), 0, eq), exit) 14240// (brcond (setcc (loop.decrement), 0, ne), header) 14241static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 14242 bool &Negate) { 14243 switch (N->getOpcode()) { 14244 default: 14245 break; 14246 case ISD::XOR: { 14247 if (!isa<ConstantSDNode>(N.getOperand(1))) 14248 return SDValue(); 14249 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 14250 return SDValue(); 14251 Negate = !Negate; 14252 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 14253 } 14254 case ISD::SETCC: { 14255 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 14256 if (!Const) 14257 return SDValue(); 14258 if (Const->isNullValue()) 14259 Imm = 0; 14260 else if (Const->isOne()) 14261 Imm = 1; 14262 else 14263 return SDValue(); 14264 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 14265 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 14266 } 14267 case ISD::INTRINSIC_W_CHAIN: { 14268 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 14269 if (IntOp != Intrinsic::test_set_loop_iterations && 14270 IntOp != Intrinsic::loop_decrement_reg) 14271 return SDValue(); 14272 return N; 14273 } 14274 } 14275 return SDValue(); 14276} 14277 14278static SDValue PerformHWLoopCombine(SDNode *N, 14279 TargetLowering::DAGCombinerInfo &DCI, 14280 const ARMSubtarget *ST) { 14281 14282 // The hwloop intrinsics that we're interested are used for control-flow, 14283 // either for entering or exiting the loop: 14284 // - test.set.loop.iterations will test whether its operand is zero. If it 14285 // is zero, the proceeding branch should not enter the loop. 14286 // - loop.decrement.reg also tests whether its operand is zero. If it is 14287 // zero, the proceeding branch should not branch back to the beginning of 14288 // the loop. 14289 // So here, we need to check that how the brcond is using the result of each 14290 // of the intrinsics to ensure that we're branching to the right place at the 14291 // right time. 14292 14293 ISD::CondCode CC; 14294 SDValue Cond; 14295 int Imm = 1; 14296 bool Negate = false; 14297 SDValue Chain = N->getOperand(0); 14298 SDValue Dest; 14299 14300 if (N->getOpcode() == ISD::BRCOND) { 14301 CC = ISD::SETEQ; 14302 Cond = N->getOperand(1); 14303 Dest = N->getOperand(2); 14304 } else { 14305 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 14306 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 14307 Cond = N->getOperand(2); 14308 Dest = N->getOperand(4); 14309 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 14310 if (!Const->isOne() && !Const->isNullValue()) 14311 return SDValue(); 14312 Imm = Const->getZExtValue(); 14313 } else 14314 return SDValue(); 14315 } 14316 14317 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 14318 if (!Int) 14319 return SDValue(); 14320 14321 if (Negate) 14322 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 14323 14324 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 14325 return (CC == ISD::SETEQ && Imm == 0) || 14326 (CC == ISD::SETNE && Imm == 1) || 14327 (CC == ISD::SETLT && Imm == 1) || 14328 (CC == ISD::SETULT && Imm == 1); 14329 }; 14330 14331 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 14332 return (CC == ISD::SETEQ && Imm == 1) || 14333 (CC == ISD::SETNE && Imm == 0) || 14334 (CC == ISD::SETGT && Imm == 0) || 14335 (CC == ISD::SETUGT && Imm == 0) || 14336 (CC == ISD::SETGE && Imm == 1) || 14337 (CC == ISD::SETUGE && Imm == 1); 14338 }; 14339 14340 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 14341 "unsupported condition"); 14342 14343 SDLoc dl(Int); 14344 SelectionDAG &DAG = DCI.DAG; 14345 SDValue Elements = Int.getOperand(2); 14346 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 14347 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 14348 && "expected single br user"); 14349 SDNode *Br = *N->use_begin(); 14350 SDValue OtherTarget = Br->getOperand(1); 14351 14352 // Update the unconditional branch to branch to the given Dest. 14353 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 14354 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 14355 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 14356 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 14357 }; 14358 14359 if (IntOp == Intrinsic::test_set_loop_iterations) { 14360 SDValue Res; 14361 // We expect this 'instruction' to branch when the counter is zero. 14362 if (IsTrueIfZero(CC, Imm)) { 14363 SDValue Ops[] = { Chain, Elements, Dest }; 14364 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14365 } else { 14366 // The logic is the reverse of what we need for WLS, so find the other 14367 // basic block target: the target of the proceeding br. 14368 UpdateUncondBr(Br, Dest, DAG); 14369 14370 SDValue Ops[] = { Chain, Elements, OtherTarget }; 14371 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14372 } 14373 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 14374 return Res; 14375 } else { 14376 SDValue Size = DAG.getTargetConstant( 14377 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 14378 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 14379 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 14380 DAG.getVTList(MVT::i32, MVT::Other), Args); 14381 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 14382 14383 // We expect this instruction to branch when the count is not zero. 14384 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 14385 14386 // Update the unconditional branch to target the loop preheader if we've 14387 // found the condition has been reversed. 14388 if (Target == OtherTarget) 14389 UpdateUncondBr(Br, Dest, DAG); 14390 14391 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14392 SDValue(LoopDec.getNode(), 1), Chain); 14393 14394 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 14395 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 14396 } 14397 return SDValue(); 14398} 14399 14400/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 14401SDValue 14402ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 14403 SDValue Cmp = N->getOperand(4); 14404 if (Cmp.getOpcode() != ARMISD::CMPZ) 14405 // Only looking at NE cases. 14406 return SDValue(); 14407 14408 EVT VT = N->getValueType(0); 14409 SDLoc dl(N); 14410 SDValue LHS = Cmp.getOperand(0); 14411 SDValue RHS = Cmp.getOperand(1); 14412 SDValue Chain = N->getOperand(0); 14413 SDValue BB = N->getOperand(1); 14414 SDValue ARMcc = N->getOperand(2); 14415 ARMCC::CondCodes CC = 14416 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14417 14418 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 14419 // -> (brcond Chain BB CC CPSR Cmp) 14420 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 14421 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 14422 LHS->getOperand(0)->hasOneUse()) { 14423 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 14424 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 14425 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14426 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14427 if ((LHS00C && LHS00C->getZExtValue() == 0) && 14428 (LHS01C && LHS01C->getZExtValue() == 1) && 14429 (LHS1C && LHS1C->getZExtValue() == 1) && 14430 (RHSC && RHSC->getZExtValue() == 0)) { 14431 return DAG.getNode( 14432 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 14433 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 14434 } 14435 } 14436 14437 return SDValue(); 14438} 14439 14440/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 14441SDValue 14442ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 14443 SDValue Cmp = N->getOperand(4); 14444 if (Cmp.getOpcode() != ARMISD::CMPZ) 14445 // Only looking at EQ and NE cases. 14446 return SDValue(); 14447 14448 EVT VT = N->getValueType(0); 14449 SDLoc dl(N); 14450 SDValue LHS = Cmp.getOperand(0); 14451 SDValue RHS = Cmp.getOperand(1); 14452 SDValue FalseVal = N->getOperand(0); 14453 SDValue TrueVal = N->getOperand(1); 14454 SDValue ARMcc = N->getOperand(2); 14455 ARMCC::CondCodes CC = 14456 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14457 14458 // BFI is only available on V6T2+. 14459 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 14460 SDValue R = PerformCMOVToBFICombine(N, DAG); 14461 if (R) 14462 return R; 14463 } 14464 14465 // Simplify 14466 // mov r1, r0 14467 // cmp r1, x 14468 // mov r0, y 14469 // moveq r0, x 14470 // to 14471 // cmp r0, x 14472 // movne r0, y 14473 // 14474 // mov r1, r0 14475 // cmp r1, x 14476 // mov r0, x 14477 // movne r0, y 14478 // to 14479 // cmp r0, x 14480 // movne r0, y 14481 /// FIXME: Turn this into a target neutral optimization? 14482 SDValue Res; 14483 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 14484 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 14485 N->getOperand(3), Cmp); 14486 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 14487 SDValue ARMcc; 14488 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 14489 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 14490 N->getOperand(3), NewCmp); 14491 } 14492 14493 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 14494 // -> (cmov F T CC CPSR Cmp) 14495 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 14496 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 14497 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14498 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14499 if ((LHS0C && LHS0C->getZExtValue() == 0) && 14500 (LHS1C && LHS1C->getZExtValue() == 1) && 14501 (RHSC && RHSC->getZExtValue() == 0)) { 14502 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 14503 LHS->getOperand(2), LHS->getOperand(3), 14504 LHS->getOperand(4)); 14505 } 14506 } 14507 14508 if (!VT.isInteger()) 14509 return SDValue(); 14510 14511 // Materialize a boolean comparison for integers so we can avoid branching. 14512 if (isNullConstant(FalseVal)) { 14513 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 14514 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 14515 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 14516 // right 5 bits will make that 32 be 1, otherwise it will be 0. 14517 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 14518 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14519 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 14520 DAG.getConstant(5, dl, MVT::i32)); 14521 } else { 14522 // CMOV 0, 1, ==, (CMPZ x, y) -> 14523 // (ADDCARRY (SUB x, y), t:0, t:1) 14524 // where t = (SUBCARRY 0, (SUB x, y), 0) 14525 // 14526 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 14527 // x != y. In other words, a carry C == 1 when x == y, C == 0 14528 // otherwise. 14529 // The final ADDCARRY computes 14530 // x - y + (0 - (x - y)) + C == C 14531 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14532 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14533 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 14534 // ISD::SUBCARRY returns a borrow but we want the carry here 14535 // actually. 14536 SDValue Carry = 14537 DAG.getNode(ISD::SUB, dl, MVT::i32, 14538 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 14539 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 14540 } 14541 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 14542 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 14543 // This seems pointless but will allow us to combine it further below. 14544 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14545 SDValue Sub = 14546 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14547 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14548 Sub.getValue(1), SDValue()); 14549 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 14550 N->getOperand(3), CPSRGlue.getValue(1)); 14551 FalseVal = Sub; 14552 } 14553 } else if (isNullConstant(TrueVal)) { 14554 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 14555 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 14556 // This seems pointless but will allow us to combine it further below 14557 // Note that we change == for != as this is the dual for the case above. 14558 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14559 SDValue Sub = 14560 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14561 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14562 Sub.getValue(1), SDValue()); 14563 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 14564 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 14565 N->getOperand(3), CPSRGlue.getValue(1)); 14566 FalseVal = Sub; 14567 } 14568 } 14569 14570 // On Thumb1, the DAG above may be further combined if z is a power of 2 14571 // (z == 2 ^ K). 14572 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 14573 // t1 = (USUBO (SUB x, y), 1) 14574 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 14575 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14576 // 14577 // This also handles the special case of comparing against zero; it's 14578 // essentially, the same pattern, except there's no SUBS: 14579 // CMOV x, z, !=, (CMPZ x, 0) -> 14580 // t1 = (USUBO x, 1) 14581 // t2 = (SUBCARRY x, t1:0, t1:1) 14582 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14583 const APInt *TrueConst; 14584 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 14585 ((FalseVal.getOpcode() == ARMISD::SUBS && 14586 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 14587 (FalseVal == LHS && isNullConstant(RHS))) && 14588 (TrueConst = isPowerOf2Constant(TrueVal))) { 14589 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14590 unsigned ShiftAmount = TrueConst->logBase2(); 14591 if (ShiftAmount) 14592 TrueVal = DAG.getConstant(1, dl, VT); 14593 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 14594 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 14595 14596 if (ShiftAmount) 14597 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 14598 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14599 } 14600 14601 if (Res.getNode()) { 14602 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 14603 // Capture demanded bits information that would be otherwise lost. 14604 if (Known.Zero == 0xfffffffe) 14605 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14606 DAG.getValueType(MVT::i1)); 14607 else if (Known.Zero == 0xffffff00) 14608 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14609 DAG.getValueType(MVT::i8)); 14610 else if (Known.Zero == 0xffff0000) 14611 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14612 DAG.getValueType(MVT::i16)); 14613 } 14614 14615 return Res; 14616} 14617 14618SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 14619 DAGCombinerInfo &DCI) const { 14620 switch (N->getOpcode()) { 14621 default: break; 14622 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 14623 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 14624 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 14625 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 14626 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 14627 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 14628 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 14629 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 14630 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 14631 case ISD::BRCOND: 14632 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 14633 case ARMISD::ADDC: 14634 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 14635 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 14636 case ARMISD::BFI: return PerformBFICombine(N, DCI); 14637 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 14638 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 14639 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 14640 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 14641 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 14642 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 14643 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 14644 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 14645 case ISD::FP_TO_SINT: 14646 case ISD::FP_TO_UINT: 14647 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 14648 case ISD::FDIV: 14649 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 14650 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 14651 case ISD::SHL: 14652 case ISD::SRA: 14653 case ISD::SRL: 14654 return PerformShiftCombine(N, DCI, Subtarget); 14655 case ISD::SIGN_EXTEND: 14656 case ISD::ZERO_EXTEND: 14657 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 14658 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 14659 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 14660 case ISD::LOAD: return PerformLOADCombine(N, DCI); 14661 case ARMISD::VLD1DUP: 14662 case ARMISD::VLD2DUP: 14663 case ARMISD::VLD3DUP: 14664 case ARMISD::VLD4DUP: 14665 return PerformVLDCombine(N, DCI); 14666 case ARMISD::BUILD_VECTOR: 14667 return PerformARMBUILD_VECTORCombine(N, DCI); 14668 case ARMISD::PREDICATE_CAST: 14669 return PerformPREDICATE_CASTCombine(N, DCI); 14670 case ARMISD::VCMP: 14671 return PerformVCMPCombine(N, DCI, Subtarget); 14672 case ARMISD::SMULWB: { 14673 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14674 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14675 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14676 return SDValue(); 14677 break; 14678 } 14679 case ARMISD::SMULWT: { 14680 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14681 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14682 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14683 return SDValue(); 14684 break; 14685 } 14686 case ARMISD::SMLALBB: 14687 case ARMISD::QADD16b: 14688 case ARMISD::QSUB16b: { 14689 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14690 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14691 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14692 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14693 return SDValue(); 14694 break; 14695 } 14696 case ARMISD::SMLALBT: { 14697 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 14698 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14699 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 14700 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14701 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 14702 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 14703 return SDValue(); 14704 break; 14705 } 14706 case ARMISD::SMLALTB: { 14707 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 14708 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14709 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 14710 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14711 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 14712 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 14713 return SDValue(); 14714 break; 14715 } 14716 case ARMISD::SMLALTT: { 14717 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14718 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14719 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14720 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14721 return SDValue(); 14722 break; 14723 } 14724 case ARMISD::QADD8b: 14725 case ARMISD::QSUB8b: { 14726 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14727 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 14728 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14729 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14730 return SDValue(); 14731 break; 14732 } 14733 case ISD::INTRINSIC_VOID: 14734 case ISD::INTRINSIC_W_CHAIN: 14735 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 14736 case Intrinsic::arm_neon_vld1: 14737 case Intrinsic::arm_neon_vld1x2: 14738 case Intrinsic::arm_neon_vld1x3: 14739 case Intrinsic::arm_neon_vld1x4: 14740 case Intrinsic::arm_neon_vld2: 14741 case Intrinsic::arm_neon_vld3: 14742 case Intrinsic::arm_neon_vld4: 14743 case Intrinsic::arm_neon_vld2lane: 14744 case Intrinsic::arm_neon_vld3lane: 14745 case Intrinsic::arm_neon_vld4lane: 14746 case Intrinsic::arm_neon_vld2dup: 14747 case Intrinsic::arm_neon_vld3dup: 14748 case Intrinsic::arm_neon_vld4dup: 14749 case Intrinsic::arm_neon_vst1: 14750 case Intrinsic::arm_neon_vst1x2: 14751 case Intrinsic::arm_neon_vst1x3: 14752 case Intrinsic::arm_neon_vst1x4: 14753 case Intrinsic::arm_neon_vst2: 14754 case Intrinsic::arm_neon_vst3: 14755 case Intrinsic::arm_neon_vst4: 14756 case Intrinsic::arm_neon_vst2lane: 14757 case Intrinsic::arm_neon_vst3lane: 14758 case Intrinsic::arm_neon_vst4lane: 14759 return PerformVLDCombine(N, DCI); 14760 default: break; 14761 } 14762 break; 14763 } 14764 return SDValue(); 14765} 14766 14767bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 14768 EVT VT) const { 14769 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 14770} 14771 14772bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 14773 unsigned Alignment, 14774 MachineMemOperand::Flags, 14775 bool *Fast) const { 14776 // Depends what it gets converted into if the type is weird. 14777 if (!VT.isSimple()) 14778 return false; 14779 14780 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 14781 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 14782 auto Ty = VT.getSimpleVT().SimpleTy; 14783 14784 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 14785 // Unaligned access can use (for example) LRDB, LRDH, LDR 14786 if (AllowsUnaligned) { 14787 if (Fast) 14788 *Fast = Subtarget->hasV7Ops(); 14789 return true; 14790 } 14791 } 14792 14793 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 14794 // For any little-endian targets with neon, we can support unaligned ld/st 14795 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 14796 // A big-endian target may also explicitly support unaligned accesses 14797 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 14798 if (Fast) 14799 *Fast = true; 14800 return true; 14801 } 14802 } 14803 14804 if (!Subtarget->hasMVEIntegerOps()) 14805 return false; 14806 14807 // These are for predicates 14808 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 14809 if (Fast) 14810 *Fast = true; 14811 return true; 14812 } 14813 14814 // These are for truncated stores/narrowing loads. They are fine so long as 14815 // the alignment is at least the size of the item being loaded 14816 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 14817 Alignment >= VT.getScalarSizeInBits() / 8) { 14818 if (Fast) 14819 *Fast = true; 14820 return true; 14821 } 14822 14823 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 14824 // VSTRW.U32 all store the vector register in exactly the same format, and 14825 // differ only in the range of their immediate offset field and the required 14826 // alignment. So there is always a store that can be used, regardless of 14827 // actual type. 14828 // 14829 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 14830 // VREV64.8) pair and get the same effect. This will likely be better than 14831 // aligning the vector through the stack. 14832 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 14833 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 14834 Ty == MVT::v2f64) { 14835 if (Fast) 14836 *Fast = true; 14837 return true; 14838 } 14839 14840 return false; 14841} 14842 14843static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 14844 unsigned AlignCheck) { 14845 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 14846 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 14847} 14848 14849EVT ARMTargetLowering::getOptimalMemOpType( 14850 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, 14851 bool ZeroMemset, bool MemcpyStrSrc, 14852 const AttributeList &FuncAttributes) const { 14853 // See if we can use NEON instructions for this... 14854 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 14855 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 14856 bool Fast; 14857 if (Size >= 16 && 14858 (memOpAlign(SrcAlign, DstAlign, 16) || 14859 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 14860 MachineMemOperand::MONone, &Fast) && 14861 Fast))) { 14862 return MVT::v2f64; 14863 } else if (Size >= 8 && 14864 (memOpAlign(SrcAlign, DstAlign, 8) || 14865 (allowsMisalignedMemoryAccesses( 14866 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 14867 Fast))) { 14868 return MVT::f64; 14869 } 14870 } 14871 14872 // Let the target-independent logic figure it out. 14873 return MVT::Other; 14874} 14875 14876// 64-bit integers are split into their high and low parts and held in two 14877// different registers, so the trunc is free since the low register can just 14878// be used. 14879bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 14880 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 14881 return false; 14882 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 14883 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 14884 return (SrcBits == 64 && DestBits == 32); 14885} 14886 14887bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 14888 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 14889 !DstVT.isInteger()) 14890 return false; 14891 unsigned SrcBits = SrcVT.getSizeInBits(); 14892 unsigned DestBits = DstVT.getSizeInBits(); 14893 return (SrcBits == 64 && DestBits == 32); 14894} 14895 14896bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14897 if (Val.getOpcode() != ISD::LOAD) 14898 return false; 14899 14900 EVT VT1 = Val.getValueType(); 14901 if (!VT1.isSimple() || !VT1.isInteger() || 14902 !VT2.isSimple() || !VT2.isInteger()) 14903 return false; 14904 14905 switch (VT1.getSimpleVT().SimpleTy) { 14906 default: break; 14907 case MVT::i1: 14908 case MVT::i8: 14909 case MVT::i16: 14910 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 14911 return true; 14912 } 14913 14914 return false; 14915} 14916 14917bool ARMTargetLowering::isFNegFree(EVT VT) const { 14918 if (!VT.isSimple()) 14919 return false; 14920 14921 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 14922 // negate values directly (fneg is free). So, we don't want to let the DAG 14923 // combiner rewrite fneg into xors and some other instructions. For f16 and 14924 // FullFP16 argument passing, some bitcast nodes may be introduced, 14925 // triggering this DAG combine rewrite, so we are avoiding that with this. 14926 switch (VT.getSimpleVT().SimpleTy) { 14927 default: break; 14928 case MVT::f16: 14929 return Subtarget->hasFullFP16(); 14930 } 14931 14932 return false; 14933} 14934 14935/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 14936/// of the vector elements. 14937static bool areExtractExts(Value *Ext1, Value *Ext2) { 14938 auto areExtDoubled = [](Instruction *Ext) { 14939 return Ext->getType()->getScalarSizeInBits() == 14940 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 14941 }; 14942 14943 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 14944 !match(Ext2, m_ZExtOrSExt(m_Value())) || 14945 !areExtDoubled(cast<Instruction>(Ext1)) || 14946 !areExtDoubled(cast<Instruction>(Ext2))) 14947 return false; 14948 14949 return true; 14950} 14951 14952/// Check if sinking \p I's operands to I's basic block is profitable, because 14953/// the operands can be folded into a target instruction, e.g. 14954/// sext/zext can be folded into vsubl. 14955bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 14956 SmallVectorImpl<Use *> &Ops) const { 14957 if (!I->getType()->isVectorTy()) 14958 return false; 14959 14960 if (Subtarget->hasNEON()) { 14961 switch (I->getOpcode()) { 14962 case Instruction::Sub: 14963 case Instruction::Add: { 14964 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 14965 return false; 14966 Ops.push_back(&I->getOperandUse(0)); 14967 Ops.push_back(&I->getOperandUse(1)); 14968 return true; 14969 } 14970 default: 14971 return false; 14972 } 14973 } 14974 14975 if (!Subtarget->hasMVEIntegerOps()) 14976 return false; 14977 14978 auto IsSinker = [](Instruction *I, int Operand) { 14979 switch (I->getOpcode()) { 14980 case Instruction::Add: 14981 case Instruction::Mul: 14982 case Instruction::ICmp: 14983 return true; 14984 case Instruction::Sub: 14985 case Instruction::Shl: 14986 case Instruction::LShr: 14987 case Instruction::AShr: 14988 return Operand == 1; 14989 default: 14990 return false; 14991 } 14992 }; 14993 14994 int Op = 0; 14995 if (!isa<ShuffleVectorInst>(I->getOperand(Op))) 14996 Op = 1; 14997 if (!IsSinker(I, Op)) 14998 return false; 14999 if (!match(I->getOperand(Op), 15000 m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), 15001 m_Undef(), m_Zero()))) { 15002 return false; 15003 } 15004 Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); 15005 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 15006 // and vector registers 15007 for (Use &U : Shuffle->uses()) { 15008 Instruction *Insn = cast<Instruction>(U.getUser()); 15009 if (!IsSinker(Insn, U.getOperandNo())) 15010 return false; 15011 } 15012 Ops.push_back(&Shuffle->getOperandUse(0)); 15013 Ops.push_back(&I->getOperandUse(Op)); 15014 return true; 15015} 15016 15017bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 15018 EVT VT = ExtVal.getValueType(); 15019 15020 if (!isTypeLegal(VT)) 15021 return false; 15022 15023 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 15024 if (Ld->isExpandingLoad()) 15025 return false; 15026 } 15027 15028 // Don't create a loadext if we can fold the extension into a wide/long 15029 // instruction. 15030 // If there's more than one user instruction, the loadext is desirable no 15031 // matter what. There can be two uses by the same instruction. 15032 if (ExtVal->use_empty() || 15033 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 15034 return true; 15035 15036 SDNode *U = *ExtVal->use_begin(); 15037 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 15038 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 15039 return false; 15040 15041 return true; 15042} 15043 15044bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 15045 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 15046 return false; 15047 15048 if (!isTypeLegal(EVT::getEVT(Ty1))) 15049 return false; 15050 15051 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 15052 15053 // Assuming the caller doesn't have a zeroext or signext return parameter, 15054 // truncation all the way down to i1 is valid. 15055 return true; 15056} 15057 15058int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 15059 const AddrMode &AM, Type *Ty, 15060 unsigned AS) const { 15061 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 15062 if (Subtarget->hasFPAO()) 15063 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 15064 return 0; 15065 } 15066 return -1; 15067} 15068 15069/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 15070/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 15071/// expanded to FMAs when this method returns true, otherwise fmuladd is 15072/// expanded to fmul + fadd. 15073/// 15074/// ARM supports both fused and unfused multiply-add operations; we already 15075/// lower a pair of fmul and fadd to the latter so it's not clear that there 15076/// would be a gain or that the gain would be worthwhile enough to risk 15077/// correctness bugs. 15078/// 15079/// For MVE, we set this to true as it helps simplify the need for some 15080/// patterns (and we don't have the non-fused floating point instruction). 15081bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 15082 EVT VT) const { 15083 if (!VT.isSimple()) 15084 return false; 15085 15086 switch (VT.getSimpleVT().SimpleTy) { 15087 case MVT::v4f32: 15088 case MVT::v8f16: 15089 return Subtarget->hasMVEFloatOps(); 15090 case MVT::f16: 15091 return Subtarget->useFPVFMx16(); 15092 case MVT::f32: 15093 return Subtarget->useFPVFMx(); 15094 case MVT::f64: 15095 return Subtarget->useFPVFMx64(); 15096 default: 15097 break; 15098 } 15099 15100 return false; 15101} 15102 15103static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 15104 if (V < 0) 15105 return false; 15106 15107 unsigned Scale = 1; 15108 switch (VT.getSimpleVT().SimpleTy) { 15109 case MVT::i1: 15110 case MVT::i8: 15111 // Scale == 1; 15112 break; 15113 case MVT::i16: 15114 // Scale == 2; 15115 Scale = 2; 15116 break; 15117 default: 15118 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 15119 // Scale == 4; 15120 Scale = 4; 15121 break; 15122 } 15123 15124 if ((V & (Scale - 1)) != 0) 15125 return false; 15126 return isUInt<5>(V / Scale); 15127} 15128 15129static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 15130 const ARMSubtarget *Subtarget) { 15131 if (!VT.isInteger() && !VT.isFloatingPoint()) 15132 return false; 15133 if (VT.isVector() && Subtarget->hasNEON()) 15134 return false; 15135 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 15136 !Subtarget->hasMVEFloatOps()) 15137 return false; 15138 15139 bool IsNeg = false; 15140 if (V < 0) { 15141 IsNeg = true; 15142 V = -V; 15143 } 15144 15145 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 15146 15147 // MVE: size * imm7 15148 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 15149 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 15150 case MVT::i32: 15151 case MVT::f32: 15152 return isShiftedUInt<7,2>(V); 15153 case MVT::i16: 15154 case MVT::f16: 15155 return isShiftedUInt<7,1>(V); 15156 case MVT::i8: 15157 return isUInt<7>(V); 15158 default: 15159 return false; 15160 } 15161 } 15162 15163 // half VLDR: 2 * imm8 15164 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 15165 return isShiftedUInt<8, 1>(V); 15166 // VLDR and LDRD: 4 * imm8 15167 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 15168 return isShiftedUInt<8, 2>(V); 15169 15170 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 15171 // + imm12 or - imm8 15172 if (IsNeg) 15173 return isUInt<8>(V); 15174 return isUInt<12>(V); 15175 } 15176 15177 return false; 15178} 15179 15180/// isLegalAddressImmediate - Return true if the integer value can be used 15181/// as the offset of the target addressing mode for load / store of the 15182/// given type. 15183static bool isLegalAddressImmediate(int64_t V, EVT VT, 15184 const ARMSubtarget *Subtarget) { 15185 if (V == 0) 15186 return true; 15187 15188 if (!VT.isSimple()) 15189 return false; 15190 15191 if (Subtarget->isThumb1Only()) 15192 return isLegalT1AddressImmediate(V, VT); 15193 else if (Subtarget->isThumb2()) 15194 return isLegalT2AddressImmediate(V, VT, Subtarget); 15195 15196 // ARM mode. 15197 if (V < 0) 15198 V = - V; 15199 switch (VT.getSimpleVT().SimpleTy) { 15200 default: return false; 15201 case MVT::i1: 15202 case MVT::i8: 15203 case MVT::i32: 15204 // +- imm12 15205 return isUInt<12>(V); 15206 case MVT::i16: 15207 // +- imm8 15208 return isUInt<8>(V); 15209 case MVT::f32: 15210 case MVT::f64: 15211 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 15212 return false; 15213 return isShiftedUInt<8, 2>(V); 15214 } 15215} 15216 15217bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 15218 EVT VT) const { 15219 int Scale = AM.Scale; 15220 if (Scale < 0) 15221 return false; 15222 15223 switch (VT.getSimpleVT().SimpleTy) { 15224 default: return false; 15225 case MVT::i1: 15226 case MVT::i8: 15227 case MVT::i16: 15228 case MVT::i32: 15229 if (Scale == 1) 15230 return true; 15231 // r + r << imm 15232 Scale = Scale & ~1; 15233 return Scale == 2 || Scale == 4 || Scale == 8; 15234 case MVT::i64: 15235 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 15236 // version in Thumb mode. 15237 // r + r 15238 if (Scale == 1) 15239 return true; 15240 // r * 2 (this can be lowered to r + r). 15241 if (!AM.HasBaseReg && Scale == 2) 15242 return true; 15243 return false; 15244 case MVT::isVoid: 15245 // Note, we allow "void" uses (basically, uses that aren't loads or 15246 // stores), because arm allows folding a scale into many arithmetic 15247 // operations. This should be made more precise and revisited later. 15248 15249 // Allow r << imm, but the imm has to be a multiple of two. 15250 if (Scale & 1) return false; 15251 return isPowerOf2_32(Scale); 15252 } 15253} 15254 15255bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 15256 EVT VT) const { 15257 const int Scale = AM.Scale; 15258 15259 // Negative scales are not supported in Thumb1. 15260 if (Scale < 0) 15261 return false; 15262 15263 // Thumb1 addressing modes do not support register scaling excepting the 15264 // following cases: 15265 // 1. Scale == 1 means no scaling. 15266 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 15267 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 15268} 15269 15270/// isLegalAddressingMode - Return true if the addressing mode represented 15271/// by AM is legal for this target, for a load/store of the specified type. 15272bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 15273 const AddrMode &AM, Type *Ty, 15274 unsigned AS, Instruction *I) const { 15275 EVT VT = getValueType(DL, Ty, true); 15276 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 15277 return false; 15278 15279 // Can never fold addr of global into load/store. 15280 if (AM.BaseGV) 15281 return false; 15282 15283 switch (AM.Scale) { 15284 case 0: // no scale reg, must be "r+i" or "r", or "i". 15285 break; 15286 default: 15287 // ARM doesn't support any R+R*scale+imm addr modes. 15288 if (AM.BaseOffs) 15289 return false; 15290 15291 if (!VT.isSimple()) 15292 return false; 15293 15294 if (Subtarget->isThumb1Only()) 15295 return isLegalT1ScaledAddressingMode(AM, VT); 15296 15297 if (Subtarget->isThumb2()) 15298 return isLegalT2ScaledAddressingMode(AM, VT); 15299 15300 int Scale = AM.Scale; 15301 switch (VT.getSimpleVT().SimpleTy) { 15302 default: return false; 15303 case MVT::i1: 15304 case MVT::i8: 15305 case MVT::i32: 15306 if (Scale < 0) Scale = -Scale; 15307 if (Scale == 1) 15308 return true; 15309 // r + r << imm 15310 return isPowerOf2_32(Scale & ~1); 15311 case MVT::i16: 15312 case MVT::i64: 15313 // r +/- r 15314 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 15315 return true; 15316 // r * 2 (this can be lowered to r + r). 15317 if (!AM.HasBaseReg && Scale == 2) 15318 return true; 15319 return false; 15320 15321 case MVT::isVoid: 15322 // Note, we allow "void" uses (basically, uses that aren't loads or 15323 // stores), because arm allows folding a scale into many arithmetic 15324 // operations. This should be made more precise and revisited later. 15325 15326 // Allow r << imm, but the imm has to be a multiple of two. 15327 if (Scale & 1) return false; 15328 return isPowerOf2_32(Scale); 15329 } 15330 } 15331 return true; 15332} 15333 15334/// isLegalICmpImmediate - Return true if the specified immediate is legal 15335/// icmp immediate, that is the target has icmp instructions which can compare 15336/// a register against the immediate without having to materialize the 15337/// immediate into a register. 15338bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 15339 // Thumb2 and ARM modes can use cmn for negative immediates. 15340 if (!Subtarget->isThumb()) 15341 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 15342 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 15343 if (Subtarget->isThumb2()) 15344 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 15345 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 15346 // Thumb1 doesn't have cmn, and only 8-bit immediates. 15347 return Imm >= 0 && Imm <= 255; 15348} 15349 15350/// isLegalAddImmediate - Return true if the specified immediate is a legal add 15351/// *or sub* immediate, that is the target has add or sub instructions which can 15352/// add a register with the immediate without having to materialize the 15353/// immediate into a register. 15354bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 15355 // Same encoding for add/sub, just flip the sign. 15356 int64_t AbsImm = std::abs(Imm); 15357 if (!Subtarget->isThumb()) 15358 return ARM_AM::getSOImmVal(AbsImm) != -1; 15359 if (Subtarget->isThumb2()) 15360 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 15361 // Thumb1 only has 8-bit unsigned immediate. 15362 return AbsImm >= 0 && AbsImm <= 255; 15363} 15364 15365static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 15366 bool isSEXTLoad, SDValue &Base, 15367 SDValue &Offset, bool &isInc, 15368 SelectionDAG &DAG) { 15369 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15370 return false; 15371 15372 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 15373 // AddressingMode 3 15374 Base = Ptr->getOperand(0); 15375 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15376 int RHSC = (int)RHS->getZExtValue(); 15377 if (RHSC < 0 && RHSC > -256) { 15378 assert(Ptr->getOpcode() == ISD::ADD); 15379 isInc = false; 15380 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15381 return true; 15382 } 15383 } 15384 isInc = (Ptr->getOpcode() == ISD::ADD); 15385 Offset = Ptr->getOperand(1); 15386 return true; 15387 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 15388 // AddressingMode 2 15389 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15390 int RHSC = (int)RHS->getZExtValue(); 15391 if (RHSC < 0 && RHSC > -0x1000) { 15392 assert(Ptr->getOpcode() == ISD::ADD); 15393 isInc = false; 15394 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15395 Base = Ptr->getOperand(0); 15396 return true; 15397 } 15398 } 15399 15400 if (Ptr->getOpcode() == ISD::ADD) { 15401 isInc = true; 15402 ARM_AM::ShiftOpc ShOpcVal= 15403 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 15404 if (ShOpcVal != ARM_AM::no_shift) { 15405 Base = Ptr->getOperand(1); 15406 Offset = Ptr->getOperand(0); 15407 } else { 15408 Base = Ptr->getOperand(0); 15409 Offset = Ptr->getOperand(1); 15410 } 15411 return true; 15412 } 15413 15414 isInc = (Ptr->getOpcode() == ISD::ADD); 15415 Base = Ptr->getOperand(0); 15416 Offset = Ptr->getOperand(1); 15417 return true; 15418 } 15419 15420 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 15421 return false; 15422} 15423 15424static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 15425 bool isSEXTLoad, SDValue &Base, 15426 SDValue &Offset, bool &isInc, 15427 SelectionDAG &DAG) { 15428 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15429 return false; 15430 15431 Base = Ptr->getOperand(0); 15432 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15433 int RHSC = (int)RHS->getZExtValue(); 15434 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 15435 assert(Ptr->getOpcode() == ISD::ADD); 15436 isInc = false; 15437 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15438 return true; 15439 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 15440 isInc = Ptr->getOpcode() == ISD::ADD; 15441 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15442 return true; 15443 } 15444 } 15445 15446 return false; 15447} 15448 15449static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 15450 bool isSEXTLoad, bool IsMasked, bool isLE, 15451 SDValue &Base, SDValue &Offset, 15452 bool &isInc, SelectionDAG &DAG) { 15453 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15454 return false; 15455 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 15456 return false; 15457 15458 // We allow LE non-masked loads to change the type (for example use a vldrb.8 15459 // as opposed to a vldrw.32). This can allow extra addressing modes or 15460 // alignments for what is otherwise an equivalent instruction. 15461 bool CanChangeType = isLE && !IsMasked; 15462 15463 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 15464 int RHSC = (int)RHS->getZExtValue(); 15465 15466 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 15467 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 15468 assert(Ptr->getOpcode() == ISD::ADD); 15469 isInc = false; 15470 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15471 return true; 15472 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 15473 isInc = Ptr->getOpcode() == ISD::ADD; 15474 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15475 return true; 15476 } 15477 return false; 15478 }; 15479 15480 // Try to find a matching instruction based on s/zext, Alignment, Offset and 15481 // (in BE/masked) type. 15482 Base = Ptr->getOperand(0); 15483 if (VT == MVT::v4i16) { 15484 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 15485 return true; 15486 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 15487 if (IsInRange(RHSC, 0x80, 1)) 15488 return true; 15489 } else if (Align >= 4 && 15490 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 15491 IsInRange(RHSC, 0x80, 4)) 15492 return true; 15493 else if (Align >= 2 && 15494 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 15495 IsInRange(RHSC, 0x80, 2)) 15496 return true; 15497 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 15498 return true; 15499 return false; 15500} 15501 15502/// getPreIndexedAddressParts - returns true by value, base pointer and 15503/// offset pointer and addressing mode by reference if the node's address 15504/// can be legally represented as pre-indexed load / store address. 15505bool 15506ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 15507 SDValue &Offset, 15508 ISD::MemIndexedMode &AM, 15509 SelectionDAG &DAG) const { 15510 if (Subtarget->isThumb1Only()) 15511 return false; 15512 15513 EVT VT; 15514 SDValue Ptr; 15515 unsigned Align; 15516 bool isSEXTLoad = false; 15517 bool IsMasked = false; 15518 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15519 Ptr = LD->getBasePtr(); 15520 VT = LD->getMemoryVT(); 15521 Align = LD->getAlignment(); 15522 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15523 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15524 Ptr = ST->getBasePtr(); 15525 VT = ST->getMemoryVT(); 15526 Align = ST->getAlignment(); 15527 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15528 Ptr = LD->getBasePtr(); 15529 VT = LD->getMemoryVT(); 15530 Align = LD->getAlignment(); 15531 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15532 IsMasked = true; 15533 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15534 Ptr = ST->getBasePtr(); 15535 VT = ST->getMemoryVT(); 15536 Align = ST->getAlignment(); 15537 IsMasked = true; 15538 } else 15539 return false; 15540 15541 bool isInc; 15542 bool isLegal = false; 15543 if (VT.isVector()) 15544 isLegal = Subtarget->hasMVEIntegerOps() && 15545 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 15546 IsMasked, Subtarget->isLittle(), Base, 15547 Offset, isInc, DAG); 15548 else { 15549 if (Subtarget->isThumb2()) 15550 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15551 Offset, isInc, DAG); 15552 else 15553 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15554 Offset, isInc, DAG); 15555 } 15556 if (!isLegal) 15557 return false; 15558 15559 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 15560 return true; 15561} 15562 15563/// getPostIndexedAddressParts - returns true by value, base pointer and 15564/// offset pointer and addressing mode by reference if this node can be 15565/// combined with a load / store to form a post-indexed load / store. 15566bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 15567 SDValue &Base, 15568 SDValue &Offset, 15569 ISD::MemIndexedMode &AM, 15570 SelectionDAG &DAG) const { 15571 EVT VT; 15572 SDValue Ptr; 15573 unsigned Align; 15574 bool isSEXTLoad = false, isNonExt; 15575 bool IsMasked = false; 15576 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15577 VT = LD->getMemoryVT(); 15578 Ptr = LD->getBasePtr(); 15579 Align = LD->getAlignment(); 15580 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15581 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15582 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15583 VT = ST->getMemoryVT(); 15584 Ptr = ST->getBasePtr(); 15585 Align = ST->getAlignment(); 15586 isNonExt = !ST->isTruncatingStore(); 15587 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15588 VT = LD->getMemoryVT(); 15589 Ptr = LD->getBasePtr(); 15590 Align = LD->getAlignment(); 15591 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15592 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15593 IsMasked = true; 15594 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15595 VT = ST->getMemoryVT(); 15596 Ptr = ST->getBasePtr(); 15597 Align = ST->getAlignment(); 15598 isNonExt = !ST->isTruncatingStore(); 15599 IsMasked = true; 15600 } else 15601 return false; 15602 15603 if (Subtarget->isThumb1Only()) { 15604 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 15605 // must be non-extending/truncating, i32, with an offset of 4. 15606 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 15607 if (Op->getOpcode() != ISD::ADD || !isNonExt) 15608 return false; 15609 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 15610 if (!RHS || RHS->getZExtValue() != 4) 15611 return false; 15612 15613 Offset = Op->getOperand(1); 15614 Base = Op->getOperand(0); 15615 AM = ISD::POST_INC; 15616 return true; 15617 } 15618 15619 bool isInc; 15620 bool isLegal = false; 15621 if (VT.isVector()) 15622 isLegal = Subtarget->hasMVEIntegerOps() && 15623 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, 15624 Subtarget->isLittle(), Base, Offset, 15625 isInc, DAG); 15626 else { 15627 if (Subtarget->isThumb2()) 15628 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15629 isInc, DAG); 15630 else 15631 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15632 isInc, DAG); 15633 } 15634 if (!isLegal) 15635 return false; 15636 15637 if (Ptr != Base) { 15638 // Swap base ptr and offset to catch more post-index load / store when 15639 // it's legal. In Thumb2 mode, offset must be an immediate. 15640 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 15641 !Subtarget->isThumb2()) 15642 std::swap(Base, Offset); 15643 15644 // Post-indexed load / store update the base pointer. 15645 if (Ptr != Base) 15646 return false; 15647 } 15648 15649 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 15650 return true; 15651} 15652 15653void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 15654 KnownBits &Known, 15655 const APInt &DemandedElts, 15656 const SelectionDAG &DAG, 15657 unsigned Depth) const { 15658 unsigned BitWidth = Known.getBitWidth(); 15659 Known.resetAll(); 15660 switch (Op.getOpcode()) { 15661 default: break; 15662 case ARMISD::ADDC: 15663 case ARMISD::ADDE: 15664 case ARMISD::SUBC: 15665 case ARMISD::SUBE: 15666 // Special cases when we convert a carry to a boolean. 15667 if (Op.getResNo() == 0) { 15668 SDValue LHS = Op.getOperand(0); 15669 SDValue RHS = Op.getOperand(1); 15670 // (ADDE 0, 0, C) will give us a single bit. 15671 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 15672 isNullConstant(RHS)) { 15673 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 15674 return; 15675 } 15676 } 15677 break; 15678 case ARMISD::CMOV: { 15679 // Bits are known zero/one if known on the LHS and RHS. 15680 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 15681 if (Known.isUnknown()) 15682 return; 15683 15684 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 15685 Known.Zero &= KnownRHS.Zero; 15686 Known.One &= KnownRHS.One; 15687 return; 15688 } 15689 case ISD::INTRINSIC_W_CHAIN: { 15690 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 15691 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 15692 switch (IntID) { 15693 default: return; 15694 case Intrinsic::arm_ldaex: 15695 case Intrinsic::arm_ldrex: { 15696 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 15697 unsigned MemBits = VT.getScalarSizeInBits(); 15698 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 15699 return; 15700 } 15701 } 15702 } 15703 case ARMISD::BFI: { 15704 // Conservatively, we can recurse down the first operand 15705 // and just mask out all affected bits. 15706 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 15707 15708 // The operand to BFI is already a mask suitable for removing the bits it 15709 // sets. 15710 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 15711 const APInt &Mask = CI->getAPIntValue(); 15712 Known.Zero &= Mask; 15713 Known.One &= Mask; 15714 return; 15715 } 15716 case ARMISD::VGETLANEs: 15717 case ARMISD::VGETLANEu: { 15718 const SDValue &SrcSV = Op.getOperand(0); 15719 EVT VecVT = SrcSV.getValueType(); 15720 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 15721 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 15722 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 15723 assert(Pos->getAPIntValue().ult(NumSrcElts) && 15724 "VGETLANE index out of bounds"); 15725 unsigned Idx = Pos->getZExtValue(); 15726 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 15727 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 15728 15729 EVT VT = Op.getValueType(); 15730 const unsigned DstSz = VT.getScalarSizeInBits(); 15731 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 15732 (void)SrcSz; 15733 assert(SrcSz == Known.getBitWidth()); 15734 assert(DstSz > SrcSz); 15735 if (Op.getOpcode() == ARMISD::VGETLANEs) 15736 Known = Known.sext(DstSz); 15737 else { 15738 Known = Known.zext(DstSz, true /* extended bits are known zero */); 15739 } 15740 assert(DstSz == Known.getBitWidth()); 15741 break; 15742 } 15743 } 15744} 15745 15746bool 15747ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 15748 const APInt &DemandedAPInt, 15749 TargetLoweringOpt &TLO) const { 15750 // Delay optimization, so we don't have to deal with illegal types, or block 15751 // optimizations. 15752 if (!TLO.LegalOps) 15753 return false; 15754 15755 // Only optimize AND for now. 15756 if (Op.getOpcode() != ISD::AND) 15757 return false; 15758 15759 EVT VT = Op.getValueType(); 15760 15761 // Ignore vectors. 15762 if (VT.isVector()) 15763 return false; 15764 15765 assert(VT == MVT::i32 && "Unexpected integer type"); 15766 15767 // Make sure the RHS really is a constant. 15768 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 15769 if (!C) 15770 return false; 15771 15772 unsigned Mask = C->getZExtValue(); 15773 15774 unsigned Demanded = DemandedAPInt.getZExtValue(); 15775 unsigned ShrunkMask = Mask & Demanded; 15776 unsigned ExpandedMask = Mask | ~Demanded; 15777 15778 // If the mask is all zeros, let the target-independent code replace the 15779 // result with zero. 15780 if (ShrunkMask == 0) 15781 return false; 15782 15783 // If the mask is all ones, erase the AND. (Currently, the target-independent 15784 // code won't do this, so we have to do it explicitly to avoid an infinite 15785 // loop in obscure cases.) 15786 if (ExpandedMask == ~0U) 15787 return TLO.CombineTo(Op, Op.getOperand(0)); 15788 15789 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 15790 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 15791 }; 15792 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 15793 if (NewMask == Mask) 15794 return true; 15795 SDLoc DL(Op); 15796 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 15797 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 15798 return TLO.CombineTo(Op, NewOp); 15799 }; 15800 15801 // Prefer uxtb mask. 15802 if (IsLegalMask(0xFF)) 15803 return UseMask(0xFF); 15804 15805 // Prefer uxth mask. 15806 if (IsLegalMask(0xFFFF)) 15807 return UseMask(0xFFFF); 15808 15809 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 15810 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15811 if (ShrunkMask < 256) 15812 return UseMask(ShrunkMask); 15813 15814 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 15815 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15816 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 15817 return UseMask(ExpandedMask); 15818 15819 // Potential improvements: 15820 // 15821 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 15822 // We could try to prefer Thumb1 immediates which can be lowered to a 15823 // two-instruction sequence. 15824 // We could try to recognize more legal ARM/Thumb2 immediates here. 15825 15826 return false; 15827} 15828 15829 15830//===----------------------------------------------------------------------===// 15831// ARM Inline Assembly Support 15832//===----------------------------------------------------------------------===// 15833 15834bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 15835 // Looking for "rev" which is V6+. 15836 if (!Subtarget->hasV6Ops()) 15837 return false; 15838 15839 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 15840 std::string AsmStr = IA->getAsmString(); 15841 SmallVector<StringRef, 4> AsmPieces; 15842 SplitString(AsmStr, AsmPieces, ";\n"); 15843 15844 switch (AsmPieces.size()) { 15845 default: return false; 15846 case 1: 15847 AsmStr = AsmPieces[0]; 15848 AsmPieces.clear(); 15849 SplitString(AsmStr, AsmPieces, " \t,"); 15850 15851 // rev $0, $1 15852 if (AsmPieces.size() == 3 && 15853 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 15854 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 15855 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 15856 if (Ty && Ty->getBitWidth() == 32) 15857 return IntrinsicLowering::LowerToByteSwap(CI); 15858 } 15859 break; 15860 } 15861 15862 return false; 15863} 15864 15865const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 15866 // At this point, we have to lower this constraint to something else, so we 15867 // lower it to an "r" or "w". However, by doing this we will force the result 15868 // to be in register, while the X constraint is much more permissive. 15869 // 15870 // Although we are correct (we are free to emit anything, without 15871 // constraints), we might break use cases that would expect us to be more 15872 // efficient and emit something else. 15873 if (!Subtarget->hasVFP2Base()) 15874 return "r"; 15875 if (ConstraintVT.isFloatingPoint()) 15876 return "w"; 15877 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 15878 (ConstraintVT.getSizeInBits() == 64 || 15879 ConstraintVT.getSizeInBits() == 128)) 15880 return "w"; 15881 15882 return "r"; 15883} 15884 15885/// getConstraintType - Given a constraint letter, return the type of 15886/// constraint it is for this target. 15887ARMTargetLowering::ConstraintType 15888ARMTargetLowering::getConstraintType(StringRef Constraint) const { 15889 unsigned S = Constraint.size(); 15890 if (S == 1) { 15891 switch (Constraint[0]) { 15892 default: break; 15893 case 'l': return C_RegisterClass; 15894 case 'w': return C_RegisterClass; 15895 case 'h': return C_RegisterClass; 15896 case 'x': return C_RegisterClass; 15897 case 't': return C_RegisterClass; 15898 case 'j': return C_Immediate; // Constant for movw. 15899 // An address with a single base register. Due to the way we 15900 // currently handle addresses it is the same as an 'r' memory constraint. 15901 case 'Q': return C_Memory; 15902 } 15903 } else if (S == 2) { 15904 switch (Constraint[0]) { 15905 default: break; 15906 case 'T': return C_RegisterClass; 15907 // All 'U+' constraints are addresses. 15908 case 'U': return C_Memory; 15909 } 15910 } 15911 return TargetLowering::getConstraintType(Constraint); 15912} 15913 15914/// Examine constraint type and operand type and determine a weight value. 15915/// This object must already have been set up with the operand type 15916/// and the current alternative constraint selected. 15917TargetLowering::ConstraintWeight 15918ARMTargetLowering::getSingleConstraintMatchWeight( 15919 AsmOperandInfo &info, const char *constraint) const { 15920 ConstraintWeight weight = CW_Invalid; 15921 Value *CallOperandVal = info.CallOperandVal; 15922 // If we don't have a value, we can't do a match, 15923 // but allow it at the lowest weight. 15924 if (!CallOperandVal) 15925 return CW_Default; 15926 Type *type = CallOperandVal->getType(); 15927 // Look at the constraint type. 15928 switch (*constraint) { 15929 default: 15930 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 15931 break; 15932 case 'l': 15933 if (type->isIntegerTy()) { 15934 if (Subtarget->isThumb()) 15935 weight = CW_SpecificReg; 15936 else 15937 weight = CW_Register; 15938 } 15939 break; 15940 case 'w': 15941 if (type->isFloatingPointTy()) 15942 weight = CW_Register; 15943 break; 15944 } 15945 return weight; 15946} 15947 15948using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 15949 15950RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 15951 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 15952 switch (Constraint.size()) { 15953 case 1: 15954 // GCC ARM Constraint Letters 15955 switch (Constraint[0]) { 15956 case 'l': // Low regs or general regs. 15957 if (Subtarget->isThumb()) 15958 return RCPair(0U, &ARM::tGPRRegClass); 15959 return RCPair(0U, &ARM::GPRRegClass); 15960 case 'h': // High regs or no regs. 15961 if (Subtarget->isThumb()) 15962 return RCPair(0U, &ARM::hGPRRegClass); 15963 break; 15964 case 'r': 15965 if (Subtarget->isThumb1Only()) 15966 return RCPair(0U, &ARM::tGPRRegClass); 15967 return RCPair(0U, &ARM::GPRRegClass); 15968 case 'w': 15969 if (VT == MVT::Other) 15970 break; 15971 if (VT == MVT::f32) 15972 return RCPair(0U, &ARM::SPRRegClass); 15973 if (VT.getSizeInBits() == 64) 15974 return RCPair(0U, &ARM::DPRRegClass); 15975 if (VT.getSizeInBits() == 128) 15976 return RCPair(0U, &ARM::QPRRegClass); 15977 break; 15978 case 'x': 15979 if (VT == MVT::Other) 15980 break; 15981 if (VT == MVT::f32) 15982 return RCPair(0U, &ARM::SPR_8RegClass); 15983 if (VT.getSizeInBits() == 64) 15984 return RCPair(0U, &ARM::DPR_8RegClass); 15985 if (VT.getSizeInBits() == 128) 15986 return RCPair(0U, &ARM::QPR_8RegClass); 15987 break; 15988 case 't': 15989 if (VT == MVT::Other) 15990 break; 15991 if (VT == MVT::f32 || VT == MVT::i32) 15992 return RCPair(0U, &ARM::SPRRegClass); 15993 if (VT.getSizeInBits() == 64) 15994 return RCPair(0U, &ARM::DPR_VFP2RegClass); 15995 if (VT.getSizeInBits() == 128) 15996 return RCPair(0U, &ARM::QPR_VFP2RegClass); 15997 break; 15998 } 15999 break; 16000 16001 case 2: 16002 if (Constraint[0] == 'T') { 16003 switch (Constraint[1]) { 16004 default: 16005 break; 16006 case 'e': 16007 return RCPair(0U, &ARM::tGPREvenRegClass); 16008 case 'o': 16009 return RCPair(0U, &ARM::tGPROddRegClass); 16010 } 16011 } 16012 break; 16013 16014 default: 16015 break; 16016 } 16017 16018 if (StringRef("{cc}").equals_lower(Constraint)) 16019 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 16020 16021 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16022} 16023 16024/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16025/// vector. If it is invalid, don't add anything to Ops. 16026void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16027 std::string &Constraint, 16028 std::vector<SDValue>&Ops, 16029 SelectionDAG &DAG) const { 16030 SDValue Result; 16031 16032 // Currently only support length 1 constraints. 16033 if (Constraint.length() != 1) return; 16034 16035 char ConstraintLetter = Constraint[0]; 16036 switch (ConstraintLetter) { 16037 default: break; 16038 case 'j': 16039 case 'I': case 'J': case 'K': case 'L': 16040 case 'M': case 'N': case 'O': 16041 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 16042 if (!C) 16043 return; 16044 16045 int64_t CVal64 = C->getSExtValue(); 16046 int CVal = (int) CVal64; 16047 // None of these constraints allow values larger than 32 bits. Check 16048 // that the value fits in an int. 16049 if (CVal != CVal64) 16050 return; 16051 16052 switch (ConstraintLetter) { 16053 case 'j': 16054 // Constant suitable for movw, must be between 0 and 16055 // 65535. 16056 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 16057 if (CVal >= 0 && CVal <= 65535) 16058 break; 16059 return; 16060 case 'I': 16061 if (Subtarget->isThumb1Only()) { 16062 // This must be a constant between 0 and 255, for ADD 16063 // immediates. 16064 if (CVal >= 0 && CVal <= 255) 16065 break; 16066 } else if (Subtarget->isThumb2()) { 16067 // A constant that can be used as an immediate value in a 16068 // data-processing instruction. 16069 if (ARM_AM::getT2SOImmVal(CVal) != -1) 16070 break; 16071 } else { 16072 // A constant that can be used as an immediate value in a 16073 // data-processing instruction. 16074 if (ARM_AM::getSOImmVal(CVal) != -1) 16075 break; 16076 } 16077 return; 16078 16079 case 'J': 16080 if (Subtarget->isThumb1Only()) { 16081 // This must be a constant between -255 and -1, for negated ADD 16082 // immediates. This can be used in GCC with an "n" modifier that 16083 // prints the negated value, for use with SUB instructions. It is 16084 // not useful otherwise but is implemented for compatibility. 16085 if (CVal >= -255 && CVal <= -1) 16086 break; 16087 } else { 16088 // This must be a constant between -4095 and 4095. It is not clear 16089 // what this constraint is intended for. Implemented for 16090 // compatibility with GCC. 16091 if (CVal >= -4095 && CVal <= 4095) 16092 break; 16093 } 16094 return; 16095 16096 case 'K': 16097 if (Subtarget->isThumb1Only()) { 16098 // A 32-bit value where only one byte has a nonzero value. Exclude 16099 // zero to match GCC. This constraint is used by GCC internally for 16100 // constants that can be loaded with a move/shift combination. 16101 // It is not useful otherwise but is implemented for compatibility. 16102 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 16103 break; 16104 } else if (Subtarget->isThumb2()) { 16105 // A constant whose bitwise inverse can be used as an immediate 16106 // value in a data-processing instruction. This can be used in GCC 16107 // with a "B" modifier that prints the inverted value, for use with 16108 // BIC and MVN instructions. It is not useful otherwise but is 16109 // implemented for compatibility. 16110 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 16111 break; 16112 } else { 16113 // A constant whose bitwise inverse can be used as an immediate 16114 // value in a data-processing instruction. This can be used in GCC 16115 // with a "B" modifier that prints the inverted value, for use with 16116 // BIC and MVN instructions. It is not useful otherwise but is 16117 // implemented for compatibility. 16118 if (ARM_AM::getSOImmVal(~CVal) != -1) 16119 break; 16120 } 16121 return; 16122 16123 case 'L': 16124 if (Subtarget->isThumb1Only()) { 16125 // This must be a constant between -7 and 7, 16126 // for 3-operand ADD/SUB immediate instructions. 16127 if (CVal >= -7 && CVal < 7) 16128 break; 16129 } else if (Subtarget->isThumb2()) { 16130 // A constant whose negation can be used as an immediate value in a 16131 // data-processing instruction. This can be used in GCC with an "n" 16132 // modifier that prints the negated value, for use with SUB 16133 // instructions. It is not useful otherwise but is implemented for 16134 // compatibility. 16135 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 16136 break; 16137 } else { 16138 // A constant whose negation can be used as an immediate value in a 16139 // data-processing instruction. This can be used in GCC with an "n" 16140 // modifier that prints the negated value, for use with SUB 16141 // instructions. It is not useful otherwise but is implemented for 16142 // compatibility. 16143 if (ARM_AM::getSOImmVal(-CVal) != -1) 16144 break; 16145 } 16146 return; 16147 16148 case 'M': 16149 if (Subtarget->isThumb1Only()) { 16150 // This must be a multiple of 4 between 0 and 1020, for 16151 // ADD sp + immediate. 16152 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 16153 break; 16154 } else { 16155 // A power of two or a constant between 0 and 32. This is used in 16156 // GCC for the shift amount on shifted register operands, but it is 16157 // useful in general for any shift amounts. 16158 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 16159 break; 16160 } 16161 return; 16162 16163 case 'N': 16164 if (Subtarget->isThumb1Only()) { 16165 // This must be a constant between 0 and 31, for shift amounts. 16166 if (CVal >= 0 && CVal <= 31) 16167 break; 16168 } 16169 return; 16170 16171 case 'O': 16172 if (Subtarget->isThumb1Only()) { 16173 // This must be a multiple of 4 between -508 and 508, for 16174 // ADD/SUB sp = sp + immediate. 16175 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 16176 break; 16177 } 16178 return; 16179 } 16180 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 16181 break; 16182 } 16183 16184 if (Result.getNode()) { 16185 Ops.push_back(Result); 16186 return; 16187 } 16188 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16189} 16190 16191static RTLIB::Libcall getDivRemLibcall( 16192 const SDNode *N, MVT::SimpleValueType SVT) { 16193 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16194 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16195 "Unhandled Opcode in getDivRemLibcall"); 16196 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16197 N->getOpcode() == ISD::SREM; 16198 RTLIB::Libcall LC; 16199 switch (SVT) { 16200 default: llvm_unreachable("Unexpected request for libcall!"); 16201 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 16202 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 16203 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 16204 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 16205 } 16206 return LC; 16207} 16208 16209static TargetLowering::ArgListTy getDivRemArgList( 16210 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 16211 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16212 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16213 "Unhandled Opcode in getDivRemArgList"); 16214 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16215 N->getOpcode() == ISD::SREM; 16216 TargetLowering::ArgListTy Args; 16217 TargetLowering::ArgListEntry Entry; 16218 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 16219 EVT ArgVT = N->getOperand(i).getValueType(); 16220 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 16221 Entry.Node = N->getOperand(i); 16222 Entry.Ty = ArgTy; 16223 Entry.IsSExt = isSigned; 16224 Entry.IsZExt = !isSigned; 16225 Args.push_back(Entry); 16226 } 16227 if (Subtarget->isTargetWindows() && Args.size() >= 2) 16228 std::swap(Args[0], Args[1]); 16229 return Args; 16230} 16231 16232SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 16233 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 16234 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 16235 Subtarget->isTargetWindows()) && 16236 "Register-based DivRem lowering only"); 16237 unsigned Opcode = Op->getOpcode(); 16238 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 16239 "Invalid opcode for Div/Rem lowering"); 16240 bool isSigned = (Opcode == ISD::SDIVREM); 16241 EVT VT = Op->getValueType(0); 16242 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 16243 SDLoc dl(Op); 16244 16245 // If the target has hardware divide, use divide + multiply + subtract: 16246 // div = a / b 16247 // rem = a - b * div 16248 // return {div, rem} 16249 // This should be lowered into UDIV/SDIV + MLS later on. 16250 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 16251 : Subtarget->hasDivideInARMMode(); 16252 if (hasDivide && Op->getValueType(0).isSimple() && 16253 Op->getSimpleValueType(0) == MVT::i32) { 16254 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 16255 const SDValue Dividend = Op->getOperand(0); 16256 const SDValue Divisor = Op->getOperand(1); 16257 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 16258 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 16259 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 16260 16261 SDValue Values[2] = {Div, Rem}; 16262 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 16263 } 16264 16265 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 16266 VT.getSimpleVT().SimpleTy); 16267 SDValue InChain = DAG.getEntryNode(); 16268 16269 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 16270 DAG.getContext(), 16271 Subtarget); 16272 16273 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16274 getPointerTy(DAG.getDataLayout())); 16275 16276 Type *RetTy = StructType::get(Ty, Ty); 16277 16278 if (Subtarget->isTargetWindows()) 16279 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 16280 16281 TargetLowering::CallLoweringInfo CLI(DAG); 16282 CLI.setDebugLoc(dl).setChain(InChain) 16283 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 16284 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16285 16286 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16287 return CallInfo.first; 16288} 16289 16290// Lowers REM using divmod helpers 16291// see RTABI section 4.2/4.3 16292SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 16293 // Build return types (div and rem) 16294 std::vector<Type*> RetTyParams; 16295 Type *RetTyElement; 16296 16297 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 16298 default: llvm_unreachable("Unexpected request for libcall!"); 16299 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 16300 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 16301 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 16302 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 16303 } 16304 16305 RetTyParams.push_back(RetTyElement); 16306 RetTyParams.push_back(RetTyElement); 16307 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 16308 Type *RetTy = StructType::get(*DAG.getContext(), ret); 16309 16310 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 16311 SimpleTy); 16312 SDValue InChain = DAG.getEntryNode(); 16313 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 16314 Subtarget); 16315 bool isSigned = N->getOpcode() == ISD::SREM; 16316 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16317 getPointerTy(DAG.getDataLayout())); 16318 16319 if (Subtarget->isTargetWindows()) 16320 InChain = WinDBZCheckDenominator(DAG, N, InChain); 16321 16322 // Lower call 16323 CallLoweringInfo CLI(DAG); 16324 CLI.setChain(InChain) 16325 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 16326 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 16327 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 16328 16329 // Return second (rem) result operand (first contains div) 16330 SDNode *ResNode = CallResult.first.getNode(); 16331 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 16332 return ResNode->getOperand(1); 16333} 16334 16335SDValue 16336ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 16337 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 16338 SDLoc DL(Op); 16339 16340 // Get the inputs. 16341 SDValue Chain = Op.getOperand(0); 16342 SDValue Size = Op.getOperand(1); 16343 16344 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 16345 "no-stack-arg-probe")) { 16346 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16347 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16348 Chain = SP.getValue(1); 16349 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 16350 if (Align) 16351 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 16352 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 16353 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 16354 SDValue Ops[2] = { SP, Chain }; 16355 return DAG.getMergeValues(Ops, DL); 16356 } 16357 16358 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 16359 DAG.getConstant(2, DL, MVT::i32)); 16360 16361 SDValue Flag; 16362 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 16363 Flag = Chain.getValue(1); 16364 16365 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16366 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 16367 16368 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16369 Chain = NewSP.getValue(1); 16370 16371 SDValue Ops[2] = { NewSP, Chain }; 16372 return DAG.getMergeValues(Ops, DL); 16373} 16374 16375SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 16376 bool IsStrict = Op->isStrictFPOpcode(); 16377 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16378 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16379 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 16380 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 16381 "Unexpected type for custom-lowering FP_EXTEND"); 16382 16383 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16384 "With both FP DP and 16, any FP conversion is legal!"); 16385 16386 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 16387 "With FP16, 16 to 32 conversion is legal!"); 16388 16389 // Converting from 32 -> 64 is valid if we have FP64. 16390 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 16391 // FIXME: Remove this when we have strict fp instruction selection patterns 16392 if (IsStrict) { 16393 SDLoc Loc(Op); 16394 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 16395 Loc, Op.getValueType(), SrcVal); 16396 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 16397 } 16398 return Op; 16399 } 16400 16401 // Either we are converting from 16 -> 64, without FP16 and/or 16402 // FP.double-precision or without Armv8-fp. So we must do it in two 16403 // steps. 16404 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 16405 // without FP16. So we must do a function call. 16406 SDLoc Loc(Op); 16407 RTLIB::Libcall LC; 16408 MakeLibCallOptions CallOptions; 16409 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16410 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 16411 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 16412 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 16413 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 16414 if (Supported) { 16415 if (IsStrict) { 16416 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 16417 {DstVT, MVT::Other}, {Chain, SrcVal}); 16418 Chain = SrcVal.getValue(1); 16419 } else { 16420 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 16421 } 16422 } else { 16423 LC = RTLIB::getFPEXT(SrcVT, DstVT); 16424 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16425 "Unexpected type for custom-lowering FP_EXTEND"); 16426 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16427 Loc, Chain); 16428 } 16429 } 16430 16431 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 16432} 16433 16434SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 16435 bool IsStrict = Op->isStrictFPOpcode(); 16436 16437 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16438 EVT SrcVT = SrcVal.getValueType(); 16439 EVT DstVT = Op.getValueType(); 16440 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16441 const unsigned SrcSz = SrcVT.getSizeInBits(); 16442 (void)DstSz; 16443 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 16444 "Unexpected type for custom-lowering FP_ROUND"); 16445 16446 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16447 "With both FP DP and 16, any FP conversion is legal!"); 16448 16449 SDLoc Loc(Op); 16450 16451 // Instruction from 32 -> 16 if hasFP16 is valid 16452 if (SrcSz == 32 && Subtarget->hasFP16()) 16453 return Op; 16454 16455 // Lib call from 32 -> 16 / 64 -> [32, 16] 16456 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 16457 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16458 "Unexpected type for custom-lowering FP_ROUND"); 16459 MakeLibCallOptions CallOptions; 16460 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16461 SDValue Result; 16462 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16463 Loc, Chain); 16464 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 16465} 16466 16467void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 16468 SelectionDAG &DAG) const { 16469 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 16470 MVT HalfT = MVT::i32; 16471 SDLoc dl(N); 16472 SDValue Hi, Lo, Tmp; 16473 16474 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 16475 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 16476 return ; 16477 16478 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 16479 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 16480 16481 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16482 DAG.getConstant(0, dl, HalfT)); 16483 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16484 DAG.getConstant(1, dl, HalfT)); 16485 16486 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 16487 DAG.getConstant(OpTypeBits - 1, dl, 16488 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 16489 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 16490 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 16491 SDValue(Lo.getNode(), 1)); 16492 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 16493 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 16494 16495 Results.push_back(Lo); 16496 Results.push_back(Hi); 16497} 16498 16499bool 16500ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 16501 // The ARM target isn't yet aware of offsets. 16502 return false; 16503} 16504 16505bool ARM::isBitFieldInvertedMask(unsigned v) { 16506 if (v == 0xffffffff) 16507 return false; 16508 16509 // there can be 1's on either or both "outsides", all the "inside" 16510 // bits must be 0's 16511 return isShiftedMask_32(~v); 16512} 16513 16514/// isFPImmLegal - Returns true if the target can instruction select the 16515/// specified FP immediate natively. If false, the legalizer will 16516/// materialize the FP immediate as a load from a constant pool. 16517bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 16518 bool ForCodeSize) const { 16519 if (!Subtarget->hasVFP3Base()) 16520 return false; 16521 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 16522 return ARM_AM::getFP16Imm(Imm) != -1; 16523 if (VT == MVT::f32) 16524 return ARM_AM::getFP32Imm(Imm) != -1; 16525 if (VT == MVT::f64 && Subtarget->hasFP64()) 16526 return ARM_AM::getFP64Imm(Imm) != -1; 16527 return false; 16528} 16529 16530/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 16531/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 16532/// specified in the intrinsic calls. 16533bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 16534 const CallInst &I, 16535 MachineFunction &MF, 16536 unsigned Intrinsic) const { 16537 switch (Intrinsic) { 16538 case Intrinsic::arm_neon_vld1: 16539 case Intrinsic::arm_neon_vld2: 16540 case Intrinsic::arm_neon_vld3: 16541 case Intrinsic::arm_neon_vld4: 16542 case Intrinsic::arm_neon_vld2lane: 16543 case Intrinsic::arm_neon_vld3lane: 16544 case Intrinsic::arm_neon_vld4lane: 16545 case Intrinsic::arm_neon_vld2dup: 16546 case Intrinsic::arm_neon_vld3dup: 16547 case Intrinsic::arm_neon_vld4dup: { 16548 Info.opc = ISD::INTRINSIC_W_CHAIN; 16549 // Conservatively set memVT to the entire set of vectors loaded. 16550 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16551 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16552 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16553 Info.ptrVal = I.getArgOperand(0); 16554 Info.offset = 0; 16555 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16556 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16557 // volatile loads with NEON intrinsics not supported 16558 Info.flags = MachineMemOperand::MOLoad; 16559 return true; 16560 } 16561 case Intrinsic::arm_neon_vld1x2: 16562 case Intrinsic::arm_neon_vld1x3: 16563 case Intrinsic::arm_neon_vld1x4: { 16564 Info.opc = ISD::INTRINSIC_W_CHAIN; 16565 // Conservatively set memVT to the entire set of vectors loaded. 16566 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16567 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16568 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16569 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 16570 Info.offset = 0; 16571 Info.align.reset(); 16572 // volatile loads with NEON intrinsics not supported 16573 Info.flags = MachineMemOperand::MOLoad; 16574 return true; 16575 } 16576 case Intrinsic::arm_neon_vst1: 16577 case Intrinsic::arm_neon_vst2: 16578 case Intrinsic::arm_neon_vst3: 16579 case Intrinsic::arm_neon_vst4: 16580 case Intrinsic::arm_neon_vst2lane: 16581 case Intrinsic::arm_neon_vst3lane: 16582 case Intrinsic::arm_neon_vst4lane: { 16583 Info.opc = ISD::INTRINSIC_VOID; 16584 // Conservatively set memVT to the entire set of vectors stored. 16585 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16586 unsigned NumElts = 0; 16587 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16588 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16589 if (!ArgTy->isVectorTy()) 16590 break; 16591 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16592 } 16593 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16594 Info.ptrVal = I.getArgOperand(0); 16595 Info.offset = 0; 16596 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16597 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16598 // volatile stores with NEON intrinsics not supported 16599 Info.flags = MachineMemOperand::MOStore; 16600 return true; 16601 } 16602 case Intrinsic::arm_neon_vst1x2: 16603 case Intrinsic::arm_neon_vst1x3: 16604 case Intrinsic::arm_neon_vst1x4: { 16605 Info.opc = ISD::INTRINSIC_VOID; 16606 // Conservatively set memVT to the entire set of vectors stored. 16607 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16608 unsigned NumElts = 0; 16609 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16610 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16611 if (!ArgTy->isVectorTy()) 16612 break; 16613 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16614 } 16615 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16616 Info.ptrVal = I.getArgOperand(0); 16617 Info.offset = 0; 16618 Info.align.reset(); 16619 // volatile stores with NEON intrinsics not supported 16620 Info.flags = MachineMemOperand::MOStore; 16621 return true; 16622 } 16623 case Intrinsic::arm_ldaex: 16624 case Intrinsic::arm_ldrex: { 16625 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16626 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 16627 Info.opc = ISD::INTRINSIC_W_CHAIN; 16628 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16629 Info.ptrVal = I.getArgOperand(0); 16630 Info.offset = 0; 16631 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16632 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16633 return true; 16634 } 16635 case Intrinsic::arm_stlex: 16636 case Intrinsic::arm_strex: { 16637 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16638 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 16639 Info.opc = ISD::INTRINSIC_W_CHAIN; 16640 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16641 Info.ptrVal = I.getArgOperand(1); 16642 Info.offset = 0; 16643 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16644 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16645 return true; 16646 } 16647 case Intrinsic::arm_stlexd: 16648 case Intrinsic::arm_strexd: 16649 Info.opc = ISD::INTRINSIC_W_CHAIN; 16650 Info.memVT = MVT::i64; 16651 Info.ptrVal = I.getArgOperand(2); 16652 Info.offset = 0; 16653 Info.align = Align(8); 16654 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16655 return true; 16656 16657 case Intrinsic::arm_ldaexd: 16658 case Intrinsic::arm_ldrexd: 16659 Info.opc = ISD::INTRINSIC_W_CHAIN; 16660 Info.memVT = MVT::i64; 16661 Info.ptrVal = I.getArgOperand(0); 16662 Info.offset = 0; 16663 Info.align = Align(8); 16664 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16665 return true; 16666 16667 default: 16668 break; 16669 } 16670 16671 return false; 16672} 16673 16674/// Returns true if it is beneficial to convert a load of a constant 16675/// to just the constant itself. 16676bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 16677 Type *Ty) const { 16678 assert(Ty->isIntegerTy()); 16679 16680 unsigned Bits = Ty->getPrimitiveSizeInBits(); 16681 if (Bits == 0 || Bits > 32) 16682 return false; 16683 return true; 16684} 16685 16686bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 16687 unsigned Index) const { 16688 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 16689 return false; 16690 16691 return (Index == 0 || Index == ResVT.getVectorNumElements()); 16692} 16693 16694Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 16695 ARM_MB::MemBOpt Domain) const { 16696 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16697 16698 // First, if the target has no DMB, see what fallback we can use. 16699 if (!Subtarget->hasDataBarrier()) { 16700 // Some ARMv6 cpus can support data barriers with an mcr instruction. 16701 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 16702 // here. 16703 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 16704 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 16705 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 16706 Builder.getInt32(0), Builder.getInt32(7), 16707 Builder.getInt32(10), Builder.getInt32(5)}; 16708 return Builder.CreateCall(MCR, args); 16709 } else { 16710 // Instead of using barriers, atomic accesses on these subtargets use 16711 // libcalls. 16712 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 16713 } 16714 } else { 16715 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 16716 // Only a full system barrier exists in the M-class architectures. 16717 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 16718 Constant *CDomain = Builder.getInt32(Domain); 16719 return Builder.CreateCall(DMB, CDomain); 16720 } 16721} 16722 16723// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 16724Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 16725 Instruction *Inst, 16726 AtomicOrdering Ord) const { 16727 switch (Ord) { 16728 case AtomicOrdering::NotAtomic: 16729 case AtomicOrdering::Unordered: 16730 llvm_unreachable("Invalid fence: unordered/non-atomic"); 16731 case AtomicOrdering::Monotonic: 16732 case AtomicOrdering::Acquire: 16733 return nullptr; // Nothing to do 16734 case AtomicOrdering::SequentiallyConsistent: 16735 if (!Inst->hasAtomicStore()) 16736 return nullptr; // Nothing to do 16737 LLVM_FALLTHROUGH; 16738 case AtomicOrdering::Release: 16739 case AtomicOrdering::AcquireRelease: 16740 if (Subtarget->preferISHSTBarriers()) 16741 return makeDMB(Builder, ARM_MB::ISHST); 16742 // FIXME: add a comment with a link to documentation justifying this. 16743 else 16744 return makeDMB(Builder, ARM_MB::ISH); 16745 } 16746 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 16747} 16748 16749Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 16750 Instruction *Inst, 16751 AtomicOrdering Ord) const { 16752 switch (Ord) { 16753 case AtomicOrdering::NotAtomic: 16754 case AtomicOrdering::Unordered: 16755 llvm_unreachable("Invalid fence: unordered/not-atomic"); 16756 case AtomicOrdering::Monotonic: 16757 case AtomicOrdering::Release: 16758 return nullptr; // Nothing to do 16759 case AtomicOrdering::Acquire: 16760 case AtomicOrdering::AcquireRelease: 16761 case AtomicOrdering::SequentiallyConsistent: 16762 return makeDMB(Builder, ARM_MB::ISH); 16763 } 16764 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 16765} 16766 16767// Loads and stores less than 64-bits are already atomic; ones above that 16768// are doomed anyway, so defer to the default libcall and blame the OS when 16769// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16770// anything for those. 16771bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16772 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 16773 return (Size == 64) && !Subtarget->isMClass(); 16774} 16775 16776// Loads and stores less than 64-bits are already atomic; ones above that 16777// are doomed anyway, so defer to the default libcall and blame the OS when 16778// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16779// anything for those. 16780// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 16781// guarantee, see DDI0406C ARM architecture reference manual, 16782// sections A8.8.72-74 LDRD) 16783TargetLowering::AtomicExpansionKind 16784ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16785 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 16786 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 16787 : AtomicExpansionKind::None; 16788} 16789 16790// For the real atomic operations, we have ldrex/strex up to 32 bits, 16791// and up to 64 bits on the non-M profiles 16792TargetLowering::AtomicExpansionKind 16793ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16794 if (AI->isFloatingPointOperation()) 16795 return AtomicExpansionKind::CmpXChg; 16796 16797 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 16798 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16799 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 16800 ? AtomicExpansionKind::LLSC 16801 : AtomicExpansionKind::None; 16802} 16803 16804TargetLowering::AtomicExpansionKind 16805ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 16806 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 16807 // implement cmpxchg without spilling. If the address being exchanged is also 16808 // on the stack and close enough to the spill slot, this can lead to a 16809 // situation where the monitor always gets cleared and the atomic operation 16810 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 16811 bool HasAtomicCmpXchg = 16812 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16813 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 16814 return AtomicExpansionKind::LLSC; 16815 return AtomicExpansionKind::None; 16816} 16817 16818bool ARMTargetLowering::shouldInsertFencesForAtomic( 16819 const Instruction *I) const { 16820 return InsertFencesForAtomic; 16821} 16822 16823// This has so far only been implemented for MachO. 16824bool ARMTargetLowering::useLoadStackGuardNode() const { 16825 return Subtarget->isTargetMachO(); 16826} 16827 16828void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 16829 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16830 return TargetLowering::insertSSPDeclarations(M); 16831 16832 // MSVC CRT has a global variable holding security cookie. 16833 M.getOrInsertGlobal("__security_cookie", 16834 Type::getInt8PtrTy(M.getContext())); 16835 16836 // MSVC CRT has a function to validate security cookie. 16837 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 16838 "__security_check_cookie", Type::getVoidTy(M.getContext()), 16839 Type::getInt8PtrTy(M.getContext())); 16840 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 16841 F->addAttribute(1, Attribute::AttrKind::InReg); 16842} 16843 16844Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 16845 // MSVC CRT has a global variable holding security cookie. 16846 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16847 return M.getGlobalVariable("__security_cookie"); 16848 return TargetLowering::getSDagStackGuard(M); 16849} 16850 16851Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 16852 // MSVC CRT has a function to validate security cookie. 16853 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16854 return M.getFunction("__security_check_cookie"); 16855 return TargetLowering::getSSPStackGuardCheck(M); 16856} 16857 16858bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 16859 unsigned &Cost) const { 16860 // If we do not have NEON, vector types are not natively supported. 16861 if (!Subtarget->hasNEON()) 16862 return false; 16863 16864 // Floating point values and vector values map to the same register file. 16865 // Therefore, although we could do a store extract of a vector type, this is 16866 // better to leave at float as we have more freedom in the addressing mode for 16867 // those. 16868 if (VectorTy->isFPOrFPVectorTy()) 16869 return false; 16870 16871 // If the index is unknown at compile time, this is very expensive to lower 16872 // and it is not possible to combine the store with the extract. 16873 if (!isa<ConstantInt>(Idx)) 16874 return false; 16875 16876 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 16877 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 16878 // We can do a store + vector extract on any vector that fits perfectly in a D 16879 // or Q register. 16880 if (BitWidth == 64 || BitWidth == 128) { 16881 Cost = 0; 16882 return true; 16883 } 16884 return false; 16885} 16886 16887bool ARMTargetLowering::isCheapToSpeculateCttz() const { 16888 return Subtarget->hasV6T2Ops(); 16889} 16890 16891bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 16892 return Subtarget->hasV6T2Ops(); 16893} 16894 16895bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 16896 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 16897} 16898 16899Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 16900 AtomicOrdering Ord) const { 16901 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16902 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 16903 bool IsAcquire = isAcquireOrStronger(Ord); 16904 16905 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 16906 // intrinsic must return {i32, i32} and we have to recombine them into a 16907 // single i64 here. 16908 if (ValTy->getPrimitiveSizeInBits() == 64) { 16909 Intrinsic::ID Int = 16910 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 16911 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 16912 16913 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16914 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 16915 16916 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 16917 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 16918 if (!Subtarget->isLittle()) 16919 std::swap (Lo, Hi); 16920 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 16921 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 16922 return Builder.CreateOr( 16923 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 16924 } 16925 16926 Type *Tys[] = { Addr->getType() }; 16927 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 16928 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 16929 16930 return Builder.CreateTruncOrBitCast( 16931 Builder.CreateCall(Ldrex, Addr), 16932 cast<PointerType>(Addr->getType())->getElementType()); 16933} 16934 16935void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 16936 IRBuilder<> &Builder) const { 16937 if (!Subtarget->hasV7Ops()) 16938 return; 16939 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16940 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 16941} 16942 16943Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 16944 Value *Addr, 16945 AtomicOrdering Ord) const { 16946 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16947 bool IsRelease = isReleaseOrStronger(Ord); 16948 16949 // Since the intrinsics must have legal type, the i64 intrinsics take two 16950 // parameters: "i32, i32". We must marshal Val into the appropriate form 16951 // before the call. 16952 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 16953 Intrinsic::ID Int = 16954 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 16955 Function *Strex = Intrinsic::getDeclaration(M, Int); 16956 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 16957 16958 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 16959 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 16960 if (!Subtarget->isLittle()) 16961 std::swap(Lo, Hi); 16962 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16963 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 16964 } 16965 16966 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 16967 Type *Tys[] = { Addr->getType() }; 16968 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 16969 16970 return Builder.CreateCall( 16971 Strex, {Builder.CreateZExtOrBitCast( 16972 Val, Strex->getFunctionType()->getParamType(0)), 16973 Addr}); 16974} 16975 16976 16977bool ARMTargetLowering::alignLoopsWithOptSize() const { 16978 return Subtarget->isMClass(); 16979} 16980 16981/// A helper function for determining the number of interleaved accesses we 16982/// will generate when lowering accesses of the given type. 16983unsigned 16984ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 16985 const DataLayout &DL) const { 16986 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 16987} 16988 16989bool ARMTargetLowering::isLegalInterleavedAccessType( 16990 unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { 16991 16992 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 16993 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 16994 16995 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 16996 return false; 16997 16998 // Ensure the vector doesn't have f16 elements. Even though we could do an 16999 // i16 vldN, we can't hold the f16 vectors and will end up converting via 17000 // f32. 17001 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 17002 return false; 17003 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 17004 return false; 17005 17006 // Ensure the number of vector elements is greater than 1. 17007 if (VecTy->getNumElements() < 2) 17008 return false; 17009 17010 // Ensure the element type is legal. 17011 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 17012 return false; 17013 17014 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 17015 // 128 will be split into multiple interleaved accesses. 17016 if (Subtarget->hasNEON() && VecSize == 64) 17017 return true; 17018 return VecSize % 128 == 0; 17019} 17020 17021unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 17022 if (Subtarget->hasNEON()) 17023 return 4; 17024 if (Subtarget->hasMVEIntegerOps()) 17025 return MVEMaxSupportedInterleaveFactor; 17026 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 17027} 17028 17029/// Lower an interleaved load into a vldN intrinsic. 17030/// 17031/// E.g. Lower an interleaved load (Factor = 2): 17032/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 17033/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 17034/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 17035/// 17036/// Into: 17037/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 17038/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 17039/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 17040bool ARMTargetLowering::lowerInterleavedLoad( 17041 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 17042 ArrayRef<unsigned> Indices, unsigned Factor) const { 17043 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17044 "Invalid interleave factor"); 17045 assert(!Shuffles.empty() && "Empty shufflevector input"); 17046 assert(Shuffles.size() == Indices.size() && 17047 "Unmatched number of shufflevectors and indices"); 17048 17049 VectorType *VecTy = Shuffles[0]->getType(); 17050 Type *EltTy = VecTy->getVectorElementType(); 17051 17052 const DataLayout &DL = LI->getModule()->getDataLayout(); 17053 17054 // Skip if we do not have NEON and skip illegal vector types. We can 17055 // "legalize" wide vector types into multiple interleaved accesses as long as 17056 // the vector types are divisible by 128. 17057 if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) 17058 return false; 17059 17060 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 17061 17062 // A pointer vector can not be the return type of the ldN intrinsics. Need to 17063 // load integer vectors first and then convert to pointer vectors. 17064 if (EltTy->isPointerTy()) 17065 VecTy = 17066 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 17067 17068 IRBuilder<> Builder(LI); 17069 17070 // The base address of the load. 17071 Value *BaseAddr = LI->getPointerOperand(); 17072 17073 if (NumLoads > 1) { 17074 // If we're going to generate more than one load, reset the sub-vector type 17075 // to something legal. 17076 VecTy = VectorType::get(VecTy->getVectorElementType(), 17077 VecTy->getVectorNumElements() / NumLoads); 17078 17079 // We will compute the pointer operand of each load from the original base 17080 // address using GEPs. Cast the base address to a pointer to the scalar 17081 // element type. 17082 BaseAddr = Builder.CreateBitCast( 17083 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 17084 LI->getPointerAddressSpace())); 17085 } 17086 17087 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 17088 17089 auto createLoadIntrinsic = [&](Value *BaseAddr) { 17090 if (Subtarget->hasNEON()) { 17091 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 17092 Type *Tys[] = {VecTy, Int8Ptr}; 17093 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 17094 Intrinsic::arm_neon_vld3, 17095 Intrinsic::arm_neon_vld4}; 17096 Function *VldnFunc = 17097 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 17098 17099 SmallVector<Value *, 2> Ops; 17100 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17101 Ops.push_back(Builder.getInt32(LI->getAlignment())); 17102 17103 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17104 } else { 17105 assert((Factor == 2 || Factor == 4) && 17106 "expected interleave factor of 2 or 4 for MVE"); 17107 Intrinsic::ID LoadInts = 17108 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 17109 Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( 17110 LI->getPointerAddressSpace()); 17111 Type *Tys[] = {VecTy, VecEltTy}; 17112 Function *VldnFunc = 17113 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 17114 17115 SmallVector<Value *, 2> Ops; 17116 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 17117 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17118 } 17119 }; 17120 17121 // Holds sub-vectors extracted from the load intrinsic return values. The 17122 // sub-vectors are associated with the shufflevector instructions they will 17123 // replace. 17124 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 17125 17126 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 17127 // If we're generating more than one load, compute the base address of 17128 // subsequent loads as an offset from the previous. 17129 if (LoadCount > 0) 17130 BaseAddr = 17131 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 17132 VecTy->getVectorNumElements() * Factor); 17133 17134 CallInst *VldN = createLoadIntrinsic(BaseAddr); 17135 17136 // Replace uses of each shufflevector with the corresponding vector loaded 17137 // by ldN. 17138 for (unsigned i = 0; i < Shuffles.size(); i++) { 17139 ShuffleVectorInst *SV = Shuffles[i]; 17140 unsigned Index = Indices[i]; 17141 17142 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 17143 17144 // Convert the integer vector to pointer vector if the element is pointer. 17145 if (EltTy->isPointerTy()) 17146 SubVec = Builder.CreateIntToPtr( 17147 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 17148 VecTy->getVectorNumElements())); 17149 17150 SubVecs[SV].push_back(SubVec); 17151 } 17152 } 17153 17154 // Replace uses of the shufflevector instructions with the sub-vectors 17155 // returned by the load intrinsic. If a shufflevector instruction is 17156 // associated with more than one sub-vector, those sub-vectors will be 17157 // concatenated into a single wide vector. 17158 for (ShuffleVectorInst *SVI : Shuffles) { 17159 auto &SubVec = SubVecs[SVI]; 17160 auto *WideVec = 17161 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 17162 SVI->replaceAllUsesWith(WideVec); 17163 } 17164 17165 return true; 17166} 17167 17168/// Lower an interleaved store into a vstN intrinsic. 17169/// 17170/// E.g. Lower an interleaved store (Factor = 3): 17171/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 17172/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 17173/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 17174/// 17175/// Into: 17176/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 17177/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 17178/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 17179/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17180/// 17181/// Note that the new shufflevectors will be removed and we'll only generate one 17182/// vst3 instruction in CodeGen. 17183/// 17184/// Example for a more general valid mask (Factor 3). Lower: 17185/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 17186/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 17187/// store <12 x i32> %i.vec, <12 x i32>* %ptr 17188/// 17189/// Into: 17190/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 17191/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 17192/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 17193/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17194bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 17195 ShuffleVectorInst *SVI, 17196 unsigned Factor) const { 17197 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17198 "Invalid interleave factor"); 17199 17200 VectorType *VecTy = SVI->getType(); 17201 assert(VecTy->getVectorNumElements() % Factor == 0 && 17202 "Invalid interleaved store"); 17203 17204 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 17205 Type *EltTy = VecTy->getVectorElementType(); 17206 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 17207 17208 const DataLayout &DL = SI->getModule()->getDataLayout(); 17209 17210 // Skip if we do not have NEON and skip illegal vector types. We can 17211 // "legalize" wide vector types into multiple interleaved accesses as long as 17212 // the vector types are divisible by 128. 17213 if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 17214 return false; 17215 17216 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 17217 17218 Value *Op0 = SVI->getOperand(0); 17219 Value *Op1 = SVI->getOperand(1); 17220 IRBuilder<> Builder(SI); 17221 17222 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 17223 // vectors to integer vectors. 17224 if (EltTy->isPointerTy()) { 17225 Type *IntTy = DL.getIntPtrType(EltTy); 17226 17227 // Convert to the corresponding integer vector. 17228 Type *IntVecTy = 17229 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 17230 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 17231 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 17232 17233 SubVecTy = VectorType::get(IntTy, LaneLen); 17234 } 17235 17236 // The base address of the store. 17237 Value *BaseAddr = SI->getPointerOperand(); 17238 17239 if (NumStores > 1) { 17240 // If we're going to generate more than one store, reset the lane length 17241 // and sub-vector type to something legal. 17242 LaneLen /= NumStores; 17243 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 17244 17245 // We will compute the pointer operand of each store from the original base 17246 // address using GEPs. Cast the base address to a pointer to the scalar 17247 // element type. 17248 BaseAddr = Builder.CreateBitCast( 17249 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 17250 SI->getPointerAddressSpace())); 17251 } 17252 17253 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 17254 17255 auto Mask = SVI->getShuffleMask(); 17256 17257 auto createStoreIntrinsic = [&](Value *BaseAddr, 17258 SmallVectorImpl<Value *> &Shuffles) { 17259 if (Subtarget->hasNEON()) { 17260 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 17261 Intrinsic::arm_neon_vst3, 17262 Intrinsic::arm_neon_vst4}; 17263 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 17264 Type *Tys[] = {Int8Ptr, SubVecTy}; 17265 17266 Function *VstNFunc = Intrinsic::getDeclaration( 17267 SI->getModule(), StoreInts[Factor - 2], Tys); 17268 17269 SmallVector<Value *, 6> Ops; 17270 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17271 for (auto S : Shuffles) 17272 Ops.push_back(S); 17273 Ops.push_back(Builder.getInt32(SI->getAlignment())); 17274 Builder.CreateCall(VstNFunc, Ops); 17275 } else { 17276 assert((Factor == 2 || Factor == 4) && 17277 "expected interleave factor of 2 or 4 for MVE"); 17278 Intrinsic::ID StoreInts = 17279 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 17280 Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( 17281 SI->getPointerAddressSpace()); 17282 Type *Tys[] = {EltPtrTy, SubVecTy}; 17283 Function *VstNFunc = 17284 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 17285 17286 SmallVector<Value *, 6> Ops; 17287 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 17288 for (auto S : Shuffles) 17289 Ops.push_back(S); 17290 for (unsigned F = 0; F < Factor; F++) { 17291 Ops.push_back(Builder.getInt32(F)); 17292 Builder.CreateCall(VstNFunc, Ops); 17293 Ops.pop_back(); 17294 } 17295 } 17296 }; 17297 17298 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 17299 // If we generating more than one store, we compute the base address of 17300 // subsequent stores as an offset from the previous. 17301 if (StoreCount > 0) 17302 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 17303 BaseAddr, LaneLen * Factor); 17304 17305 SmallVector<Value *, 4> Shuffles; 17306 17307 // Split the shufflevector operands into sub vectors for the new vstN call. 17308 for (unsigned i = 0; i < Factor; i++) { 17309 unsigned IdxI = StoreCount * LaneLen * Factor + i; 17310 if (Mask[IdxI] >= 0) { 17311 Shuffles.push_back(Builder.CreateShuffleVector( 17312 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 17313 } else { 17314 unsigned StartMask = 0; 17315 for (unsigned j = 1; j < LaneLen; j++) { 17316 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 17317 if (Mask[IdxJ * Factor + IdxI] >= 0) { 17318 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 17319 break; 17320 } 17321 } 17322 // Note: If all elements in a chunk are undefs, StartMask=0! 17323 // Note: Filling undef gaps with random elements is ok, since 17324 // those elements were being written anyway (with undefs). 17325 // In the case of all undefs we're defaulting to using elems from 0 17326 // Note: StartMask cannot be negative, it's checked in 17327 // isReInterleaveMask 17328 Shuffles.push_back(Builder.CreateShuffleVector( 17329 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 17330 } 17331 } 17332 17333 createStoreIntrinsic(BaseAddr, Shuffles); 17334 } 17335 return true; 17336} 17337 17338enum HABaseType { 17339 HA_UNKNOWN = 0, 17340 HA_FLOAT, 17341 HA_DOUBLE, 17342 HA_VECT64, 17343 HA_VECT128 17344}; 17345 17346static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 17347 uint64_t &Members) { 17348 if (auto *ST = dyn_cast<StructType>(Ty)) { 17349 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 17350 uint64_t SubMembers = 0; 17351 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 17352 return false; 17353 Members += SubMembers; 17354 } 17355 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 17356 uint64_t SubMembers = 0; 17357 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 17358 return false; 17359 Members += SubMembers * AT->getNumElements(); 17360 } else if (Ty->isFloatTy()) { 17361 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 17362 return false; 17363 Members = 1; 17364 Base = HA_FLOAT; 17365 } else if (Ty->isDoubleTy()) { 17366 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 17367 return false; 17368 Members = 1; 17369 Base = HA_DOUBLE; 17370 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 17371 Members = 1; 17372 switch (Base) { 17373 case HA_FLOAT: 17374 case HA_DOUBLE: 17375 return false; 17376 case HA_VECT64: 17377 return VT->getBitWidth() == 64; 17378 case HA_VECT128: 17379 return VT->getBitWidth() == 128; 17380 case HA_UNKNOWN: 17381 switch (VT->getBitWidth()) { 17382 case 64: 17383 Base = HA_VECT64; 17384 return true; 17385 case 128: 17386 Base = HA_VECT128; 17387 return true; 17388 default: 17389 return false; 17390 } 17391 } 17392 } 17393 17394 return (Members > 0 && Members <= 4); 17395} 17396 17397/// Return the correct alignment for the current calling convention. 17398Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 17399 DataLayout DL) const { 17400 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 17401 if (!ArgTy->isVectorTy()) 17402 return ABITypeAlign; 17403 17404 // Avoid over-aligning vector parameters. It would require realigning the 17405 // stack and waste space for no real benefit. 17406 return std::min(ABITypeAlign, DL.getStackAlignment()); 17407} 17408 17409/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 17410/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 17411/// passing according to AAPCS rules. 17412bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 17413 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 17414 if (getEffectiveCallingConv(CallConv, isVarArg) != 17415 CallingConv::ARM_AAPCS_VFP) 17416 return false; 17417 17418 HABaseType Base = HA_UNKNOWN; 17419 uint64_t Members = 0; 17420 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 17421 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 17422 17423 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 17424 return IsHA || IsIntArray; 17425} 17426 17427unsigned ARMTargetLowering::getExceptionPointerRegister( 17428 const Constant *PersonalityFn) const { 17429 // Platforms which do not use SjLj EH may return values in these registers 17430 // via the personality function. 17431 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 17432} 17433 17434unsigned ARMTargetLowering::getExceptionSelectorRegister( 17435 const Constant *PersonalityFn) const { 17436 // Platforms which do not use SjLj EH may return values in these registers 17437 // via the personality function. 17438 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 17439} 17440 17441void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17442 // Update IsSplitCSR in ARMFunctionInfo. 17443 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 17444 AFI->setIsSplitCSR(true); 17445} 17446 17447void ARMTargetLowering::insertCopiesSplitCSR( 17448 MachineBasicBlock *Entry, 17449 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17450 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17451 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17452 if (!IStart) 17453 return; 17454 17455 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17456 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17457 MachineBasicBlock::iterator MBBI = Entry->begin(); 17458 for (const MCPhysReg *I = IStart; *I; ++I) { 17459 const TargetRegisterClass *RC = nullptr; 17460 if (ARM::GPRRegClass.contains(*I)) 17461 RC = &ARM::GPRRegClass; 17462 else if (ARM::DPRRegClass.contains(*I)) 17463 RC = &ARM::DPRRegClass; 17464 else 17465 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17466 17467 Register NewVR = MRI->createVirtualRegister(RC); 17468 // Create copy from CSR to a virtual register. 17469 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17470 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17471 // nounwind. If we want to generalize this later, we may need to emit 17472 // CFI pseudo-instructions. 17473 assert(Entry->getParent()->getFunction().hasFnAttribute( 17474 Attribute::NoUnwind) && 17475 "Function should be nounwind in insertCopiesSplitCSR!"); 17476 Entry->addLiveIn(*I); 17477 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17478 .addReg(*I); 17479 17480 // Insert the copy-back instructions right before the terminator. 17481 for (auto *Exit : Exits) 17482 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17483 TII->get(TargetOpcode::COPY), *I) 17484 .addReg(NewVR); 17485 } 17486} 17487 17488void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 17489 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17490 TargetLoweringBase::finalizeLowering(MF); 17491} 17492